From 01f9cfab8bf73653ff6df066155a01ce7892cf7d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 1 Mar 2017 11:50:24 +0000 Subject: [PATCH 1/6] qemu-options: explain disk I/O throttling options The disk I/O throttling options have been listed for a long time but never explained on the QEMU man page. Suggested-by: Nini Gu Cc: Alberto Garcia Signed-off-by: Stefan Hajnoczi Reviewed-by: Alberto Garcia Reviewed-by: Greg Kurz Message-id: 20170301115026.22621-2-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- qemu-options.hx | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/qemu-options.hx b/qemu-options.hx index 99af8edf5f..9171bd5eec 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -635,6 +635,30 @@ file sectors into the image file. conversion of plain zero writes by the OS to driver specific optimized zero write commands. You may even choose "unmap" if @var{discard} is set to "unmap" to allow a zero write to be converted to an UNMAP operation. +@item bps=@var{b},bps_rd=@var{r},bps_wr=@var{w} +Specify bandwidth throttling limits in bytes per second, either for all request +types or for reads or writes only. Small values can lead to timeouts or hangs +inside the guest. A safe minimum for disks is 2 MB/s. +@item bps_max=@var{bm},bps_rd_max=@var{rm},bps_wr_max=@var{wm} +Specify bursts in bytes per second, either for all request types or for reads +or writes only. Bursts allow the guest I/O to spike above the limit +temporarily. +@item iops=@var{i},iops_rd=@var{r},iops_wr=@var{w} +Specify request rate limits in requests per second, either for all request +types or for reads or writes only. +@item iops_max=@var{bm},iops_rd_max=@var{rm},iops_wr_max=@var{wm} +Specify bursts in requests per second, either for all request types or for reads +or writes only. Bursts allow the guest I/O to spike above the limit +temporarily. +@item iops_size=@var{is} +Let every @var{is} bytes of a request count as a new request for iops +throttling purposes. Use this option to prevent guests from circumventing iops +limits by sending fewer but larger requests. +@item group=@var{g} +Join a throttling quota group with given name @var{g}. All drives that are +members of the same group are accounted for together. Use this option to +prevent guests from circumventing throttling limits by using many small disks +instead of a single larger disk. @end table By default, the @option{cache=writeback} mode is used. It will report data From ab08aec45f67a776ea37cee0bf94a34abb84ad97 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 1 Mar 2017 11:50:25 +0000 Subject: [PATCH 2/6] throttle: do not use invalid config in test The (burst) max parameter cannot be smaller than the avg parameter. There is a test case that uses avg = 56, max = 1 and gets away with it because no input validation is performed by the test case. This patch switches to valid test input parameters. Signed-off-by: Stefan Hajnoczi Reviewed-by: Alberto Garcia Message-id: 20170301115026.22621-3-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- tests/test-throttle.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test-throttle.c b/tests/test-throttle.c index bd7c501b2e..089e937356 100644 --- a/tests/test-throttle.c +++ b/tests/test-throttle.c @@ -205,8 +205,8 @@ static void test_config_functions(void) orig_cfg.buckets[THROTTLE_OPS_READ].avg = 69; orig_cfg.buckets[THROTTLE_OPS_WRITE].avg = 23; - orig_cfg.buckets[THROTTLE_BPS_TOTAL].max = 0; /* should be corrected */ - orig_cfg.buckets[THROTTLE_BPS_READ].max = 1; /* should not be corrected */ + orig_cfg.buckets[THROTTLE_BPS_TOTAL].max = 0; /* should be corrected */ + orig_cfg.buckets[THROTTLE_BPS_READ].max = 56; /* should not be corrected */ orig_cfg.buckets[THROTTLE_BPS_WRITE].max = 120; orig_cfg.buckets[THROTTLE_OPS_TOTAL].max = 150; @@ -246,8 +246,8 @@ static void test_config_functions(void) g_assert(final_cfg.buckets[THROTTLE_OPS_READ].avg == 69); g_assert(final_cfg.buckets[THROTTLE_OPS_WRITE].avg == 23); - g_assert(final_cfg.buckets[THROTTLE_BPS_TOTAL].max == 15.3);/* fixed */ - g_assert(final_cfg.buckets[THROTTLE_BPS_READ].max == 1); /* not fixed */ + g_assert(final_cfg.buckets[THROTTLE_BPS_TOTAL].max == 15.3); /* fixed */ + g_assert(final_cfg.buckets[THROTTLE_BPS_READ].max == 56); /* not fixed */ g_assert(final_cfg.buckets[THROTTLE_BPS_WRITE].max == 120); g_assert(final_cfg.buckets[THROTTLE_OPS_TOTAL].max == 150); From d72915c60bff51495529449750e051d01b03c62f Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Wed, 1 Mar 2017 11:50:26 +0000 Subject: [PATCH 3/6] throttle: make throttle_config(throttle_get_config()) symmetric Throttling has a weird property that throttle_get_config() does not always return the same throttling settings that were given with throttle_config(). In other words, the set and get functions aren't symmetric. If .max is 0 then the throttling code assigns a default value of .avg / 10 in throttle_config(). This is an implementation detail of the throttling algorithm. When throttle_get_config() is called the .max value returned should still be 0. Users are exposed to this quirk via "info block" or "query-block" monitor commands. This has caused confusion because it looks like a bug when an unexpected value is reported. This patch hides the .max value adjustment in throttle_get_config() and updates test-throttle.c appropriately. Reported-by: Nini Gu Signed-off-by: Stefan Hajnoczi Reviewed-by: Alberto Garcia Message-id: 20170301115026.22621-4-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi --- tests/test-throttle.c | 8 ++++---- util/throttle.c | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/test-throttle.c b/tests/test-throttle.c index 089e937356..a9201b1fea 100644 --- a/tests/test-throttle.c +++ b/tests/test-throttle.c @@ -205,8 +205,8 @@ static void test_config_functions(void) orig_cfg.buckets[THROTTLE_OPS_READ].avg = 69; orig_cfg.buckets[THROTTLE_OPS_WRITE].avg = 23; - orig_cfg.buckets[THROTTLE_BPS_TOTAL].max = 0; /* should be corrected */ - orig_cfg.buckets[THROTTLE_BPS_READ].max = 56; /* should not be corrected */ + orig_cfg.buckets[THROTTLE_BPS_TOTAL].max = 0; + orig_cfg.buckets[THROTTLE_BPS_READ].max = 56; orig_cfg.buckets[THROTTLE_BPS_WRITE].max = 120; orig_cfg.buckets[THROTTLE_OPS_TOTAL].max = 150; @@ -246,8 +246,8 @@ static void test_config_functions(void) g_assert(final_cfg.buckets[THROTTLE_OPS_READ].avg == 69); g_assert(final_cfg.buckets[THROTTLE_OPS_WRITE].avg == 23); - g_assert(final_cfg.buckets[THROTTLE_BPS_TOTAL].max == 15.3); /* fixed */ - g_assert(final_cfg.buckets[THROTTLE_BPS_READ].max == 56); /* not fixed */ + g_assert(final_cfg.buckets[THROTTLE_BPS_TOTAL].max == 0); + g_assert(final_cfg.buckets[THROTTLE_BPS_READ].max == 56); g_assert(final_cfg.buckets[THROTTLE_BPS_WRITE].max == 120); g_assert(final_cfg.buckets[THROTTLE_OPS_TOTAL].max == 150); diff --git a/util/throttle.c b/util/throttle.c index 3817d9b904..3570ed25fc 100644 --- a/util/throttle.c +++ b/util/throttle.c @@ -380,6 +380,14 @@ static void throttle_fix_bucket(LeakyBucket *bkt) } } +/* undo internal bucket parameter changes (see throttle_fix_bucket()) */ +static void throttle_unfix_bucket(LeakyBucket *bkt) +{ + if (bkt->max < bkt->avg) { + bkt->max = 0; + } +} + /* take care of canceling a timer */ static void throttle_cancel_timer(QEMUTimer *timer) { @@ -420,7 +428,13 @@ void throttle_config(ThrottleState *ts, */ void throttle_get_config(ThrottleState *ts, ThrottleConfig *cfg) { + int i; + *cfg = ts->cfg; + + for (i = 0; i < BUCKETS_COUNT; i++) { + throttle_unfix_bucket(&cfg->buckets[i]); + } } From 3928d50bf151d2f25fde93432b0ee1d8eddd982d Mon Sep 17 00:00:00 2001 From: Lidong Chen Date: Thu, 13 Apr 2017 10:34:28 +0800 Subject: [PATCH 4/6] migration/block: use blk_pwrite_zeroes for each zero cluster BLOCK_SIZE is (1 << 20), qcow2 cluster size is 65536 by default, this may cause the qcow2 file size to be bigger after migration. This patch checks each cluster, using blk_pwrite_zeroes for each zero cluster. [Initialize cluster_size to BLOCK_SIZE to prevent a gcc uninitialized variable compiler warning. In reality we always initialize cluster_size in a conditional but gcc doesn't know that. --Stefan] Reviewed-by: Stefan Hajnoczi Signed-off-by: Lidong Chen Message-id: 1492050868-16200-1-git-send-email-lidongchen@tencent.com Signed-off-by: Stefan Hajnoczi --- migration/block.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/migration/block.c b/migration/block.c index 7734ff728a..060087fa32 100644 --- a/migration/block.c +++ b/migration/block.c @@ -885,6 +885,8 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) int64_t total_sectors = 0; int nr_sectors; int ret; + BlockDriverInfo bdi; + int cluster_size = BLOCK_SIZE; do { addr = qemu_get_be64(f); @@ -919,6 +921,15 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) error_report_err(local_err); return -EINVAL; } + + ret = bdrv_get_info(blk_bs(blk), &bdi); + if (ret == 0 && bdi.cluster_size > 0 && + bdi.cluster_size <= BLOCK_SIZE && + BLOCK_SIZE % bdi.cluster_size == 0) { + cluster_size = bdi.cluster_size; + } else { + cluster_size = BLOCK_SIZE; + } } if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) { @@ -932,10 +943,30 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) nr_sectors * BDRV_SECTOR_SIZE, BDRV_REQ_MAY_UNMAP); } else { + int i; + int64_t cur_addr; + uint8_t *cur_buf; + buf = g_malloc(BLOCK_SIZE); qemu_get_buffer(f, buf, BLOCK_SIZE); - ret = blk_pwrite(blk, addr * BDRV_SECTOR_SIZE, buf, - nr_sectors * BDRV_SECTOR_SIZE, 0); + for (i = 0; i < BLOCK_SIZE / cluster_size; i++) { + cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size; + cur_buf = buf + i * cluster_size; + + if ((!block_mig_state.zero_blocks || + cluster_size < BLOCK_SIZE) && + buffer_is_zero(cur_buf, cluster_size)) { + ret = blk_pwrite_zeroes(blk, cur_addr, + cluster_size, + BDRV_REQ_MAY_UNMAP); + } else { + ret = blk_pwrite(blk, cur_addr, cur_buf, + cluster_size, 0); + } + if (ret < 0) { + break; + } + } g_free(buf); } From 205f8618beb8ec00f7617b9c737dfd273b1310eb Mon Sep 17 00:00:00 2001 From: Changlong Xie Date: Tue, 18 Apr 2017 11:08:13 +0800 Subject: [PATCH 5/6] MAINTAINERS: update Wen's email address So he can get CC'ed on future patches and bugs for this feature Signed-off-by: Changlong Xie Message-id: 1492484893-23435-1-git-send-email-xiecl.fnst@cn.fujitsu.com Signed-off-by: Stefan Hajnoczi --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c60235eaf6..5638992da8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1817,7 +1817,7 @@ S: Supported F: tests/image-fuzzer/ Replication -M: Wen Congyang +M: Wen Congyang M: Changlong Xie S: Supported F: replication* From 3ccc0a0163b932fe980dce8d26db4bf98b1900e9 Mon Sep 17 00:00:00 2001 From: Zhang Chen Date: Fri, 21 Apr 2017 15:12:47 +0800 Subject: [PATCH 6/6] MAINTAINERS: update my email address I'm leaving my job at Fujitsu, this email address will stop working this week. Update it to one that I will have access to later. Signed-off-by: Xie Changlong Message-id: 1492758767-19716-1-git-send-email-xiecl.fnst@cn.fujitsu.com Signed-off-by: Stefan Hajnoczi --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5638992da8..cae3b09f9c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1818,7 +1818,7 @@ F: tests/image-fuzzer/ Replication M: Wen Congyang -M: Changlong Xie +M: Xie Changlong S: Supported F: replication* F: block/replication.c