From eed325b92c3e68417121ea23f96e33af6a4654ed Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 28 Jan 2020 16:06:41 +0100 Subject: [PATCH 01/36] mirror: Store MirrorOp.co for debuggability If a coroutine is launched, but the coroutine pointer isn't stored anywhere, debugging any problems inside the coroutine is quite hard. Let's store the coroutine pointer of a mirror operation in MirrorOp to have it available in the debugger. Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake --- block/mirror.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/mirror.c b/block/mirror.c index f0f2d9dff1..8959e4255f 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -103,6 +103,7 @@ struct MirrorOp { bool is_pseudo_op; bool is_active_write; CoQueue waiting_requests; + Coroutine *co; QTAILQ_ENTRY(MirrorOp) next; }; @@ -429,6 +430,7 @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset, default: abort(); } + op->co = co; QTAILQ_INSERT_TAIL(&s->ops_in_flight, op, next); qemu_coroutine_enter(co); From 7e6c4ff792734e196c8ca82564c56b5e7c6288ca Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 28 Jan 2020 16:09:28 +0100 Subject: [PATCH 02/36] mirror: Don't let an operation wait for itself mirror_wait_for_free_in_flight_slot() just picks a random operation to wait for. However, when mirror_co_read() waits for free slots, its MirrorOp is already in s->ops_in_flight, so if not enough slots are immediately available, an operation can end up waiting for itself to complete, which results in a hang. Fix this by passing the current MirrorOp and skipping this operation when picking an operation to wait for. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1794692 Signed-off-by: Kevin Wolf Reviewed-by: Eric Blake --- block/mirror.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 8959e4255f..cacbc70014 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -283,11 +283,14 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset, } static inline void coroutine_fn -mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) +mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp *self, bool active) { MirrorOp *op; QTAILQ_FOREACH(op, &s->ops_in_flight, next) { + if (self == op) { + continue; + } /* Do not wait on pseudo ops, because it may in turn wait on * some other operation to start, which may in fact be the * caller of this function. Since there is only one pseudo op @@ -302,10 +305,10 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) } static inline void coroutine_fn -mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s) +mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s, MirrorOp *self) { /* Only non-active operations use up in-flight slots */ - mirror_wait_for_any_operation(s, false); + mirror_wait_for_any_operation(s, self, false); } /* Perform a mirror copy operation. @@ -348,7 +351,7 @@ static void coroutine_fn mirror_co_read(void *opaque) while (s->buf_free_count < nb_chunks) { trace_mirror_yield_in_flight(s, op->offset, s->in_flight); - mirror_wait_for_free_in_flight_slot(s); + mirror_wait_for_free_in_flight_slot(s, op); } /* Now make a QEMUIOVector taking enough granularity-sized chunks @@ -555,7 +558,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) while (s->in_flight >= MAX_IN_FLIGHT) { trace_mirror_yield_in_flight(s, offset, s->in_flight); - mirror_wait_for_free_in_flight_slot(s); + mirror_wait_for_free_in_flight_slot(s, pseudo_op); } if (s->ret < 0) { @@ -609,7 +612,7 @@ static void mirror_free_init(MirrorBlockJob *s) static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s) { while (s->in_flight > 0) { - mirror_wait_for_free_in_flight_slot(s); + mirror_wait_for_free_in_flight_slot(s, NULL); } } @@ -794,7 +797,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) if (s->in_flight >= MAX_IN_FLIGHT) { trace_mirror_yield(s, UINT64_MAX, s->buf_free_count, s->in_flight); - mirror_wait_for_free_in_flight_slot(s); + mirror_wait_for_free_in_flight_slot(s, NULL); continue; } @@ -947,7 +950,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) /* Do not start passive operations while there are active * writes in progress */ while (s->in_active_write_counter) { - mirror_wait_for_any_operation(s, true); + mirror_wait_for_any_operation(s, NULL, true); } if (s->ret < 0) { @@ -973,7 +976,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight); - mirror_wait_for_free_in_flight_slot(s); + mirror_wait_for_free_in_flight_slot(s, NULL); continue; } else if (cnt != 0) { delay_ns = mirror_iteration(s); From 2d4b5256cf6a091bb6c3516661be11b9ec690f95 Mon Sep 17 00:00:00 2001 From: Alberto Garcia Date: Thu, 13 Feb 2020 18:16:46 +0100 Subject: [PATCH 03/36] qcow2: Fix alignment checks in encrypted images MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I/O requests to encrypted media should be aligned to the sector size used by the underlying encryption method, not to BDRV_SECTOR_SIZE. Fortunately this doesn't break anything at the moment because both existing QCRYPTO_BLOCK_*_SECTOR_SIZE have the same value as BDRV_SECTOR_SIZE. The checks in qcow2_co_preadv_encrypted() are also unnecessary because they are repeated immediately afterwards in qcow2_co_encdec(). Signed-off-by: Alberto Garcia Message-Id: <20200213171646.15876-1-berto@igalia.com> Reviewed-by: Daniel P. Berrangé Signed-off-by: Kevin Wolf --- block/qcow2-threads.c | 12 ++++++++---- block/qcow2.c | 2 -- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/block/qcow2-threads.c b/block/qcow2-threads.c index 8f5a0d1ebe..77bb578cdf 100644 --- a/block/qcow2-threads.c +++ b/block/qcow2-threads.c @@ -246,12 +246,15 @@ qcow2_co_encdec(BlockDriverState *bs, uint64_t host_offset, .len = len, .func = func, }; + uint64_t sector_size; - assert(QEMU_IS_ALIGNED(guest_offset, BDRV_SECTOR_SIZE)); - assert(QEMU_IS_ALIGNED(host_offset, BDRV_SECTOR_SIZE)); - assert(QEMU_IS_ALIGNED(len, BDRV_SECTOR_SIZE)); assert(s->crypto); + sector_size = qcrypto_block_get_sector_size(s->crypto); + assert(QEMU_IS_ALIGNED(guest_offset, sector_size)); + assert(QEMU_IS_ALIGNED(host_offset, sector_size)); + assert(QEMU_IS_ALIGNED(len, sector_size)); + return len == 0 ? 0 : qcow2_co_process(bs, qcow2_encdec_pool_func, &arg); } @@ -270,7 +273,8 @@ qcow2_co_encdec(BlockDriverState *bs, uint64_t host_offset, * will be written to the underlying storage device at * @host_offset * - * @len - length of the buffer (must be a BDRV_SECTOR_SIZE multiple) + * @len - length of the buffer (must be a multiple of the encryption + * sector size) * * Depending on the encryption method, @host_offset and/or @guest_offset * may be used for generating the initialization vector for diff --git a/block/qcow2.c b/block/qcow2.c index ef96606f8d..8dcee5efec 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2068,8 +2068,6 @@ qcow2_co_preadv_encrypted(BlockDriverState *bs, goto fail; } - assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)); - assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)); if (qcow2_co_decrypt(bs, file_cluster_offset + offset_into_cluster(s, offset), offset, buf, bytes) < 0) From 8475ea48544b313cf533312846a4899ddecb799c Mon Sep 17 00:00:00 2001 From: Hikaru Nishida Date: Mon, 10 Feb 2020 02:51:56 +0900 Subject: [PATCH 04/36] block/vvfat: Do not unref qcow on closing backing bdrv Before this commit, BDRVVVFATState.qcow is unrefed in write_target_close on closing backing bdrv of vvfat. However, qcow bdrv is opend as a child of vvfat in enable_write_target() so it will be also unrefed on closing vvfat itself. This causes use-after-free of qcow on freeing vvfat which has backing bdrv and qcow bdrv as children in this order because bdrv_close(vvfat) tries to free qcow bdrv after freeing backing bdrv as QLIST_FOREACH_SAFE() loop keeps next pointer, but BdrvChild of qcow is already freed in bdrv_close(backing bdrv). Signed-off-by: Hikaru Nishida Message-Id: <20200209175156.85748-1-hikarupsp@gmail.com> Signed-off-by: Kevin Wolf --- block/vvfat.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/block/vvfat.c b/block/vvfat.c index 019b8f1341..ab800c4887 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -3124,17 +3124,10 @@ write_target_commit(BlockDriverState *bs, uint64_t offset, uint64_t bytes, return ret; } -static void write_target_close(BlockDriverState *bs) { - BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); - bdrv_unref_child(s->bs, s->qcow); - g_free(s->qcow_filename); -} - static BlockDriver vvfat_write_target = { .format_name = "vvfat_write_target", .instance_size = sizeof(void*), .bdrv_co_pwritev = write_target_commit, - .bdrv_close = write_target_close, }; static void vvfat_qcow_options(int *child_flags, QDict *child_options, From dea9052ef1ba12c83f17d394c70d7d710ea1dec9 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 11 Feb 2020 10:48:58 +0100 Subject: [PATCH 05/36] qcow2: update_refcount(): Reset old_table_index after qcow2_cache_put() In the case that update_refcount() frees a refcount block, it evicts it from the metadata cache. Before doing so, however, it returns the currently used refcount block to the cache because it might be the same. Returning the refcount block early means that we need to reset old_table_index so that we reload the refcount block in the next iteration if it is actually still in use. Fixes: f71c08ea8e60f035485a512fd2af8908567592f0 Signed-off-by: Kevin Wolf Message-Id: <20200211094900.17315-2-kwolf@redhat.com> Signed-off-by: Kevin Wolf --- block/qcow2-refcount.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index c963bc8de1..7ef1c0e42a 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -889,6 +889,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, offset); if (table != NULL) { qcow2_cache_put(s->refcount_block_cache, &refcount_block); + old_table_index = -1; qcow2_cache_discard(s->refcount_block_cache, table); } From c3b6658c1a5a3fb24d6c27b2594cf86146f75b22 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 11 Feb 2020 10:48:59 +0100 Subject: [PATCH 06/36] qcow2: Fix qcow2_alloc_cluster_abort() for external data file For external data file, cluster allocations return an offset in the data file and are not refcounted. In this case, there is nothing to do for qcow2_alloc_cluster_abort(). Freeing the same offset in the qcow2 file is wrong and causes crashes in the better case or image corruption in the worse case. Signed-off-by: Kevin Wolf Message-Id: <20200211094900.17315-3-kwolf@redhat.com> Signed-off-by: Kevin Wolf --- block/qcow2-cluster.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 1947f13a2d..78c95dfa16 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1026,8 +1026,11 @@ err: void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m) { BDRVQcow2State *s = bs->opaque; - qcow2_free_clusters(bs, m->alloc_offset, m->nb_clusters << s->cluster_bits, - QCOW2_DISCARD_NEVER); + if (!has_data_file(bs)) { + qcow2_free_clusters(bs, m->alloc_offset, + m->nb_clusters << s->cluster_bits, + QCOW2_DISCARD_NEVER); + } } /* From a0cf8daf77548786ced84d773f06fc70571c5d38 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Tue, 11 Feb 2020 10:49:00 +0100 Subject: [PATCH 07/36] iotests: Test copy offloading with external data file This adds a test for 'qemu-img convert' with copy offloading where the target image has an external data file. If the test hosts supports it, it tests both the case where copy offloading is supported and the case where it isn't (otherwise we just test unsupported twice). More specifically, the case with unsupported copy offloading tests qcow2_alloc_cluster_abort() with external data files. Signed-off-by: Kevin Wolf Message-Id: <20200211094900.17315-4-kwolf@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/244 | 14 ++++++++++++++ tests/qemu-iotests/244.out | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/tests/qemu-iotests/244 b/tests/qemu-iotests/244 index 0d1efee6ef..2ec1815e6f 100755 --- a/tests/qemu-iotests/244 +++ b/tests/qemu-iotests/244 @@ -197,6 +197,20 @@ $QEMU_IO -c 'read -P 0x11 0 1M' -f $IMGFMT "$TEST_IMG" | _filter_qemu_io $QEMU_IMG map --output=human "$TEST_IMG" | _filter_testdir $QEMU_IMG map --output=json "$TEST_IMG" +echo +echo "=== Copy offloading ===" +echo + +# Make use of copy offloading if the test host can provide it +_make_test_img -o "data_file=$TEST_IMG.data" 64M +$QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$TEST_IMG" +$QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$TEST_IMG" + +# blkdebug doesn't support copy offloading, so this tests the error path +$QEMU_IMG amend -f $IMGFMT -o "data_file=blkdebug::$TEST_IMG.data" "$TEST_IMG" +$QEMU_IMG convert -f $IMGFMT -O $IMGFMT -n -C "$TEST_IMG.src" "$TEST_IMG" +$QEMU_IMG compare -f $IMGFMT -F $IMGFMT "$TEST_IMG.src" "$TEST_IMG" + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/244.out b/tests/qemu-iotests/244.out index 6a3d0067cc..e6f4dc7993 100644 --- a/tests/qemu-iotests/244.out +++ b/tests/qemu-iotests/244.out @@ -122,4 +122,10 @@ Offset Length Mapped to File 0 0x100000 0 TEST_DIR/t.qcow2.data [{ "start": 0, "length": 1048576, "depth": 0, "zero": false, "data": true, "offset": 0}, { "start": 1048576, "length": 66060288, "depth": 0, "zero": true, "data": false}] + +=== Copy offloading === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 data_file=TEST_DIR/t.IMGFMT.data +Images are identical. +Images are identical. *** done From 5b1405db0fbe4c2600dbaa1a340110a3aecef173 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Sat, 15 Feb 2020 17:15:55 +0100 Subject: [PATCH 08/36] block/qcow2-bitmap: Remove unneeded variable assignment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix warning reported by Clang static code analyzer: CC block/qcow2-bitmap.o block/qcow2-bitmap.c:650:5: warning: Value stored to 'ret' is never read ret = -EINVAL; ^ ~~~~~~~ Fixes: 88ddffae8 Reported-by: Clang Static Analyzer Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200215161557.4077-2-philmd@redhat.com> Reviewed-by: Richard Henderson Reviewed-by: Ján Tomko Reviewed-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Kevin Wolf --- block/qcow2-bitmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c index d41f5d049b..8cccc2c9f3 100644 --- a/block/qcow2-bitmap.c +++ b/block/qcow2-bitmap.c @@ -647,7 +647,6 @@ static Qcow2BitmapList *bitmap_list_load(BlockDriverState *bs, uint64_t offset, return bm_list; broken_dir: - ret = -EINVAL; error_setg(errp, "Broken bitmap directory"); fail: From 248e3ffb66c27cf95623f965b61b0eb97a607441 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 14 Feb 2020 21:08:06 +0100 Subject: [PATCH 09/36] qapi: Document meaning of 'ignore' BlockdevOnError for jobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is not obvious what 'ignore' actually means for block jobs: It could be continuing the job and returning success in the end despite the error (no block job does this). It could also mean continuing and returning failure in the end (this is what stream does). And it can mean retrying the failed request later (this is what backup, commit and mirror do). This (somewhat inconsistent) behaviour was introduced and described for stream and mirror in commit 32c81a4a6ec. backup and commit were introduced later and use the same model as mirror. Signed-off-by: Kevin Wolf Message-Id: <20200214200812.28180-2-kwolf@redhat.com> Reviewed-by: Ján Tomko Signed-off-by: Kevin Wolf --- qapi/block-core.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qapi/block-core.json b/qapi/block-core.json index 13dad62f44..1c32158d11 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1164,7 +1164,10 @@ # for jobs, cancel the job # # @ignore: ignore the error, only report a QMP event (BLOCK_IO_ERROR -# or BLOCK_JOB_ERROR) +# or BLOCK_JOB_ERROR). The backup, mirror and commit block jobs retry +# the failing request later and may still complete successfully. The +# stream block job continues to stream and will complete with an +# error. # # @enospc: same as @stop on ENOSPC, same as @report otherwise. # From d71e65ec1d08849a4e62a778ed4ea732621cea26 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 14 Feb 2020 21:08:07 +0100 Subject: [PATCH 10/36] commit: Remove unused bytes_written MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bytes_written variable is only ever written to, it serves no purpose. This has actually been the case since the commit job was first introduced in commit 747ff602636. Signed-off-by: Kevin Wolf Message-Id: <20200214200812.28180-3-kwolf@redhat.com> Reviewed-by: Ján Tomko Signed-off-by: Kevin Wolf --- block/commit.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/commit.c b/block/commit.c index 23c90b3b91..cce898a4f3 100644 --- a/block/commit.c +++ b/block/commit.c @@ -140,7 +140,6 @@ static int coroutine_fn commit_run(Job *job, Error **errp) int ret = 0; int64_t n = 0; /* bytes */ void *buf = NULL; - int bytes_written = 0; int64_t len, base_len; ret = len = blk_getlength(s->top); @@ -180,7 +179,6 @@ static int coroutine_fn commit_run(Job *job, Error **errp) trace_commit_one_iteration(s, offset, n, ret); if (copy) { ret = commit_populate(s->top, s->base, offset, n, buf); - bytes_written += n; } if (ret < 0) { BlockErrorAction action = From c5507b4d55f1f2989d3c108be387f62ed092ddc0 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 14 Feb 2020 21:08:08 +0100 Subject: [PATCH 11/36] commit: Fix argument order for block_job_error_action() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The block_job_error_action() error call in the commit job gives the on_err and is_read arguments in the wrong order. Fix this. (Of course, hard-coded is_read = false is wrong, too, but that's a separate problem for a separate patch.) Signed-off-by: Kevin Wolf Message-Id: <20200214200812.28180-4-kwolf@redhat.com> Reviewed-by: Ján Tomko Signed-off-by: Kevin Wolf --- block/commit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/commit.c b/block/commit.c index cce898a4f3..8189f079d2 100644 --- a/block/commit.c +++ b/block/commit.c @@ -182,7 +182,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp) } if (ret < 0) { BlockErrorAction action = - block_job_error_action(&s->common, false, s->on_error, -ret); + block_job_error_action(&s->common, s->on_error, false, -ret); if (action == BLOCK_ERROR_ACTION_REPORT) { goto out; } else { From 0c42e175fcc9d9ed888e4d7a30237ddf36454d19 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 14 Feb 2020 21:08:09 +0100 Subject: [PATCH 12/36] commit: Inline commit_populate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit_populate() is a very short function and only called in a single place. Its return value doesn't tell us whether an error happened while reading or writing, which would be necessary for sending the right data in the BLOCK_JOB_ERROR QMP event. Signed-off-by: Kevin Wolf Message-Id: <20200214200812.28180-5-kwolf@redhat.com> Reviewed-by: Ján Tomko Signed-off-by: Kevin Wolf --- block/commit.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/block/commit.c b/block/commit.c index 8189f079d2..2992a1012f 100644 --- a/block/commit.c +++ b/block/commit.c @@ -43,27 +43,6 @@ typedef struct CommitBlockJob { char *backing_file_str; } CommitBlockJob; -static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base, - int64_t offset, uint64_t bytes, - void *buf) -{ - int ret = 0; - - assert(bytes < SIZE_MAX); - - ret = blk_co_pread(bs, offset, bytes, buf, 0); - if (ret < 0) { - return ret; - } - - ret = blk_co_pwrite(base, offset, bytes, buf, 0); - if (ret < 0) { - return ret; - } - - return 0; -} - static int commit_prepare(Job *job) { CommitBlockJob *s = container_of(job, CommitBlockJob, common.job); @@ -178,7 +157,12 @@ static int coroutine_fn commit_run(Job *job, Error **errp) copy = (ret == 1); trace_commit_one_iteration(s, offset, n, ret); if (copy) { - ret = commit_populate(s->top, s->base, offset, n, buf); + assert(n < SIZE_MAX); + + ret = blk_co_pread(s->top, offset, n, buf, 0); + if (ret >= 0) { + ret = blk_co_pwrite(s->base, offset, n, buf, 0); + } } if (ret < 0) { BlockErrorAction action = From 9ad1e79f3f9b102a9aa05053347cf9874c7bc909 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 14 Feb 2020 21:08:10 +0100 Subject: [PATCH 13/36] commit: Fix is_read for block_job_error_action() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit block_job_error_action() needs to know if reading from the top node or writing to the base node failed so that it can set the right 'operation' in the BLOCK_JOB_ERROR QMP event. Signed-off-by: Kevin Wolf Message-Id: <20200214200812.28180-6-kwolf@redhat.com> Reviewed-by: Ján Tomko Signed-off-by: Kevin Wolf --- block/commit.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/commit.c b/block/commit.c index 2992a1012f..8e672799af 100644 --- a/block/commit.c +++ b/block/commit.c @@ -143,6 +143,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp) for (offset = 0; offset < len; offset += n) { bool copy; + bool error_in_source = true; /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. @@ -162,11 +163,15 @@ static int coroutine_fn commit_run(Job *job, Error **errp) ret = blk_co_pread(s->top, offset, n, buf, 0); if (ret >= 0) { ret = blk_co_pwrite(s->base, offset, n, buf, 0); + if (ret < 0) { + error_in_source = false; + } } } if (ret < 0) { BlockErrorAction action = - block_job_error_action(&s->common, s->on_error, false, -ret); + block_job_error_action(&s->common, s->on_error, + error_in_source, -ret); if (action == BLOCK_ERROR_ACTION_REPORT) { goto out; } else { From 8faad1c7fbabec51223297a9477dbc0d2a7d0a24 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 14 Feb 2020 21:08:11 +0100 Subject: [PATCH 14/36] commit: Expose on-error option in QMP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the error handling in the common block job is fixed, we can expose the on-error option in QMP instead of hard-coding it as 'report' in qmp_block_commit(). This fulfills the promise that the old comment in that function made, even if a bit later than expected: "This will be part of the QMP command, if/when the BlockdevOnError change for blkmirror makes it in". Signed-off-by: Kevin Wolf Message-Id: <20200214200812.28180-7-kwolf@redhat.com> Reviewed-by: Ján Tomko Signed-off-by: Kevin Wolf --- blockdev.c | 8 ++++---- qapi/block-core.json | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/blockdev.c b/blockdev.c index c6a727cca9..374189a426 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3471,6 +3471,7 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device, bool has_top, const char *top, bool has_backing_file, const char *backing_file, bool has_speed, int64_t speed, + bool has_on_error, BlockdevOnError on_error, bool has_filter_node_name, const char *filter_node_name, bool has_auto_finalize, bool auto_finalize, bool has_auto_dismiss, bool auto_dismiss, @@ -3481,15 +3482,14 @@ void qmp_block_commit(bool has_job_id, const char *job_id, const char *device, BlockDriverState *base_bs, *top_bs; AioContext *aio_context; Error *local_err = NULL; - /* This will be part of the QMP command, if/when the - * BlockdevOnError change for blkmirror makes it in - */ - BlockdevOnError on_error = BLOCKDEV_ON_ERROR_REPORT; int job_flags = JOB_DEFAULT; if (!has_speed) { speed = 0; } + if (!has_on_error) { + on_error = BLOCKDEV_ON_ERROR_REPORT; + } if (!has_filter_node_name) { filter_node_name = NULL; } diff --git a/qapi/block-core.json b/qapi/block-core.json index 1c32158d11..37d7ea7295 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -1658,6 +1658,9 @@ # # @speed: the maximum speed, in bytes per second # +# @on-error: the action to take on an error. 'ignore' means that the request +# should be retried. (default: report; Since: 5.0) +# # @filter-node-name: the node name that should be assigned to the # filter driver that the commit job inserts into the graph # above @top. If this option is not given, a node name is @@ -1694,6 +1697,7 @@ 'data': { '*job-id': 'str', 'device': 'str', '*base-node': 'str', '*base': 'str', '*top-node': 'str', '*top': 'str', '*backing-file': 'str', '*speed': 'int', + '*on-error': 'BlockdevOnError', '*filter-node-name': 'str', '*auto-finalize': 'bool', '*auto-dismiss': 'bool' } } From d439848941531f11ca94b714b57881e6c0c5ed2a Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Fri, 14 Feb 2020 21:08:12 +0100 Subject: [PATCH 15/36] iotests: Test error handling policies with block-commit This tests both read failure (from the top node) and write failure (to the base node) for on-error=report/stop/ignore. As block-commit actually starts two different types of block jobs (mirror.c for committing the active later, commit.c for intermediate layers), all tests are run for both cases. Signed-off-by: Kevin Wolf Message-Id: <20200214200812.28180-8-kwolf@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/040 | 283 +++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/040.out | 4 +- 2 files changed, 285 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040 index 2e7ee0e84f..32c82b4ec6 100755 --- a/tests/qemu-iotests/040 +++ b/tests/qemu-iotests/040 @@ -430,6 +430,289 @@ class TestReopenOverlay(ImageCommitTestCase): def test_reopen_overlay(self): self.run_commit_test(self.img1, self.img0) +class TestErrorHandling(iotests.QMPTestCase): + image_len = 2 * 1024 * 1024 + + def setUp(self): + iotests.create_image(backing_img, self.image_len) + qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % backing_img, mid_img) + qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % mid_img, test_img) + + qemu_io('-f', iotests.imgfmt, '-c', 'write -P 0x11 0 512k', mid_img) + qemu_io('-f', iotests.imgfmt, '-c', 'write -P 0x22 0 512k', test_img) + + self.vm = iotests.VM() + self.vm.launch() + + self.blkdebug_file = iotests.file_path("blkdebug.conf") + + def tearDown(self): + self.vm.shutdown() + os.remove(test_img) + os.remove(mid_img) + os.remove(backing_img) + + def blockdev_add(self, **kwargs): + result = self.vm.qmp('blockdev-add', **kwargs) + self.assert_qmp(result, 'return', {}) + + def add_block_nodes(self, base_debug=None, mid_debug=None, top_debug=None): + self.blockdev_add(node_name='base-file', driver='file', + filename=backing_img) + self.blockdev_add(node_name='mid-file', driver='file', + filename=mid_img) + self.blockdev_add(node_name='top-file', driver='file', + filename=test_img) + + if base_debug: + self.blockdev_add(node_name='base-dbg', driver='blkdebug', + image='base-file', inject_error=base_debug) + if mid_debug: + self.blockdev_add(node_name='mid-dbg', driver='blkdebug', + image='mid-file', inject_error=mid_debug) + if top_debug: + self.blockdev_add(node_name='top-dbg', driver='blkdebug', + image='top-file', inject_error=top_debug) + + self.blockdev_add(node_name='base-fmt', driver='raw', + file=('base-dbg' if base_debug else 'base-file')) + self.blockdev_add(node_name='mid-fmt', driver=iotests.imgfmt, + file=('mid-dbg' if mid_debug else 'mid-file'), + backing='base-fmt') + self.blockdev_add(node_name='top-fmt', driver=iotests.imgfmt, + file=('top-dbg' if top_debug else 'top-file'), + backing='mid-fmt') + + def run_job(self, expected_events, error_pauses_job=False): + match_device = {'data': {'device': 'job0'}} + events = [ + ('BLOCK_JOB_COMPLETED', match_device), + ('BLOCK_JOB_CANCELLED', match_device), + ('BLOCK_JOB_ERROR', match_device), + ('BLOCK_JOB_READY', match_device), + ] + + completed = False + log = [] + while not completed: + ev = self.vm.events_wait(events, timeout=5.0) + if ev['event'] == 'BLOCK_JOB_COMPLETED': + completed = True + elif ev['event'] == 'BLOCK_JOB_ERROR': + if error_pauses_job: + result = self.vm.qmp('block-job-resume', device='job0') + self.assert_qmp(result, 'return', {}) + elif ev['event'] == 'BLOCK_JOB_READY': + result = self.vm.qmp('block-job-complete', device='job0') + self.assert_qmp(result, 'return', {}) + else: + self.fail("Unexpected event: %s" % ev) + log.append(iotests.filter_qmp_event(ev)) + + self.maxDiff = None + self.assertEqual(expected_events, log) + + def event_error(self, op, action): + return { + 'event': 'BLOCK_JOB_ERROR', + 'data': {'action': action, 'device': 'job0', 'operation': op}, + 'timestamp': {'microseconds': 'USECS', 'seconds': 'SECS'} + } + + def event_ready(self): + return { + 'event': 'BLOCK_JOB_READY', + 'data': {'device': 'job0', + 'len': 524288, + 'offset': 524288, + 'speed': 0, + 'type': 'commit'}, + 'timestamp': {'microseconds': 'USECS', 'seconds': 'SECS'}, + } + + def event_completed(self, errmsg=None, active=True): + max_len = 524288 if active else self.image_len + data = { + 'device': 'job0', + 'len': max_len, + 'offset': 0 if errmsg else max_len, + 'speed': 0, + 'type': 'commit' + } + if errmsg: + data['error'] = errmsg + + return { + 'event': 'BLOCK_JOB_COMPLETED', + 'data': data, + 'timestamp': {'microseconds': 'USECS', 'seconds': 'SECS'}, + } + + def blkdebug_event(self, event, is_raw=False): + if event: + return [{ + 'event': event, + 'sector': 512 if is_raw else 1024, + 'once': True, + }] + return None + + def prepare_and_start_job(self, on_error, active=True, + top_event=None, mid_event=None, base_event=None): + + top_debug = self.blkdebug_event(top_event) + mid_debug = self.blkdebug_event(mid_event) + base_debug = self.blkdebug_event(base_event, True) + + self.add_block_nodes(top_debug=top_debug, mid_debug=mid_debug, + base_debug=base_debug) + + result = self.vm.qmp('block-commit', job_id='job0', device='top-fmt', + top_node='top-fmt' if active else 'mid-fmt', + base_node='mid-fmt' if active else 'base-fmt', + on_error=on_error) + self.assert_qmp(result, 'return', {}) + + def testActiveReadErrorReport(self): + self.prepare_and_start_job('report', top_event='read_aio') + self.run_job([ + self.event_error('read', 'report'), + self.event_completed('Input/output error') + ]) + + self.vm.shutdown() + self.assertFalse(iotests.compare_images(test_img, mid_img), + 'target image matches source after error') + + def testActiveReadErrorStop(self): + self.prepare_and_start_job('stop', top_event='read_aio') + self.run_job([ + self.event_error('read', 'stop'), + self.event_ready(), + self.event_completed() + ], error_pauses_job=True) + + self.vm.shutdown() + self.assertTrue(iotests.compare_images(test_img, mid_img), + 'target image does not match source after commit') + + def testActiveReadErrorIgnore(self): + self.prepare_and_start_job('ignore', top_event='read_aio') + self.run_job([ + self.event_error('read', 'ignore'), + self.event_ready(), + self.event_completed() + ]) + + # For commit, 'ignore' actually means retry, so this will succeed + self.vm.shutdown() + self.assertTrue(iotests.compare_images(test_img, mid_img), + 'target image does not match source after commit') + + def testActiveWriteErrorReport(self): + self.prepare_and_start_job('report', mid_event='write_aio') + self.run_job([ + self.event_error('write', 'report'), + self.event_completed('Input/output error') + ]) + + self.vm.shutdown() + self.assertFalse(iotests.compare_images(test_img, mid_img), + 'target image matches source after error') + + def testActiveWriteErrorStop(self): + self.prepare_and_start_job('stop', mid_event='write_aio') + self.run_job([ + self.event_error('write', 'stop'), + self.event_ready(), + self.event_completed() + ], error_pauses_job=True) + + self.vm.shutdown() + self.assertTrue(iotests.compare_images(test_img, mid_img), + 'target image does not match source after commit') + + def testActiveWriteErrorIgnore(self): + self.prepare_and_start_job('ignore', mid_event='write_aio') + self.run_job([ + self.event_error('write', 'ignore'), + self.event_ready(), + self.event_completed() + ]) + + # For commit, 'ignore' actually means retry, so this will succeed + self.vm.shutdown() + self.assertTrue(iotests.compare_images(test_img, mid_img), + 'target image does not match source after commit') + + def testIntermediateReadErrorReport(self): + self.prepare_and_start_job('report', active=False, mid_event='read_aio') + self.run_job([ + self.event_error('read', 'report'), + self.event_completed('Input/output error', active=False) + ]) + + self.vm.shutdown() + self.assertFalse(iotests.compare_images(mid_img, backing_img, fmt2='raw'), + 'target image matches source after error') + + def testIntermediateReadErrorStop(self): + self.prepare_and_start_job('stop', active=False, mid_event='read_aio') + self.run_job([ + self.event_error('read', 'stop'), + self.event_completed(active=False) + ], error_pauses_job=True) + + self.vm.shutdown() + self.assertTrue(iotests.compare_images(mid_img, backing_img, fmt2='raw'), + 'target image does not match source after commit') + + def testIntermediateReadErrorIgnore(self): + self.prepare_and_start_job('ignore', active=False, mid_event='read_aio') + self.run_job([ + self.event_error('read', 'ignore'), + self.event_completed(active=False) + ]) + + # For commit, 'ignore' actually means retry, so this will succeed + self.vm.shutdown() + self.assertTrue(iotests.compare_images(mid_img, backing_img, fmt2='raw'), + 'target image does not match source after commit') + + def testIntermediateWriteErrorReport(self): + self.prepare_and_start_job('report', active=False, base_event='write_aio') + self.run_job([ + self.event_error('write', 'report'), + self.event_completed('Input/output error', active=False) + ]) + + self.vm.shutdown() + self.assertFalse(iotests.compare_images(mid_img, backing_img, fmt2='raw'), + 'target image matches source after error') + + def testIntermediateWriteErrorStop(self): + self.prepare_and_start_job('stop', active=False, base_event='write_aio') + self.run_job([ + self.event_error('write', 'stop'), + self.event_completed(active=False) + ], error_pauses_job=True) + + self.vm.shutdown() + self.assertTrue(iotests.compare_images(mid_img, backing_img, fmt2='raw'), + 'target image does not match source after commit') + + def testIntermediateWriteErrorIgnore(self): + self.prepare_and_start_job('ignore', active=False, base_event='write_aio') + self.run_job([ + self.event_error('write', 'ignore'), + self.event_completed(active=False) + ]) + + # For commit, 'ignore' actually means retry, so this will succeed + self.vm.shutdown() + self.assertTrue(iotests.compare_images(mid_img, backing_img, fmt2='raw'), + 'target image does not match source after commit') + if __name__ == '__main__': iotests.main(supported_fmts=['qcow2', 'qed'], supported_protocols=['file']) diff --git a/tests/qemu-iotests/040.out b/tests/qemu-iotests/040.out index 220a5fa82c..6a917130b6 100644 --- a/tests/qemu-iotests/040.out +++ b/tests/qemu-iotests/040.out @@ -1,5 +1,5 @@ -............................................... +........................................................... ---------------------------------------------------------------------- -Ran 47 tests +Ran 59 tests OK From 0beab8119f5e9d98f4afd4c9b57935436fb54a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 18 Feb 2020 10:43:52 +0100 Subject: [PATCH 16/36] block: Remove superfluous semicolons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: 132ada80c4a Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200218094402.26625-4-philmd@redhat.com> Signed-off-by: Kevin Wolf --- block.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block.c b/block.c index 9c810534d6..9db0b973fe 100644 --- a/block.c +++ b/block.c @@ -2435,13 +2435,13 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, if (bdrv_get_aio_context(child_bs) != ctx) { ret = bdrv_try_set_aio_context(child_bs, ctx, &local_err); if (ret < 0 && child_role->can_set_aio_ctx) { - GSList *ignore = g_slist_prepend(NULL, child);; + GSList *ignore = g_slist_prepend(NULL, child); ctx = bdrv_get_aio_context(child_bs); if (child_role->can_set_aio_ctx(child, ctx, &ignore, NULL)) { error_free(local_err); ret = 0; g_slist_free(ignore); - ignore = g_slist_prepend(NULL, child);; + ignore = g_slist_prepend(NULL, child); child_role->set_aio_ctx(child, ctx, &ignore); } g_slist_free(ignore); From 74e4a8a9614a96a7b7e34778d33e1f05a30d312d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Philippe=20Mathieu-Daud=C3=A9?= Date: Tue, 18 Feb 2020 10:43:53 +0100 Subject: [PATCH 17/36] block/io_uring: Remove superfluous semicolon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes: 6663a0a3376 Signed-off-by: Philippe Mathieu-Daudé Message-Id: <20200218094402.26625-5-philmd@redhat.com> Signed-off-by: Kevin Wolf --- block/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/io_uring.c b/block/io_uring.c index 56892fd1ab..a3142ca989 100644 --- a/block/io_uring.c +++ b/block/io_uring.c @@ -187,7 +187,7 @@ static void luring_process_completions(LuringState *s) ret = 0; } } else { - ret = -ENOSPC;; + ret = -ENOSPC; } } end: From ca08d937e886aea7b9ce2c05d41e5c1aeec07a9f Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:36 +0100 Subject: [PATCH 18/36] blockdev: Allow external snapshots everywhere There is no good reason why we would allow external snapshots only on the first non-filter node in a chain. Parent BDSs should not care whether their child is replaced by a snapshot. (If they do care, they should announce that via freezing the chain, which is checked in bdrv_append() through bdrv_set_backing_hd().) Before we had bdrv_is_first_non_filter() here (since 212a5a8f095), there was a special function bdrv_check_ext_snapshot() that allowed snapshots by default, but block drivers could override this. Only blkverify did so, however. It is not clear to me why blkverify would do so; maybe just so that the testee block driver would not be replaced. The introducing commit f6186f49e2c does not explain why. Maybe because 08b24cfe376 would have been the correct solution? (Which adds a .supports_backing check.) Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-2-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- blockdev.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/blockdev.c b/blockdev.c index 374189a426..d83bb2beda 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1592,11 +1592,6 @@ static void external_snapshot_prepare(BlkActionState *common, } } - if (!bdrv_is_first_non_filter(state->old_bs)) { - error_setg(errp, QERR_FEATURE_DISABLED, "snapshot"); - goto out; - } - if (action->type == TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC) { BlockdevSnapshotSync *s = action->u.blockdev_snapshot_sync.data; const char *format = s->has_format ? s->format : "qcow2"; From 7607074f429eeac94f7c9dd6adfb5a42b0d835ee Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:37 +0100 Subject: [PATCH 19/36] blockdev: Allow resizing everywhere Block nodes that do not allow resizing should not share BLK_PERM_RESIZE. It does not matter whether they are the first non-filter in their chain or not. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-3-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- blockdev.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/blockdev.c b/blockdev.c index d83bb2beda..45de0ba37e 100644 --- a/blockdev.c +++ b/blockdev.c @@ -3331,11 +3331,6 @@ void qmp_block_resize(bool has_device, const char *device, aio_context = bdrv_get_aio_context(bs); aio_context_acquire(aio_context); - if (!bdrv_is_first_non_filter(bs)) { - error_setg(errp, QERR_FEATURE_DISABLED, "resize"); - goto out; - } - if (size < 0) { error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "size", "a >0 size"); goto out; From a851ad4cacc57bae6161f95de836a43011f4b906 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:38 +0100 Subject: [PATCH 20/36] block: Drop bdrv_is_first_non_filter() It is unused now. (And it was ugly because it needed to explore all BDS chains from the top.) Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-4-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block.c | 26 -------------------------- include/block/block.h | 1 - 2 files changed, 27 deletions(-) diff --git a/block.c b/block.c index 9db0b973fe..145d0baf5e 100644 --- a/block.c +++ b/block.c @@ -6234,32 +6234,6 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, return false; } -/* This function checks if the candidate is the first non filter bs down it's - * bs chain. Since we don't have pointers to parents it explore all bs chains - * from the top. Some filters can choose not to pass down the recursion. - */ -bool bdrv_is_first_non_filter(BlockDriverState *candidate) -{ - BlockDriverState *bs; - BdrvNextIterator it; - - /* walk down the bs forest recursively */ - for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { - bool perm; - - /* try to recurse in this top level bs */ - perm = bdrv_recurse_is_first_non_filter(bs, candidate); - - /* candidate is the first non filter */ - if (perm) { - bdrv_next_cleanup(&it); - return true; - } - } - - return false; -} - BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, const char *node_name, Error **errp) { diff --git a/include/block/block.h b/include/block/block.h index 6cd566324d..6a8462a5bc 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -394,7 +394,6 @@ int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts, /* external snapshots */ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, BlockDriverState *candidate); -bool bdrv_is_first_non_filter(BlockDriverState *candidate); /* check if a named node can be replaced when doing drive-mirror */ BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, From f718ca147dad5b03b4cb75952966e59e34616382 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:39 +0100 Subject: [PATCH 21/36] iotests: Let 041 use -blockdev for quorum children Using -drive with default options means that a virtio-blk drive will be created that has write access to the to-be quorum children. Quorum should have exclusive write access to them, so we should use -blockdev instead. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-5-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/041 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index 43556b9727..aa7d54d968 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -885,7 +885,10 @@ class TestRepairQuorum(iotests.QMPTestCase): # Assign a node name to each quorum image in order to manipulate # them opts = "node-name=img%i" % self.IMAGES.index(i) - self.vm = self.vm.add_drive(i, opts) + opts += ',driver=%s' % iotests.imgfmt + opts += ',file.driver=file' + opts += ',file.filename=%s' % i + self.vm = self.vm.add_blockdev(opts) self.vm.launch() From 37a3791b380584949046af6b62b54869390616dc Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:40 +0100 Subject: [PATCH 22/36] quorum: Fix child permissions Quorum cannot share WRITE or RESIZE on its children. Presumably, it only does so because as a filter, it seemed intuitively correct to point its .bdrv_child_perm to bdrv_filter_default_perm(). However, it is not really a filter, and bdrv_filter_default_perm() does not work for it, so we have to provide a custom .bdrv_child_perm implementation. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-6-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block/quorum.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/block/quorum.c b/block/quorum.c index df68adcfaa..17b439056f 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -1114,6 +1114,23 @@ static char *quorum_dirname(BlockDriverState *bs, Error **errp) return NULL; } +static void quorum_child_perm(BlockDriverState *bs, BdrvChild *c, + const BdrvChildRole *role, + BlockReopenQueue *reopen_queue, + uint64_t perm, uint64_t shared, + uint64_t *nperm, uint64_t *nshared) +{ + *nperm = perm & DEFAULT_PERM_PASSTHROUGH; + + /* + * We cannot share RESIZE or WRITE, as this would make the + * children differ from each other. + */ + *nshared = (shared & (BLK_PERM_CONSISTENT_READ | + BLK_PERM_WRITE_UNCHANGED)) + | DEFAULT_PERM_UNCHANGED; +} + static const char *const quorum_strong_runtime_opts[] = { QUORUM_OPT_VOTE_THRESHOLD, QUORUM_OPT_BLKVERIFY, @@ -1143,7 +1160,7 @@ static BlockDriver bdrv_quorum = { .bdrv_add_child = quorum_add_child, .bdrv_del_child = quorum_del_child, - .bdrv_child_perm = bdrv_filter_default_perms, + .bdrv_child_perm = quorum_child_perm, .is_filter = true, .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, From 5d69b5ab85d30201c1f5ef34651ba910977fc583 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:41 +0100 Subject: [PATCH 23/36] block: Add bdrv_recurse_can_replace() After a couple of follow-up patches, this function will replace bdrv_recurse_is_first_non_filter() in check_to_replace_node(). bdrv_recurse_is_first_non_filter() is both not sufficiently specific for check_to_replace_node() (it allows cases that should not be allowed, like replacing child nodes of quorum with dissenting data that have more parents than just quorum), and it is too restrictive (it is perfectly fine to replace filters). Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-7-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block.c | 38 ++++++++++++++++++++++++++++++++++++++ include/block/block_int.h | 10 ++++++++++ 2 files changed, 48 insertions(+) diff --git a/block.c b/block.c index 145d0baf5e..e2fefe5883 100644 --- a/block.c +++ b/block.c @@ -6234,6 +6234,44 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, return false; } +/* + * This function checks whether the given @to_replace is allowed to be + * replaced by a node that always shows the same data as @bs. This is + * used for example to verify whether the mirror job can replace + * @to_replace by the target mirrored from @bs. + * To be replaceable, @bs and @to_replace may either be guaranteed to + * always show the same data (because they are only connected through + * filters), or some driver may allow replacing one of its children + * because it can guarantee that this child's data is not visible at + * all (for example, for dissenting quorum children that have no other + * parents). + */ +bool bdrv_recurse_can_replace(BlockDriverState *bs, + BlockDriverState *to_replace) +{ + if (!bs || !bs->drv) { + return false; + } + + if (bs == to_replace) { + return true; + } + + /* See what the driver can do */ + if (bs->drv->bdrv_recurse_can_replace) { + return bs->drv->bdrv_recurse_can_replace(bs, to_replace); + } + + /* For filters without an own implementation, we can recurse on our own */ + if (bs->drv->is_filter) { + BdrvChild *child = bs->file ?: bs->backing; + return bdrv_recurse_can_replace(child->bs, to_replace); + } + + /* Safe default */ + return false; +} + BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, const char *node_name, Error **errp) { diff --git a/include/block/block_int.h b/include/block/block_int.h index 640fb82c78..eaefac210e 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -102,6 +102,13 @@ struct BlockDriver { */ bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs, BlockDriverState *candidate); + /* + * Return true if @to_replace can be replaced by a BDS with the + * same data as @bs without it affecting @bs's behavior (that is, + * without it being visible to @bs's parents). + */ + bool (*bdrv_recurse_can_replace)(BlockDriverState *bs, + BlockDriverState *to_replace); int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename); int (*bdrv_probe_device)(const char *filename); @@ -1263,6 +1270,9 @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c, uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared); +bool bdrv_recurse_can_replace(BlockDriverState *bs, + BlockDriverState *to_replace); + /* * Default implementation for drivers to pass bdrv_co_block_status() to * their file. From 998a6b2fc55b42c066260a2f75aa11a07e128808 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:42 +0100 Subject: [PATCH 24/36] blkverify: Implement .bdrv_recurse_can_replace() Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-8-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block/blkverify.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/block/blkverify.c b/block/blkverify.c index 304b0a1368..0add3ab483 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -282,6 +282,20 @@ static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate); } +static bool blkverify_recurse_can_replace(BlockDriverState *bs, + BlockDriverState *to_replace) +{ + BDRVBlkverifyState *s = bs->opaque; + + /* + * blkverify quits the whole qemu process if there is a mismatch + * between bs->file->bs and s->test_file->bs. Therefore, we know + * know that both must match bs and we can recurse down to either. + */ + return bdrv_recurse_can_replace(bs->file->bs, to_replace) || + bdrv_recurse_can_replace(s->test_file->bs, to_replace); +} + static void blkverify_refresh_filename(BlockDriverState *bs) { BDRVBlkverifyState *s = bs->opaque; @@ -328,6 +342,7 @@ static BlockDriver bdrv_blkverify = { .is_filter = true, .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter, + .bdrv_recurse_can_replace = blkverify_recurse_can_replace, }; static void bdrv_blkverify_init(void) From a3ed794b36a60f544efcf00d146b95e03a786954 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:43 +0100 Subject: [PATCH 25/36] quorum: Implement .bdrv_recurse_can_replace() Signed-off-by: Max Reitz Message-Id: <20200218103454.296704-9-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block/quorum.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/block/quorum.c b/block/quorum.c index 17b439056f..3ece6e4382 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -813,6 +813,59 @@ static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs, return false; } +static bool quorum_recurse_can_replace(BlockDriverState *bs, + BlockDriverState *to_replace) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + /* + * We have no idea whether our children show the same data as + * this node (@bs). It is actually highly likely that + * @to_replace does not, because replacing a broken child is + * one of the main use cases here. + * + * We do know that the new BDS will match @bs, so replacing + * any of our children by it will be safe. It cannot change + * the data this quorum node presents to its parents. + * + * However, replacing @to_replace by @bs in any of our + * children's chains may change visible data somewhere in + * there. We therefore cannot recurse down those chains with + * bdrv_recurse_can_replace(). + * (More formally, bdrv_recurse_can_replace() requires that + * @to_replace will be replaced by something matching the @bs + * passed to it. We cannot guarantee that.) + * + * Thus, we can only check whether any of our immediate + * children matches @to_replace. + * + * (In the future, we might add a function to recurse down a + * chain that checks that nothing there cares about a change + * in data from the respective child in question. For + * example, most filters do not care when their child's data + * suddenly changes, as long as their parents do not care.) + */ + if (s->children[i]->bs == to_replace) { + /* + * We now have to ensure that there is no other parent + * that cares about replacing this child by a node with + * potentially different data. + * We do so by checking whether there are any other parents + * at all, which is stricter than necessary, but also very + * simple. (We may decide to implement something more + * complex and permissive when there is an actual need for + * it.) + */ + return QLIST_FIRST(&to_replace->parents) == s->children[i] && + QLIST_NEXT(s->children[i], next_parent) == NULL; + } + } + + return false; +} + static int quorum_valid_threshold(int threshold, int num_children, Error **errp) { @@ -1164,6 +1217,7 @@ static BlockDriver bdrv_quorum = { .is_filter = true, .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, + .bdrv_recurse_can_replace = quorum_recurse_can_replace, .strong_runtime_opts = quorum_strong_runtime_opts, }; From 810803a87ce88abd695f07f434d00e7cd1e122e2 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:44 +0100 Subject: [PATCH 26/36] block: Use bdrv_recurse_can_replace() Let check_to_replace_node() use the more specialized bdrv_recurse_can_replace() instead of bdrv_recurse_is_first_non_filter(), which is too restrictive (or, in the case of quorum, sometimes not restrictive enough). Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-10-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/block.c b/block.c index e2fefe5883..80a0806eb0 100644 --- a/block.c +++ b/block.c @@ -6272,6 +6272,17 @@ bool bdrv_recurse_can_replace(BlockDriverState *bs, return false; } +/* + * Check whether the given @node_name can be replaced by a node that + * has the same data as @parent_bs. If so, return @node_name's BDS; + * NULL otherwise. + * + * @node_name must be a (recursive) *child of @parent_bs (or this + * function will return NULL). + * + * The result (whether the node can be replaced or not) is only valid + * for as long as no graph or permission changes occur. + */ BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, const char *node_name, Error **errp) { @@ -6296,8 +6307,11 @@ BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, * Another benefit is that this tests exclude backing files which are * blocked by the backing blockers. */ - if (!bdrv_recurse_is_first_non_filter(parent_bs, to_replace_bs)) { - error_setg(errp, "Only top most non filter can be replaced"); + if (!bdrv_recurse_can_replace(parent_bs, to_replace_bs)) { + error_setg(errp, "Cannot replace '%s' by a node mirrored from '%s', " + "because it cannot be guaranteed that doing so would not " + "lead to an abrupt change of visible data", + node_name, parent_bs->node_name); to_replace_bs = NULL; goto out; } From 6b4907cf4279e55207fc3fede5686324464ee413 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:45 +0100 Subject: [PATCH 27/36] block: Remove bdrv_recurse_is_first_non_filter() It no longer has any users. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-11-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block.c | 33 --------------------------------- block/blkverify.c | 15 --------------- block/copy-on-read.c | 9 --------- block/filter-compress.c | 9 --------- block/quorum.c | 18 ------------------ block/replication.c | 7 ------- block/throttle.c | 8 -------- include/block/block.h | 4 ---- include/block/block_int.h | 8 -------- 9 files changed, 111 deletions(-) diff --git a/block.c b/block.c index 80a0806eb0..946e3c896e 100644 --- a/block.c +++ b/block.c @@ -6201,39 +6201,6 @@ int bdrv_amend_options(BlockDriverState *bs, QemuOpts *opts, return bs->drv->bdrv_amend_options(bs, opts, status_cb, cb_opaque, errp); } -/* This function will be called by the bdrv_recurse_is_first_non_filter method - * of block filter and by bdrv_is_first_non_filter. - * It is used to test if the given bs is the candidate or recurse more in the - * node graph. - */ -bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, - BlockDriverState *candidate) -{ - /* return false if basic checks fails */ - if (!bs || !bs->drv) { - return false; - } - - /* the code reached a non block filter driver -> check if the bs is - * the same as the candidate. It's the recursion termination condition. - */ - if (!bs->drv->is_filter) { - return bs == candidate; - } - /* Down this path the driver is a block filter driver */ - - /* If the block filter recursion method is defined use it to recurse down - * the node graph. - */ - if (bs->drv->bdrv_recurse_is_first_non_filter) { - return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate); - } - - /* the driver is a block filter but don't allow to recurse -> return false - */ - return false; -} - /* * This function checks whether the given @to_replace is allowed to be * replaced by a node that always shows the same data as @bs. This is diff --git a/block/blkverify.c b/block/blkverify.c index 0add3ab483..ba6b1853ae 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -268,20 +268,6 @@ static int blkverify_co_flush(BlockDriverState *bs) return bdrv_co_flush(s->test_file->bs); } -static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, - BlockDriverState *candidate) -{ - BDRVBlkverifyState *s = bs->opaque; - - bool perm = bdrv_recurse_is_first_non_filter(bs->file->bs, candidate); - - if (perm) { - return true; - } - - return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate); -} - static bool blkverify_recurse_can_replace(BlockDriverState *bs, BlockDriverState *to_replace) { @@ -341,7 +327,6 @@ static BlockDriver bdrv_blkverify = { .bdrv_co_flush = blkverify_co_flush, .is_filter = true, - .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter, .bdrv_recurse_can_replace = blkverify_recurse_can_replace, }; diff --git a/block/copy-on-read.c b/block/copy-on-read.c index e95223d3cb..242d3ff055 100644 --- a/block/copy-on-read.c +++ b/block/copy-on-read.c @@ -118,13 +118,6 @@ static void cor_lock_medium(BlockDriverState *bs, bool locked) } -static bool cor_recurse_is_first_non_filter(BlockDriverState *bs, - BlockDriverState *candidate) -{ - return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate); -} - - static BlockDriver bdrv_copy_on_read = { .format_name = "copy-on-read", @@ -143,8 +136,6 @@ static BlockDriver bdrv_copy_on_read = { .bdrv_co_block_status = bdrv_co_block_status_from_file, - .bdrv_recurse_is_first_non_filter = cor_recurse_is_first_non_filter, - .has_variable_length = true, .is_filter = true, }; diff --git a/block/filter-compress.c b/block/filter-compress.c index 60137fb680..82c315b298 100644 --- a/block/filter-compress.c +++ b/block/filter-compress.c @@ -128,13 +128,6 @@ static void compress_lock_medium(BlockDriverState *bs, bool locked) } -static bool compress_recurse_is_first_non_filter(BlockDriverState *bs, - BlockDriverState *candidate) -{ - return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate); -} - - static BlockDriver bdrv_compress = { .format_name = "compress", @@ -154,8 +147,6 @@ static BlockDriver bdrv_compress = { .bdrv_co_block_status = bdrv_co_block_status_from_file, - .bdrv_recurse_is_first_non_filter = compress_recurse_is_first_non_filter, - .has_variable_length = true, .is_filter = true, }; diff --git a/block/quorum.c b/block/quorum.c index 3ece6e4382..f57b0402d9 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -796,23 +796,6 @@ static coroutine_fn int quorum_co_flush(BlockDriverState *bs) return result; } -static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs, - BlockDriverState *candidate) -{ - BDRVQuorumState *s = bs->opaque; - int i; - - for (i = 0; i < s->num_children; i++) { - bool perm = bdrv_recurse_is_first_non_filter(s->children[i]->bs, - candidate); - if (perm) { - return true; - } - } - - return false; -} - static bool quorum_recurse_can_replace(BlockDriverState *bs, BlockDriverState *to_replace) { @@ -1216,7 +1199,6 @@ static BlockDriver bdrv_quorum = { .bdrv_child_perm = quorum_child_perm, .is_filter = true, - .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, .bdrv_recurse_can_replace = quorum_recurse_can_replace, .strong_runtime_opts = quorum_strong_runtime_opts, diff --git a/block/replication.c b/block/replication.c index 99532ce521..d6681b6c84 100644 --- a/block/replication.c +++ b/block/replication.c @@ -306,12 +306,6 @@ out: return ret; } -static bool replication_recurse_is_first_non_filter(BlockDriverState *bs, - BlockDriverState *candidate) -{ - return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate); -} - static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp) { Error *local_err = NULL; @@ -699,7 +693,6 @@ static BlockDriver bdrv_replication = { .bdrv_co_writev = replication_co_writev, .is_filter = true, - .bdrv_recurse_is_first_non_filter = replication_recurse_is_first_non_filter, .has_variable_length = true, .strong_runtime_opts = replication_strong_runtime_opts, diff --git a/block/throttle.c b/block/throttle.c index 0349f42257..71f4bb0ad1 100644 --- a/block/throttle.c +++ b/block/throttle.c @@ -207,12 +207,6 @@ static void throttle_reopen_abort(BDRVReopenState *reopen_state) reopen_state->opaque = NULL; } -static bool throttle_recurse_is_first_non_filter(BlockDriverState *bs, - BlockDriverState *candidate) -{ - return bdrv_recurse_is_first_non_filter(bs->file->bs, candidate); -} - static void coroutine_fn throttle_co_drain_begin(BlockDriverState *bs) { ThrottleGroupMember *tgm = bs->opaque; @@ -252,8 +246,6 @@ static BlockDriver bdrv_throttle = { .bdrv_co_pwrite_zeroes = throttle_co_pwrite_zeroes, .bdrv_co_pdiscard = throttle_co_pdiscard, - .bdrv_recurse_is_first_non_filter = throttle_recurse_is_first_non_filter, - .bdrv_attach_aio_context = throttle_attach_aio_context, .bdrv_detach_aio_context = throttle_detach_aio_context, diff --git a/include/block/block.h b/include/block/block.h index 6a8462a5bc..314ce63ed9 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -391,10 +391,6 @@ int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts, BlockDriverAmendStatusCB *status_cb, void *cb_opaque, Error **errp); -/* external snapshots */ -bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, - BlockDriverState *candidate); - /* check if a named node can be replaced when doing drive-mirror */ BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, const char *node_name, Error **errp); diff --git a/include/block/block_int.h b/include/block/block_int.h index eaefac210e..6f9fd5e20e 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -94,14 +94,6 @@ struct BlockDriver { * must implement them and return -ENOTSUP. */ bool is_filter; - /* for snapshots block filter like Quorum can implement the - * following recursive callback. - * It's purpose is to recurse on the filter children while calling - * bdrv_recurse_is_first_non_filter on them. - * For a sample implementation look in the future Quorum block filter. - */ - bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs, - BlockDriverState *candidate); /* * Return true if @to_replace can be replaced by a BDS with the * same data as @bs without it affecting @bs's behavior (that is, From 6e9cc0518113da423252a1fea328f27dc7bcf997 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:46 +0100 Subject: [PATCH 28/36] mirror: Double-check immediately before replacing There is no guarantee that we can still replace the node we want to replace at the end of the mirror job. Double-check by calling bdrv_recurse_can_replace(). Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-12-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block/mirror.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/block/mirror.c b/block/mirror.c index cacbc70014..447051dbc6 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -700,7 +700,19 @@ static int mirror_exit_common(Job *job) * drain potential other users of the BDS before changing the graph. */ assert(s->in_drain); bdrv_drained_begin(target_bs); - bdrv_replace_node(to_replace, target_bs, &local_err); + /* + * Cannot use check_to_replace_node() here, because that would + * check for an op blocker on @to_replace, and we have our own + * there. + */ + if (bdrv_recurse_can_replace(src, to_replace)) { + bdrv_replace_node(to_replace, target_bs, &local_err); + } else { + error_setg(&local_err, "Can no longer replace '%s' by '%s', " + "because it can no longer be guaranteed that doing so " + "would not lead to an abrupt change of visible data", + to_replace->node_name, target_bs->node_name); + } bdrv_drained_end(target_bs); if (local_err) { error_report_err(local_err); From 3c7f75b3219e6bd59cd4c6867e695449048ac152 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:47 +0100 Subject: [PATCH 29/36] quorum: Stop marking it as a filter Quorum is not a filter, for example because it cannot guarantee which of its children will serve the next request. Thus, any of its children may differ from the data visible to quorum's parents. We have other filters with multiple children, but they differ in this aspect: - blkverify quits the whole qemu process if its children differ. As such, we can always skip it when we want to skip it (as a filter node) by going to any of its children. Both have the same data. - replication generally serves requests from bs->file, so this is its only actually filtered child. - Block job filters currently only have one child, but they will probably get more children in the future. Still, they will always have only one actually filtered child. Having "filters" as a dedicated node category only makes sense if you can skip them by going to a one fixed child that always shows the same data as the filter node. Quorum cannot fulfill this, so it is not a filter. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-13-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- block/quorum.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/quorum.c b/block/quorum.c index f57b0402d9..6d7a56bd93 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -1198,7 +1198,6 @@ static BlockDriver bdrv_quorum = { .bdrv_child_perm = quorum_child_perm, - .is_filter = true, .bdrv_recurse_can_replace = quorum_recurse_can_replace, .strong_runtime_opts = quorum_strong_runtime_opts, From 6644d0e6192b36cdf2902c9774e1afb8ab2e7223 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:48 +0100 Subject: [PATCH 30/36] iotests: Use complete_and_wait() in 155 This way, we get to see errors during the completion phase. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-14-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/155 | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155 index e35b1d534b..f237868710 100755 --- a/tests/qemu-iotests/155 +++ b/tests/qemu-iotests/155 @@ -163,12 +163,7 @@ class MirrorBaseClass(BaseClass): self.assert_qmp(result, 'return', {}) - self.vm.event_wait('BLOCK_JOB_READY') - - result = self.vm.qmp('block-job-complete', device='mirror-job') - self.assert_qmp(result, 'return', {}) - - self.vm.event_wait('BLOCK_JOB_COMPLETED') + self.complete_and_wait('mirror-job') def testFull(self): self.runMirror('full') From 6a3d0f1e3fc6fa8c7d859951816858f3afc4a081 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:49 +0100 Subject: [PATCH 31/36] iotests: Add VM.assert_block_path() Signed-off-by: Max Reitz Message-Id: <20200218103454.296704-15-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/iotests.py | 59 +++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py index 0473e824ed..8815052eb5 100644 --- a/tests/qemu-iotests/iotests.py +++ b/tests/qemu-iotests/iotests.py @@ -714,6 +714,65 @@ class VM(qtest.QEMUQtestMachine): return fields.items() <= ret.items() + def assert_block_path(self, root, path, expected_node, graph=None): + """ + Check whether the node under the given path in the block graph + is @expected_node. + + @root is the node name of the node where the @path is rooted. + + @path is a string that consists of child names separated by + slashes. It must begin with a slash. + + Examples for @root + @path: + - root="qcow2-node", path="/backing/file" + - root="quorum-node", path="/children.2/file" + + Hypothetically, @path could be empty, in which case it would + point to @root. However, in practice this case is not useful + and hence not allowed. + + @expected_node may be None. (All elements of the path but the + leaf must still exist.) + + @graph may be None or the result of an x-debug-query-block-graph + call that has already been performed. + """ + if graph is None: + graph = self.qmp('x-debug-query-block-graph')['return'] + + iter_path = iter(path.split('/')) + + # Must start with a / + assert next(iter_path) == '' + + node = next((node for node in graph['nodes'] if node['name'] == root), + None) + + # An empty @path is not allowed, so the root node must be present + assert node is not None, 'Root node %s not found' % root + + for child_name in iter_path: + assert node is not None, 'Cannot follow path %s%s' % (root, path) + + try: + node_id = next(edge['child'] for edge in graph['edges'] \ + if edge['parent'] == node['id'] and + edge['name'] == child_name) + + node = next(node for node in graph['nodes'] \ + if node['id'] == node_id) + except StopIteration: + node = None + + if node is None: + assert expected_node is None, \ + 'No node found under %s (but expected %s)' % \ + (path, expected_node) + else: + assert node['name'] == expected_node, \ + 'Found node %s under %s (but expected %s)' % \ + (node['name'], path, expected_node) index_re = re.compile(r'([^\[]+)\[([^\]]+)\]') From 5d016a69e32dab4d796db69c9776688637858602 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:50 +0100 Subject: [PATCH 32/36] iotests/041: Drop superfluous shutdowns All tearDowns in 041 shutdown the VM. Thus, test cases do not need to do it themselves (unless they need the VM to be down for some post-operation check). Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-16-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/041 | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index aa7d54d968..7b2cf5c2f8 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -80,7 +80,6 @@ class TestSingleDrive(iotests.QMPTestCase): self.cancel_and_wait(force=True) result = self.vm.qmp('query-block') self.assert_qmp(result, 'return[0]/inserted/file', test_img) - self.vm.shutdown() def test_cancel_after_ready(self): self.assert_no_active_block_jobs() @@ -201,8 +200,6 @@ class TestSingleDrive(iotests.QMPTestCase): self.assert_qmp(result, 'return[0]/node-name', 'top') self.assert_qmp(result, 'return[0]/backing/node-name', 'base') - self.vm.shutdown() - def test_medium_not_found(self): if iotests.qemu_default_machine != 'pc': return @@ -455,7 +452,6 @@ new_state = "1" self.assert_qmp(event, 'data/id', 'drive0') self.assert_no_active_block_jobs() - self.vm.shutdown() def test_ignore_read(self): self.assert_no_active_block_jobs() @@ -475,7 +471,6 @@ new_state = "1" result = self.vm.qmp('query-block-jobs') self.assert_qmp(result, 'return[0]/paused', False) self.complete_and_wait() - self.vm.shutdown() def test_large_cluster(self): self.assert_no_active_block_jobs() @@ -540,7 +535,6 @@ new_state = "1" self.complete_and_wait(wait_ready=False) self.assert_no_active_block_jobs() - self.vm.shutdown() class TestWriteErrors(iotests.QMPTestCase): image_len = 2 * 1024 * 1024 # MB @@ -614,7 +608,6 @@ new_state = "1" completed = True self.assert_no_active_block_jobs() - self.vm.shutdown() def test_ignore_write(self): self.assert_no_active_block_jobs() @@ -631,7 +624,6 @@ new_state = "1" result = self.vm.qmp('query-block-jobs') self.assert_qmp(result, 'return[0]/paused', False) self.complete_and_wait() - self.vm.shutdown() def test_stop_write(self): self.assert_no_active_block_jobs() @@ -667,7 +659,6 @@ new_state = "1" self.complete_and_wait(wait_ready=False) self.assert_no_active_block_jobs() - self.vm.shutdown() class TestSetSpeed(iotests.QMPTestCase): image_len = 80 * 1024 * 1024 # MB @@ -936,7 +927,6 @@ class TestRepairQuorum(iotests.QMPTestCase): # here we check that the last registered quorum file has not been # swapped out and unref self.assert_has_block_node(None, quorum_img3) - self.vm.shutdown() def test_cancel_after_ready(self): self.assert_no_active_block_jobs() @@ -1043,7 +1033,6 @@ class TestRepairQuorum(iotests.QMPTestCase): self.assert_has_block_node("repair0", quorum_repair_img) # TODO: a better test requiring some QEMU infrastructure will be added # to check that this file is really driven by quorum - self.vm.shutdown() # Test mirroring with a source that does not have any parents (not even a # BlockBackend) From c351afd6f3fe068ff370d872fae4ab096bba0e92 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:51 +0100 Subject: [PATCH 33/36] iotests: Resolve TODOs in 041 Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-17-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/041 | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index 7b2cf5c2f8..084da6baf3 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -909,8 +909,7 @@ class TestRepairQuorum(iotests.QMPTestCase): self.complete_and_wait(drive="job0") self.assert_has_block_node("repair0", quorum_repair_img) - # TODO: a better test requiring some QEMU infrastructure will be added - # to check that this file is really driven by quorum + self.vm.assert_block_path('quorum0', '/children.1', 'repair0') self.vm.shutdown() self.assertTrue(iotests.compare_images(quorum_img2, quorum_repair_img), 'target image does not match source after mirroring') @@ -1031,8 +1030,7 @@ class TestRepairQuorum(iotests.QMPTestCase): self.complete_and_wait('job0') self.assert_has_block_node("repair0", quorum_repair_img) - # TODO: a better test requiring some QEMU infrastructure will be added - # to check that this file is really driven by quorum + self.vm.assert_block_path('quorum0', '/children.1', 'repair0') # Test mirroring with a source that does not have any parents (not even a # BlockBackend) From 89e2194524b3b503775fe62f507242227d89827c Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:52 +0100 Subject: [PATCH 34/36] iotests: Use self.image_len in TestRepairQuorum 041's TestRepairQuorum has its own image_len, no need to refer to TestSingleDrive. (This patch allows commenting out TestSingleDrive to speed up 041 during test testing.) Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-18-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/041 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index 084da6baf3..1d9e64ff6d 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -872,7 +872,7 @@ class TestRepairQuorum(iotests.QMPTestCase): # Add each individual quorum images for i in self.IMAGES: qemu_img('create', '-f', iotests.imgfmt, i, - str(TestSingleDrive.image_len)) + str(self.image_len)) # Assign a node name to each quorum image in order to manipulate # them opts = "node-name=img%i" % self.IMAGES.index(i) From a1da1878607adb9f5ab8e58e61ff987d90730fa9 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:53 +0100 Subject: [PATCH 35/36] iotests: Add tests for invalid Quorum @replaces Add two tests to see that you cannot replace a Quorum child with the mirror job while the child is in use by a different parent. Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-19-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/041 | 70 +++++++++++++++++++++++++++++++++++++- tests/qemu-iotests/041.out | 4 +-- 2 files changed, 71 insertions(+), 3 deletions(-) diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index 1d9e64ff6d..7af6de9124 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -20,6 +20,7 @@ import time import os +import re import iotests from iotests import qemu_img, qemu_io @@ -34,6 +35,8 @@ quorum_img3 = os.path.join(iotests.test_dir, 'quorum3.img') quorum_repair_img = os.path.join(iotests.test_dir, 'quorum_repair.img') quorum_snapshot_file = os.path.join(iotests.test_dir, 'quorum_snapshot.img') +nbd_sock_path = os.path.join(iotests.test_dir, 'nbd.sock') + class TestSingleDrive(iotests.QMPTestCase): image_len = 1 * 1024 * 1024 # MB qmp_cmd = 'drive-mirror' @@ -892,7 +895,8 @@ class TestRepairQuorum(iotests.QMPTestCase): def tearDown(self): self.vm.shutdown() - for i in self.IMAGES + [ quorum_repair_img, quorum_snapshot_file ]: + for i in self.IMAGES + [ quorum_repair_img, quorum_snapshot_file, + nbd_sock_path ]: # Do a try/except because the test may have deleted some images try: os.remove(i) @@ -1032,6 +1036,70 @@ class TestRepairQuorum(iotests.QMPTestCase): self.assert_has_block_node("repair0", quorum_repair_img) self.vm.assert_block_path('quorum0', '/children.1', 'repair0') + def test_with_other_parent(self): + """ + Check that we cannot replace a Quorum child when it has other + parents. + """ + result = self.vm.qmp('nbd-server-start', + addr={ + 'type': 'unix', + 'data': {'path': nbd_sock_path} + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('nbd-server-add', device='img1') + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('drive-mirror', job_id='mirror', device='quorum0', + sync='full', node_name='repair0', replaces='img1', + target=quorum_repair_img, format=iotests.imgfmt) + self.assert_qmp(result, 'error/desc', + "Cannot replace 'img1' by a node mirrored from " + "'quorum0', because it cannot be guaranteed that doing " + "so would not lead to an abrupt change of visible data") + + def test_with_other_parents_after_mirror_start(self): + """ + The same as test_with_other_parent(), but add the NBD server + only when the mirror job is already running. + """ + result = self.vm.qmp('nbd-server-start', + addr={ + 'type': 'unix', + 'data': {'path': nbd_sock_path} + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('drive-mirror', job_id='mirror', device='quorum0', + sync='full', node_name='repair0', replaces='img1', + target=quorum_repair_img, format=iotests.imgfmt) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('nbd-server-add', device='img1') + self.assert_qmp(result, 'return', {}) + + # The full error message goes to stderr, we will check it later + self.complete_and_wait('mirror', + completion_error='Operation not permitted') + + # Should not have been replaced + self.vm.assert_block_path('quorum0', '/children.1', 'img1') + + # Check the full error message now + self.vm.shutdown() + log = self.vm.get_log() + log = re.sub(r'^\[I \d+\.\d+\] OPENED\n', '', log) + log = re.sub(r'^Formatting.*\n', '', log) + log = re.sub(r'\n\[I \+\d+\.\d+\] CLOSED\n?$', '', log) + log = re.sub(r'^%s: ' % os.path.basename(iotests.qemu_prog), '', log) + + self.assertEqual(log, + "Can no longer replace 'img1' by 'repair0', because " + + "it can no longer be guaranteed that doing so would " + + "not lead to an abrupt change of visible data") + + # Test mirroring with a source that does not have any parents (not even a # BlockBackend) class TestOrphanedSource(iotests.QMPTestCase): diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out index f496be9197..ffc779b4d1 100644 --- a/tests/qemu-iotests/041.out +++ b/tests/qemu-iotests/041.out @@ -1,5 +1,5 @@ -........................................................................................... +............................................................................................. ---------------------------------------------------------------------- -Ran 91 tests +Ran 93 tests OK From c45a88f4429d7a8f384b75f3fd3fed5138a6edca Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Tue, 18 Feb 2020 11:34:54 +0100 Subject: [PATCH 36/36] iotests: Check that @replaces can replace filters Signed-off-by: Max Reitz Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200218103454.296704-20-mreitz@redhat.com> Signed-off-by: Kevin Wolf --- tests/qemu-iotests/041 | 46 ++++++++++++++++++++++++++++++++++++++ tests/qemu-iotests/041.out | 4 ++-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index 7af6de9124..5d67bf14bf 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -1190,6 +1190,52 @@ class TestOrphanedSource(iotests.QMPTestCase): self.assertFalse('mirror-filter' in nodes, 'Mirror filter node did not disappear') +# Test cases for @replaces that do not necessarily involve Quorum +class TestReplaces(iotests.QMPTestCase): + # Each of these test cases needs their own block graph, so do not + # create any nodes here + def setUp(self): + self.vm = iotests.VM() + self.vm.launch() + + def tearDown(self): + self.vm.shutdown() + for img in (test_img, target_img): + try: + os.remove(img) + except OSError: + pass + + @iotests.skip_if_unsupported(['copy-on-read']) + def test_replace_filter(self): + """ + Check that we can replace filter nodes. + """ + result = self.vm.qmp('blockdev-add', **{ + 'driver': 'copy-on-read', + 'node-name': 'filter0', + 'file': { + 'driver': 'copy-on-read', + 'node-name': 'filter1', + 'file': { + 'driver': 'null-co' + } + } + }) + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('blockdev-add', + node_name='target', driver='null-co') + self.assert_qmp(result, 'return', {}) + + result = self.vm.qmp('blockdev-mirror', job_id='mirror', device='filter0', + target='target', sync='full', replaces='filter1') + self.assert_qmp(result, 'return', {}) + + self.complete_and_wait('mirror') + + self.vm.assert_block_path('filter0', '/file', 'target') + if __name__ == '__main__': iotests.main(supported_fmts=['qcow2', 'qed'], supported_protocols=['file'], diff --git a/tests/qemu-iotests/041.out b/tests/qemu-iotests/041.out index ffc779b4d1..877b76fd31 100644 --- a/tests/qemu-iotests/041.out +++ b/tests/qemu-iotests/041.out @@ -1,5 +1,5 @@ -............................................................................................. +.............................................................................................. ---------------------------------------------------------------------- -Ran 93 tests +Ran 94 tests OK