From aded6539d983280212e08d09f14157b1cb4d58cc Mon Sep 17 00:00:00 2001 From: Juan Quintela Date: Tue, 11 Feb 2014 22:56:00 +0100 Subject: [PATCH 1/4] qemu_file: use fwrite() correctly fwrite() returns the number of items written. But when there is one error, it can return a short write. In the particular bug that I was tracking, I did a migration to a read-only filesystem. And it was able to finish the migration correctly. fwrite() never returned a negative error code, nor zero, always 4096. (migration writes chunks of about 14000 bytes). And it was able to "complete" the migration with success (yes, reading the file was a bit more difficult). To add insult to injury, if your amount of memory was big enough (12GB on my case), it overwrote some important structure, and from them, malloc failed. This check makes the problem go away. Signed-off-by: Juan Quintela Reviewed-by: Eric Blake Reviewed-by: Markus Armbruster Signed-off-by: Juan Quintela --- qemu-file.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/qemu-file.c b/qemu-file.c index 9473b674ba..f074af15c3 100644 --- a/qemu-file.c +++ b/qemu-file.c @@ -100,7 +100,14 @@ static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size) { QEMUFileStdio *s = opaque; - return fwrite(buf, 1, size, s->stdio_file); + int res; + + res = fwrite(buf, 1, size, s->stdio_file); + + if (res != size) { + return -EIO; /* fake errno value */ + } + return res; } static int stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) From 24a370ef2351dc596a7e47508b952ddfba79ef94 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 12 Feb 2014 17:20:10 +0000 Subject: [PATCH 2/4] Fix vmstate_info_int32_le comparison/assign Fix comparison of vmstate_info_int32_le so that it succeeds if loaded value is (l)ess than or (e)qual When the comparison succeeds, assign the value loaded This is a change in behaviour but I think the original intent, since the idea is to check if the version/size of the thing you're loading is less than some limit, but you might well want to do something based on the actual version/size in the file Fix up comment and name text Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Juan Quintela --- vmstate.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/vmstate.c b/vmstate.c index 284b080f46..d1f5eb0e6a 100644 --- a/vmstate.c +++ b/vmstate.c @@ -321,23 +321,24 @@ const VMStateInfo vmstate_info_int32_equal = { .put = put_int32, }; -/* 32 bit int. See that the received value is the less or the same - than the one in the field */ +/* 32 bit int. Check that the received value is less than or equal to + the one in the field */ static int get_int32_le(QEMUFile *f, void *pv, size_t size) { - int32_t *old = pv; - int32_t new; - qemu_get_sbe32s(f, &new); + int32_t *cur = pv; + int32_t loaded; + qemu_get_sbe32s(f, &loaded); - if (*old <= new) { + if (loaded <= *cur) { + *cur = loaded; return 0; } return -EINVAL; } const VMStateInfo vmstate_info_int32_le = { - .name = "int32 equal", + .name = "int32 le", .get = get_int32_le, .put = put_int32, }; From 6d3cb1f970ee85361618f7ff02869180394e012d Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Thu, 13 Feb 2014 19:44:45 +0000 Subject: [PATCH 3/4] Fix two XBZRLE corruption issues Push zero'd pages into the XBZRLE cache A page that was cached by XBZRLE, zero'd and then XBZRLE'd again was being compared against a stale cache value Don't use 'qemu_put_buffer_async' to put pages from the XBZRLE cache Since the cache might change before the data hits the wire Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Juan Quintela --- arch_init.c | 64 ++++++++++++++++++++++++++-------- include/migration/page_cache.h | 2 +- page_cache.c | 2 +- 3 files changed, 51 insertions(+), 17 deletions(-) diff --git a/arch_init.c b/arch_init.c index 80574a090c..fe1727922c 100644 --- a/arch_init.c +++ b/arch_init.c @@ -122,7 +122,6 @@ static void check_guest_throttling(void); #define RAM_SAVE_FLAG_XBZRLE 0x40 /* 0x80 is reserved in migration.h start with 0x100 next */ - static struct defconfig_file { const char *filename; /* Indicates it is an user config file (disabled by -no-user-config) */ @@ -133,6 +132,7 @@ static struct defconfig_file { { NULL }, /* end of list */ }; +static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE]; int qemu_read_default_config_files(bool userconfig) { @@ -273,6 +273,34 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset, return size; } +/* This is the last block that we have visited serching for dirty pages + */ +static RAMBlock *last_seen_block; +/* This is the last block from where we have sent data */ +static RAMBlock *last_sent_block; +static ram_addr_t last_offset; +static unsigned long *migration_bitmap; +static uint64_t migration_dirty_pages; +static uint32_t last_version; +static bool ram_bulk_stage; + +/* Update the xbzrle cache to reflect a page that's been sent as all 0. + * The important thing is that a stale (not-yet-0'd) page be replaced + * by the new data. + * As a bonus, if the page wasn't in the cache it gets added so that + * when a small write is made into the 0'd page it gets XBZRLE sent + */ +static void xbzrle_cache_zero_page(ram_addr_t current_addr) +{ + if (ram_bulk_stage || !migrate_use_xbzrle()) { + return; + } + + /* We don't care if this fails to allocate a new cache page + * as long as it updated an old one */ + cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE); +} + #define ENCODING_FLAG_XBZRLE 0x1 static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, @@ -329,18 +357,6 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, return bytes_sent; } - -/* This is the last block that we have visited serching for dirty pages - */ -static RAMBlock *last_seen_block; -/* This is the last block from where we have sent data */ -static RAMBlock *last_sent_block; -static ram_addr_t last_offset; -static unsigned long *migration_bitmap; -static uint64_t migration_dirty_pages; -static uint32_t last_version; -static bool ram_bulk_stage; - static inline ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, ram_addr_t start) @@ -512,6 +528,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage) } else { int ret; uint8_t *p; + bool send_async = true; int cont = (block == last_sent_block) ? RAM_SAVE_FLAG_CONTINUE : 0; @@ -522,6 +539,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage) ret = ram_control_save_page(f, block->offset, offset, TARGET_PAGE_SIZE, &bytes_sent); + current_addr = block->offset + offset; if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_sent > 0) { @@ -536,19 +554,35 @@ static int ram_save_block(QEMUFile *f, bool last_stage) RAM_SAVE_FLAG_COMPRESS); qemu_put_byte(f, 0); bytes_sent++; + /* Must let xbzrle know, otherwise a previous (now 0'd) cached + * page would be stale + */ + xbzrle_cache_zero_page(current_addr); } else if (!ram_bulk_stage && migrate_use_xbzrle()) { - current_addr = block->offset + offset; bytes_sent = save_xbzrle_page(f, p, current_addr, block, offset, cont, last_stage); if (!last_stage) { + /* We must send exactly what's in the xbzrle cache + * even if the page wasn't xbzrle compressed, so that + * it's right next time. + */ p = get_cached_data(XBZRLE.cache, current_addr); + + /* Can't send this cached data async, since the cache page + * might get updated before it gets to the wire + */ + send_async = false; } } /* XBZRLE overflow or normal page */ if (bytes_sent == -1) { bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE); - qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); + if (send_async) { + qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); + } else { + qemu_put_buffer(f, p, TARGET_PAGE_SIZE); + } bytes_sent += TARGET_PAGE_SIZE; acct_info.norm_pages++; } diff --git a/include/migration/page_cache.h b/include/migration/page_cache.h index d156f0d398..2d5ce2dd7a 100644 --- a/include/migration/page_cache.h +++ b/include/migration/page_cache.h @@ -66,7 +66,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr); * @addr: page address * @pdata: pointer to the page */ -int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata); +int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata); /** * cache_resize: resize the page cache. In case of size reduction the extra diff --git a/page_cache.c b/page_cache.c index 3ef6ee7ad2..b033681a93 100644 --- a/page_cache.c +++ b/page_cache.c @@ -150,7 +150,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr) return cache_get_by_addr(cache, addr)->it_data; } -int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata) +int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata) { CacheItem *it = NULL; From 41310c68781d742fa9bbfd5fcb1df9b7f23f5759 Mon Sep 17 00:00:00 2001 From: "Michael R. Hines" Date: Thu, 19 Dec 2013 04:52:01 +0800 Subject: [PATCH 4/4] rdma: rename 'x-rdma' => 'rdma' As far as we can tell, all known bugs have been fixed: 1. Parallel migrations are working 2. IPv6 migration is working 3. virt-test is working I'm not comfortable sending the revised libvirt patch until this is accepted or review suggestions are addressed, (including pin-all support. It does not make sense to remove experimental for one thing and not the other. That's too many trips through the libvirt community). Reviewed-by: Eric Blake Signed-off-by: Michael R. Hines Signed-off-by: Juan Quintela --- docs/rdma.txt | 24 ++++++++++-------------- migration-rdma.c | 2 +- migration.c | 6 +++--- qapi-schema.json | 7 +++---- 4 files changed, 17 insertions(+), 22 deletions(-) diff --git a/docs/rdma.txt b/docs/rdma.txt index 2aca63bd72..1f5d9e9fe4 100644 --- a/docs/rdma.txt +++ b/docs/rdma.txt @@ -66,7 +66,7 @@ bulk-phase round of the migration and can be enabled for extremely high-performance RDMA hardware using the following command: QEMU Monitor Command: -$ migrate_set_capability x-rdma-pin-all on # disabled by default +$ migrate_set_capability rdma-pin-all on # disabled by default Performing this action will cause all 8GB to be pinned, so if that's not what you want, then please ignore this step altogether. @@ -93,12 +93,12 @@ $ migrate_set_speed 40g # or whatever is the MAX of your RDMA device Next, on the destination machine, add the following to the QEMU command line: -qemu ..... -incoming x-rdma:host:port +qemu ..... -incoming rdma:host:port Finally, perform the actual migration on the source machine: QEMU Monitor Command: -$ migrate -d x-rdma:host:port +$ migrate -d rdma:host:port PERFORMANCE =========== @@ -120,8 +120,8 @@ For example, in the same 8GB RAM example with all 8GB of memory in active use and the VM itself is completely idle using the same 40 gbps infiniband link: -1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps -2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps +1. rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps +2. rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps These numbers would of course scale up to whatever size virtual machine you have to migrate using RDMA. @@ -407,18 +407,14 @@ socket is broken during a non-RDMA based migration. TODO: ===== -1. 'migrate x-rdma:host:port' and '-incoming x-rdma' options will be - renamed to 'rdma' after the experimental phase of this work has - completed upstream. -2. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits +1. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits are not compatible with infinband memory pinning and will result in an aborted migration (but with the source VM left unaffected). -3. Use of the recent /proc//pagemap would likely speed up +2. Use of the recent /proc//pagemap would likely speed up the use of KSM and ballooning while using RDMA. -4. Also, some form of balloon-device usage tracking would also +3. Also, some form of balloon-device usage tracking would also help alleviate some issues. -5. Move UNREGISTER requests to a separate thread. -6. Use LRU to provide more fine-grained direction of UNREGISTER +4. Use LRU to provide more fine-grained direction of UNREGISTER requests for unpinning memory in an overcommitted environment. -7. Expose UNREGISTER support to the user by way of workload-specific +5. Expose UNREGISTER support to the user by way of workload-specific hints about application behavior. diff --git a/migration-rdma.c b/migration-rdma.c index f94f3b4e3a..eeb4302215 100644 --- a/migration-rdma.c +++ b/migration-rdma.c @@ -3412,7 +3412,7 @@ void rdma_start_outgoing_migration(void *opaque, } ret = qemu_rdma_source_init(rdma, &local_err, - s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]); + s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]); if (ret) { goto err; diff --git a/migration.c b/migration.c index 25add6f9e2..14235b280a 100644 --- a/migration.c +++ b/migration.c @@ -82,7 +82,7 @@ void qemu_start_incoming_migration(const char *uri, Error **errp) if (strstart(uri, "tcp:", &p)) tcp_start_incoming_migration(p, errp); #ifdef CONFIG_RDMA - else if (strstart(uri, "x-rdma:", &p)) + else if (strstart(uri, "rdma:", &p)) rdma_start_incoming_migration(p, errp); #endif #if !defined(WIN32) @@ -438,7 +438,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, if (strstart(uri, "tcp:", &p)) { tcp_start_outgoing_migration(s, p, &local_err); #ifdef CONFIG_RDMA - } else if (strstart(uri, "x-rdma:", &p)) { + } else if (strstart(uri, "rdma:", &p)) { rdma_start_outgoing_migration(s, p, &local_err); #endif #if !defined(WIN32) @@ -532,7 +532,7 @@ bool migrate_rdma_pin_all(void) s = migrate_get_current(); - return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]; + return s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]; } bool migrate_auto_converge(void) diff --git a/qapi-schema.json b/qapi-schema.json index fcb2280053..ac8ad24966 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -751,10 +751,9 @@ # This feature allows us to minimize migration traffic for certain work # loads, by sending compressed difference of the pages # -# @x-rdma-pin-all: Controls whether or not the entire VM memory footprint is +# @rdma-pin-all: Controls whether or not the entire VM memory footprint is # mlock()'d on demand or all at once. Refer to docs/rdma.txt for usage. -# Disabled by default. Experimental: may (or may not) be renamed after -# further testing is complete. (since 1.6) +# Disabled by default. (since 2.0) # # @zero-blocks: During storage migration encode blocks of zeroes efficiently. This # essentially saves 1MB of zeroes per block on the wire. Enabling requires @@ -768,7 +767,7 @@ # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] } + 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks'] } ## # @MigrationCapabilityStatus