diff --git a/arch_init.c b/arch_init.c index 80574a090c..fe1727922c 100644 --- a/arch_init.c +++ b/arch_init.c @@ -122,7 +122,6 @@ static void check_guest_throttling(void); #define RAM_SAVE_FLAG_XBZRLE 0x40 /* 0x80 is reserved in migration.h start with 0x100 next */ - static struct defconfig_file { const char *filename; /* Indicates it is an user config file (disabled by -no-user-config) */ @@ -133,6 +132,7 @@ static struct defconfig_file { { NULL }, /* end of list */ }; +static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE]; int qemu_read_default_config_files(bool userconfig) { @@ -273,6 +273,34 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset, return size; } +/* This is the last block that we have visited serching for dirty pages + */ +static RAMBlock *last_seen_block; +/* This is the last block from where we have sent data */ +static RAMBlock *last_sent_block; +static ram_addr_t last_offset; +static unsigned long *migration_bitmap; +static uint64_t migration_dirty_pages; +static uint32_t last_version; +static bool ram_bulk_stage; + +/* Update the xbzrle cache to reflect a page that's been sent as all 0. + * The important thing is that a stale (not-yet-0'd) page be replaced + * by the new data. + * As a bonus, if the page wasn't in the cache it gets added so that + * when a small write is made into the 0'd page it gets XBZRLE sent + */ +static void xbzrle_cache_zero_page(ram_addr_t current_addr) +{ + if (ram_bulk_stage || !migrate_use_xbzrle()) { + return; + } + + /* We don't care if this fails to allocate a new cache page + * as long as it updated an old one */ + cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE); +} + #define ENCODING_FLAG_XBZRLE 0x1 static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, @@ -329,18 +357,6 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, return bytes_sent; } - -/* This is the last block that we have visited serching for dirty pages - */ -static RAMBlock *last_seen_block; -/* This is the last block from where we have sent data */ -static RAMBlock *last_sent_block; -static ram_addr_t last_offset; -static unsigned long *migration_bitmap; -static uint64_t migration_dirty_pages; -static uint32_t last_version; -static bool ram_bulk_stage; - static inline ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, ram_addr_t start) @@ -512,6 +528,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage) } else { int ret; uint8_t *p; + bool send_async = true; int cont = (block == last_sent_block) ? RAM_SAVE_FLAG_CONTINUE : 0; @@ -522,6 +539,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage) ret = ram_control_save_page(f, block->offset, offset, TARGET_PAGE_SIZE, &bytes_sent); + current_addr = block->offset + offset; if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { if (bytes_sent > 0) { @@ -536,19 +554,35 @@ static int ram_save_block(QEMUFile *f, bool last_stage) RAM_SAVE_FLAG_COMPRESS); qemu_put_byte(f, 0); bytes_sent++; + /* Must let xbzrle know, otherwise a previous (now 0'd) cached + * page would be stale + */ + xbzrle_cache_zero_page(current_addr); } else if (!ram_bulk_stage && migrate_use_xbzrle()) { - current_addr = block->offset + offset; bytes_sent = save_xbzrle_page(f, p, current_addr, block, offset, cont, last_stage); if (!last_stage) { + /* We must send exactly what's in the xbzrle cache + * even if the page wasn't xbzrle compressed, so that + * it's right next time. + */ p = get_cached_data(XBZRLE.cache, current_addr); + + /* Can't send this cached data async, since the cache page + * might get updated before it gets to the wire + */ + send_async = false; } } /* XBZRLE overflow or normal page */ if (bytes_sent == -1) { bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE); - qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); + if (send_async) { + qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); + } else { + qemu_put_buffer(f, p, TARGET_PAGE_SIZE); + } bytes_sent += TARGET_PAGE_SIZE; acct_info.norm_pages++; } diff --git a/docs/rdma.txt b/docs/rdma.txt index 2aca63bd72..1f5d9e9fe4 100644 --- a/docs/rdma.txt +++ b/docs/rdma.txt @@ -66,7 +66,7 @@ bulk-phase round of the migration and can be enabled for extremely high-performance RDMA hardware using the following command: QEMU Monitor Command: -$ migrate_set_capability x-rdma-pin-all on # disabled by default +$ migrate_set_capability rdma-pin-all on # disabled by default Performing this action will cause all 8GB to be pinned, so if that's not what you want, then please ignore this step altogether. @@ -93,12 +93,12 @@ $ migrate_set_speed 40g # or whatever is the MAX of your RDMA device Next, on the destination machine, add the following to the QEMU command line: -qemu ..... -incoming x-rdma:host:port +qemu ..... -incoming rdma:host:port Finally, perform the actual migration on the source machine: QEMU Monitor Command: -$ migrate -d x-rdma:host:port +$ migrate -d rdma:host:port PERFORMANCE =========== @@ -120,8 +120,8 @@ For example, in the same 8GB RAM example with all 8GB of memory in active use and the VM itself is completely idle using the same 40 gbps infiniband link: -1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps -2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps +1. rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps +2. rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps These numbers would of course scale up to whatever size virtual machine you have to migrate using RDMA. @@ -407,18 +407,14 @@ socket is broken during a non-RDMA based migration. TODO: ===== -1. 'migrate x-rdma:host:port' and '-incoming x-rdma' options will be - renamed to 'rdma' after the experimental phase of this work has - completed upstream. -2. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits +1. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits are not compatible with infinband memory pinning and will result in an aborted migration (but with the source VM left unaffected). -3. Use of the recent /proc//pagemap would likely speed up +2. Use of the recent /proc//pagemap would likely speed up the use of KSM and ballooning while using RDMA. -4. Also, some form of balloon-device usage tracking would also +3. Also, some form of balloon-device usage tracking would also help alleviate some issues. -5. Move UNREGISTER requests to a separate thread. -6. Use LRU to provide more fine-grained direction of UNREGISTER +4. Use LRU to provide more fine-grained direction of UNREGISTER requests for unpinning memory in an overcommitted environment. -7. Expose UNREGISTER support to the user by way of workload-specific +5. Expose UNREGISTER support to the user by way of workload-specific hints about application behavior. diff --git a/include/migration/page_cache.h b/include/migration/page_cache.h index d156f0d398..2d5ce2dd7a 100644 --- a/include/migration/page_cache.h +++ b/include/migration/page_cache.h @@ -66,7 +66,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr); * @addr: page address * @pdata: pointer to the page */ -int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata); +int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata); /** * cache_resize: resize the page cache. In case of size reduction the extra diff --git a/migration-rdma.c b/migration-rdma.c index f94f3b4e3a..eeb4302215 100644 --- a/migration-rdma.c +++ b/migration-rdma.c @@ -3412,7 +3412,7 @@ void rdma_start_outgoing_migration(void *opaque, } ret = qemu_rdma_source_init(rdma, &local_err, - s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]); + s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]); if (ret) { goto err; diff --git a/migration.c b/migration.c index 25add6f9e2..14235b280a 100644 --- a/migration.c +++ b/migration.c @@ -82,7 +82,7 @@ void qemu_start_incoming_migration(const char *uri, Error **errp) if (strstart(uri, "tcp:", &p)) tcp_start_incoming_migration(p, errp); #ifdef CONFIG_RDMA - else if (strstart(uri, "x-rdma:", &p)) + else if (strstart(uri, "rdma:", &p)) rdma_start_incoming_migration(p, errp); #endif #if !defined(WIN32) @@ -438,7 +438,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, if (strstart(uri, "tcp:", &p)) { tcp_start_outgoing_migration(s, p, &local_err); #ifdef CONFIG_RDMA - } else if (strstart(uri, "x-rdma:", &p)) { + } else if (strstart(uri, "rdma:", &p)) { rdma_start_outgoing_migration(s, p, &local_err); #endif #if !defined(WIN32) @@ -532,7 +532,7 @@ bool migrate_rdma_pin_all(void) s = migrate_get_current(); - return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]; + return s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]; } bool migrate_auto_converge(void) diff --git a/page_cache.c b/page_cache.c index 3ef6ee7ad2..b033681a93 100644 --- a/page_cache.c +++ b/page_cache.c @@ -150,7 +150,7 @@ uint8_t *get_cached_data(const PageCache *cache, uint64_t addr) return cache_get_by_addr(cache, addr)->it_data; } -int cache_insert(PageCache *cache, uint64_t addr, uint8_t *pdata) +int cache_insert(PageCache *cache, uint64_t addr, const uint8_t *pdata) { CacheItem *it = NULL; diff --git a/qapi-schema.json b/qapi-schema.json index fcb2280053..ac8ad24966 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -751,10 +751,9 @@ # This feature allows us to minimize migration traffic for certain work # loads, by sending compressed difference of the pages # -# @x-rdma-pin-all: Controls whether or not the entire VM memory footprint is +# @rdma-pin-all: Controls whether or not the entire VM memory footprint is # mlock()'d on demand or all at once. Refer to docs/rdma.txt for usage. -# Disabled by default. Experimental: may (or may not) be renamed after -# further testing is complete. (since 1.6) +# Disabled by default. (since 2.0) # # @zero-blocks: During storage migration encode blocks of zeroes efficiently. This # essentially saves 1MB of zeroes per block on the wire. Enabling requires @@ -768,7 +767,7 @@ # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge', 'zero-blocks'] } + 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks'] } ## # @MigrationCapabilityStatus diff --git a/qemu-file.c b/qemu-file.c index 9473b674ba..f074af15c3 100644 --- a/qemu-file.c +++ b/qemu-file.c @@ -100,7 +100,14 @@ static int stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, int size) { QEMUFileStdio *s = opaque; - return fwrite(buf, 1, size, s->stdio_file); + int res; + + res = fwrite(buf, 1, size, s->stdio_file); + + if (res != size) { + return -EIO; /* fake errno value */ + } + return res; } static int stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, int size) diff --git a/vmstate.c b/vmstate.c index 284b080f46..d1f5eb0e6a 100644 --- a/vmstate.c +++ b/vmstate.c @@ -321,23 +321,24 @@ const VMStateInfo vmstate_info_int32_equal = { .put = put_int32, }; -/* 32 bit int. See that the received value is the less or the same - than the one in the field */ +/* 32 bit int. Check that the received value is less than or equal to + the one in the field */ static int get_int32_le(QEMUFile *f, void *pv, size_t size) { - int32_t *old = pv; - int32_t new; - qemu_get_sbe32s(f, &new); + int32_t *cur = pv; + int32_t loaded; + qemu_get_sbe32s(f, &loaded); - if (*old <= new) { + if (loaded <= *cur) { + *cur = loaded; return 0; } return -EINVAL; } const VMStateInfo vmstate_info_int32_le = { - .name = "int32 equal", + .name = "int32 le", .get = get_int32_le, .put = put_int32, };