diff --git a/balloon.c b/balloon.c index 5d69e8a00b..0f45d1b5c4 100644 --- a/balloon.c +++ b/balloon.c @@ -36,6 +36,17 @@ static QEMUBalloonEvent *balloon_event_fn; static QEMUBalloonStatus *balloon_stat_fn; static void *balloon_opaque; +static bool balloon_inhibited; + +bool qemu_balloon_is_inhibited(void) +{ + return balloon_inhibited; +} + +void qemu_balloon_inhibit(bool state) +{ + balloon_inhibited = state; +} static bool have_balloon(Error **errp) { diff --git a/docs/migration.txt b/docs/migration.txt index f6df4beb2a..fda8d61d69 100644 --- a/docs/migration.txt +++ b/docs/migration.txt @@ -291,3 +291,194 @@ save/send this state when we are in the middle of a pio operation (that is what ide_drive_pio_state_needed() checks). If DRQ_STAT is not enabled, the values on that fields are garbage and don't need to be sent. + += Return path = + +In most migration scenarios there is only a single data path that runs +from the source VM to the destination, typically along a single fd (although +possibly with another fd or similar for some fast way of throwing pages across). + +However, some uses need two way communication; in particular the Postcopy +destination needs to be able to request pages on demand from the source. + +For these scenarios there is a 'return path' from the destination to the source; +qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return +path. + + Source side + Forward path - written by migration thread + Return path - opened by main thread, read by return-path thread + + Destination side + Forward path - read by main thread + Return path - opened by main thread, written by main thread AND postcopy + thread (protected by rp_mutex) + += Postcopy = +'Postcopy' migration is a way to deal with migrations that refuse to converge +(or take too long to converge) its plus side is that there is an upper bound on +the amount of migration traffic and time it takes, the down side is that during +the postcopy phase, a failure of *either* side or the network connection causes +the guest to be lost. + +In postcopy the destination CPUs are started before all the memory has been +transferred, and accesses to pages that are yet to be transferred cause +a fault that's translated by QEMU into a request to the source QEMU. + +Postcopy can be combined with precopy (i.e. normal migration) so that if precopy +doesn't finish in a given time the switch is made to postcopy. + +=== Enabling postcopy === + +To enable postcopy, issue this command on the monitor prior to the +start of migration: + +migrate_set_capability x-postcopy-ram on + +The normal commands are then used to start a migration, which is still +started in precopy mode. Issuing: + +migrate_start_postcopy + +will now cause the transition from precopy to postcopy. +It can be issued immediately after migration is started or any +time later on. Issuing it after the end of a migration is harmless. + +Note: During the postcopy phase, the bandwidth limits set using +migrate_set_speed is ignored (to avoid delaying requested pages that +the destination is waiting for). + +=== Postcopy device transfer === + +Loading of device data may cause the device emulation to access guest RAM +that may trigger faults that have to be resolved by the source, as such +the migration stream has to be able to respond with page data *during* the +device load, and hence the device data has to be read from the stream completely +before the device load begins to free the stream up. This is achieved by +'packaging' the device data into a blob that's read in one go. + +Source behaviour + +Until postcopy is entered the migration stream is identical to normal +precopy, except for the addition of a 'postcopy advise' command at +the beginning, to tell the destination that postcopy might happen. +When postcopy starts the source sends the page discard data and then +forms the 'package' containing: + + Command: 'postcopy listen' + The device state + A series of sections, identical to the precopy streams device state stream + containing everything except postcopiable devices (i.e. RAM) + Command: 'postcopy run' + +The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the +contents are formatted in the same way as the main migration stream. + +During postcopy the source scans the list of dirty pages and sends them +to the destination without being requested (in much the same way as precopy), +however when a page request is received from the destination, the dirty page +scanning restarts from the requested location. This causes requested pages +to be sent quickly, and also causes pages directly after the requested page +to be sent quickly in the hope that those pages are likely to be used +by the destination soon. + +Destination behaviour + +Initially the destination looks the same as precopy, with a single thread +reading the migration stream; the 'postcopy advise' and 'discard' commands +are processed to change the way RAM is managed, but don't affect the stream +processing. + +------------------------------------------------------------------------------ + 1 2 3 4 5 6 7 +main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) +thread | | + | (page request) + | \___ + v \ +listen thread: --- page -- page -- page -- page -- page -- + + a b c +------------------------------------------------------------------------------ + +On receipt of CMD_PACKAGED (1) + All the data associated with the package - the ( ... ) section in the +diagram - is read into memory (into a QEMUSizedBuffer), and the main thread +recurses into qemu_loadvm_state_main to process the contents of the package (2) +which contains commands (3,6) and devices (4...) + +On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) +a new thread (a) is started that takes over servicing the migration stream, +while the main thread carries on loading the package. It loads normal +background page data (b) but if during a device load a fault happens (5) the +returned page (c) is loaded by the listen thread allowing the main threads +device load to carry on. + +The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination +CPUs start running. +At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour +and is no longer used by migration, while the listen thread carries +on servicing page data until the end of migration. + +=== Postcopy states === + +Postcopy moves through a series of states (see postcopy_state) from +ADVISE->DISCARD->LISTEN->RUNNING->END + + Advise: Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) + + Discard: Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. + + Listen: The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + Running: POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + End: The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +=== Source side page maps === + +The source side keeps two bitmaps during postcopy; 'the migration bitmap' +and 'unsent map'. The 'migration bitmap' is basically the same as in +the precopy case, and holds a bit to indicate that page is 'dirty' - +i.e. needs sending. During the precopy phase this is updated as the CPU +dirties pages, however during postcopy the CPUs are stopped and nothing +should dirty anything any more. + +The 'unsent map' is used for the transition to postcopy. It is a bitmap that +has a bit cleared whenever a page is sent to the destination, however during +the transition to postcopy mode it is combined with the migration bitmap +to form a set of pages that: + a) Have been sent but then redirtied (which must be discarded) + b) Have not yet been sent - which also must be discarded to cause any + transparent huge pages built during precopy to be broken. + +Note that the contents of the unsentmap are sacrificed during the calculation +of the discard set and thus aren't valid once in postcopy. The dirtymap +is still valid and is used to ensure that no page is sent more than once. Any +request for a page that has already been sent is ignored. Duplicate requests +such as this can happen as a page is sent at about the same time the +destination accesses it. + diff --git a/exec.c b/exec.c index a028961587..b09f18b2a4 100644 --- a/exec.c +++ b/exec.c @@ -1377,6 +1377,11 @@ static RAMBlock *find_ram_block(ram_addr_t addr) return NULL; } +const char *qemu_ram_get_idstr(RAMBlock *rb) +{ + return rb->idstr; +} + /* Called with iothread lock held. */ void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev) { @@ -1447,7 +1452,7 @@ int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp) assert(block); - newsize = TARGET_PAGE_ALIGN(newsize); + newsize = HOST_PAGE_ALIGN(newsize); if (block->used_length == newsize) { return 0; @@ -1591,7 +1596,7 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, return -1; } - size = TARGET_PAGE_ALIGN(size); + size = HOST_PAGE_ALIGN(size); new_block = g_malloc0(sizeof(*new_block)); new_block->mr = mr; new_block->used_length = size; @@ -1627,8 +1632,8 @@ ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size, ram_addr_t addr; Error *local_err = NULL; - size = TARGET_PAGE_ALIGN(size); - max_size = TARGET_PAGE_ALIGN(max_size); + size = HOST_PAGE_ALIGN(size); + max_size = HOST_PAGE_ALIGN(max_size); new_block = g_malloc0(sizeof(*new_block)); new_block->mr = mr; new_block->resized = resized; @@ -1877,8 +1882,16 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size) } } -/* Some of the softmmu routines need to translate from a host pointer - * (typically a TLB entry) back to a ram offset. +/* + * Translates a host ptr back to a RAMBlock, a ram_addr and an offset + * in that RAMBlock. + * + * ptr: Host pointer to look up + * round_offset: If true round the result offset down to a page boundary + * *ram_addr: set to result ram_addr + * *offset: set to result offset within the RAMBlock + * + * Returns: RAMBlock (or NULL if not found) * * By the time this function returns, the returned pointer is not protected * by RCU anymore. If the caller is not within an RCU critical section and @@ -1886,18 +1899,22 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size) * pointer, such as a reference to the region that includes the incoming * ram_addr_t. */ -MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) +RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset, + ram_addr_t *ram_addr, + ram_addr_t *offset) { RAMBlock *block; uint8_t *host = ptr; - MemoryRegion *mr; if (xen_enabled()) { rcu_read_lock(); *ram_addr = xen_ram_addr_from_mapcache(ptr); - mr = qemu_get_ram_block(*ram_addr)->mr; + block = qemu_get_ram_block(*ram_addr); + if (block) { + *offset = (host - block->host); + } rcu_read_unlock(); - return mr; + return block; } rcu_read_lock(); @@ -1920,10 +1937,49 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) return NULL; found: - *ram_addr = block->offset + (host - block->host); - mr = block->mr; + *offset = (host - block->host); + if (round_offset) { + *offset &= TARGET_PAGE_MASK; + } + *ram_addr = block->offset + *offset; rcu_read_unlock(); - return mr; + return block; +} + +/* + * Finds the named RAMBlock + * + * name: The name of RAMBlock to find + * + * Returns: RAMBlock (or NULL if not found) + */ +RAMBlock *qemu_ram_block_by_name(const char *name) +{ + RAMBlock *block; + + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + if (!strcmp(name, block->idstr)) { + return block; + } + } + + return NULL; +} + +/* Some of the softmmu routines need to translate from a host pointer + (typically a TLB entry) back to a ram offset. */ +MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr) +{ + RAMBlock *block; + ram_addr_t offset; /* Not used */ + + block = qemu_ram_block_from_host(ptr, false, ram_addr, &offset); + + if (!block) { + return NULL; + } + + return block->mr; } static void notdirty_mem_write(void *opaque, hwaddr ram_addr, @@ -3502,6 +3558,16 @@ int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, } return 0; } + +/* + * Allows code that needs to deal with migration bitmaps etc to still be built + * target independent. + */ +size_t qemu_target_page_bits(void) +{ + return TARGET_PAGE_BITS; +} + #endif /* diff --git a/hmp-commands.hx b/hmp-commands.hx index 3a4ae3950a..8939b9838a 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1005,6 +1005,21 @@ STEXI @item migrate_set_parameter @var{parameter} @var{value} @findex migrate_set_parameter Set the parameter @var{parameter} for migration. +ETEXI + + { + .name = "migrate_start_postcopy", + .args_type = "", + .params = "", + .help = "Switch migration to postcopy mode", + .mhandler.cmd = hmp_migrate_start_postcopy, + }, + +STEXI +@item migrate_start_postcopy +@findex migrate_start_postcopy +Switch in-progress migration to postcopy mode. Ignored after the end of +migration (or once already in postcopy). ETEXI { diff --git a/hmp.c b/hmp.c index a15d00c18c..e1f854aefe 100644 --- a/hmp.c +++ b/hmp.c @@ -1293,6 +1293,13 @@ void hmp_client_migrate_info(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } +void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict) +{ + Error *err = NULL; + qmp_migrate_start_postcopy(&err); + hmp_handle_error(mon, &err); +} + void hmp_set_password(Monitor *mon, const QDict *qdict) { const char *protocol = qdict_get_str(qdict, "protocol"); diff --git a/hmp.h b/hmp.h index 81656c3d82..a8c5b5a9a6 100644 --- a/hmp.h +++ b/hmp.h @@ -69,6 +69,7 @@ void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict); void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict); void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict); void hmp_client_migrate_info(Monitor *mon, const QDict *qdict); +void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict); void hmp_set_password(Monitor *mon, const QDict *qdict); void hmp_expire_password(Monitor *mon, const QDict *qdict); void hmp_eject(Monitor *mon, const QDict *qdict); diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 0ed8527969..37d071e4d4 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -1588,7 +1588,7 @@ static int htab_load(QEMUFile *f, void *opaque, int version_id) static SaveVMHandlers savevm_htab_handlers = { .save_live_setup = htab_save_setup, .save_live_iterate = htab_save_iterate, - .save_live_complete = htab_save_complete, + .save_live_complete_precopy = htab_save_complete, .load_state = htab_load, }; diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c index c419b17143..9671635e63 100644 --- a/hw/virtio/virtio-balloon.c +++ b/hw/virtio/virtio-balloon.c @@ -37,9 +37,11 @@ static void balloon_page(void *addr, int deflate) { #if defined(__linux__) - if (!kvm_enabled() || kvm_has_sync_mmu()) + if (!qemu_balloon_is_inhibited() && (!kvm_enabled() || + kvm_has_sync_mmu())) { qemu_madvise(addr, TARGET_PAGE_SIZE, deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED); + } #endif } diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 9fb1d541d4..85aa4033e7 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -64,8 +64,12 @@ typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr); void qemu_ram_remap(ram_addr_t addr, ram_addr_t length); /* This should not be used by devices. */ MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr); +RAMBlock *qemu_ram_block_by_name(const char *name); +RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset, + ram_addr_t *ram_addr, ram_addr_t *offset); void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev); void qemu_ram_unset_idstr(ram_addr_t addr); +const char *qemu_ram_get_idstr(RAMBlock *rb); void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf, int len, int is_write); diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h index b07de109fb..d900b0d078 100644 --- a/include/exec/exec-all.h +++ b/include/exec/exec-all.h @@ -72,7 +72,6 @@ void restore_state_to_opc(CPUArchState *env, struct TranslationBlock *tb, void cpu_gen_init(void); bool cpu_restore_state(CPUState *cpu, uintptr_t searched_pc); -void page_size_init(void); void QEMU_NORETURN cpu_resume_from_signal(CPUState *cpu, void *puc); void QEMU_NORETURN cpu_io_recompile(CPUState *cpu, uintptr_t retaddr); diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h index 3360ac5fde..7115154bc1 100644 --- a/include/exec/ram_addr.h +++ b/include/exec/ram_addr.h @@ -22,8 +22,6 @@ #ifndef CONFIG_USER_ONLY #include "hw/xen/xen.h" -typedef struct RAMBlock RAMBlock; - struct RAMBlock { struct rcu_head rcu; struct MemoryRegion *mr; diff --git a/include/migration/migration.h b/include/migration/migration.h index 83346210b1..fd018b74a2 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -35,6 +35,7 @@ #define QEMU_VM_SUBSECTION 0x05 #define QEMU_VM_VMDESCRIPTION 0x06 #define QEMU_VM_CONFIGURATION 0x07 +#define QEMU_VM_COMMAND 0x08 #define QEMU_VM_SECTION_FOOTER 0x7e struct MigrationParams { @@ -42,13 +43,67 @@ struct MigrationParams { bool shared; }; -typedef struct MigrationState MigrationState; +/* Messages sent on the return path from destination to source */ +enum mig_rp_message_type { + MIG_RP_MSG_INVALID = 0, /* Must be 0 */ + MIG_RP_MSG_SHUT, /* sibling will not send any more RP messages */ + MIG_RP_MSG_PONG, /* Response to a PING; data (seq: be32 ) */ + + MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */ + MIG_RP_MSG_REQ_PAGES, /* data (start: be64, len: be32) */ + + MIG_RP_MSG_MAX +}; typedef QLIST_HEAD(, LoadStateEntry) LoadStateEntry_Head; +/* The current postcopy state is read/set by postcopy_state_get/set + * which update it atomically. + * The state is updated as postcopy messages are received, and + * in general only one thread should be writing to the state at any one + * time, initially the main thread and then the listen thread; + * Corner cases are where either thread finishes early and/or errors. + * The state is checked as messages are received to ensure that + * the source is sending us messages in the correct order. + * The state is also used by the RAM reception code to know if it + * has to place pages atomically, and the cleanup code at the end of + * the main thread to know if it has to delay cleanup until the end + * of postcopy. + */ +typedef enum { + POSTCOPY_INCOMING_NONE = 0, /* Initial state - no postcopy */ + POSTCOPY_INCOMING_ADVISE, + POSTCOPY_INCOMING_DISCARD, + POSTCOPY_INCOMING_LISTENING, + POSTCOPY_INCOMING_RUNNING, + POSTCOPY_INCOMING_END +} PostcopyState; + /* State for the incoming migration */ struct MigrationIncomingState { - QEMUFile *file; + QEMUFile *from_src_file; + + /* + * Free at the start of the main state load, set as the main thread finishes + * loading state. + */ + QemuEvent main_thread_load_event; + + bool have_fault_thread; + QemuThread fault_thread; + QemuSemaphore fault_thread_sem; + + bool have_listen_thread; + QemuThread listen_thread; + QemuSemaphore listen_thread_sem; + + /* For the kernel to send us notifications */ + int userfault_fd; + /* To tell the fault_thread to quit */ + int userfault_quit_fd; + QEMUFile *to_src_file; + QemuMutex rp_mutex; /* We send replies from multiple threads */ + void *postcopy_tmp_page; /* See savevm.c */ LoadStateEntry_Head loadvm_handlers; @@ -58,6 +113,18 @@ MigrationIncomingState *migration_incoming_get_current(void); MigrationIncomingState *migration_incoming_state_new(QEMUFile *f); void migration_incoming_state_destroy(void); +/* + * An outstanding page request, on the source, having been received + * and queued + */ +struct MigrationSrcPageRequest { + RAMBlock *rb; + hwaddr offset; + hwaddr len; + + QSIMPLEQ_ENTRY(MigrationSrcPageRequest) next_req; +}; + struct MigrationState { int64_t bandwidth_limit; @@ -70,6 +137,14 @@ struct MigrationState int state; MigrationParams params; + + /* State related to return path */ + struct { + QEMUFile *from_dst_file; + QemuThread rp_thread; + bool error; + } rp_state; + double mbps; int64_t total_time; int64_t downtime; @@ -80,6 +155,18 @@ struct MigrationState int64_t xbzrle_cache_size; int64_t setup_time; int64_t dirty_sync_count; + + /* Flag set once the migration has been asked to enter postcopy */ + bool start_postcopy; + + /* Flag set once the migration thread is running (and needs joining) */ + bool migration_thread_running; + + /* Queue of outstanding page requests from the destination */ + QemuMutex src_page_req_mutex; + QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) src_page_requests; + /* The RAMBlock used in the last src_page_request */ + RAMBlock *last_req_rb; }; void process_incoming_migration(QEMUFile *f); @@ -116,9 +203,12 @@ int migrate_fd_close(MigrationState *s); void add_migration_state_change_notifier(Notifier *notify); void remove_migration_state_change_notifier(Notifier *notify); +MigrationState *migrate_init(const MigrationParams *params); bool migration_in_setup(MigrationState *); bool migration_has_finished(MigrationState *); bool migration_has_failed(MigrationState *); +/* True if outgoing migration has entered postcopy phase */ +bool migration_in_postcopy(MigrationState *); MigrationState *migrate_get_current(void); void migrate_compress_threads_create(void); @@ -145,6 +235,13 @@ uint64_t xbzrle_mig_pages_cache_miss(void); double xbzrle_mig_cache_miss_rate(void); void ram_handle_compressed(void *host, uint8_t ch, uint64_t size); +void ram_debug_dump_bitmap(unsigned long *todump, bool expected); +/* For outgoing discard bitmap */ +int ram_postcopy_send_discard_bitmap(MigrationState *ms); +/* For incoming postcopy discard */ +int ram_discard_range(MigrationIncomingState *mis, const char *block_name, + uint64_t start, size_t length); +int ram_postcopy_incoming_init(MigrationIncomingState *mis); /** * @migrate_add_blocker - prevent migration from proceeding @@ -160,6 +257,7 @@ void migrate_add_blocker(Error *reason); */ void migrate_del_blocker(Error *reason); +bool migrate_postcopy_ram(void); bool migrate_zero_blocks(void); bool migrate_auto_converge(void); @@ -179,6 +277,17 @@ int migrate_compress_threads(void); int migrate_decompress_threads(void); bool migrate_use_events(void); +/* Sending on the return path - generic and then for each message type */ +void migrate_send_rp_message(MigrationIncomingState *mis, + enum mig_rp_message_type message_type, + uint16_t len, void *data); +void migrate_send_rp_shut(MigrationIncomingState *mis, + uint32_t value); +void migrate_send_rp_pong(MigrationIncomingState *mis, + uint32_t value); +void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char* rbname, + ram_addr_t start, size_t len); + void ram_control_before_iterate(QEMUFile *f, uint64_t flags); void ram_control_after_iterate(QEMUFile *f, uint64_t flags); void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data); @@ -204,4 +313,12 @@ void global_state_set_optional(void); void savevm_skip_configuration(void); int global_state_store(void); void global_state_store_running(void); + +void flush_page_queue(MigrationState *ms); +int ram_save_queue_pages(MigrationState *ms, const char *rbname, + ram_addr_t start, ram_addr_t len); + +PostcopyState postcopy_state_get(void); +/* Set the state and return the old state */ +PostcopyState postcopy_state_set(PostcopyState new_state); #endif diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h new file mode 100644 index 0000000000..b6a7491f2d --- /dev/null +++ b/include/migration/postcopy-ram.h @@ -0,0 +1,99 @@ +/* + * Postcopy migration for RAM + * + * Copyright 2013 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Dave Gilbert + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#ifndef QEMU_POSTCOPY_RAM_H +#define QEMU_POSTCOPY_RAM_H + +/* Return true if the host supports everything we need to do postcopy-ram */ +bool postcopy_ram_supported_by_host(void); + +/* + * Make all of RAM sensitive to accesses to areas that haven't yet been written + * and wire up anything necessary to deal with it. + */ +int postcopy_ram_enable_notify(MigrationIncomingState *mis); + +/* + * Initialise postcopy-ram, setting the RAM to a state where we can go into + * postcopy later; must be called prior to any precopy. + * called from ram.c's similarly named ram_postcopy_incoming_init + */ +int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages); + +/* + * At the end of a migration where postcopy_ram_incoming_init was called. + */ +int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis); + +/* + * Discard the contents of 'length' bytes from 'start' + * We can assume that if we've been called postcopy_ram_hosttest returned true + */ +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, + size_t length); + +/* + * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard + * however leaving it until after precopy means that most of the precopy + * data is still THPd + */ +int postcopy_ram_prepare_discard(MigrationIncomingState *mis); + +/* + * Called at the start of each RAMBlock by the bitmap code. + * 'offset' is the bitmap offset of the named RAMBlock in the migration + * bitmap. + * Returns a new PDS + */ +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, + unsigned long offset, + const char *name); + +/* + * Called by the bitmap code for each chunk to discard. + * May send a discard message, may just leave it queued to + * be sent later. + * @start,@length: a range of pages in the migration bitmap in the + * RAM block passed to postcopy_discard_send_init() (length=1 is one page) + */ +void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds, + unsigned long start, unsigned long length); + +/* + * Called at the end of each RAMBlock by the bitmap code. + * Sends any outstanding discard messages, frees the PDS. + */ +void postcopy_discard_send_finish(MigrationState *ms, + PostcopyDiscardState *pds); + +/* + * Place a page (from) at (host) efficiently + * There are restrictions on how 'from' must be mapped, in general best + * to use other postcopy_ routines to allocate. + * returns 0 on success + */ +int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from); + +/* + * Place a zero page at (host) atomically + * returns 0 on success + */ +int postcopy_place_page_zero(MigrationIncomingState *mis, void *host); + +/* + * Allocate a page of memory that can be mapped at a later point in time + * using postcopy_place_page + * Returns: Pointer to allocated page + */ +void *postcopy_get_tmp_page(MigrationIncomingState *mis); + +#endif diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 29a338d0a9..b5d08d217d 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -88,6 +88,11 @@ typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque, size_t size, uint64_t *bytes_sent); +/* + * Return a QEMUFile for comms in the opposite direction + */ +typedef QEMUFile *(QEMURetPathFunc)(void *opaque); + /* * Stop any read or write (depending on flags) on the underlying * transport on the QEMUFile. @@ -106,6 +111,7 @@ typedef struct QEMUFileOps { QEMURamHookFunc *after_ram_iterate; QEMURamHookFunc *hook_ram_load; QEMURamSaveFunc *save_page; + QEMURetPathFunc *get_return_path; QEMUFileShutdownFunc *shut_down; } QEMUFileOps; @@ -163,9 +169,11 @@ void qemu_put_be32(QEMUFile *f, unsigned int v); void qemu_put_be64(QEMUFile *f, uint64_t v); size_t qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t size, size_t offset); size_t qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size); +size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size); ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, int level); int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src); + /* * Note that you can only peek continuous bytes from where the current pointer * is; you aren't guaranteed to be able to peak to +n bytes unless you've @@ -194,7 +202,9 @@ int64_t qemu_file_get_rate_limit(QEMUFile *f); int qemu_file_get_error(QEMUFile *f); void qemu_file_set_error(QEMUFile *f, int ret); int qemu_file_shutdown(QEMUFile *f); +QEMUFile *qemu_file_get_return_path(QEMUFile *f); void qemu_fflush(QEMUFile *f); +void qemu_file_set_blocking(QEMUFile *f, bool block); static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv) { diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h index d173b565f5..7267e38c1f 100644 --- a/include/migration/vmstate.h +++ b/include/migration/vmstate.h @@ -40,7 +40,8 @@ typedef struct SaveVMHandlers { SaveStateHandler *save_state; void (*cleanup)(void *opaque); - int (*save_live_complete)(QEMUFile *f, void *opaque); + int (*save_live_complete_postcopy)(QEMUFile *f, void *opaque); + int (*save_live_complete_precopy)(QEMUFile *f, void *opaque); /* This runs both outside and inside the iothread lock. */ bool (*is_active)(void *opaque); @@ -54,8 +55,9 @@ typedef struct SaveVMHandlers { /* This runs outside the iothread lock! */ int (*save_live_setup)(QEMUFile *f, void *opaque); - uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size); - + void (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size, + uint64_t *non_postcopiable_pending, + uint64_t *postcopiable_pending); LoadStateHandler *load_state; } SaveVMHandlers; diff --git a/include/qemu-common.h b/include/qemu-common.h index 2f74540a87..405364f2b9 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -499,5 +499,6 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len); int parse_debug_env(const char *name, int max, int initial); const char *qemu_ether_ntoa(const MACAddr *mac); +void page_size_init(void); #endif diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index ab2d5d9d31..861d84b4e4 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -139,6 +139,8 @@ void qemu_anon_ram_free(void *ptr, size_t size); #if defined(CONFIG_MADVISE) +#include + #define QEMU_MADV_WILLNEED MADV_WILLNEED #define QEMU_MADV_DONTNEED MADV_DONTNEED #ifdef MADV_DONTFORK @@ -171,6 +173,11 @@ void qemu_anon_ram_free(void *ptr, size_t size); #else #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID #endif +#ifdef MADV_NOHUGEPAGE +#define QEMU_MADV_NOHUGEPAGE MADV_NOHUGEPAGE +#else +#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID +#endif #elif defined(CONFIG_POSIX_MADVISE) @@ -182,6 +189,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); #define QEMU_MADV_DODUMP QEMU_MADV_INVALID #define QEMU_MADV_DONTDUMP QEMU_MADV_INVALID #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID +#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID #else /* no-op */ @@ -193,6 +201,7 @@ void qemu_anon_ram_free(void *ptr, size_t size); #define QEMU_MADV_DODUMP QEMU_MADV_INVALID #define QEMU_MADV_DONTDUMP QEMU_MADV_INVALID #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID +#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID #endif diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 2cdce1866e..6b1093dcfc 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -44,6 +44,7 @@ typedef struct MemoryRegion MemoryRegion; typedef struct MemoryRegionSection MemoryRegionSection; typedef struct MigrationIncomingState MigrationIncomingState; typedef struct MigrationParams MigrationParams; +typedef struct MigrationState MigrationState; typedef struct Monitor Monitor; typedef struct MouseTransformInfo MouseTransformInfo; typedef struct MSIMessage MSIMessage; @@ -66,6 +67,7 @@ typedef struct PCMachineState PCMachineState; typedef struct PCMachineClass PCMachineClass; typedef struct PCMCIACardState PCMCIACardState; typedef struct PixelFormat PixelFormat; +typedef struct PostcopyDiscardState PostcopyDiscardState; typedef struct PropertyInfo PropertyInfo; typedef struct Property Property; typedef struct QEMUBH QEMUBH; @@ -79,6 +81,7 @@ typedef struct QEMUSizedBuffer QEMUSizedBuffer; typedef struct QEMUTimerListGroup QEMUTimerListGroup; typedef struct QEMUTimer QEMUTimer; typedef struct Range Range; +typedef struct RAMBlock RAMBlock; typedef struct SerialState SerialState; typedef struct SHPCDevice SHPCDevice; typedef struct SMBusDevice SMBusDevice; diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h index 17fe30070d..3f976b49e7 100644 --- a/include/sysemu/balloon.h +++ b/include/sysemu/balloon.h @@ -22,5 +22,7 @@ typedef void (QEMUBalloonStatus)(void *opaque, BalloonInfo *info); int qemu_add_balloon_handler(QEMUBalloonEvent *event_func, QEMUBalloonStatus *stat_func, void *opaque); void qemu_remove_balloon_handler(void *opaque); +bool qemu_balloon_is_inhibited(void); +void qemu_balloon_inhibit(bool state); #endif diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 5cb0f05068..f992494e10 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -70,6 +70,7 @@ void qemu_system_killed(int signal, pid_t pid); void qemu_devices_reset(void); void qemu_system_reset(bool report); void qemu_system_guest_panicked(void); +size_t qemu_target_page_bits(void); void qemu_add_exit_notifier(Notifier *notify); void qemu_remove_exit_notifier(Notifier *notify); @@ -83,14 +84,52 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict); void qemu_announce_self(void); +/* Subcommands for QEMU_VM_COMMAND */ +enum qemu_vm_cmd { + MIG_CMD_INVALID = 0, /* Must be 0 */ + MIG_CMD_OPEN_RETURN_PATH, /* Tell the dest to open the Return path */ + MIG_CMD_PING, /* Request a PONG on the RP */ + + MIG_CMD_POSTCOPY_ADVISE, /* Prior to any page transfers, just + warn we might want to do PC */ + MIG_CMD_POSTCOPY_LISTEN, /* Start listening for incoming + pages as it's running. */ + MIG_CMD_POSTCOPY_RUN, /* Start execution */ + + MIG_CMD_POSTCOPY_RAM_DISCARD, /* A list of pages to discard that + were previously sent during + precopy but are dirty. */ + MIG_CMD_PACKAGED, /* Send a wrapped stream within this stream */ + MIG_CMD_MAX +}; + +#define MAX_VM_CMD_PACKAGED_SIZE (1ul << 24) + bool qemu_savevm_state_blocked(Error **errp); void qemu_savevm_state_begin(QEMUFile *f, const MigrationParams *params); void qemu_savevm_state_header(QEMUFile *f); -int qemu_savevm_state_iterate(QEMUFile *f); -void qemu_savevm_state_complete(QEMUFile *f); +int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy); void qemu_savevm_state_cleanup(void); -uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size); +void qemu_savevm_state_complete_postcopy(QEMUFile *f); +void qemu_savevm_state_complete_precopy(QEMUFile *f); +void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size, + uint64_t *res_non_postcopiable, + uint64_t *res_postcopiable); +void qemu_savevm_command_send(QEMUFile *f, enum qemu_vm_cmd command, + uint16_t len, uint8_t *data); +void qemu_savevm_send_ping(QEMUFile *f, uint32_t value); +void qemu_savevm_send_open_return_path(QEMUFile *f); +int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb); +void qemu_savevm_send_postcopy_advise(QEMUFile *f); +void qemu_savevm_send_postcopy_listen(QEMUFile *f); +void qemu_savevm_send_postcopy_run(QEMUFile *f); + +void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, + uint16_t len, + uint64_t *start_list, + uint64_t *length_list); + int qemu_loadvm_state(QEMUFile *f); typedef enum DisplayType @@ -133,6 +172,7 @@ extern int boot_menu; extern bool boot_strict; extern uint8_t *boot_splash_filedata; extern size_t boot_splash_filedata_size; +extern bool enable_mlock; extern uint8_t qemu_extra_params_fw[2]; extern QEMUClockType rtc_clock; extern const char *mem_path; diff --git a/kvm-all.c b/kvm-all.c index 1bc1273772..de3c8c48bb 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1461,7 +1461,6 @@ static int kvm_init(MachineState *ms) * page size for the system though. */ assert(TARGET_PAGE_SIZE <= getpagesize()); - page_size_init(); s->sigmask_len = 8; diff --git a/linux-headers/linux/userfaultfd.h b/linux-headers/linux/userfaultfd.h new file mode 100644 index 0000000000..9057d7af3a --- /dev/null +++ b/linux-headers/linux/userfaultfd.h @@ -0,0 +1,167 @@ +/* + * include/linux/userfaultfd.h + * + * Copyright (C) 2007 Davide Libenzi + * Copyright (C) 2015 Red Hat, Inc. + * + */ + +#ifndef _LINUX_USERFAULTFD_H +#define _LINUX_USERFAULTFD_H + +#include + +#define UFFD_API ((__u64)0xAA) +/* + * After implementing the respective features it will become: + * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \ + * UFFD_FEATURE_EVENT_FORK) + */ +#define UFFD_API_FEATURES (0) +#define UFFD_API_IOCTLS \ + ((__u64)1 << _UFFDIO_REGISTER | \ + (__u64)1 << _UFFDIO_UNREGISTER | \ + (__u64)1 << _UFFDIO_API) +#define UFFD_API_RANGE_IOCTLS \ + ((__u64)1 << _UFFDIO_WAKE | \ + (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_ZEROPAGE) + +/* + * Valid ioctl command number range with this API is from 0x00 to + * 0x3F. UFFDIO_API is the fixed number, everything else can be + * changed by implementing a different UFFD_API. If sticking to the + * same UFFD_API more ioctl can be added and userland will be aware of + * which ioctl the running kernel implements through the ioctl command + * bitmask written by the UFFDIO_API. + */ +#define _UFFDIO_REGISTER (0x00) +#define _UFFDIO_UNREGISTER (0x01) +#define _UFFDIO_WAKE (0x02) +#define _UFFDIO_COPY (0x03) +#define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_API (0x3F) + +/* userfaultfd ioctl ids */ +#define UFFDIO 0xAA +#define UFFDIO_API _IOWR(UFFDIO, _UFFDIO_API, \ + struct uffdio_api) +#define UFFDIO_REGISTER _IOWR(UFFDIO, _UFFDIO_REGISTER, \ + struct uffdio_register) +#define UFFDIO_UNREGISTER _IOR(UFFDIO, _UFFDIO_UNREGISTER, \ + struct uffdio_range) +#define UFFDIO_WAKE _IOR(UFFDIO, _UFFDIO_WAKE, \ + struct uffdio_range) +#define UFFDIO_COPY _IOWR(UFFDIO, _UFFDIO_COPY, \ + struct uffdio_copy) +#define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ + struct uffdio_zeropage) + +/* read() structure */ +struct uffd_msg { + __u8 event; + + __u8 reserved1; + __u16 reserved2; + __u32 reserved3; + + union { + struct { + __u64 flags; + __u64 address; + } pagefault; + + struct { + /* unused reserved fields */ + __u64 reserved1; + __u64 reserved2; + __u64 reserved3; + } reserved; + } arg; +} __packed; + +/* + * Start at 0x12 and not at 0 to be more strict against bugs. + */ +#define UFFD_EVENT_PAGEFAULT 0x12 +#if 0 /* not available yet */ +#define UFFD_EVENT_FORK 0x13 +#endif + +/* flags for UFFD_EVENT_PAGEFAULT */ +#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ +#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ + +struct uffdio_api { + /* userland asks for an API number and the features to enable */ + __u64 api; + /* + * Kernel answers below with the all available features for + * the API, this notifies userland of which events and/or + * which flags for each event are enabled in the current + * kernel. + * + * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE + * are to be considered implicitly always enabled in all kernels as + * long as the uffdio_api.api requested matches UFFD_API. + */ +#if 0 /* not available yet */ +#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) +#define UFFD_FEATURE_EVENT_FORK (1<<1) +#endif + __u64 features; + + __u64 ioctls; +}; + +struct uffdio_range { + __u64 start; + __u64 len; +}; + +struct uffdio_register { + struct uffdio_range range; +#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) +#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) + __u64 mode; + + /* + * kernel answers which ioctl commands are available for the + * range, keep at the end as the last 8 bytes aren't read. + */ + __u64 ioctls; +}; + +struct uffdio_copy { + __u64 dst; + __u64 src; + __u64 len; + /* + * There will be a wrprotection flag later that allows to map + * pages wrprotected on the fly. And such a flag will be + * available if the wrprotection ioctl are implemented for the + * range according to the uffdio_register.ioctls. + */ +#define UFFDIO_COPY_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * "copy" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 copy; +}; + +struct uffdio_zeropage { + struct uffdio_range range; +#define UFFDIO_ZEROPAGE_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * "zeropage" is written by the ioctl and must be at the end: + * the copy_from_user will not read the last 8 bytes. + */ + __s64 zeropage; +}; + +#endif /* _LINUX_USERFAULTFD_H */ diff --git a/migration/Makefile.objs b/migration/Makefile.objs index d929e969ae..0cac6d707a 100644 --- a/migration/Makefile.objs +++ b/migration/Makefile.objs @@ -1,7 +1,7 @@ common-obj-y += migration.o tcp.o common-obj-y += vmstate.o common-obj-y += qemu-file.o qemu-file-buf.o qemu-file-unix.o qemu-file-stdio.o -common-obj-y += xbzrle.o +common-obj-y += xbzrle.o postcopy-ram.o common-obj-$(CONFIG_RDMA) += rdma.o common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o diff --git a/migration/block.c b/migration/block.c index cf9d9f8999..310e2b36dc 100644 --- a/migration/block.c +++ b/migration/block.c @@ -748,7 +748,9 @@ static int block_save_complete(QEMUFile *f, void *opaque) return 0; } -static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) +static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, + uint64_t *non_postcopiable_pending, + uint64_t *postcopiable_pending) { /* Estimate pending number of bytes to send */ uint64_t pending; @@ -767,7 +769,8 @@ static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) qemu_mutex_unlock_iothread(); DPRINTF("Enter save live pending %" PRIu64 "\n", pending); - return pending; + /* We don't do postcopy */ + *non_postcopiable_pending += pending; } static int block_load(QEMUFile *f, void *opaque, int version_id) @@ -876,7 +879,7 @@ static SaveVMHandlers savevm_block_handlers = { .set_params = block_set_params, .save_live_setup = block_save_setup, .save_live_iterate = block_save_iterate, - .save_live_complete = block_save_complete, + .save_live_complete_precopy = block_save_complete, .save_live_pending = block_save_pending, .load_state = block_load, .cleanup = block_migration_cleanup, diff --git a/migration/migration.c b/migration/migration.c index f99d3eabf7..c5c977e737 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -21,15 +21,18 @@ #include "sysemu/sysemu.h" #include "block/block.h" #include "qapi/qmp/qerror.h" +#include "qapi/util.h" #include "qemu/sockets.h" #include "qemu/rcu.h" #include "migration/block.h" +#include "migration/postcopy-ram.h" #include "qemu/thread.h" #include "qmp-commands.h" #include "trace.h" -#include "qapi/util.h" #include "qapi-event.h" #include "qom/cpu.h" +#include "exec/memory.h" +#include "exec/address-spaces.h" #define MAX_THROTTLE (32 << 20) /* Migration transfer speed throttling */ @@ -57,6 +60,13 @@ static NotifierList migration_state_notifiers = static bool deferred_incoming; +/* + * Current state of incoming postcopy; note this is not part of + * MigrationIncomingState since it's state is used during cleanup + * at the end as MIS is being freed. + */ +static PostcopyState incoming_postcopy_state; + /* When we add fault tolerance, we could have several migrations at once. For now we don't need to add dynamic creation of migration */ @@ -64,6 +74,7 @@ static bool deferred_incoming; /* For outgoing */ MigrationState *migrate_get_current(void) { + static bool once; static MigrationState current_migration = { .state = MIGRATION_STATUS_NONE, .bandwidth_limit = MAX_THROTTLE, @@ -81,6 +92,10 @@ MigrationState *migrate_get_current(void) DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT, }; + if (!once) { + qemu_mutex_init(¤t_migration.src_page_req_mutex); + once = true; + } return ¤t_migration; } @@ -95,14 +110,17 @@ MigrationIncomingState *migration_incoming_get_current(void) MigrationIncomingState *migration_incoming_state_new(QEMUFile* f) { mis_current = g_new0(MigrationIncomingState, 1); - mis_current->file = f; + mis_current->from_src_file = f; QLIST_INIT(&mis_current->loadvm_handlers); + qemu_mutex_init(&mis_current->rp_mutex); + qemu_event_init(&mis_current->main_thread_load_event, false); return mis_current; } void migration_incoming_state_destroy(void) { + qemu_event_destroy(&mis_current->main_thread_load_event); loadvm_free_handlers(mis_current); g_free(mis_current); mis_current = NULL; @@ -248,6 +266,35 @@ static void deferred_incoming_migration(Error **errp) deferred_incoming = true; } +/* Request a range of pages from the source VM at the given + * start address. + * rbname: Name of the RAMBlock to request the page in, if NULL it's the same + * as the last request (a name must have been given previously) + * Start: Address offset within the RB + * Len: Length in bytes required - must be a multiple of pagesize + */ +void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname, + ram_addr_t start, size_t len) +{ + uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname upto 256 */ + size_t msglen = 12; /* start + len */ + + *(uint64_t *)bufc = cpu_to_be64((uint64_t)start); + *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len); + + if (rbname) { + int rbname_len = strlen(rbname); + assert(rbname_len < 256); + + bufc[msglen++] = rbname_len; + memcpy(bufc + msglen, rbname, rbname_len); + msglen += rbname_len; + migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES_ID, msglen, bufc); + } else { + migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES, msglen, bufc); + } +} + void qemu_start_incoming_migration(const char *uri, Error **errp) { const char *p; @@ -278,12 +325,37 @@ static void process_incoming_migration_co(void *opaque) { QEMUFile *f = opaque; Error *local_err = NULL; + MigrationIncomingState *mis; + PostcopyState ps; int ret; - migration_incoming_state_new(f); + mis = migration_incoming_state_new(f); + postcopy_state_set(POSTCOPY_INCOMING_NONE); migrate_generate_event(MIGRATION_STATUS_ACTIVE); + ret = qemu_loadvm_state(f); + ps = postcopy_state_get(); + trace_process_incoming_migration_co_end(ret, ps); + if (ps != POSTCOPY_INCOMING_NONE) { + if (ps == POSTCOPY_INCOMING_ADVISE) { + /* + * Where a migration had postcopy enabled (and thus went to advise) + * but managed to complete within the precopy period, we can use + * the normal exit. + */ + postcopy_ram_incoming_cleanup(mis); + } else if (ret >= 0) { + /* + * Postcopy was started, cleanup should happen at the end of the + * postcopy thread. + */ + trace_process_incoming_migration_co_postcopy_end_main(); + return; + } + /* Else if something went wrong then just fall out of the normal exit */ + } + qemu_fclose(f); free_xbzrle_decoded_buf(); migration_incoming_state_destroy(); @@ -344,6 +416,50 @@ void process_incoming_migration(QEMUFile *f) qemu_coroutine_enter(co, f); } +/* + * Send a message on the return channel back to the source + * of the migration. + */ +void migrate_send_rp_message(MigrationIncomingState *mis, + enum mig_rp_message_type message_type, + uint16_t len, void *data) +{ + trace_migrate_send_rp_message((int)message_type, len); + qemu_mutex_lock(&mis->rp_mutex); + qemu_put_be16(mis->to_src_file, (unsigned int)message_type); + qemu_put_be16(mis->to_src_file, len); + qemu_put_buffer(mis->to_src_file, data, len); + qemu_fflush(mis->to_src_file); + qemu_mutex_unlock(&mis->rp_mutex); +} + +/* + * Send a 'SHUT' message on the return channel with the given value + * to indicate that we've finished with the RP. Non-0 value indicates + * error. + */ +void migrate_send_rp_shut(MigrationIncomingState *mis, + uint32_t value) +{ + uint32_t buf; + + buf = cpu_to_be32(value); + migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf); +} + +/* + * Send a 'PONG' message on the return channel with the given value + * (normally in response to a 'PING') + */ +void migrate_send_rp_pong(MigrationIncomingState *mis, + uint32_t value) +{ + uint32_t buf; + + buf = cpu_to_be32(value); + migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf); +} + /* amount of nanoseconds we are willing to wait for migration to be down. * the choice of nanoseconds is because it is the maximum resolution that * get_clock() can achieve. It is an internal measure. All user-visible @@ -399,6 +515,24 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp) return params; } +/* + * Return true if we're already in the middle of a migration + * (i.e. any of the active or setup states) + */ +static bool migration_is_setup_or_active(int state) +{ + switch (state) { + case MIGRATION_STATUS_ACTIVE: + case MIGRATION_STATUS_POSTCOPY_ACTIVE: + case MIGRATION_STATUS_SETUP: + return true; + + default: + return false; + + } +} + static void get_xbzrle_cache_stats(MigrationInfo *info) { if (migrate_use_xbzrle()) { @@ -463,6 +597,39 @@ MigrationInfo *qmp_query_migrate(Error **errp) info->x_cpu_throttle_percentage = cpu_throttle_get_percentage(); } + get_xbzrle_cache_stats(info); + break; + case MIGRATION_STATUS_POSTCOPY_ACTIVE: + /* Mostly the same as active; TODO add some postcopy stats */ + info->has_status = true; + info->has_total_time = true; + info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + - s->total_time; + info->has_expected_downtime = true; + info->expected_downtime = s->expected_downtime; + info->has_setup_time = true; + info->setup_time = s->setup_time; + + info->has_ram = true; + info->ram = g_malloc0(sizeof(*info->ram)); + info->ram->transferred = ram_bytes_transferred(); + info->ram->remaining = ram_bytes_remaining(); + info->ram->total = ram_bytes_total(); + info->ram->duplicate = dup_mig_pages_transferred(); + info->ram->skipped = skipped_mig_pages_transferred(); + info->ram->normal = norm_mig_pages_transferred(); + info->ram->normal_bytes = norm_mig_bytes_transferred(); + info->ram->dirty_pages_rate = s->dirty_pages_rate; + info->ram->mbps = s->mbps; + + if (blk_mig_active()) { + info->has_disk = true; + info->disk = g_malloc0(sizeof(*info->disk)); + info->disk->transferred = blk_mig_bytes_transferred(); + info->disk->remaining = blk_mig_bytes_remaining(); + info->disk->total = blk_mig_bytes_total(); + } + get_xbzrle_cache_stats(info); break; case MIGRATION_STATUS_COMPLETED: @@ -506,8 +673,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, MigrationState *s = migrate_get_current(); MigrationCapabilityStatusList *cap; - if (s->state == MIGRATION_STATUS_ACTIVE || - s->state == MIGRATION_STATUS_SETUP) { + if (migration_is_setup_or_active(s->state)) { error_setg(errp, QERR_MIGRATION_ACTIVE); return; } @@ -515,6 +681,20 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, for (cap = params; cap; cap = cap->next) { s->enabled_capabilities[cap->value->capability] = cap->value->state; } + + if (migrate_postcopy_ram()) { + if (migrate_use_compression()) { + /* The decompression threads asynchronously write into RAM + * rather than use the atomic copies needed to avoid + * userfaulting. It should be possible to fix the decompression + * threads for compatibility in future. + */ + error_report("Postcopy is not currently compatible with " + "compression"); + s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM] = + false; + } + } } void qmp_migrate_set_parameters(bool has_compress_level, @@ -583,6 +763,28 @@ void qmp_migrate_set_parameters(bool has_compress_level, } } +void qmp_migrate_start_postcopy(Error **errp) +{ + MigrationState *s = migrate_get_current(); + + if (!migrate_postcopy_ram()) { + error_setg(errp, "Enable postcopy with migration_set_capability before" + " the start of migration"); + return; + } + + if (s->state == MIGRATION_STATUS_NONE) { + error_setg(errp, "Postcopy must be started after migration has been" + " started"); + return; + } + /* + * we don't error if migration has finished since that would be racy + * with issuing this command. + */ + atomic_set(&s->start_postcopy, true); +} + /* shared migration helpers */ static void migrate_set_state(MigrationState *s, int old_state, int new_state) @@ -600,10 +802,15 @@ static void migrate_fd_cleanup(void *opaque) qemu_bh_delete(s->cleanup_bh); s->cleanup_bh = NULL; + flush_page_queue(s); + if (s->file) { trace_migrate_fd_cleanup(); qemu_mutex_unlock_iothread(); - qemu_thread_join(&s->thread); + if (s->migration_thread_running) { + qemu_thread_join(&s->thread); + s->migration_thread_running = false; + } qemu_mutex_lock_iothread(); migrate_compress_threads_join(); @@ -611,7 +818,8 @@ static void migrate_fd_cleanup(void *opaque) s->file = NULL; } - assert(s->state != MIGRATION_STATUS_ACTIVE); + assert((s->state != MIGRATION_STATUS_ACTIVE) && + (s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE)); if (s->state == MIGRATION_STATUS_CANCELLING) { migrate_set_state(s, MIGRATION_STATUS_CANCELLING, @@ -635,10 +843,14 @@ static void migrate_fd_cancel(MigrationState *s) QEMUFile *f = migrate_get_current()->file; trace_migrate_fd_cancel(); + if (s->rp_state.from_dst_file) { + /* shutdown the rp socket, so causing the rp thread to shutdown */ + qemu_file_shutdown(s->rp_state.from_dst_file); + } + do { old_state = s->state; - if (old_state != MIGRATION_STATUS_SETUP && - old_state != MIGRATION_STATUS_ACTIVE) { + if (!migration_is_setup_or_active(old_state)) { break; } migrate_set_state(s, old_state, MIGRATION_STATUS_CANCELLING); @@ -682,7 +894,12 @@ bool migration_has_failed(MigrationState *s) s->state == MIGRATION_STATUS_FAILED); } -static MigrationState *migrate_init(const MigrationParams *params) +bool migration_in_postcopy(MigrationState *s) +{ + return (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); +} + +MigrationState *migrate_init(const MigrationParams *params) { MigrationState *s = migrate_get_current(); int64_t bandwidth_limit = s->bandwidth_limit; @@ -719,6 +936,8 @@ static MigrationState *migrate_init(const MigrationParams *params) s->bandwidth_limit = bandwidth_limit; migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); + QSIMPLEQ_INIT(&s->src_page_requests); + s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); return s; } @@ -770,8 +989,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk, params.blk = has_blk && blk; params.shared = has_inc && inc; - if (s->state == MIGRATION_STATUS_ACTIVE || - s->state == MIGRATION_STATUS_SETUP || + if (migration_is_setup_or_active(s->state) || s->state == MIGRATION_STATUS_CANCELLING) { error_setg(errp, QERR_MIGRATION_ACTIVE); return; @@ -890,6 +1108,15 @@ void qmp_migrate_set_downtime(double value, Error **errp) max_downtime = (uint64_t)value; } +bool migrate_postcopy_ram(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM]; +} + bool migrate_auto_converge(void) { MigrationState *s; @@ -971,36 +1198,376 @@ int64_t migrate_xbzrle_cache_size(void) return s->xbzrle_cache_size; } +/* migration thread support */ +/* + * Something bad happened to the RP stream, mark an error + * The caller shall print or trace something to indicate why + */ +static void mark_source_rp_bad(MigrationState *s) +{ + s->rp_state.error = true; +} + +static struct rp_cmd_args { + ssize_t len; /* -1 = variable */ + const char *name; +} rp_cmd_args[] = { + [MIG_RP_MSG_INVALID] = { .len = -1, .name = "INVALID" }, + [MIG_RP_MSG_SHUT] = { .len = 4, .name = "SHUT" }, + [MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" }, + [MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" }, + [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" }, + [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" }, +}; + +/* + * Process a request for pages received on the return path, + * We're allowed to send more than requested (e.g. to round to our page size) + * and we don't need to send pages that have already been sent. + */ +static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, + ram_addr_t start, size_t len) +{ + long our_host_ps = getpagesize(); + + trace_migrate_handle_rp_req_pages(rbname, start, len); + + /* + * Since we currently insist on matching page sizes, just sanity check + * we're being asked for whole host pages. + */ + if (start & (our_host_ps-1) || + (len & (our_host_ps-1))) { + error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT + " len: %zd", __func__, start, len); + mark_source_rp_bad(ms); + return; + } + + if (ram_save_queue_pages(ms, rbname, start, len)) { + mark_source_rp_bad(ms); + } +} + +/* + * Handles messages sent on the return path towards the source VM + * + */ +static void *source_return_path_thread(void *opaque) +{ + MigrationState *ms = opaque; + QEMUFile *rp = ms->rp_state.from_dst_file; + uint16_t header_len, header_type; + const int max_len = 512; + uint8_t buf[max_len]; + uint32_t tmp32, sibling_error; + ram_addr_t start = 0; /* =0 to silence warning */ + size_t len = 0, expected_len; + int res; + + trace_source_return_path_thread_entry(); + while (!ms->rp_state.error && !qemu_file_get_error(rp) && + migration_is_setup_or_active(ms->state)) { + trace_source_return_path_thread_loop_top(); + header_type = qemu_get_be16(rp); + header_len = qemu_get_be16(rp); + + if (header_type >= MIG_RP_MSG_MAX || + header_type == MIG_RP_MSG_INVALID) { + error_report("RP: Received invalid message 0x%04x length 0x%04x", + header_type, header_len); + mark_source_rp_bad(ms); + goto out; + } + + if ((rp_cmd_args[header_type].len != -1 && + header_len != rp_cmd_args[header_type].len) || + header_len > max_len) { + error_report("RP: Received '%s' message (0x%04x) with" + "incorrect length %d expecting %zu", + rp_cmd_args[header_type].name, header_type, header_len, + (size_t)rp_cmd_args[header_type].len); + mark_source_rp_bad(ms); + goto out; + } + + /* We know we've got a valid header by this point */ + res = qemu_get_buffer(rp, buf, header_len); + if (res != header_len) { + error_report("RP: Failed reading data for message 0x%04x" + " read %d expected %d", + header_type, res, header_len); + mark_source_rp_bad(ms); + goto out; + } + + /* OK, we have the message and the data */ + switch (header_type) { + case MIG_RP_MSG_SHUT: + sibling_error = be32_to_cpup((uint32_t *)buf); + trace_source_return_path_thread_shut(sibling_error); + if (sibling_error) { + error_report("RP: Sibling indicated error %d", sibling_error); + mark_source_rp_bad(ms); + } + /* + * We'll let the main thread deal with closing the RP + * we could do a shutdown(2) on it, but we're the only user + * anyway, so there's nothing gained. + */ + goto out; + + case MIG_RP_MSG_PONG: + tmp32 = be32_to_cpup((uint32_t *)buf); + trace_source_return_path_thread_pong(tmp32); + break; + + case MIG_RP_MSG_REQ_PAGES: + start = be64_to_cpup((uint64_t *)buf); + len = be32_to_cpup((uint32_t *)(buf + 8)); + migrate_handle_rp_req_pages(ms, NULL, start, len); + break; + + case MIG_RP_MSG_REQ_PAGES_ID: + expected_len = 12 + 1; /* header + termination */ + + if (header_len >= expected_len) { + start = be64_to_cpup((uint64_t *)buf); + len = be32_to_cpup((uint32_t *)(buf + 8)); + /* Now we expect an idstr */ + tmp32 = buf[12]; /* Length of the following idstr */ + buf[13 + tmp32] = '\0'; + expected_len += tmp32; + } + if (header_len != expected_len) { + error_report("RP: Req_Page_id with length %d expecting %zd", + header_len, expected_len); + mark_source_rp_bad(ms); + goto out; + } + migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len); + break; + + default: + break; + } + } + if (rp && qemu_file_get_error(rp)) { + trace_source_return_path_thread_bad_end(); + mark_source_rp_bad(ms); + } + + trace_source_return_path_thread_end(); +out: + ms->rp_state.from_dst_file = NULL; + qemu_fclose(rp); + return NULL; +} + +static int open_return_path_on_source(MigrationState *ms) +{ + + ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->file); + if (!ms->rp_state.from_dst_file) { + return -1; + } + + trace_open_return_path_on_source(); + qemu_thread_create(&ms->rp_state.rp_thread, "return path", + source_return_path_thread, ms, QEMU_THREAD_JOINABLE); + + trace_open_return_path_on_source_continue(); + + return 0; +} + +/* Returns 0 if the RP was ok, otherwise there was an error on the RP */ +static int await_return_path_close_on_source(MigrationState *ms) +{ + /* + * If this is a normal exit then the destination will send a SHUT and the + * rp_thread will exit, however if there's an error we need to cause + * it to exit. + */ + if (qemu_file_get_error(ms->file) && ms->rp_state.from_dst_file) { + /* + * shutdown(2), if we have it, will cause it to unblock if it's stuck + * waiting for the destination. + */ + qemu_file_shutdown(ms->rp_state.from_dst_file); + mark_source_rp_bad(ms); + } + trace_await_return_path_close_on_source_joining(); + qemu_thread_join(&ms->rp_state.rp_thread); + trace_await_return_path_close_on_source_close(); + return ms->rp_state.error; +} + +/* + * Switch from normal iteration to postcopy + * Returns non-0 on error + */ +static int postcopy_start(MigrationState *ms, bool *old_vm_running) +{ + int ret; + const QEMUSizedBuffer *qsb; + int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + migrate_set_state(ms, MIGRATION_STATUS_ACTIVE, + MIGRATION_STATUS_POSTCOPY_ACTIVE); + + trace_postcopy_start(); + qemu_mutex_lock_iothread(); + trace_postcopy_start_set_run(); + + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); + *old_vm_running = runstate_is_running(); + global_state_store(); + ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + + if (ret < 0) { + goto fail; + } + + /* + * in Finish migrate and with the io-lock held everything should + * be quiet, but we've potentially still got dirty pages and we + * need to tell the destination to throw any pages it's already received + * that are dirty + */ + if (ram_postcopy_send_discard_bitmap(ms)) { + error_report("postcopy send discard bitmap failed"); + goto fail; + } + + /* + * send rest of state - note things that are doing postcopy + * will notice we're in POSTCOPY_ACTIVE and not actually + * wrap their state up here + */ + qemu_file_set_rate_limit(ms->file, INT64_MAX); + /* Ping just for debugging, helps line traces up */ + qemu_savevm_send_ping(ms->file, 2); + + /* + * While loading the device state we may trigger page transfer + * requests and the fd must be free to process those, and thus + * the destination must read the whole device state off the fd before + * it starts processing it. Unfortunately the ad-hoc migration format + * doesn't allow the destination to know the size to read without fully + * parsing it through each devices load-state code (especially the open + * coded devices that use get/put). + * So we wrap the device state up in a package with a length at the start; + * to do this we use a qemu_buf to hold the whole of the device state. + */ + QEMUFile *fb = qemu_bufopen("w", NULL); + if (!fb) { + error_report("Failed to create buffered file"); + goto fail; + } + + /* + * Make sure the receiver can get incoming pages before we send the rest + * of the state + */ + qemu_savevm_send_postcopy_listen(fb); + + qemu_savevm_state_complete_precopy(fb); + qemu_savevm_send_ping(fb, 3); + + qemu_savevm_send_postcopy_run(fb); + + /* <><> end of stuff going into the package */ + qsb = qemu_buf_get(fb); + + /* Now send that blob */ + if (qemu_savevm_send_packaged(ms->file, qsb)) { + goto fail_closefb; + } + qemu_fclose(fb); + ms->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop; + + qemu_mutex_unlock_iothread(); + + /* + * Although this ping is just for debug, it could potentially be + * used for getting a better measurement of downtime at the source. + */ + qemu_savevm_send_ping(ms->file, 4); + + ret = qemu_file_get_error(ms->file); + if (ret) { + error_report("postcopy_start: Migration stream errored"); + migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE, + MIGRATION_STATUS_FAILED); + } + + return ret; + +fail_closefb: + qemu_fclose(fb); +fail: + migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE, + MIGRATION_STATUS_FAILED); + qemu_mutex_unlock_iothread(); + return -1; +} + /** * migration_completion: Used by migration_thread when there's not much left. * The caller 'breaks' the loop when this returns. * * @s: Current migration state + * @current_active_state: The migration state we expect to be in * @*old_vm_running: Pointer to old_vm_running flag * @*start_time: Pointer to time to update */ -static void migration_completion(MigrationState *s, bool *old_vm_running, +static void migration_completion(MigrationState *s, int current_active_state, + bool *old_vm_running, int64_t *start_time) { int ret; - qemu_mutex_lock_iothread(); - *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); - *old_vm_running = runstate_is_running(); + if (s->state == MIGRATION_STATUS_ACTIVE) { + qemu_mutex_lock_iothread(); + *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); + *old_vm_running = runstate_is_running(); + ret = global_state_store(); - ret = global_state_store(); - if (!ret) { - ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); - if (ret >= 0) { - qemu_file_set_rate_limit(s->file, INT64_MAX); - qemu_savevm_state_complete(s->file); + if (!ret) { + ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); + if (ret >= 0) { + qemu_file_set_rate_limit(s->file, INT64_MAX); + qemu_savevm_state_complete_precopy(s->file); + } } - } - qemu_mutex_unlock_iothread(); + qemu_mutex_unlock_iothread(); - if (ret < 0) { - goto fail; + if (ret < 0) { + goto fail; + } + } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { + trace_migration_completion_postcopy_end(); + + qemu_savevm_state_complete_postcopy(s->file); + trace_migration_completion_postcopy_end_after_complete(); + } + + /* + * If rp was opened we must clean up the thread before + * cleaning everything else up (since if there are no failures + * it will wait for the destination to send it's status in + * a SHUT command). + * Postcopy opens rp if enabled (even if it's not avtivated) + */ + if (migrate_postcopy_ram()) { + int rp_error; + trace_migration_completion_postcopy_end_before_rp(); + rp_error = await_return_path_close_on_source(s); + trace_migration_completion_postcopy_end_after_rp(rp_error); + if (rp_error) { + goto fail; + } } if (qemu_file_get_error(s->file)) { @@ -1008,18 +1575,21 @@ static void migration_completion(MigrationState *s, bool *old_vm_running, goto fail; } - migrate_set_state(s, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COMPLETED); + migrate_set_state(s, current_active_state, MIGRATION_STATUS_COMPLETED); return; fail: - migrate_set_state(s, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_FAILED); + migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED); } -/* migration thread support */ - +/* + * Master migration thread on the source VM. + * It drives the migration and pumps the data down the outgoing channel. + */ static void *migration_thread(void *opaque) { MigrationState *s = opaque; + /* Used by the bandwidth calcs, updated later */ int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); int64_t initial_bytes = 0; @@ -1027,34 +1597,79 @@ static void *migration_thread(void *opaque) int64_t start_time = initial_time; int64_t end_time; bool old_vm_running = false; + bool entered_postcopy = false; + /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ + enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; rcu_register_thread(); qemu_savevm_state_header(s->file); + + if (migrate_postcopy_ram()) { + /* Now tell the dest that it should open its end so it can reply */ + qemu_savevm_send_open_return_path(s->file); + + /* And do a ping that will make stuff easier to debug */ + qemu_savevm_send_ping(s->file, 1); + + /* + * Tell the destination that we *might* want to do postcopy later; + * if the other end can't do postcopy it should fail now, nice and + * early. + */ + qemu_savevm_send_postcopy_advise(s->file); + } + qemu_savevm_state_begin(s->file, &s->params); s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; + current_active_state = MIGRATION_STATUS_ACTIVE; migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_ACTIVE); - while (s->state == MIGRATION_STATUS_ACTIVE) { + trace_migration_thread_setup_complete(); + + while (s->state == MIGRATION_STATUS_ACTIVE || + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { int64_t current_time; uint64_t pending_size; if (!qemu_file_rate_limit(s->file)) { - pending_size = qemu_savevm_state_pending(s->file, max_size); - trace_migrate_pending(pending_size, max_size); + uint64_t pend_post, pend_nonpost; + + qemu_savevm_state_pending(s->file, max_size, &pend_nonpost, + &pend_post); + pending_size = pend_nonpost + pend_post; + trace_migrate_pending(pending_size, max_size, + pend_post, pend_nonpost); if (pending_size && pending_size >= max_size) { - qemu_savevm_state_iterate(s->file); + /* Still a significant amount to transfer */ + + current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); + if (migrate_postcopy_ram() && + s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE && + pend_nonpost <= max_size && + atomic_read(&s->start_postcopy)) { + + if (!postcopy_start(s, &old_vm_running)) { + current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE; + entered_postcopy = true; + } + + continue; + } + /* Just another iteration step */ + qemu_savevm_state_iterate(s->file, entered_postcopy); } else { trace_migration_thread_low_pending(pending_size); - migration_completion(s, &old_vm_running, &start_time); + migration_completion(s, current_active_state, + &old_vm_running, &start_time); break; } } if (qemu_file_get_error(s->file)) { - migrate_set_state(s, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_FAILED); + migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED); + trace_migration_thread_file_err(); break; } current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); @@ -1085,6 +1700,7 @@ static void *migration_thread(void *opaque) } } + trace_migration_thread_after_loop(); /* If we enabled cpu throttling for auto-converge, turn it off. */ cpu_throttle_stop(); end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); @@ -1094,14 +1710,16 @@ static void *migration_thread(void *opaque) if (s->state == MIGRATION_STATUS_COMPLETED) { uint64_t transferred_bytes = qemu_ftell(s->file); s->total_time = end_time - s->total_time; - s->downtime = end_time - start_time; + if (!entered_postcopy) { + s->downtime = end_time - start_time; + } if (s->total_time) { s->mbps = (((double) transferred_bytes * 8.0) / ((double) s->total_time)) / 1000; } runstate_set(RUN_STATE_POSTMIGRATE); } else { - if (old_vm_running) { + if (old_vm_running && !entered_postcopy) { vm_start(); } } @@ -1124,7 +1742,34 @@ void migrate_fd_connect(MigrationState *s) /* Notify before starting migration thread */ notifier_list_notify(&migration_state_notifiers, s); + /* + * Open the return path; currently for postcopy but other things might + * also want it. + */ + if (migrate_postcopy_ram()) { + if (open_return_path_on_source(s)) { + error_report("Unable to open return-path for postcopy"); + migrate_set_state(s, MIGRATION_STATUS_SETUP, + MIGRATION_STATUS_FAILED); + migrate_fd_cleanup(s); + return; + } + } + migrate_compress_threads_create(); qemu_thread_create(&s->thread, "migration", migration_thread, s, QEMU_THREAD_JOINABLE); + s->migration_thread_running = true; } + +PostcopyState postcopy_state_get(void) +{ + return atomic_mb_read(&incoming_postcopy_state); +} + +/* Set the state and return the old state */ +PostcopyState postcopy_state_set(PostcopyState new_state) +{ + return atomic_xchg(&incoming_postcopy_state, new_state); +} + diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c new file mode 100644 index 0000000000..22d6b18e63 --- /dev/null +++ b/migration/postcopy-ram.c @@ -0,0 +1,767 @@ +/* + * Postcopy migration for RAM + * + * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Dave Gilbert + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +/* + * Postcopy is a migration technique where the execution flips from the + * source to the destination before all the data has been copied. + */ + +#include +#include +#include + +#include "qemu-common.h" +#include "migration/migration.h" +#include "migration/postcopy-ram.h" +#include "sysemu/sysemu.h" +#include "sysemu/balloon.h" +#include "qemu/error-report.h" +#include "trace.h" + +/* Arbitrary limit on size of each discard command, + * keeps them around ~200 bytes + */ +#define MAX_DISCARDS_PER_COMMAND 12 + +struct PostcopyDiscardState { + const char *ramblock_name; + uint64_t offset; /* Bitmap entry for the 1st bit of this RAMBlock */ + uint16_t cur_entry; + /* + * Start and length of a discard range (bytes) + */ + uint64_t start_list[MAX_DISCARDS_PER_COMMAND]; + uint64_t length_list[MAX_DISCARDS_PER_COMMAND]; + unsigned int nsentwords; + unsigned int nsentcmds; +}; + +/* Postcopy needs to detect accesses to pages that haven't yet been copied + * across, and efficiently map new pages in, the techniques for doing this + * are target OS specific. + */ +#if defined(__linux__) + +#include +#include +#include +#include +#include +#include +#include /* for __u64 */ +#endif + +#if defined(__linux__) && defined(__NR_userfaultfd) +#include + +static bool ufd_version_check(int ufd) +{ + struct uffdio_api api_struct; + uint64_t ioctl_mask; + + api_struct.api = UFFD_API; + api_struct.features = 0; + if (ioctl(ufd, UFFDIO_API, &api_struct)) { + error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s", + strerror(errno)); + return false; + } + + ioctl_mask = (__u64)1 << _UFFDIO_REGISTER | + (__u64)1 << _UFFDIO_UNREGISTER; + if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { + error_report("Missing userfault features: %" PRIx64, + (uint64_t)(~api_struct.ioctls & ioctl_mask)); + return false; + } + + return true; +} + +/* + * Note: This has the side effect of munlock'ing all of RAM, that's + * normally fine since if the postcopy succeeds it gets turned back on at the + * end. + */ +bool postcopy_ram_supported_by_host(void) +{ + long pagesize = getpagesize(); + int ufd = -1; + bool ret = false; /* Error unless we change it */ + void *testarea = NULL; + struct uffdio_register reg_struct; + struct uffdio_range range_struct; + uint64_t feature_mask; + + if ((1ul << qemu_target_page_bits()) > pagesize) { + error_report("Target page size bigger than host page size"); + goto out; + } + + ufd = syscall(__NR_userfaultfd, O_CLOEXEC); + if (ufd == -1) { + error_report("%s: userfaultfd not available: %s", __func__, + strerror(errno)); + goto out; + } + + /* Version and features check */ + if (!ufd_version_check(ufd)) { + goto out; + } + + /* + * userfault and mlock don't go together; we'll put it back later if + * it was enabled. + */ + if (munlockall()) { + error_report("%s: munlockall: %s", __func__, strerror(errno)); + return -1; + } + + /* + * We need to check that the ops we need are supported on anon memory + * To do that we need to register a chunk and see the flags that + * are returned. + */ + testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | + MAP_ANONYMOUS, -1, 0); + if (testarea == MAP_FAILED) { + error_report("%s: Failed to map test area: %s", __func__, + strerror(errno)); + goto out; + } + g_assert(((size_t)testarea & (pagesize-1)) == 0); + + reg_struct.range.start = (uintptr_t)testarea; + reg_struct.range.len = pagesize; + reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; + + if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) { + error_report("%s userfault register: %s", __func__, strerror(errno)); + goto out; + } + + range_struct.start = (uintptr_t)testarea; + range_struct.len = pagesize; + if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) { + error_report("%s userfault unregister: %s", __func__, strerror(errno)); + goto out; + } + + feature_mask = (__u64)1 << _UFFDIO_WAKE | + (__u64)1 << _UFFDIO_COPY | + (__u64)1 << _UFFDIO_ZEROPAGE; + if ((reg_struct.ioctls & feature_mask) != feature_mask) { + error_report("Missing userfault map features: %" PRIx64, + (uint64_t)(~reg_struct.ioctls & feature_mask)); + goto out; + } + + /* Success! */ + ret = true; +out: + if (testarea) { + munmap(testarea, pagesize); + } + if (ufd != -1) { + close(ufd); + } + return ret; +} + +/** + * postcopy_ram_discard_range: Discard a range of memory. + * We can assume that if we've been called postcopy_ram_hosttest returned true. + * + * @mis: Current incoming migration state. + * @start, @length: range of memory to discard. + * + * returns: 0 on success. + */ +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, + size_t length) +{ + trace_postcopy_ram_discard_range(start, length); + if (madvise(start, length, MADV_DONTNEED)) { + error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno)); + return -1; + } + + return 0; +} + +/* + * Setup an area of RAM so that it *can* be used for postcopy later; this + * must be done right at the start prior to pre-copy. + * opaque should be the MIS. + */ +static int init_range(const char *block_name, void *host_addr, + ram_addr_t offset, ram_addr_t length, void *opaque) +{ + MigrationIncomingState *mis = opaque; + + trace_postcopy_init_range(block_name, host_addr, offset, length); + + /* + * We need the whole of RAM to be truly empty for postcopy, so things + * like ROMs and any data tables built during init must be zero'd + * - we're going to get the copy from the source anyway. + * (Precopy will just overwrite this data, so doesn't need the discard) + */ + if (postcopy_ram_discard_range(mis, host_addr, length)) { + return -1; + } + + return 0; +} + +/* + * At the end of migration, undo the effects of init_range + * opaque should be the MIS. + */ +static int cleanup_range(const char *block_name, void *host_addr, + ram_addr_t offset, ram_addr_t length, void *opaque) +{ + MigrationIncomingState *mis = opaque; + struct uffdio_range range_struct; + trace_postcopy_cleanup_range(block_name, host_addr, offset, length); + + /* + * We turned off hugepage for the precopy stage with postcopy enabled + * we can turn it back on now. + */ + if (qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE)) { + error_report("%s HUGEPAGE: %s", __func__, strerror(errno)); + return -1; + } + + /* + * We can also turn off userfault now since we should have all the + * pages. It can be useful to leave it on to debug postcopy + * if you're not sure it's always getting every page. + */ + range_struct.start = (uintptr_t)host_addr; + range_struct.len = length; + + if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) { + error_report("%s: userfault unregister %s", __func__, strerror(errno)); + + return -1; + } + + return 0; +} + +/* + * Initialise postcopy-ram, setting the RAM to a state where we can go into + * postcopy later; must be called prior to any precopy. + * called from arch_init's similarly named ram_postcopy_incoming_init + */ +int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages) +{ + if (qemu_ram_foreach_block(init_range, mis)) { + return -1; + } + + return 0; +} + +/* + * At the end of a migration where postcopy_ram_incoming_init was called. + */ +int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) +{ + trace_postcopy_ram_incoming_cleanup_entry(); + + if (mis->have_fault_thread) { + uint64_t tmp64; + + if (qemu_ram_foreach_block(cleanup_range, mis)) { + return -1; + } + /* + * Tell the fault_thread to exit, it's an eventfd that should + * currently be at 0, we're going to increment it to 1 + */ + tmp64 = 1; + if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) { + trace_postcopy_ram_incoming_cleanup_join(); + qemu_thread_join(&mis->fault_thread); + } else { + /* Not much we can do here, but may as well report it */ + error_report("%s: incrementing userfault_quit_fd: %s", __func__, + strerror(errno)); + } + trace_postcopy_ram_incoming_cleanup_closeuf(); + close(mis->userfault_fd); + close(mis->userfault_quit_fd); + mis->have_fault_thread = false; + } + + qemu_balloon_inhibit(false); + + if (enable_mlock) { + if (os_mlock() < 0) { + error_report("mlock: %s", strerror(errno)); + /* + * It doesn't feel right to fail at this point, we have a valid + * VM state. + */ + } + } + + postcopy_state_set(POSTCOPY_INCOMING_END); + migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0); + + if (mis->postcopy_tmp_page) { + munmap(mis->postcopy_tmp_page, getpagesize()); + mis->postcopy_tmp_page = NULL; + } + trace_postcopy_ram_incoming_cleanup_exit(); + return 0; +} + +/* + * Disable huge pages on an area + */ +static int nhp_range(const char *block_name, void *host_addr, + ram_addr_t offset, ram_addr_t length, void *opaque) +{ + trace_postcopy_nhp_range(block_name, host_addr, offset, length); + + /* + * Before we do discards we need to ensure those discards really + * do delete areas of the page, even if THP thinks a hugepage would + * be a good idea, so force hugepages off. + */ + if (qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE)) { + error_report("%s: NOHUGEPAGE: %s", __func__, strerror(errno)); + return -1; + } + + return 0; +} + +/* + * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard + * however leaving it until after precopy means that most of the precopy + * data is still THPd + */ +int postcopy_ram_prepare_discard(MigrationIncomingState *mis) +{ + if (qemu_ram_foreach_block(nhp_range, mis)) { + return -1; + } + + postcopy_state_set(POSTCOPY_INCOMING_DISCARD); + + return 0; +} + +/* + * Mark the given area of RAM as requiring notification to unwritten areas + * Used as a callback on qemu_ram_foreach_block. + * host_addr: Base of area to mark + * offset: Offset in the whole ram arena + * length: Length of the section + * opaque: MigrationIncomingState pointer + * Returns 0 on success + */ +static int ram_block_enable_notify(const char *block_name, void *host_addr, + ram_addr_t offset, ram_addr_t length, + void *opaque) +{ + MigrationIncomingState *mis = opaque; + struct uffdio_register reg_struct; + + reg_struct.range.start = (uintptr_t)host_addr; + reg_struct.range.len = length; + reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; + + /* Now tell our userfault_fd that it's responsible for this area */ + if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) { + error_report("%s userfault register: %s", __func__, strerror(errno)); + return -1; + } + + return 0; +} + +/* + * Handle faults detected by the USERFAULT markings + */ +static void *postcopy_ram_fault_thread(void *opaque) +{ + MigrationIncomingState *mis = opaque; + struct uffd_msg msg; + int ret; + size_t hostpagesize = getpagesize(); + RAMBlock *rb = NULL; + RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */ + + trace_postcopy_ram_fault_thread_entry(); + qemu_sem_post(&mis->fault_thread_sem); + + while (true) { + ram_addr_t rb_offset; + ram_addr_t in_raspace; + struct pollfd pfd[2]; + + /* + * We're mainly waiting for the kernel to give us a faulting HVA, + * however we can be told to quit via userfault_quit_fd which is + * an eventfd + */ + pfd[0].fd = mis->userfault_fd; + pfd[0].events = POLLIN; + pfd[0].revents = 0; + pfd[1].fd = mis->userfault_quit_fd; + pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ + pfd[1].revents = 0; + + if (poll(pfd, 2, -1 /* Wait forever */) == -1) { + error_report("%s: userfault poll: %s", __func__, strerror(errno)); + break; + } + + if (pfd[1].revents) { + trace_postcopy_ram_fault_thread_quit(); + break; + } + + ret = read(mis->userfault_fd, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + if (errno == EAGAIN) { + /* + * if a wake up happens on the other thread just after + * the poll, there is nothing to read. + */ + continue; + } + if (ret < 0) { + error_report("%s: Failed to read full userfault message: %s", + __func__, strerror(errno)); + break; + } else { + error_report("%s: Read %d bytes from userfaultfd expected %zd", + __func__, ret, sizeof(msg)); + break; /* Lost alignment, don't know what we'd read next */ + } + } + if (msg.event != UFFD_EVENT_PAGEFAULT) { + error_report("%s: Read unexpected event %ud from userfaultfd", + __func__, msg.event); + continue; /* It's not a page fault, shouldn't happen */ + } + + rb = qemu_ram_block_from_host( + (void *)(uintptr_t)msg.arg.pagefault.address, + true, &in_raspace, &rb_offset); + if (!rb) { + error_report("postcopy_ram_fault_thread: Fault outside guest: %" + PRIx64, (uint64_t)msg.arg.pagefault.address); + break; + } + + rb_offset &= ~(hostpagesize - 1); + trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, + qemu_ram_get_idstr(rb), + rb_offset); + + /* + * Send the request to the source - we want to request one + * of our host page sizes (which is >= TPS) + */ + if (rb != last_rb) { + last_rb = rb; + migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), + rb_offset, hostpagesize); + } else { + /* Save some space */ + migrate_send_rp_req_pages(mis, NULL, + rb_offset, hostpagesize); + } + } + trace_postcopy_ram_fault_thread_exit(); + return NULL; +} + +int postcopy_ram_enable_notify(MigrationIncomingState *mis) +{ + /* Open the fd for the kernel to give us userfaults */ + mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (mis->userfault_fd == -1) { + error_report("%s: Failed to open userfault fd: %s", __func__, + strerror(errno)); + return -1; + } + + /* + * Although the host check already tested the API, we need to + * do the check again as an ABI handshake on the new fd. + */ + if (!ufd_version_check(mis->userfault_fd)) { + return -1; + } + + /* Now an eventfd we use to tell the fault-thread to quit */ + mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC); + if (mis->userfault_quit_fd == -1) { + error_report("%s: Opening userfault_quit_fd: %s", __func__, + strerror(errno)); + close(mis->userfault_fd); + return -1; + } + + qemu_sem_init(&mis->fault_thread_sem, 0); + qemu_thread_create(&mis->fault_thread, "postcopy/fault", + postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE); + qemu_sem_wait(&mis->fault_thread_sem); + qemu_sem_destroy(&mis->fault_thread_sem); + mis->have_fault_thread = true; + + /* Mark so that we get notified of accesses to unwritten areas */ + if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) { + return -1; + } + + /* + * Ballooning can mark pages as absent while we're postcopying + * that would cause false userfaults. + */ + qemu_balloon_inhibit(true); + + trace_postcopy_ram_enable_notify(); + + return 0; +} + +/* + * Place a host page (from) at (host) atomically + * returns 0 on success + */ +int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from) +{ + struct uffdio_copy copy_struct; + + copy_struct.dst = (uint64_t)(uintptr_t)host; + copy_struct.src = (uint64_t)(uintptr_t)from; + copy_struct.len = getpagesize(); + copy_struct.mode = 0; + + /* copy also acks to the kernel waking the stalled thread up + * TODO: We can inhibit that ack and only do it if it was requested + * which would be slightly cheaper, but we'd have to be careful + * of the order of updating our page state. + */ + if (ioctl(mis->userfault_fd, UFFDIO_COPY, ©_struct)) { + int e = errno; + error_report("%s: %s copy host: %p from: %p", + __func__, strerror(e), host, from); + + return -e; + } + + trace_postcopy_place_page(host); + return 0; +} + +/* + * Place a zero page at (host) atomically + * returns 0 on success + */ +int postcopy_place_page_zero(MigrationIncomingState *mis, void *host) +{ + struct uffdio_zeropage zero_struct; + + zero_struct.range.start = (uint64_t)(uintptr_t)host; + zero_struct.range.len = getpagesize(); + zero_struct.mode = 0; + + if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) { + int e = errno; + error_report("%s: %s zero host: %p", + __func__, strerror(e), host); + + return -e; + } + + trace_postcopy_place_page_zero(host); + return 0; +} + +/* + * Returns a target page of memory that can be mapped at a later point in time + * using postcopy_place_page + * The same address is used repeatedly, postcopy_place_page just takes the + * backing page away. + * Returns: Pointer to allocated page + * + */ +void *postcopy_get_tmp_page(MigrationIncomingState *mis) +{ + if (!mis->postcopy_tmp_page) { + mis->postcopy_tmp_page = mmap(NULL, getpagesize(), + PROT_READ | PROT_WRITE, MAP_PRIVATE | + MAP_ANONYMOUS, -1, 0); + if (!mis->postcopy_tmp_page) { + error_report("%s: %s", __func__, strerror(errno)); + return NULL; + } + } + + return mis->postcopy_tmp_page; +} + +#else +/* No target OS support, stubs just fail */ +bool postcopy_ram_supported_by_host(void) +{ + error_report("%s: No OS support", __func__); + return false; +} + +int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages) +{ + error_report("postcopy_ram_incoming_init: No OS support"); + return -1; +} + +int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) +{ + assert(0); + return -1; +} + +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, + size_t length) +{ + assert(0); + return -1; +} + +int postcopy_ram_prepare_discard(MigrationIncomingState *mis) +{ + assert(0); + return -1; +} + +int postcopy_ram_enable_notify(MigrationIncomingState *mis) +{ + assert(0); + return -1; +} + +int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from) +{ + assert(0); + return -1; +} + +int postcopy_place_page_zero(MigrationIncomingState *mis, void *host) +{ + assert(0); + return -1; +} + +void *postcopy_get_tmp_page(MigrationIncomingState *mis) +{ + assert(0); + return NULL; +} + +#endif + +/* ------------------------------------------------------------------------- */ + +/** + * postcopy_discard_send_init: Called at the start of each RAMBlock before + * asking to discard individual ranges. + * + * @ms: The current migration state. + * @offset: the bitmap offset of the named RAMBlock in the migration + * bitmap. + * @name: RAMBlock that discards will operate on. + * + * returns: a new PDS. + */ +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, + unsigned long offset, + const char *name) +{ + PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState)); + + if (res) { + res->ramblock_name = name; + res->offset = offset; + } + + return res; +} + +/** + * postcopy_discard_send_range: Called by the bitmap code for each chunk to + * discard. May send a discard message, may just leave it queued to + * be sent later. + * + * @ms: Current migration state. + * @pds: Structure initialised by postcopy_discard_send_init(). + * @start,@length: a range of pages in the migration bitmap in the + * RAM block passed to postcopy_discard_send_init() (length=1 is one page) + */ +void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds, + unsigned long start, unsigned long length) +{ + size_t tp_bits = qemu_target_page_bits(); + /* Convert to byte offsets within the RAM block */ + pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits; + pds->length_list[pds->cur_entry] = length << tp_bits; + trace_postcopy_discard_send_range(pds->ramblock_name, start, length); + pds->cur_entry++; + pds->nsentwords++; + + if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) { + /* Full set, ship it! */ + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name, + pds->cur_entry, + pds->start_list, + pds->length_list); + pds->nsentcmds++; + pds->cur_entry = 0; + } +} + +/** + * postcopy_discard_send_finish: Called at the end of each RAMBlock by the + * bitmap code. Sends any outstanding discard messages, frees the PDS + * + * @ms: Current migration state. + * @pds: Structure initialised by postcopy_discard_send_init(). + */ +void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds) +{ + /* Anything unsent? */ + if (pds->cur_entry) { + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name, + pds->cur_entry, + pds->start_list, + pds->length_list); + pds->nsentcmds++; + } + + trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords, + pds->nsentcmds); + + g_free(pds); +} diff --git a/migration/qemu-file-unix.c b/migration/qemu-file-unix.c index 809bf070d7..c503b027a9 100644 --- a/migration/qemu-file-unix.c +++ b/migration/qemu-file-unix.c @@ -22,6 +22,7 @@ * THE SOFTWARE. */ #include "qemu-common.h" +#include "qemu/error-report.h" #include "qemu/iov.h" #include "qemu/sockets.h" #include "qemu/coroutine.h" @@ -39,12 +40,43 @@ static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, QEMUFileSocket *s = opaque; ssize_t len; ssize_t size = iov_size(iov, iovcnt); + ssize_t offset = 0; + int err; - len = iov_send(s->fd, iov, iovcnt, 0, size); - if (len < size) { - len = -socket_error(); - } - return len; + while (size > 0) { + len = iov_send(s->fd, iov, iovcnt, offset, size); + + if (len > 0) { + size -= len; + offset += len; + } + + if (size > 0) { + err = socket_error(); + + if (err != EAGAIN && err != EWOULDBLOCK) { + error_report("socket_writev_buffer: Got err=%d for (%zu/%zu)", + err, (size_t)size, (size_t)len); + /* + * If I've already sent some but only just got the error, I + * could return the amount validly sent so far and wait for the + * next call to report the error, but I'd rather flag the error + * immediately. + */ + return -err; + } + + /* Emulate blocking */ + GPollFD pfd; + + pfd.fd = s->fd; + pfd.events = G_IO_OUT | G_IO_ERR; + pfd.revents = 0; + g_poll(&pfd, 1 /* 1 fd */, -1 /* no timeout */); + } + } + + return offset; } static int socket_get_fd(void *opaque) @@ -97,6 +129,56 @@ static int socket_shutdown(void *opaque, bool rd, bool wr) } } +static int socket_return_close(void *opaque) +{ + QEMUFileSocket *s = opaque; + /* + * Note: We don't close the socket, that should be done by the forward + * path. + */ + g_free(s); + return 0; +} + +static const QEMUFileOps socket_return_read_ops = { + .get_fd = socket_get_fd, + .get_buffer = socket_get_buffer, + .close = socket_return_close, + .shut_down = socket_shutdown, +}; + +static const QEMUFileOps socket_return_write_ops = { + .get_fd = socket_get_fd, + .writev_buffer = socket_writev_buffer, + .close = socket_return_close, + .shut_down = socket_shutdown, +}; + +/* + * Give a QEMUFile* off the same socket but data in the opposite + * direction. + */ +static QEMUFile *socket_get_return_path(void *opaque) +{ + QEMUFileSocket *forward = opaque; + QEMUFileSocket *reverse; + + if (qemu_file_get_error(forward->file)) { + /* If the forward file is in error, don't try and open a return */ + return NULL; + } + + reverse = g_malloc0(sizeof(QEMUFileSocket)); + reverse->fd = forward->fd; + /* I don't think there's a better way to tell which direction 'this' is */ + if (forward->file->ops->get_buffer != NULL) { + /* being called from the read side, so we need to be able to write */ + return qemu_fopen_ops(reverse, &socket_return_write_ops); + } else { + return qemu_fopen_ops(reverse, &socket_return_read_ops); + } +} + static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, int64_t pos) { @@ -206,18 +288,19 @@ QEMUFile *qemu_fdopen(int fd, const char *mode) } static const QEMUFileOps socket_read_ops = { - .get_fd = socket_get_fd, - .get_buffer = socket_get_buffer, - .close = socket_close, - .shut_down = socket_shutdown - + .get_fd = socket_get_fd, + .get_buffer = socket_get_buffer, + .close = socket_close, + .shut_down = socket_shutdown, + .get_return_path = socket_get_return_path }; static const QEMUFileOps socket_write_ops = { - .get_fd = socket_get_fd, - .writev_buffer = socket_writev_buffer, - .close = socket_close, - .shut_down = socket_shutdown + .get_fd = socket_get_fd, + .writev_buffer = socket_writev_buffer, + .close = socket_close, + .shut_down = socket_shutdown, + .get_return_path = socket_get_return_path }; QEMUFile *qemu_fopen_socket(int fd, const char *mode) diff --git a/migration/qemu-file.c b/migration/qemu-file.c index df49023ed8..0bbd2574a8 100644 --- a/migration/qemu-file.c +++ b/migration/qemu-file.c @@ -44,6 +44,18 @@ int qemu_file_shutdown(QEMUFile *f) return f->ops->shut_down(f->opaque, true, true); } +/* + * Result: QEMUFile* for a 'return path' for comms in the opposite direction + * NULL if not available + */ +QEMUFile *qemu_file_get_return_path(QEMUFile *f) +{ + if (!f->ops->get_return_path) { + return NULL; + } + return f->ops->get_return_path(f->opaque); +} + bool qemu_file_mode_is_not_valid(const char *mode) { if (mode == NULL || @@ -433,6 +445,43 @@ size_t qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size) return done; } +/* + * Read 'size' bytes of data from the file. + * 'size' can be larger than the internal buffer. + * + * The data: + * may be held on an internal buffer (in which case *buf is updated + * to point to it) that is valid until the next qemu_file operation. + * OR + * will be copied to the *buf that was passed in. + * + * The code tries to avoid the copy if possible. + * + * It will return size bytes unless there was an error, in which case it will + * return as many as it managed to read (assuming blocking fd's which + * all current QEMUFile are) + * + * Note: Since **buf may get changed, the caller should take care to + * keep a pointer to the original buffer if it needs to deallocate it. + */ +size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size) +{ + if (size < IO_BUF_SIZE) { + size_t res; + uint8_t *src; + + res = qemu_peek_buffer(f, &src, size, 0); + + if (res == size) { + qemu_file_skip(f, res); + *buf = src; + return res; + } + } + + return qemu_get_buffer(f, *buf, size); +} + /* * Peeks a single byte from the buffer; this isn't guaranteed to work if * offset leaves a gap after the previous read/peeked data. @@ -611,3 +660,18 @@ size_t qemu_get_counted_string(QEMUFile *f, char buf[256]) return res == len ? res : 0; } + +/* + * Set the blocking state of the QEMUFile. + * Note: On some transports the OS only keeps a single blocking state for + * both directions, and thus changing the blocking on the main + * QEMUFile can also affect the return path. + */ +void qemu_file_set_blocking(QEMUFile *f, bool block) +{ + if (block) { + qemu_set_block(qemu_get_fd(f)); + } else { + qemu_set_nonblock(qemu_get_fd(f)); + } +} diff --git a/migration/ram.c b/migration/ram.c index df3df9e3bf..62cf42bfdb 100644 --- a/migration/ram.c +++ b/migration/ram.c @@ -32,6 +32,7 @@ #include "qemu/timer.h" #include "qemu/main-loop.h" #include "migration/migration.h" +#include "migration/postcopy-ram.h" #include "exec/address-spaces.h" #include "migration/page_cache.h" #include "qemu/error-report.h" @@ -237,7 +238,14 @@ typedef struct PageSearchStatus PageSearchStatus; static struct BitmapRcu { struct rcu_head rcu; + /* Main migration bitmap */ unsigned long *bmap; + /* bitmap of pages that haven't been sent even once + * only maintained and used in postcopy at the moment + * where it's used to send the dirtymap at the start + * of the postcopy phase + */ + unsigned long *unsentmap; } *migration_bitmap_rcu; struct CompressParam { @@ -531,10 +539,18 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, return 1; } -/* Called with rcu_read_lock() to protect migration_bitmap */ +/* Called with rcu_read_lock() to protect migration_bitmap + * rb: The RAMBlock to search for dirty pages in + * start: Start address (typically so we can continue from previous page) + * ram_addr_abs: Pointer into which to store the address of the dirty page + * within the global ram_addr space + * + * Returns: byte offset within memory region of the start of a dirty page + */ static inline -ram_addr_t migration_bitmap_find_and_reset_dirty(RAMBlock *rb, - ram_addr_t start) +ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb, + ram_addr_t start, + ram_addr_t *ram_addr_abs) { unsigned long base = rb->offset >> TARGET_PAGE_BITS; unsigned long nr = base + (start >> TARGET_PAGE_BITS); @@ -551,14 +567,24 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(RAMBlock *rb, next = find_next_bit(bitmap, size, nr); } - if (next < size) { - clear_bit(next, bitmap); - migration_dirty_pages--; - } + *ram_addr_abs = next << TARGET_PAGE_BITS; return (next - base) << TARGET_PAGE_BITS; } -/* Called with rcu_read_lock() to protect migration_bitmap */ +static inline bool migration_bitmap_clear_dirty(ram_addr_t addr) +{ + bool ret; + int nr = addr >> TARGET_PAGE_BITS; + unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + + ret = test_and_clear_bit(nr, bitmap); + + if (ret) { + migration_dirty_pages--; + } + return ret; +} + static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) { unsigned long *bitmap; @@ -951,12 +977,14 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block, * @f: Current migration stream. * @pss: Data about the state of the current dirty page scan. * @*again: Set to false if the search has scanned the whole of RAM + * *ram_addr_abs: Pointer into which to store the address of the dirty page + * within the global ram_addr space */ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, - bool *again) + bool *again, ram_addr_t *ram_addr_abs) { - pss->offset = migration_bitmap_find_and_reset_dirty(pss->block, - pss->offset); + pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset, + ram_addr_abs); if (pss->complete_round && pss->block == last_seen_block && pss->offset >= last_offset) { /* @@ -995,6 +1023,276 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, } } +/* + * Helper for 'get_queued_page' - gets a page off the queue + * ms: MigrationState in + * *offset: Used to return the offset within the RAMBlock + * ram_addr_abs: global offset in the dirty/sent bitmaps + * + * Returns: block (or NULL if none available) + */ +static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, + ram_addr_t *ram_addr_abs) +{ + RAMBlock *block = NULL; + + qemu_mutex_lock(&ms->src_page_req_mutex); + if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) { + struct MigrationSrcPageRequest *entry = + QSIMPLEQ_FIRST(&ms->src_page_requests); + block = entry->rb; + *offset = entry->offset; + *ram_addr_abs = (entry->offset + entry->rb->offset) & + TARGET_PAGE_MASK; + + if (entry->len > TARGET_PAGE_SIZE) { + entry->len -= TARGET_PAGE_SIZE; + entry->offset += TARGET_PAGE_SIZE; + } else { + memory_region_unref(block->mr); + QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); + g_free(entry); + } + } + qemu_mutex_unlock(&ms->src_page_req_mutex); + + return block; +} + +/* + * Unqueue a page from the queue fed by postcopy page requests; skips pages + * that are already sent (!dirty) + * + * ms: MigrationState in + * pss: PageSearchStatus structure updated with found block/offset + * ram_addr_abs: global offset in the dirty/sent bitmaps + * + * Returns: true if a queued page is found + */ +static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, + ram_addr_t *ram_addr_abs) +{ + RAMBlock *block; + ram_addr_t offset; + bool dirty; + + do { + block = unqueue_page(ms, &offset, ram_addr_abs); + /* + * We're sending this page, and since it's postcopy nothing else + * will dirty it, and we must make sure it doesn't get sent again + * even if this queue request was received after the background + * search already sent it. + */ + if (block) { + unsigned long *bitmap; + bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap); + if (!dirty) { + trace_get_queued_page_not_dirty( + block->idstr, (uint64_t)offset, + (uint64_t)*ram_addr_abs, + test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, + atomic_rcu_read(&migration_bitmap_rcu)->unsentmap)); + } else { + trace_get_queued_page(block->idstr, + (uint64_t)offset, + (uint64_t)*ram_addr_abs); + } + } + + } while (block && !dirty); + + if (block) { + /* + * As soon as we start servicing pages out of order, then we have + * to kill the bulk stage, since the bulk stage assumes + * in (migration_bitmap_find_and_reset_dirty) that every page is + * dirty, that's no longer true. + */ + ram_bulk_stage = false; + + /* + * We want the background search to continue from the queued page + * since the guest is likely to want other pages near to the page + * it just requested. + */ + pss->block = block; + pss->offset = offset; + } + + return !!block; +} + +/** + * flush_page_queue: Flush any remaining pages in the ram request queue + * it should be empty at the end anyway, but in error cases there may be + * some left. + * + * ms: MigrationState + */ +void flush_page_queue(MigrationState *ms) +{ + struct MigrationSrcPageRequest *mspr, *next_mspr; + /* This queue generally should be empty - but in the case of a failed + * migration might have some droppings in. + */ + rcu_read_lock(); + QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) { + memory_region_unref(mspr->rb->mr); + QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); + g_free(mspr); + } + rcu_read_unlock(); +} + +/** + * Queue the pages for transmission, e.g. a request from postcopy destination + * ms: MigrationStatus in which the queue is held + * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last) + * start: Offset from the start of the RAMBlock + * len: Length (in bytes) to send + * Return: 0 on success + */ +int ram_save_queue_pages(MigrationState *ms, const char *rbname, + ram_addr_t start, ram_addr_t len) +{ + RAMBlock *ramblock; + + rcu_read_lock(); + if (!rbname) { + /* Reuse last RAMBlock */ + ramblock = ms->last_req_rb; + + if (!ramblock) { + /* + * Shouldn't happen, we can't reuse the last RAMBlock if + * it's the 1st request. + */ + error_report("ram_save_queue_pages no previous block"); + goto err; + } + } else { + ramblock = qemu_ram_block_by_name(rbname); + + if (!ramblock) { + /* We shouldn't be asked for a non-existent RAMBlock */ + error_report("ram_save_queue_pages no block '%s'", rbname); + goto err; + } + ms->last_req_rb = ramblock; + } + trace_ram_save_queue_pages(ramblock->idstr, start, len); + if (start+len > ramblock->used_length) { + error_report("%s request overrun start=%zx len=%zx blocklen=%zx", + __func__, start, len, ramblock->used_length); + goto err; + } + + struct MigrationSrcPageRequest *new_entry = + g_malloc0(sizeof(struct MigrationSrcPageRequest)); + new_entry->rb = ramblock; + new_entry->offset = start; + new_entry->len = len; + + memory_region_ref(ramblock->mr); + qemu_mutex_lock(&ms->src_page_req_mutex); + QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req); + qemu_mutex_unlock(&ms->src_page_req_mutex); + rcu_read_unlock(); + + return 0; + +err: + rcu_read_unlock(); + return -1; +} + +/** + * ram_save_target_page: Save one target page + * + * + * @f: QEMUFile where to send the data + * @block: pointer to block that contains the page we want to send + * @offset: offset inside the block for the page; + * @last_stage: if we are at the completion stage + * @bytes_transferred: increase it with the number of transferred bytes + * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space + * + * Returns: Number of pages written. + */ +static int ram_save_target_page(MigrationState *ms, QEMUFile *f, + RAMBlock *block, ram_addr_t offset, + bool last_stage, + uint64_t *bytes_transferred, + ram_addr_t dirty_ram_abs) +{ + int res = 0; + + /* Check the pages is dirty and if it is send it */ + if (migration_bitmap_clear_dirty(dirty_ram_abs)) { + unsigned long *unsentmap; + if (compression_switch && migrate_use_compression()) { + res = ram_save_compressed_page(f, block, offset, + last_stage, + bytes_transferred); + } else { + res = ram_save_page(f, block, offset, last_stage, + bytes_transferred); + } + + if (res < 0) { + return res; + } + unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; + if (unsentmap) { + clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap); + } + } + + return res; +} + +/** + * ram_save_host_page: Starting at *offset send pages upto the end + * of the current host page. It's valid for the initial + * offset to point into the middle of a host page + * in which case the remainder of the hostpage is sent. + * Only dirty target pages are sent. + * + * Returns: Number of pages written. + * + * @f: QEMUFile where to send the data + * @block: pointer to block that contains the page we want to send + * @offset: offset inside the block for the page; updated to last target page + * sent + * @last_stage: if we are at the completion stage + * @bytes_transferred: increase it with the number of transferred bytes + * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space + */ +static int ram_save_host_page(MigrationState *ms, QEMUFile *f, RAMBlock *block, + ram_addr_t *offset, bool last_stage, + uint64_t *bytes_transferred, + ram_addr_t dirty_ram_abs) +{ + int tmppages, pages = 0; + do { + tmppages = ram_save_target_page(ms, f, block, *offset, last_stage, + bytes_transferred, dirty_ram_abs); + if (tmppages < 0) { + return tmppages; + } + + pages += tmppages; + *offset += TARGET_PAGE_SIZE; + dirty_ram_abs += TARGET_PAGE_SIZE; + } while (*offset & (qemu_host_page_size - 1)); + + /* The offset we leave with is the last one we looked at */ + *offset -= TARGET_PAGE_SIZE; + return pages; +} + /** * ram_find_and_save_block: Finds a dirty page and sends it to f * @@ -1006,14 +1304,20 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, * @f: QEMUFile where to send the data * @last_stage: if we are at the completion stage * @bytes_transferred: increase it with the number of transferred bytes + * + * On systems where host-page-size > target-page-size it will send all the + * pages in a host page that are dirty. */ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, uint64_t *bytes_transferred) { PageSearchStatus pss; + MigrationState *ms = migrate_get_current(); int pages = 0; bool again, found; + ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in + ram_addr_t space */ pss.block = last_seen_block; pss.offset = last_offset; @@ -1024,22 +1328,18 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, } do { - found = find_dirty_block(f, &pss, &again); + again = true; + found = get_queued_page(ms, &pss, &dirty_ram_abs); + + if (!found) { + /* priority queue empty, so just search for something dirty */ + found = find_dirty_block(f, &pss, &again, &dirty_ram_abs); + } if (found) { - if (compression_switch && migrate_use_compression()) { - pages = ram_save_compressed_page(f, pss.block, pss.offset, - last_stage, - bytes_transferred); - } else { - pages = ram_save_page(f, pss.block, pss.offset, last_stage, - bytes_transferred); - } - - /* if page is unmodified, continue to the next */ - if (pages > 0) { - last_sent_block = pss.block; - } + pages = ram_save_host_page(ms, f, pss.block, &pss.offset, + last_stage, bytes_transferred, + dirty_ram_abs); } } while (!pages && again); @@ -1097,6 +1397,7 @@ void free_xbzrle_decoded_buf(void) static void migration_bitmap_free(struct BitmapRcu *bmap) { g_free(bmap->bmap); + g_free(bmap->unsentmap); g_free(bmap); } @@ -1153,6 +1454,13 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) qemu_mutex_lock(&migration_bitmap_mutex); bitmap_copy(bitmap->bmap, old_bitmap->bmap, old); bitmap_set(bitmap->bmap, old, new - old); + + /* We don't have a way to safely extend the sentmap + * with RCU; so mark it as missing, entry to postcopy + * will fail. + */ + bitmap->unsentmap = NULL; + atomic_rcu_set(&migration_bitmap_rcu, bitmap); qemu_mutex_unlock(&migration_bitmap_mutex); migration_dirty_pages += new - old; @@ -1160,6 +1468,394 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) } } +/* + * 'expected' is the value you expect the bitmap mostly to be full + * of; it won't bother printing lines that are all this value. + * If 'todump' is null the migration bitmap is dumped. + */ +void ram_debug_dump_bitmap(unsigned long *todump, bool expected) +{ + int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; + + int64_t cur; + int64_t linelen = 128; + char linebuf[129]; + + if (!todump) { + todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + } + + for (cur = 0; cur < ram_pages; cur += linelen) { + int64_t curb; + bool found = false; + /* + * Last line; catch the case where the line length + * is longer than remaining ram + */ + if (cur + linelen > ram_pages) { + linelen = ram_pages - cur; + } + for (curb = 0; curb < linelen; curb++) { + bool thisbit = test_bit(cur + curb, todump); + linebuf[curb] = thisbit ? '1' : '.'; + found = found || (thisbit != expected); + } + if (found) { + linebuf[curb] = '\0'; + fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); + } + } +} + +/* **** functions for postcopy ***** */ + +/* + * Callback from postcopy_each_ram_send_discard for each RAMBlock + * Note: At this point the 'unsentmap' is the processed bitmap combined + * with the dirtymap; so a '1' means it's either dirty or unsent. + * start,length: Indexes into the bitmap for the first bit + * representing the named block and length in target-pages + */ +static int postcopy_send_discard_bm_ram(MigrationState *ms, + PostcopyDiscardState *pds, + unsigned long start, + unsigned long length) +{ + unsigned long end = start + length; /* one after the end */ + unsigned long current; + unsigned long *unsentmap; + + unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; + for (current = start; current < end; ) { + unsigned long one = find_next_bit(unsentmap, end, current); + + if (one <= end) { + unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1); + unsigned long discard_length; + + if (zero >= end) { + discard_length = end - one; + } else { + discard_length = zero - one; + } + postcopy_discard_send_range(ms, pds, one, discard_length); + current = one + discard_length; + } else { + current = one; + } + } + + return 0; +} + +/* + * Utility for the outgoing postcopy code. + * Calls postcopy_send_discard_bm_ram for each RAMBlock + * passing it bitmap indexes and name. + * Returns: 0 on success + * (qemu_ram_foreach_block ends up passing unscaled lengths + * which would mean postcopy code would have to deal with target page) + */ +static int postcopy_each_ram_send_discard(MigrationState *ms) +{ + struct RAMBlock *block; + int ret; + + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + unsigned long first = block->offset >> TARGET_PAGE_BITS; + PostcopyDiscardState *pds = postcopy_discard_send_init(ms, + first, + block->idstr); + + /* + * Postcopy sends chunks of bitmap over the wire, but it + * just needs indexes at this point, avoids it having + * target page specific code. + */ + ret = postcopy_send_discard_bm_ram(ms, pds, first, + block->used_length >> TARGET_PAGE_BITS); + postcopy_discard_send_finish(ms, pds); + if (ret) { + return ret; + } + } + + return 0; +} + +/* + * Helper for postcopy_chunk_hostpages; it's called twice to cleanup + * the two bitmaps, that are similar, but one is inverted. + * + * We search for runs of target-pages that don't start or end on a + * host page boundary; + * unsent_pass=true: Cleans up partially unsent host pages by searching + * the unsentmap + * unsent_pass=false: Cleans up partially dirty host pages by searching + * the main migration bitmap + * + */ +static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass, + RAMBlock *block, + PostcopyDiscardState *pds) +{ + unsigned long *bitmap; + unsigned long *unsentmap; + unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE; + unsigned long first = block->offset >> TARGET_PAGE_BITS; + unsigned long len = block->used_length >> TARGET_PAGE_BITS; + unsigned long last = first + (len - 1); + unsigned long run_start; + + bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; + + if (unsent_pass) { + /* Find a sent page */ + run_start = find_next_zero_bit(unsentmap, last + 1, first); + } else { + /* Find a dirty page */ + run_start = find_next_bit(bitmap, last + 1, first); + } + + while (run_start <= last) { + bool do_fixup = false; + unsigned long fixup_start_addr; + unsigned long host_offset; + + /* + * If the start of this run of pages is in the middle of a host + * page, then we need to fixup this host page. + */ + host_offset = run_start % host_ratio; + if (host_offset) { + do_fixup = true; + run_start -= host_offset; + fixup_start_addr = run_start; + /* For the next pass */ + run_start = run_start + host_ratio; + } else { + /* Find the end of this run */ + unsigned long run_end; + if (unsent_pass) { + run_end = find_next_bit(unsentmap, last + 1, run_start + 1); + } else { + run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1); + } + /* + * If the end isn't at the start of a host page, then the + * run doesn't finish at the end of a host page + * and we need to discard. + */ + host_offset = run_end % host_ratio; + if (host_offset) { + do_fixup = true; + fixup_start_addr = run_end - host_offset; + /* + * This host page has gone, the next loop iteration starts + * from after the fixup + */ + run_start = fixup_start_addr + host_ratio; + } else { + /* + * No discards on this iteration, next loop starts from + * next sent/dirty page + */ + run_start = run_end + 1; + } + } + + if (do_fixup) { + unsigned long page; + + /* Tell the destination to discard this page */ + if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) { + /* For the unsent_pass we: + * discard partially sent pages + * For the !unsent_pass (dirty) we: + * discard partially dirty pages that were sent + * (any partially sent pages were already discarded + * by the previous unsent_pass) + */ + postcopy_discard_send_range(ms, pds, fixup_start_addr, + host_ratio); + } + + /* Clean up the bitmap */ + for (page = fixup_start_addr; + page < fixup_start_addr + host_ratio; page++) { + /* All pages in this host page are now not sent */ + set_bit(page, unsentmap); + + /* + * Remark them as dirty, updating the count for any pages + * that weren't previously dirty. + */ + migration_dirty_pages += !test_and_set_bit(page, bitmap); + } + } + + if (unsent_pass) { + /* Find the next sent page for the next iteration */ + run_start = find_next_zero_bit(unsentmap, last + 1, + run_start); + } else { + /* Find the next dirty page for the next iteration */ + run_start = find_next_bit(bitmap, last + 1, run_start); + } + } +} + +/* + * Utility for the outgoing postcopy code. + * + * Discard any partially sent host-page size chunks, mark any partially + * dirty host-page size chunks as all dirty. + * + * Returns: 0 on success + */ +static int postcopy_chunk_hostpages(MigrationState *ms) +{ + struct RAMBlock *block; + + if (qemu_host_page_size == TARGET_PAGE_SIZE) { + /* Easy case - TPS==HPS - nothing to be done */ + return 0; + } + + /* Easiest way to make sure we don't resume in the middle of a host-page */ + last_seen_block = NULL; + last_sent_block = NULL; + last_offset = 0; + + QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { + unsigned long first = block->offset >> TARGET_PAGE_BITS; + + PostcopyDiscardState *pds = + postcopy_discard_send_init(ms, first, block->idstr); + + /* First pass: Discard all partially sent host pages */ + postcopy_chunk_hostpages_pass(ms, true, block, pds); + /* + * Second pass: Ensure that all partially dirty host pages are made + * fully dirty. + */ + postcopy_chunk_hostpages_pass(ms, false, block, pds); + + postcopy_discard_send_finish(ms, pds); + } /* ram_list loop */ + + return 0; +} + +/* + * Transmit the set of pages to be discarded after precopy to the target + * these are pages that: + * a) Have been previously transmitted but are now dirty again + * b) Pages that have never been transmitted, this ensures that + * any pages on the destination that have been mapped by background + * tasks get discarded (transparent huge pages is the specific concern) + * Hopefully this is pretty sparse + */ +int ram_postcopy_send_discard_bitmap(MigrationState *ms) +{ + int ret; + unsigned long *bitmap, *unsentmap; + + rcu_read_lock(); + + /* This should be our last sync, the src is now paused */ + migration_bitmap_sync(); + + unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; + if (!unsentmap) { + /* We don't have a safe way to resize the sentmap, so + * if the bitmap was resized it will be NULL at this + * point. + */ + error_report("migration ram resized during precopy phase"); + rcu_read_unlock(); + return -EINVAL; + } + + /* Deal with TPS != HPS */ + ret = postcopy_chunk_hostpages(ms); + if (ret) { + rcu_read_unlock(); + return ret; + } + + /* + * Update the unsentmap to be unsentmap = unsentmap | dirty + */ + bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; + bitmap_or(unsentmap, unsentmap, bitmap, + last_ram_offset() >> TARGET_PAGE_BITS); + + + trace_ram_postcopy_send_discard_bitmap(); +#ifdef DEBUG_POSTCOPY + ram_debug_dump_bitmap(unsentmap, true); +#endif + + ret = postcopy_each_ram_send_discard(ms); + rcu_read_unlock(); + + return ret; +} + +/* + * At the start of the postcopy phase of migration, any now-dirty + * precopied pages are discarded. + * + * start, length describe a byte address range within the RAMBlock + * + * Returns 0 on success. + */ +int ram_discard_range(MigrationIncomingState *mis, + const char *block_name, + uint64_t start, size_t length) +{ + int ret = -1; + + rcu_read_lock(); + RAMBlock *rb = qemu_ram_block_by_name(block_name); + + if (!rb) { + error_report("ram_discard_range: Failed to find block '%s'", + block_name); + goto err; + } + + uint8_t *host_startaddr = rb->host + start; + + if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) { + error_report("ram_discard_range: Unaligned start address: %p", + host_startaddr); + goto err; + } + + if ((start + length) <= rb->used_length) { + uint8_t *host_endaddr = host_startaddr + length; + if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) { + error_report("ram_discard_range: Unaligned end address: %p", + host_endaddr); + goto err; + } + ret = postcopy_ram_discard_range(mis, host_startaddr, length); + } else { + error_report("ram_discard_range: Overrun block '%s' (%" PRIu64 + "/%zu/%zu)", + block_name, start, length, rb->used_length); + } + +err: + rcu_read_unlock(); + + return ret; +} + + /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has * long-running RCU critical section. When rcu-reclaims in the code * start to become numerous it will be necessary to reduce the @@ -1214,10 +1910,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque) reset_ram_globals(); ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; - migration_bitmap_rcu = g_new(struct BitmapRcu, 1); + migration_bitmap_rcu = g_new0(struct BitmapRcu, 1); migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); + if (migrate_postcopy_ram()) { + migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); + bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); + } + /* * Count the total number of pages used by ram blocks not including any * gaps due to alignment or unplugs. @@ -1317,7 +2018,9 @@ static int ram_save_complete(QEMUFile *f, void *opaque) { rcu_read_lock(); - migration_bitmap_sync(); + if (!migration_in_postcopy(migrate_get_current())) { + migration_bitmap_sync(); + } ram_control_before_iterate(f, RAM_CONTROL_FINISH); @@ -1344,13 +2047,16 @@ static int ram_save_complete(QEMUFile *f, void *opaque) return 0; } -static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) +static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, + uint64_t *non_postcopiable_pending, + uint64_t *postcopiable_pending) { uint64_t remaining_size; remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; - if (remaining_size < max_size) { + if (!migration_in_postcopy(migrate_get_current()) && + remaining_size < max_size) { qemu_mutex_lock_iothread(); rcu_read_lock(); migration_bitmap_sync(); @@ -1358,7 +2064,9 @@ static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) qemu_mutex_unlock_iothread(); remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; } - return remaining_size; + + /* We can do postcopy, and all the data is postcopiable */ + *postcopiable_pending += remaining_size; } static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) @@ -1399,6 +2107,14 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) /* Must be called from within a rcu critical section. * Returns a pointer from within the RCU-protected ram_list. */ +/* + * Read a RAMBlock ID from the stream f, find the host address of the + * start of that block and add on 'offset' + * + * f: Stream to read from + * offset: Offset within the block + * flags: Page flags (mostly to see if it's a continuation of previous block) + */ static inline void *host_from_stream_offset(QEMUFile *f, ram_addr_t offset, int flags) @@ -1420,14 +2136,12 @@ static inline void *host_from_stream_offset(QEMUFile *f, qemu_get_buffer(f, (uint8_t *)id, len); id[len] = 0; - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { - if (!strncmp(id, block->idstr, sizeof(id)) && - block->max_length > offset) { - return block->host + offset; - } + block = qemu_ram_block_by_name(id); + if (block && block->max_length > offset) { + return block->host + offset; } - error_report("Can't find block %s!", id); + error_report("Can't find block %s", id); return NULL; } @@ -1535,11 +2249,148 @@ static void decompress_data_with_multi_threads(uint8_t *compbuf, } } +/* + * Allocate data structures etc needed by incoming migration with postcopy-ram + * postcopy-ram's similarly names postcopy_ram_incoming_init does the work + */ +int ram_postcopy_incoming_init(MigrationIncomingState *mis) +{ + size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; + + return postcopy_ram_incoming_init(mis, ram_pages); +} + +/* + * Called in postcopy mode by ram_load(). + * rcu_read_lock is taken prior to this being called. + */ +static int ram_load_postcopy(QEMUFile *f) +{ + int flags = 0, ret = 0; + bool place_needed = false; + bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE; + MigrationIncomingState *mis = migration_incoming_get_current(); + /* Temporary page that is later 'placed' */ + void *postcopy_host_page = postcopy_get_tmp_page(mis); + void *last_host = NULL; + + while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { + ram_addr_t addr; + void *host = NULL; + void *page_buffer = NULL; + void *place_source = NULL; + uint8_t ch; + bool all_zero = false; + + addr = qemu_get_be64(f); + flags = addr & ~TARGET_PAGE_MASK; + addr &= TARGET_PAGE_MASK; + + trace_ram_load_postcopy_loop((uint64_t)addr, flags); + place_needed = false; + if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) { + host = host_from_stream_offset(f, addr, flags); + if (!host) { + error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); + ret = -EINVAL; + break; + } + page_buffer = host; + /* + * Postcopy requires that we place whole host pages atomically. + * To make it atomic, the data is read into a temporary page + * that's moved into place later. + * The migration protocol uses, possibly smaller, target-pages + * however the source ensures it always sends all the components + * of a host page in order. + */ + page_buffer = postcopy_host_page + + ((uintptr_t)host & ~qemu_host_page_mask); + /* If all TP are zero then we can optimise the place */ + if (!((uintptr_t)host & ~qemu_host_page_mask)) { + all_zero = true; + } else { + /* not the 1st TP within the HP */ + if (host != (last_host + TARGET_PAGE_SIZE)) { + error_report("Non-sequential target page %p/%p\n", + host, last_host); + ret = -EINVAL; + break; + } + } + + + /* + * If it's the last part of a host page then we place the host + * page + */ + place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) & + ~qemu_host_page_mask) == 0; + place_source = postcopy_host_page; + } + last_host = host; + + switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { + case RAM_SAVE_FLAG_COMPRESS: + ch = qemu_get_byte(f); + memset(page_buffer, ch, TARGET_PAGE_SIZE); + if (ch) { + all_zero = false; + } + break; + + case RAM_SAVE_FLAG_PAGE: + all_zero = false; + if (!place_needed || !matching_page_sizes) { + qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); + } else { + /* Avoids the qemu_file copy during postcopy, which is + * going to do a copy later; can only do it when we + * do this read in one go (matching page sizes) + */ + qemu_get_buffer_in_place(f, (uint8_t **)&place_source, + TARGET_PAGE_SIZE); + } + break; + case RAM_SAVE_FLAG_EOS: + /* normal exit */ + break; + default: + error_report("Unknown combination of migration flags: %#x" + " (postcopy mode)", flags); + ret = -EINVAL; + } + + if (place_needed) { + /* This gets called at the last target page in the host page */ + if (all_zero) { + ret = postcopy_place_page_zero(mis, + host + TARGET_PAGE_SIZE - + qemu_host_page_size); + } else { + ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE - + qemu_host_page_size, + place_source); + } + } + if (!ret) { + ret = qemu_file_get_error(f); + } + } + + return ret; +} + static int ram_load(QEMUFile *f, void *opaque, int version_id) { int flags = 0, ret = 0; static uint64_t seq_iter; int len = 0; + /* + * If system is running in postcopy mode, page inserts to host memory must + * be atomic + */ + bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING; seq_iter++; @@ -1553,15 +2404,30 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) * critical section. */ rcu_read_lock(); - while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { + + if (postcopy_running) { + ret = ram_load_postcopy(f); + } + + while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) { ram_addr_t addr, total_ram_bytes; - void *host; + void *host = NULL; uint8_t ch; addr = qemu_get_be64(f); flags = addr & ~TARGET_PAGE_MASK; addr &= TARGET_PAGE_MASK; + if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE | + RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { + host = host_from_stream_offset(f, addr, flags); + if (!host) { + error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); + ret = -EINVAL; + break; + } + } + switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { case RAM_SAVE_FLAG_MEM_SIZE: /* Synchronize RAM block list */ @@ -1576,23 +2442,20 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) id[len] = 0; length = qemu_get_be64(f); - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { - if (!strncmp(id, block->idstr, sizeof(id))) { - if (length != block->used_length) { - Error *local_err = NULL; + block = qemu_ram_block_by_name(id); + if (block) { + if (length != block->used_length) { + Error *local_err = NULL; - ret = qemu_ram_resize(block->offset, length, &local_err); - if (local_err) { - error_report_err(local_err); - } + ret = qemu_ram_resize(block->offset, length, + &local_err); + if (local_err) { + error_report_err(local_err); } - ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, - block->idstr); - break; } - } - - if (!block) { + ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, + block->idstr); + } else { error_report("Unknown ramblock \"%s\", cannot " "accept migration", id); ret = -EINVAL; @@ -1601,33 +2464,17 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) total_ram_bytes -= length; } break; + case RAM_SAVE_FLAG_COMPRESS: - host = host_from_stream_offset(f, addr, flags); - if (!host) { - error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); - ret = -EINVAL; - break; - } ch = qemu_get_byte(f); ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); break; + case RAM_SAVE_FLAG_PAGE: - host = host_from_stream_offset(f, addr, flags); - if (!host) { - error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); - ret = -EINVAL; - break; - } qemu_get_buffer(f, host, TARGET_PAGE_SIZE); break; - case RAM_SAVE_FLAG_COMPRESS_PAGE: - host = host_from_stream_offset(f, addr, flags); - if (!host) { - error_report("Invalid RAM offset " RAM_ADDR_FMT, addr); - ret = -EINVAL; - break; - } + case RAM_SAVE_FLAG_COMPRESS_PAGE: len = qemu_get_be32(f); if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { error_report("Invalid compressed data length: %d", len); @@ -1637,13 +2484,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) qemu_get_buffer(f, compressed_data_buf, len); decompress_data_with_multi_threads(compressed_data_buf, host, len); break; + case RAM_SAVE_FLAG_XBZRLE: - host = host_from_stream_offset(f, addr, flags); - if (!host) { - error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); - ret = -EINVAL; - break; - } if (load_xbzrle(f, addr, host) < 0) { error_report("Failed to decompress XBZRLE page at " RAM_ADDR_FMT, addr); @@ -1677,7 +2519,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) static SaveVMHandlers savevm_ram_handlers = { .save_live_setup = ram_save_setup, .save_live_iterate = ram_save_iterate, - .save_live_complete = ram_save_complete, + .save_live_complete_postcopy = ram_save_complete, + .save_live_complete_precopy = ram_save_complete, .save_live_pending = ram_save_pending, .load_state = ram_load, .cleanup = ram_migration_cleanup, diff --git a/migration/savevm.c b/migration/savevm.c index e05158d7ba..be52314a12 100644 --- a/migration/savevm.c +++ b/migration/savevm.c @@ -37,6 +37,7 @@ #include "qemu/timer.h" #include "audio/audio.h" #include "migration/migration.h" +#include "migration/postcopy-ram.h" #include "qapi/qmp/qerror.h" #include "qemu/error-report.h" #include "qemu/sockets.h" @@ -45,6 +46,7 @@ #include "exec/memory.h" #include "qmp-commands.h" #include "trace.h" +#include "qemu/bitops.h" #include "qemu/iov.h" #include "block/snapshot.h" #include "block/qapi.h" @@ -57,8 +59,26 @@ #define ARP_PTYPE_IP 0x0800 #define ARP_OP_REQUEST_REV 0x3 +const unsigned int postcopy_ram_discard_version = 0; + static bool skip_section_footers; +static struct mig_cmd_args { + ssize_t len; /* -1 = variable */ + const char *name; +} mig_cmd_args[] = { + [MIG_CMD_INVALID] = { .len = -1, .name = "INVALID" }, + [MIG_CMD_OPEN_RETURN_PATH] = { .len = 0, .name = "OPEN_RETURN_PATH" }, + [MIG_CMD_PING] = { .len = sizeof(uint32_t), .name = "PING" }, + [MIG_CMD_POSTCOPY_ADVISE] = { .len = 16, .name = "POSTCOPY_ADVISE" }, + [MIG_CMD_POSTCOPY_LISTEN] = { .len = 0, .name = "POSTCOPY_LISTEN" }, + [MIG_CMD_POSTCOPY_RUN] = { .len = 0, .name = "POSTCOPY_RUN" }, + [MIG_CMD_POSTCOPY_RAM_DISCARD] = { + .len = -1, .name = "POSTCOPY_RAM_DISCARD" }, + [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, + [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, +}; + static int announce_self_create(uint8_t *buf, uint8_t *mac_addr) { @@ -694,6 +714,156 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se) } } +/** + * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the + * command and associated data. + * + * @f: File to send command on + * @command: Command type to send + * @len: Length of associated data + * @data: Data associated with command. + */ +void qemu_savevm_command_send(QEMUFile *f, + enum qemu_vm_cmd command, + uint16_t len, + uint8_t *data) +{ + trace_savevm_command_send(command, len); + qemu_put_byte(f, QEMU_VM_COMMAND); + qemu_put_be16(f, (uint16_t)command); + qemu_put_be16(f, len); + qemu_put_buffer(f, data, len); + qemu_fflush(f); +} + +void qemu_savevm_send_ping(QEMUFile *f, uint32_t value) +{ + uint32_t buf; + + trace_savevm_send_ping(value); + buf = cpu_to_be32(value); + qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf); +} + +void qemu_savevm_send_open_return_path(QEMUFile *f) +{ + trace_savevm_send_open_return_path(); + qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL); +} + +/* We have a buffer of data to send; we don't want that all to be loaded + * by the command itself, so the command contains just the length of the + * extra buffer that we then send straight after it. + * TODO: Must be a better way to organise that + * + * Returns: + * 0 on success + * -ve on error + */ +int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb) +{ + size_t cur_iov; + size_t len = qsb_get_length(qsb); + uint32_t tmp; + + if (len > MAX_VM_CMD_PACKAGED_SIZE) { + error_report("%s: Unreasonably large packaged state: %zu", + __func__, len); + return -1; + } + + tmp = cpu_to_be32(len); + + trace_qemu_savevm_send_packaged(); + qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp); + + /* all the data follows (concatinating the iov's) */ + for (cur_iov = 0; cur_iov < qsb->n_iov; cur_iov++) { + /* The iov entries are partially filled */ + size_t towrite = MIN(qsb->iov[cur_iov].iov_len, len); + len -= towrite; + + if (!towrite) { + break; + } + + qemu_put_buffer(f, qsb->iov[cur_iov].iov_base, towrite); + } + + return 0; +} + +/* Send prior to any postcopy transfer */ +void qemu_savevm_send_postcopy_advise(QEMUFile *f) +{ + uint64_t tmp[2]; + tmp[0] = cpu_to_be64(getpagesize()); + tmp[1] = cpu_to_be64(1ul << qemu_target_page_bits()); + + trace_qemu_savevm_send_postcopy_advise(); + qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 16, (uint8_t *)tmp); +} + +/* Sent prior to starting the destination running in postcopy, discard pages + * that have already been sent but redirtied on the source. + * CMD_POSTCOPY_RAM_DISCARD consist of: + * byte version (0) + * byte Length of name field (not including 0) + * n x byte RAM block name + * byte 0 terminator (just for safety) + * n x Byte ranges within the named RAMBlock + * be64 Start of the range + * be64 Length + * + * name: RAMBlock name that these entries are part of + * len: Number of page entries + * start_list: 'len' addresses + * length_list: 'len' addresses + * + */ +void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, + uint16_t len, + uint64_t *start_list, + uint64_t *length_list) +{ + uint8_t *buf; + uint16_t tmplen; + uint16_t t; + size_t name_len = strlen(name); + + trace_qemu_savevm_send_postcopy_ram_discard(name, len); + assert(name_len < 256); + buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len); + buf[0] = postcopy_ram_discard_version; + buf[1] = name_len; + memcpy(buf + 2, name, name_len); + tmplen = 2 + name_len; + buf[tmplen++] = '\0'; + + for (t = 0; t < len; t++) { + cpu_to_be64w((uint64_t *)(buf + tmplen), start_list[t]); + tmplen += 8; + cpu_to_be64w((uint64_t *)(buf + tmplen), length_list[t]); + tmplen += 8; + } + qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf); + g_free(buf); +} + +/* Get the destination into a state where it can receive postcopy data. */ +void qemu_savevm_send_postcopy_listen(QEMUFile *f) +{ + trace_savevm_send_postcopy_listen(); + qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL); +} + +/* Kick the destination into running */ +void qemu_savevm_send_postcopy_run(QEMUFile *f) +{ + trace_savevm_send_postcopy_run(); + qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL); +} + bool qemu_savevm_state_blocked(Error **errp) { SaveStateEntry *se; @@ -713,6 +883,12 @@ void qemu_savevm_state_header(QEMUFile *f) trace_savevm_state_header(); qemu_put_be32(f, QEMU_VM_FILE_MAGIC); qemu_put_be32(f, QEMU_VM_FILE_VERSION); + + if (!savevm_state.skip_configuration) { + qemu_put_byte(f, QEMU_VM_CONFIGURATION); + vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0); + } + } void qemu_savevm_state_begin(QEMUFile *f, @@ -729,11 +905,6 @@ void qemu_savevm_state_begin(QEMUFile *f, se->ops->set_params(params, se->opaque); } - if (!savevm_state.skip_configuration) { - qemu_put_byte(f, QEMU_VM_CONFIGURATION); - vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0); - } - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || !se->ops->save_live_setup) { continue; @@ -760,7 +931,7 @@ void qemu_savevm_state_begin(QEMUFile *f, * 0 : We haven't finished, caller have to go again * 1 : We have finished, we can go to complete phase */ -int qemu_savevm_state_iterate(QEMUFile *f) +int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy) { SaveStateEntry *se; int ret = 1; @@ -775,6 +946,15 @@ int qemu_savevm_state_iterate(QEMUFile *f) continue; } } + /* + * In the postcopy phase, any device that doesn't know how to + * do postcopy should have saved it's state in the _complete + * call that's already run, it might get confused if we call + * iterate afterwards. + */ + if (postcopy && !se->ops->save_live_complete_postcopy) { + continue; + } if (qemu_file_rate_limit(f)) { return 0; } @@ -803,22 +983,65 @@ int qemu_savevm_state_iterate(QEMUFile *f) static bool should_send_vmdesc(void) { MachineState *machine = MACHINE(qdev_get_machine()); - return !machine->suppress_vmdesc; + bool in_postcopy = migration_in_postcopy(migrate_get_current()); + return !machine->suppress_vmdesc && !in_postcopy; } -void qemu_savevm_state_complete(QEMUFile *f) +/* + * Calls the save_live_complete_postcopy methods + * causing the last few pages to be sent immediately and doing any associated + * cleanup. + * Note postcopy also calls qemu_savevm_state_complete_precopy to complete + * all the other devices, but that happens at the point we switch to postcopy. + */ +void qemu_savevm_state_complete_postcopy(QEMUFile *f) +{ + SaveStateEntry *se; + int ret; + + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { + if (!se->ops || !se->ops->save_live_complete_postcopy) { + continue; + } + if (se->ops && se->ops->is_active) { + if (!se->ops->is_active(se->opaque)) { + continue; + } + } + trace_savevm_section_start(se->idstr, se->section_id); + /* Section type */ + qemu_put_byte(f, QEMU_VM_SECTION_END); + qemu_put_be32(f, se->section_id); + + ret = se->ops->save_live_complete_postcopy(f, se->opaque); + trace_savevm_section_end(se->idstr, se->section_id, ret); + save_section_footer(f, se); + if (ret < 0) { + qemu_file_set_error(f, ret); + return; + } + } + + qemu_put_byte(f, QEMU_VM_EOF); + qemu_fflush(f); +} + +void qemu_savevm_state_complete_precopy(QEMUFile *f) { QJSON *vmdesc; int vmdesc_len; SaveStateEntry *se; int ret; + bool in_postcopy = migration_in_postcopy(migrate_get_current()); - trace_savevm_state_complete(); + trace_savevm_state_complete_precopy(); cpu_synchronize_all_states(); QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->ops || !se->ops->save_live_complete) { + if (!se->ops || + (in_postcopy && se->ops->save_live_complete_postcopy) || + !se->ops->save_live_complete_precopy) { continue; } if (se->ops && se->ops->is_active) { @@ -830,7 +1053,7 @@ void qemu_savevm_state_complete(QEMUFile *f) save_section_header(f, se, QEMU_VM_SECTION_END); - ret = se->ops->save_live_complete(f, se->opaque); + ret = se->ops->save_live_complete_precopy(f, se->opaque); trace_savevm_section_end(se->idstr, se->section_id, ret); save_section_footer(f, se); if (ret < 0) { @@ -867,7 +1090,10 @@ void qemu_savevm_state_complete(QEMUFile *f) save_section_footer(f, se); } - qemu_put_byte(f, QEMU_VM_EOF); + if (!in_postcopy) { + /* Postcopy stream will still be going */ + qemu_put_byte(f, QEMU_VM_EOF); + } json_end_array(vmdesc); qjson_finish(vmdesc); @@ -883,10 +1109,19 @@ void qemu_savevm_state_complete(QEMUFile *f) qemu_fflush(f); } -uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size) +/* Give an estimate of the amount left to be transferred, + * the result is split into the amount for units that can and + * for units that can't do postcopy. + */ +void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size, + uint64_t *res_non_postcopiable, + uint64_t *res_postcopiable) { SaveStateEntry *se; - uint64_t ret = 0; + + *res_non_postcopiable = 0; + *res_postcopiable = 0; + QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { if (!se->ops || !se->ops->save_live_pending) { @@ -897,9 +1132,9 @@ uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size) continue; } } - ret += se->ops->save_live_pending(f, se->opaque, max_size); + se->ops->save_live_pending(f, se->opaque, max_size, + res_non_postcopiable, res_postcopiable); } - return ret; } void qemu_savevm_state_cleanup(void) @@ -921,6 +1156,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) .blk = 0, .shared = 0 }; + MigrationState *ms = migrate_init(¶ms); + ms->file = f; if (qemu_savevm_state_blocked(errp)) { return -EINVAL; @@ -932,18 +1169,18 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) qemu_mutex_lock_iothread(); while (qemu_file_get_error(f) == 0) { - if (qemu_savevm_state_iterate(f) > 0) { + if (qemu_savevm_state_iterate(f, false) > 0) { break; } } ret = qemu_file_get_error(f); if (ret == 0) { - qemu_savevm_state_complete(f); + qemu_savevm_state_complete_precopy(f); ret = qemu_file_get_error(f); } + qemu_savevm_state_cleanup(); if (ret != 0) { - qemu_savevm_state_cleanup(); error_setg_errno(errp, -ret, "Error while writing VM state"); } return ret; @@ -1001,6 +1238,420 @@ static SaveStateEntry *find_se(const char *idstr, int instance_id) return NULL; } +enum LoadVMExitCodes { + /* Allow a command to quit all layers of nested loadvm loops */ + LOADVM_QUIT = 1, +}; + +static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); + +/* ------ incoming postcopy messages ------ */ +/* 'advise' arrives before any transfers just to tell us that a postcopy + * *might* happen - it might be skipped if precopy transferred everything + * quickly. + */ +static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis) +{ + PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE); + uint64_t remote_hps, remote_tps; + + trace_loadvm_postcopy_handle_advise(); + if (ps != POSTCOPY_INCOMING_NONE) { + error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps); + return -1; + } + + if (!postcopy_ram_supported_by_host()) { + return -1; + } + + remote_hps = qemu_get_be64(mis->from_src_file); + if (remote_hps != getpagesize()) { + /* + * Some combinations of mismatch are probably possible but it gets + * a bit more complicated. In particular we need to place whole + * host pages on the dest at once, and we need to ensure that we + * handle dirtying to make sure we never end up sending part of + * a hostpage on it's own. + */ + error_report("Postcopy needs matching host page sizes (s=%d d=%d)", + (int)remote_hps, getpagesize()); + return -1; + } + + remote_tps = qemu_get_be64(mis->from_src_file); + if (remote_tps != (1ul << qemu_target_page_bits())) { + /* + * Again, some differences could be dealt with, but for now keep it + * simple. + */ + error_report("Postcopy needs matching target page sizes (s=%d d=%d)", + (int)remote_tps, 1 << qemu_target_page_bits()); + return -1; + } + + if (ram_postcopy_incoming_init(mis)) { + return -1; + } + + postcopy_state_set(POSTCOPY_INCOMING_ADVISE); + + return 0; +} + +/* After postcopy we will be told to throw some pages away since they're + * dirty and will have to be demand fetched. Must happen before CPU is + * started. + * There can be 0..many of these messages, each encoding multiple pages. + */ +static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, + uint16_t len) +{ + int tmp; + char ramid[256]; + PostcopyState ps = postcopy_state_get(); + + trace_loadvm_postcopy_ram_handle_discard(); + + switch (ps) { + case POSTCOPY_INCOMING_ADVISE: + /* 1st discard */ + tmp = postcopy_ram_prepare_discard(mis); + if (tmp) { + return tmp; + } + break; + + case POSTCOPY_INCOMING_DISCARD: + /* Expected state */ + break; + + default: + error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)", + ps); + return -1; + } + /* We're expecting a + * Version (0) + * a RAM ID string (length byte, name, 0 term) + * then at least 1 16 byte chunk + */ + if (len < (1 + 1 + 1 + 1 + 2 * 8)) { + error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); + return -1; + } + + tmp = qemu_get_byte(mis->from_src_file); + if (tmp != postcopy_ram_discard_version) { + error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp); + return -1; + } + + if (!qemu_get_counted_string(mis->from_src_file, ramid)) { + error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID"); + return -1; + } + tmp = qemu_get_byte(mis->from_src_file); + if (tmp != 0) { + error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp); + return -1; + } + + len -= 3 + strlen(ramid); + if (len % 16) { + error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); + return -1; + } + trace_loadvm_postcopy_ram_handle_discard_header(ramid, len); + while (len) { + uint64_t start_addr, block_length; + start_addr = qemu_get_be64(mis->from_src_file); + block_length = qemu_get_be64(mis->from_src_file); + + len -= 16; + int ret = ram_discard_range(mis, ramid, start_addr, + block_length); + if (ret) { + return ret; + } + } + trace_loadvm_postcopy_ram_handle_discard_end(); + + return 0; +} + +/* + * Triggered by a postcopy_listen command; this thread takes over reading + * the input stream, leaving the main thread free to carry on loading the rest + * of the device state (from RAM). + * (TODO:This could do with being in a postcopy file - but there again it's + * just another input loop, not that postcopy specific) + */ +static void *postcopy_ram_listen_thread(void *opaque) +{ + QEMUFile *f = opaque; + MigrationIncomingState *mis = migration_incoming_get_current(); + int load_res; + + qemu_sem_post(&mis->listen_thread_sem); + trace_postcopy_ram_listen_thread_start(); + + /* + * Because we're a thread and not a coroutine we can't yield + * in qemu_file, and thus we must be blocking now. + */ + qemu_file_set_blocking(f, true); + load_res = qemu_loadvm_state_main(f, mis); + /* And non-blocking again so we don't block in any cleanup */ + qemu_file_set_blocking(f, false); + + trace_postcopy_ram_listen_thread_exit(); + if (load_res < 0) { + error_report("%s: loadvm failed: %d", __func__, load_res); + qemu_file_set_error(f, load_res); + } else { + /* + * This looks good, but it's possible that the device loading in the + * main thread hasn't finished yet, and so we might not be in 'RUN' + * state yet; wait for the end of the main thread. + */ + qemu_event_wait(&mis->main_thread_load_event); + } + postcopy_ram_incoming_cleanup(mis); + /* + * If everything has worked fine, then the main thread has waited + * for us to start, and we're the last use of the mis. + * (If something broke then qemu will have to exit anyway since it's + * got a bad migration state). + */ + migration_incoming_state_destroy(); + + if (load_res < 0) { + /* + * If something went wrong then we have a bad state so exit; + * depending how far we got it might be possible at this point + * to leave the guest running and fire MCEs for pages that never + * arrived as a desperate recovery step. + */ + exit(EXIT_FAILURE); + } + + return NULL; +} + +/* After this message we must be able to immediately receive postcopy data */ +static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) +{ + PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING); + trace_loadvm_postcopy_handle_listen(); + if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) { + error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps); + return -1; + } + if (ps == POSTCOPY_INCOMING_ADVISE) { + /* + * A rare case, we entered listen without having to do any discards, + * so do the setup that's normally done at the time of the 1st discard. + */ + postcopy_ram_prepare_discard(mis); + } + + /* + * Sensitise RAM - can now generate requests for blocks that don't exist + * However, at this point the CPU shouldn't be running, and the IO + * shouldn't be doing anything yet so don't actually expect requests + */ + if (postcopy_ram_enable_notify(mis)) { + return -1; + } + + if (mis->have_listen_thread) { + error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread"); + return -1; + } + + mis->have_listen_thread = true; + /* Start up the listening thread and wait for it to signal ready */ + qemu_sem_init(&mis->listen_thread_sem, 0); + qemu_thread_create(&mis->listen_thread, "postcopy/listen", + postcopy_ram_listen_thread, mis->from_src_file, + QEMU_THREAD_JOINABLE); + qemu_sem_wait(&mis->listen_thread_sem); + qemu_sem_destroy(&mis->listen_thread_sem); + + return 0; +} + +/* After all discards we can start running and asking for pages */ +static int loadvm_postcopy_handle_run(MigrationIncomingState *mis) +{ + PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING); + Error *local_err = NULL; + + trace_loadvm_postcopy_handle_run(); + if (ps != POSTCOPY_INCOMING_LISTENING) { + error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps); + return -1; + } + + /* TODO we should move all of this lot into postcopy_ram.c or a shared code + * in migration.c + */ + cpu_synchronize_all_post_init(); + + qemu_announce_self(); + + /* Make sure all file formats flush their mutable metadata */ + bdrv_invalidate_cache_all(&local_err); + if (local_err) { + error_report_err(local_err); + return -1; + } + + trace_loadvm_postcopy_handle_run_cpu_sync(); + cpu_synchronize_all_post_init(); + + trace_loadvm_postcopy_handle_run_vmstart(); + + if (autostart) { + /* Hold onto your hats, starting the CPU */ + vm_start(); + } else { + /* leave it paused and let management decide when to start the CPU */ + runstate_set(RUN_STATE_PAUSED); + } + + /* We need to finish reading the stream from the package + * and also stop reading anything more from the stream that loaded the + * package (since it's now being read by the listener thread). + * LOADVM_QUIT will quit all the layers of nested loadvm loops. + */ + return LOADVM_QUIT; +} + +/** + * Immediately following this command is a blob of data containing an embedded + * chunk of migration stream; read it and load it. + * + * @mis: Incoming state + * @length: Length of packaged data to read + * + * Returns: Negative values on error + * + */ +static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis) +{ + int ret; + uint8_t *buffer; + uint32_t length; + QEMUSizedBuffer *qsb; + + length = qemu_get_be32(mis->from_src_file); + trace_loadvm_handle_cmd_packaged(length); + + if (length > MAX_VM_CMD_PACKAGED_SIZE) { + error_report("Unreasonably large packaged state: %u", length); + return -1; + } + buffer = g_malloc0(length); + ret = qemu_get_buffer(mis->from_src_file, buffer, (int)length); + if (ret != length) { + g_free(buffer); + error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%d\n", + ret, length); + return (ret < 0) ? ret : -EAGAIN; + } + trace_loadvm_handle_cmd_packaged_received(ret); + + /* Setup a dummy QEMUFile that actually reads from the buffer */ + qsb = qsb_create(buffer, length); + g_free(buffer); /* Because qsb_create copies */ + if (!qsb) { + error_report("Unable to create qsb"); + } + QEMUFile *packf = qemu_bufopen("r", qsb); + + ret = qemu_loadvm_state_main(packf, mis); + trace_loadvm_handle_cmd_packaged_main(ret); + qemu_fclose(packf); + qsb_free(qsb); + + return ret; +} + +/* + * Process an incoming 'QEMU_VM_COMMAND' + * 0 just a normal return + * LOADVM_QUIT All good, but exit the loop + * <0 Error + */ +static int loadvm_process_command(QEMUFile *f) +{ + MigrationIncomingState *mis = migration_incoming_get_current(); + uint16_t cmd; + uint16_t len; + uint32_t tmp32; + + cmd = qemu_get_be16(f); + len = qemu_get_be16(f); + + trace_loadvm_process_command(cmd, len); + if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) { + error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len); + return -EINVAL; + } + + if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) { + error_report("%s received with bad length - expecting %zu, got %d", + mig_cmd_args[cmd].name, + (size_t)mig_cmd_args[cmd].len, len); + return -ERANGE; + } + + switch (cmd) { + case MIG_CMD_OPEN_RETURN_PATH: + if (mis->to_src_file) { + error_report("CMD_OPEN_RETURN_PATH called when RP already open"); + /* Not really a problem, so don't give up */ + return 0; + } + mis->to_src_file = qemu_file_get_return_path(f); + if (!mis->to_src_file) { + error_report("CMD_OPEN_RETURN_PATH failed"); + return -1; + } + break; + + case MIG_CMD_PING: + tmp32 = qemu_get_be32(f); + trace_loadvm_process_command_ping(tmp32); + if (!mis->to_src_file) { + error_report("CMD_PING (0x%x) received with no return path", + tmp32); + return -1; + } + migrate_send_rp_pong(mis, tmp32); + break; + + case MIG_CMD_PACKAGED: + return loadvm_handle_cmd_packaged(mis); + + case MIG_CMD_POSTCOPY_ADVISE: + return loadvm_postcopy_handle_advise(mis); + + case MIG_CMD_POSTCOPY_LISTEN: + return loadvm_postcopy_handle_listen(mis); + + case MIG_CMD_POSTCOPY_RUN: + return loadvm_postcopy_handle_run(mis); + + case MIG_CMD_POSTCOPY_RAM_DISCARD: + return loadvm_postcopy_ram_handle_discard(mis, len); + } + + return 0; +} + struct LoadStateEntry { QLIST_ENTRY(LoadStateEntry) entry; SaveStateEntry *se; @@ -1053,14 +1704,113 @@ void loadvm_free_handlers(MigrationIncomingState *mis) } } +static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis) +{ + uint8_t section_type; + int ret; + + while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { + uint32_t instance_id, version_id, section_id; + SaveStateEntry *se; + LoadStateEntry *le; + char idstr[256]; + + trace_qemu_loadvm_state_section(section_type); + switch (section_type) { + case QEMU_VM_SECTION_START: + case QEMU_VM_SECTION_FULL: + /* Read section start */ + section_id = qemu_get_be32(f); + if (!qemu_get_counted_string(f, idstr)) { + error_report("Unable to read ID string for section %u", + section_id); + return -EINVAL; + } + instance_id = qemu_get_be32(f); + version_id = qemu_get_be32(f); + + trace_qemu_loadvm_state_section_startfull(section_id, idstr, + instance_id, version_id); + /* Find savevm section */ + se = find_se(idstr, instance_id); + if (se == NULL) { + error_report("Unknown savevm section or instance '%s' %d", + idstr, instance_id); + return -EINVAL; + } + + /* Validate version */ + if (version_id > se->version_id) { + error_report("savevm: unsupported version %d for '%s' v%d", + version_id, idstr, se->version_id); + return -EINVAL; + } + + /* Add entry */ + le = g_malloc0(sizeof(*le)); + + le->se = se; + le->section_id = section_id; + le->version_id = version_id; + QLIST_INSERT_HEAD(&mis->loadvm_handlers, le, entry); + + ret = vmstate_load(f, le->se, le->version_id); + if (ret < 0) { + error_report("error while loading state for instance 0x%x of" + " device '%s'", instance_id, idstr); + return ret; + } + if (!check_section_footer(f, le)) { + return -EINVAL; + } + break; + case QEMU_VM_SECTION_PART: + case QEMU_VM_SECTION_END: + section_id = qemu_get_be32(f); + + trace_qemu_loadvm_state_section_partend(section_id); + QLIST_FOREACH(le, &mis->loadvm_handlers, entry) { + if (le->section_id == section_id) { + break; + } + } + if (le == NULL) { + error_report("Unknown savevm section %d", section_id); + return -EINVAL; + } + + ret = vmstate_load(f, le->se, le->version_id); + if (ret < 0) { + error_report("error while loading state section id %d(%s)", + section_id, le->se->idstr); + return ret; + } + if (!check_section_footer(f, le)) { + return -EINVAL; + } + break; + case QEMU_VM_COMMAND: + ret = loadvm_process_command(f); + trace_qemu_loadvm_state_section_command(ret); + if ((ret < 0) || (ret & LOADVM_QUIT)) { + return ret; + } + break; + default: + error_report("Unknown savevm section type %d", section_type); + return -EINVAL; + } + } + + return 0; +} + int qemu_loadvm_state(QEMUFile *f) { MigrationIncomingState *mis = migration_incoming_get_current(); Error *local_err = NULL; - uint8_t section_type; unsigned int v; int ret; - int file_error_after_eof = -1; if (qemu_savevm_state_blocked(&local_err)) { error_report_err(local_err); @@ -1095,99 +1845,19 @@ int qemu_loadvm_state(QEMUFile *f) } } - while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { - uint32_t instance_id, version_id, section_id; - SaveStateEntry *se; - LoadStateEntry *le; - char idstr[256]; + ret = qemu_loadvm_state_main(f, mis); + qemu_event_set(&mis->main_thread_load_event); - trace_qemu_loadvm_state_section(section_type); - switch (section_type) { - case QEMU_VM_SECTION_START: - case QEMU_VM_SECTION_FULL: - /* Read section start */ - section_id = qemu_get_be32(f); - if (!qemu_get_counted_string(f, idstr)) { - error_report("Unable to read ID string for section %u", - section_id); - return -EINVAL; - } - instance_id = qemu_get_be32(f); - version_id = qemu_get_be32(f); + trace_qemu_loadvm_state_post_main(ret); - trace_qemu_loadvm_state_section_startfull(section_id, idstr, - instance_id, version_id); - /* Find savevm section */ - se = find_se(idstr, instance_id); - if (se == NULL) { - error_report("Unknown savevm section or instance '%s' %d", - idstr, instance_id); - ret = -EINVAL; - goto out; - } - - /* Validate version */ - if (version_id > se->version_id) { - error_report("savevm: unsupported version %d for '%s' v%d", - version_id, idstr, se->version_id); - ret = -EINVAL; - goto out; - } - - /* Add entry */ - le = g_malloc0(sizeof(*le)); - - le->se = se; - le->section_id = section_id; - le->version_id = version_id; - QLIST_INSERT_HEAD(&mis->loadvm_handlers, le, entry); - - ret = vmstate_load(f, le->se, le->version_id); - if (ret < 0) { - error_report("error while loading state for instance 0x%x of" - " device '%s'", instance_id, idstr); - goto out; - } - if (!check_section_footer(f, le)) { - ret = -EINVAL; - goto out; - } - break; - case QEMU_VM_SECTION_PART: - case QEMU_VM_SECTION_END: - section_id = qemu_get_be32(f); - - trace_qemu_loadvm_state_section_partend(section_id); - QLIST_FOREACH(le, &mis->loadvm_handlers, entry) { - if (le->section_id == section_id) { - break; - } - } - if (le == NULL) { - error_report("Unknown savevm section %d", section_id); - ret = -EINVAL; - goto out; - } - - ret = vmstate_load(f, le->se, le->version_id); - if (ret < 0) { - error_report("error while loading state section id %d(%s)", - section_id, le->se->idstr); - goto out; - } - if (!check_section_footer(f, le)) { - ret = -EINVAL; - goto out; - } - break; - default: - error_report("Unknown savevm section type %d", section_type); - ret = -EINVAL; - goto out; - } + if (mis->have_listen_thread) { + /* Listen thread still going, can't clean up yet */ + return ret; } - file_error_after_eof = qemu_file_get_error(f); + if (ret == 0) { + ret = qemu_file_get_error(f); + } /* * Try to read in the VMDESC section as well, so that dumping tools that @@ -1199,10 +1869,10 @@ int qemu_loadvm_state(QEMUFile *f) * We also mustn't read data that isn't there; some transports (RDMA) * will stall waiting for that data when the source has already closed. */ - if (should_send_vmdesc()) { + if (ret == 0 && should_send_vmdesc()) { uint8_t *buf; uint32_t size; - section_type = qemu_get_byte(f); + uint8_t section_type = qemu_get_byte(f); if (section_type != QEMU_VM_VMDESCRIPTION) { error_report("Expected vmdescription section, but got %d", @@ -1226,14 +1896,6 @@ int qemu_loadvm_state(QEMUFile *f) cpu_synchronize_all_post_init(); - ret = 0; - -out: - if (ret == 0) { - /* We may not have a VMDESC section, so ignore relative errors */ - ret = file_error_after_eof; - } - return ret; } diff --git a/qapi-schema.json b/qapi-schema.json index e18f14c88e..8c3a42a1ac 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -430,6 +430,8 @@ # # @active: in the process of doing migration. # +# @postcopy-active: like active, but now in postcopy mode. (since 2.5) +# # @completed: migration is finished. # # @failed: some error occurred during migration process. @@ -439,7 +441,7 @@ ## { 'enum': 'MigrationStatus', 'data': [ 'none', 'setup', 'cancelling', 'cancelled', - 'active', 'completed', 'failed' ] } + 'active', 'postcopy-active', 'completed', 'failed' ] } ## # @MigrationInfo @@ -540,11 +542,15 @@ # @auto-converge: If enabled, QEMU will automatically throttle down the guest # to speed up convergence of RAM migration. (since 1.6) # +# @x-postcopy-ram: Start executing on the migration target before all of RAM has +# been migrated, pulling the remaining pages along as needed. NOTE: If +# the migration fails during postcopy the VM will fail. (since 2.5) +# # Since: 1.2 ## { 'enum': 'MigrationCapability', 'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks', - 'compress', 'events'] } + 'compress', 'events', 'x-postcopy-ram'] } ## # @MigrationCapabilityStatus @@ -697,6 +703,14 @@ 'data': { 'protocol': 'str', 'hostname': 'str', '*port': 'int', '*tls-port': 'int', '*cert-subject': 'str' } } +## +# @migrate-start-postcopy +# +# Switch migration to postcopy mode +# +# Since: 2.5 +{ 'command': 'migrate-start-postcopy' } + ## # @MouseInfo: # diff --git a/qmp-commands.hx b/qmp-commands.hx index d7cf0ff264..7f85d4046c 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -717,6 +717,25 @@ Example: <- { "return": {} } EQMP + { + .name = "migrate-start-postcopy", + .args_type = "", + .mhandler.cmd_new = qmp_marshal_migrate_start_postcopy, + }, + +SQMP +migrate-start-postcopy +---------------------- + +Switch an in-progress migration to postcopy mode. Ignored after the end of +migration (or once already in postcopy). + +Example: +-> { "execute": "migrate-start-postcopy" } +<- { "return": {} } + +EQMP + { .name = "query-migrate-cache-size", .args_type = "", diff --git a/qtest.c b/qtest.c index 8e10340c7e..05cefd2800 100644 --- a/qtest.c +++ b/qtest.c @@ -657,7 +657,6 @@ void qtest_init(const char *qtest_chrdev, const char *qtest_log, Error **errp) inbuf = g_string_new(""); qtest_chr = chr; - page_size_init(); } bool qtest_driver(void) diff --git a/trace-events b/trace-events index ea2d32e362..ef6bc41a56 100644 --- a/trace-events +++ b/trace-events @@ -1202,16 +1202,43 @@ virtio_gpu_fence_resp(uint64_t fence) "fence 0x%" PRIx64 # migration/savevm.c qemu_loadvm_state_section(unsigned int section_type) "%d" +qemu_loadvm_state_section_command(int ret) "%d" qemu_loadvm_state_section_partend(uint32_t section_id) "%u" +qemu_loadvm_state_main(void) "" +qemu_loadvm_state_main_quit_parent(void) "" +qemu_loadvm_state_post_main(int ret) "%d" qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u" +qemu_savevm_send_packaged(void) "" +loadvm_handle_cmd_packaged(unsigned int length) "%u" +loadvm_handle_cmd_packaged_main(int ret) "%d" +loadvm_handle_cmd_packaged_received(int ret) "%d" +loadvm_postcopy_handle_advise(void) "" +loadvm_postcopy_handle_listen(void) "" +loadvm_postcopy_handle_run(void) "" +loadvm_postcopy_handle_run_cpu_sync(void) "" +loadvm_postcopy_handle_run_vmstart(void) "" +loadvm_postcopy_ram_handle_discard(void) "" +loadvm_postcopy_ram_handle_discard_end(void) "" +loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud" +loadvm_process_command(uint16_t com, uint16_t len) "com=0x%x len=%d" +loadvm_process_command_ping(uint32_t val) "%x" +postcopy_ram_listen_thread_exit(void) "" +postcopy_ram_listen_thread_start(void) "" +qemu_savevm_send_postcopy_advise(void) "" +qemu_savevm_send_postcopy_ram_discard(const char *id, uint16_t len) "%s: %ud" +savevm_command_send(uint16_t command, uint16_t len) "com=0x%x len=%d" savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u" savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d" savevm_section_skip(const char *id, unsigned int section_id) "%s, section_id %u" +savevm_send_open_return_path(void) "" +savevm_send_ping(uint32_t val) "%x" +savevm_send_postcopy_listen(void) "" +savevm_send_postcopy_run(void) "" savevm_state_begin(void) "" savevm_state_header(void) "" savevm_state_iterate(void) "" -savevm_state_complete(void) "" savevm_state_cleanup(void) "" +savevm_state_complete_precopy(void) "" vmstate_save(const char *idstr, const char *vmsd_name) "%s, %s" vmstate_load(const char *idstr, const char *vmsd_name) "%s, %s" qemu_announce_self_iter(const char *mac) "%s" @@ -1229,9 +1256,14 @@ vmstate_subsection_load_good(const char *parent) "%s" qemu_file_fclose(void) "" # migration/ram.c +get_queued_page(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr) "%s/%" PRIx64 " ram_addr=%" PRIx64 +get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr, int sent) "%s/%" PRIx64 " ram_addr=%" PRIx64 " (sent=%d)" migration_bitmap_sync_start(void) "" migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64"" migration_throttle(void) "" +ram_load_postcopy_loop(uint64_t addr, int flags) "@%" PRIx64 " %x" +ram_postcopy_send_discard_bitmap(void) "" +ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: %zx len: %zx" # hw/display/qxl.c disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d" @@ -1421,17 +1453,40 @@ flic_no_device_api(int err) "flic: no Device Contral API support %d" flic_reset_failed(int err) "flic: reset failed %d" # migration.c +await_return_path_close_on_source_close(void) "" +await_return_path_close_on_source_joining(void) "" migrate_set_state(int new_state) "new state %d" migrate_fd_cleanup(void) "" migrate_fd_error(void) "" migrate_fd_cancel(void) "" -migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64 -migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64 -migrate_state_too_big(void) "" +migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at %zx len %zx" +migrate_pending(uint64_t size, uint64_t max, uint64_t post, uint64_t nonpost) "pending size %" PRIu64 " max %" PRIu64 " (post=%" PRIu64 " nonpost=%" PRIu64 ")" +migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d" +migration_completion_file_err(void) "" +migration_completion_postcopy_end(void) "" +migration_completion_postcopy_end_after_complete(void) "" +migration_completion_postcopy_end_before_rp(void) "" +migration_completion_postcopy_end_after_rp(int rp_error) "%d" +migration_thread_after_loop(void) "" +migration_thread_file_err(void) "" +migration_thread_setup_complete(void) "" +open_return_path_on_source(void) "" +open_return_path_on_source_continue(void) "" +postcopy_start(void) "" +postcopy_start_set_run(void) "" +source_return_path_thread_bad_end(void) "" +source_return_path_thread_end(void) "" +source_return_path_thread_entry(void) "" +source_return_path_thread_loop_top(void) "" +source_return_path_thread_pong(uint32_t val) "%x" +source_return_path_thread_shut(uint32_t val) "%x" migrate_global_state_post_load(const char *state) "loaded state: %s" migrate_global_state_pre_save(const char *state) "saved state: %s" -migration_completion_file_err(void) "" migration_thread_low_pending(uint64_t pending) "%" PRIu64 +migrate_state_too_big(void) "" +migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64 +process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d" +process_incoming_migration_co_postcopy_end_main(void) "" # migration/rdma.c qemu_rdma_accept_incoming_migration(void) "" @@ -1497,6 +1552,25 @@ rdma_start_incoming_migration_after_rdma_listen(void) "" rdma_start_outgoing_migration_after_rdma_connect(void) "" rdma_start_outgoing_migration_after_rdma_source_init(void) "" +# migration/postcopy-ram.c +postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands" +postcopy_discard_send_range(const char *ramblock, unsigned long start, unsigned long length) "%s:%lx/%lx" +postcopy_ram_discard_range(void *start, size_t length) "%p,+%zx" +postcopy_cleanup_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx" +postcopy_init_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx" +postcopy_nhp_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx" +postcopy_place_page(void *host_addr) "host=%p" +postcopy_place_page_zero(void *host_addr) "host=%p" +postcopy_ram_enable_notify(void) "" +postcopy_ram_fault_thread_entry(void) "" +postcopy_ram_fault_thread_exit(void) "" +postcopy_ram_fault_thread_quit(void) "" +postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx" +postcopy_ram_incoming_cleanup_closeuf(void) "" +postcopy_ram_incoming_cleanup_entry(void) "" +postcopy_ram_incoming_cleanup_exit(void) "" +postcopy_ram_incoming_cleanup_join(void) "" + # kvm-all.c kvm_ioctl(int type, void *arg) "type 0x%x, arg %p" kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p" diff --git a/vl.c b/vl.c index 21e8876a57..7d993a5243 100644 --- a/vl.c +++ b/vl.c @@ -4285,6 +4285,7 @@ int main(int argc, char **argv, char **envp) exit(1); } + page_size_init(); socket_init(); if (qemu_opts_foreach(qemu_find_opts("object"),