Merge remote-tracking branch 'quintela/migration.next' into staging

# By Michael R. Hines (8) and others # Via Juan Quintela * quintela/migration.next: migration: add autoconvergence documentation Fix real mode guest segments dpl value in savevm Fix real mode guest migration rdma: account for the time spent in MIG_STATE_SETUP through QMP rdma: introduce MIG_STATE_NONE and change MIG_STATE_SETUP state transition rdma: allow state transitions between other states besides ACTIVE rdma: send pc.ram rdma: core logic rdma: introduce ram_handle_compressed() rdma: bugfix: ram_control_save_page() rdma: update documentation to reflect new unpin support Message-id: 1374590725-14144-1-git-send-email-quintela@redhat.com Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
2013-07-23 10:57:23 -05:00 · 2013-07-23 10:57:23 -05:00 · f03d07d468
parent 3988982c82 9781c3716a
commit f03d07d468
11 changed files with 3467 additions and 47 deletions
--- a/Makefile.objs
+++ b/Makefile.objs
@ -51,6 +51,7 @@ common-obj-$(CONFIG_POSIX) += os-posix.o
 common-obj-$(CONFIG_LINUX) += fsdev/

 common-obj-y += migration.o migration-tcp.o
+common-obj-$(CONFIG_RDMA) += migration-rdma.o
 common-obj-y += qemu-char.o #aio.o
 common-obj-y += block-migration.o
 common-obj-y += page_cache.o xbzrle.o
--- a/arch_init.c
+++ b/arch_init.c
@ -118,6 +118,7 @@ static void check_guest_throttling(void);
 #define RAM_SAVE_FLAG_EOS      0x10
 #define RAM_SAVE_FLAG_CONTINUE 0x20
 #define RAM_SAVE_FLAG_XBZRLE   0x40
+/* 0x80 is reserved in migration.h start with 0x100 next */


 static struct defconfig_file {
@ -475,6 +476,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
                ram_bulk_stage = false;
            }
        } else {
+            int ret;
            uint8_t *p;
            int cont = (block == last_sent_block) ?
                RAM_SAVE_FLAG_CONTINUE : 0;
@ -483,7 +485,18 @@ static int ram_save_block(QEMUFile *f, bool last_stage)

            /* In doubt sent page as normal */
            bytes_sent = -1;
-            if (is_zero_page(p)) {
+            ret = ram_control_save_page(f, block->offset,
+                               offset, TARGET_PAGE_SIZE, &bytes_sent);
+
+            if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
+                if (ret != RAM_SAVE_CONTROL_DELAYED) {
+                    if (bytes_sent > 0) {
+                        acct_info.norm_pages++;
+                    } else if (bytes_sent == 0) {
+                        acct_info.dup_pages++;
+                    }
+                }
+            } else if (is_zero_page(p)) {
                acct_info.dup_pages++;
                bytes_sent = save_block_hdr(f, block, offset, cont,
                                            RAM_SAVE_FLAG_COMPRESS);
@ -635,6 +648,10 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
    }

    qemu_mutex_unlock_ramlist();
+
+    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
+    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
+
    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

    return 0;
@ -653,6 +670,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
        reset_ram_globals();
    }

+    ram_control_before_iterate(f, RAM_CONTROL_ROUND);
+
    t0 = qemu_get_clock_ns(rt_clock);
    i = 0;
    while ((ret = qemu_file_rate_limit(f)) == 0) {
@ -684,6 +703,12 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)

    qemu_mutex_unlock_ramlist();

+    /*
+     * Must occur before EOS (or any QEMUFile operation)
+     * because of RDMA protocol.
+     */
+    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
+
    if (ret < 0) {
        bytes_transferred += total_sent;
        return ret;
@ -701,6 +726,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
    qemu_mutex_lock_ramlist();
    migration_bitmap_sync();

+    ram_control_before_iterate(f, RAM_CONTROL_FINISH);
+
    /* try transferring iterative blocks of memory */

    /* flush all remaining blocks regardless of rate limiting */
@ -714,6 +741,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
        }
        bytes_transferred += bytes_sent;
    }
+
+    ram_control_after_iterate(f, RAM_CONTROL_FINISH);
    migration_end();

    qemu_mutex_unlock_ramlist();
@ -808,6 +837,24 @@ static inline void *host_from_stream_offset(QEMUFile *f,
    return NULL;
 }

+/*
+ * If a page (or a whole RDMA chunk) has been
+ * determined to be zero, then zap it.
+ */
+void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
+{
+    if (ch != 0 || !is_zero_page(host)) {
+        memset(host, ch, size);
+#ifndef _WIN32
+        if (ch == 0 &&
+            (!kvm_enabled() || kvm_has_sync_mmu()) &&
+            getpagesize() <= TARGET_PAGE_SIZE) {
+            qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
+        }
+#endif
+    }
+}
+
 static int ram_load(QEMUFile *f, void *opaque, int version_id)
 {
    ram_addr_t addr;
@ -879,16 +926,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
            }

            ch = qemu_get_byte(f);
-            if (ch != 0 || !is_zero_page(host)) {
-                memset(host, ch, TARGET_PAGE_SIZE);
-#ifndef _WIN32
-                if (ch == 0 &&
-                    (!kvm_enabled() || kvm_has_sync_mmu()) &&
-                    getpagesize() <= TARGET_PAGE_SIZE) {
-                    qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED);
-                }
-#endif
-            }
+            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
        } else if (flags & RAM_SAVE_FLAG_PAGE) {
            void *host;

@ -908,6 +946,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                ret = -EINVAL;
                goto done;
            }
+        } else if (flags & RAM_SAVE_FLAG_HOOK) {
+            ram_control_load_hook(f, flags);
        }
        error = qemu_file_get_error(f);
        if (error) {
--- a/40
+++ b/40
@ -180,6 +180,7 @@ xfs=""
 vhost_net="no"
 vhost_scsi="no"
 kvm="no"
+rdma=""
 gprof="no"
 debug_tcg="no"
 debug="no"
@ -937,6 +938,10 @@ for opt do
  ;;
  --enable-gtk) gtk="yes"
  ;;
+  --enable-rdma) rdma="yes"
+  ;;
+  --disable-rdma) rdma="no"
+  ;;
  --with-gtkabi=*) gtkabi="$optarg"
  ;;
  --enable-tpm) tpm="yes"
@ -1095,6 +1100,8 @@ echo "  --enable-bluez           enable bluez stack connectivity"
 echo "  --disable-slirp          disable SLIRP userspace network connectivity"
 echo "  --disable-kvm            disable KVM acceleration support"
 echo "  --enable-kvm             enable KVM acceleration support"
+echo "  --disable-rdma           disable RDMA-based migration support"
+echo "  --enable-rdma            enable RDMA-based migration support"
 echo "  --enable-tcg-interpreter enable TCG with bytecode interpreter (TCI)"
 echo "  --disable-nptl           disable usermode NPTL support"
 echo "  --enable-nptl            enable usermode NPTL support"
@ -1801,6 +1808,30 @@ EOF
  libs_softmmu="$sdl_libs $libs_softmmu"
 fi

+##########################################
+# RDMA needs OpenFabrics libraries
+if test "$rdma" != "no" ; then
+  cat > $TMPC <<EOF
+#include <rdma/rdma_cma.h>
+int main(void) { return 0; }
+EOF
+  rdma_libs="-lrdmacm -libverbs"
+  if compile_prog "" "$rdma_libs" ; then
+    rdma="yes"
+    libs_softmmu="$libs_softmmu $rdma_libs"
+  else
+    if test "$rdma" = "yes" ; then
+        error_exit \
+            " OpenFabrics librdmacm/libibverbs not present." \
+            " Your options:" \
+            "  (1) Fast: Install infiniband packages from your distro." \
+            "  (2) Cleanest: Install libraries from www.openfabrics.org" \
+            "  (3) Also: Install softiwarp if you don't have RDMA hardware"
+    fi
+    rdma="no"
+  fi
+fi
+
 ##########################################
 # VNC TLS/WS detection
 if test "$vnc" = "yes" -a \( "$vnc_tls" != "no" -o "$vnc_ws" != "no" \) ; then
@ -3558,6 +3589,7 @@ echo "Linux AIO support $linux_aio"
 echo "ATTR/XATTR support $attr"
 echo "Install blobs     $blobs"
 echo "KVM support       $kvm"
+echo "RDMA support      $rdma"
 echo "TCG interpreter   $tcg_interpreter"
 echo "fdt support       $fdt"
 echo "preadv support    $preadv"
@ -4046,6 +4078,10 @@ if test "$trace_default" = "yes"; then
  echo "CONFIG_TRACE_DEFAULT=y" >> $config_host_mak
 fi

+if test "$rdma" = "yes" ; then
+  echo "CONFIG_RDMA=y" >> $config_host_mak
+fi
+
 if test "$tcg_interpreter" = "yes"; then
  QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES"
 elif test "$ARCH" = "sparc64" ; then
@ -4485,6 +4521,10 @@ if [ "$pixman" = "internal" ]; then
  echo "config-host.h: subdir-pixman" >> $config_host_mak
 fi

+if test "$rdma" = "yes" ; then
+echo "CONFIG_RDMA=y" >> $config_host_mak
+fi
+
 if [ "$dtc_internal" = "yes" ]; then
  echo "config-host.h: subdir-dtc" >> $config_host_mak
 fi
--- a/docs/rdma.txt
+++ b/docs/rdma.txt
@ -35,7 +35,7 @@ memory tracked during each live migration iteration round cannot keep pace
 with the rate of dirty memory produced by the workload.

 RDMA currently comes in two flavors: both Ethernet based (RoCE, or RDMA
-over Convered Ethernet) as well as Infiniband-based. This implementation of
+over Converged Ethernet) as well as Infiniband-based. This implementation of
 migration using RDMA is capable of using both technologies because of
 the use of the OpenFabrics OFED software stack that abstracts out the
 programming model irrespective of the underlying hardware.
@ -188,9 +188,9 @@ header portion and a data portion (but together are transmitted
 as a single SEND message).

 Header:
-    * Length  (of the data portion, uint32, network byte order)
-    * Type    (what command to perform, uint32, network byte order)
-    * Repeat  (Number of commands in data portion, same type only)
+    * Length               (of the data portion, uint32, network byte order)
+    * Type                 (what command to perform, uint32, network byte order)
+    * Repeat               (Number of commands in data portion, same type only)

 The 'Repeat' field is here to support future multiple page registrations
 in a single message without any need to change the protocol itself
@ -202,17 +202,19 @@ The maximum number of repeats is hard-coded to 4096. This is a conservative
 limit based on the maximum size of a SEND message along with emperical
 observations on the maximum future benefit of simultaneous page registrations.

-The 'type' field has 10 different command values:
-    1. Unused
-    2. Error              (sent to the source during bad things)
-    3. Ready              (control-channel is available)
-    4. QEMU File          (for sending non-live device state)
-    5. RAM Blocks request (used right after connection setup)
-    6. RAM Blocks result  (used right after connection setup)
-    7. Compress page      (zap zero page and skip registration)
-    8. Register request   (dynamic chunk registration)
-    9. Register result    ('rkey' to be used by sender)
-    10. Register finished  (registration for current iteration finished)
+The 'type' field has 12 different command values:
+     1. Unused
+     2. Error                      (sent to the source during bad things)
+     3. Ready                      (control-channel is available)
+     4. QEMU File                  (for sending non-live device state)
+     5. RAM Blocks request         (used right after connection setup)
+     6. RAM Blocks result          (used right after connection setup)
+     7. Compress page              (zap zero page and skip registration)
+     8. Register request           (dynamic chunk registration)
+     9. Register result            ('rkey' to be used by sender)
+    10. Register finished          (registration for current iteration finished)
+    11. Unregister request         (unpin previously registered memory)
+    12. Unregister finished        (confirmation that unpin completed)

 A single control message, as hinted above, can contain within the data
 portion an array of many commands of the same type. If there is more than
@ -243,7 +245,7 @@ qemu_rdma_exchange_send(header, data, optional response header & data):
   from the receiver to tell us that the receiver
   is *ready* for us to transmit some new bytes.
 2. Optionally: if we are expecting a response from the command
-   (that we have no yet transmitted), let's post an RQ
+   (that we have not yet transmitted), let's post an RQ
   work request to receive that data a few moments later.
 3. When the READY arrives, librdmacm will
   unblock us and we immediately post a RQ work request
@ -293,8 +295,10 @@ librdmacm provides the user with a 'private data' area to be exchanged
 at connection-setup time before any infiniband traffic is generated.

 Header:
-    * Version (protocol version validated before send/recv occurs), uint32, network byte order
-    * Flags   (bitwise OR of each capability), uint32, network byte order
+    * Version (protocol version validated before send/recv occurs),
+                                               uint32, network byte order
+    * Flags   (bitwise OR of each capability),
+                                               uint32, network byte order

 There is no data portion of this header right now, so there is
 no length field. The maximum size of the 'private data' section
@ -313,7 +317,7 @@ If the version is invalid, we throw an error.
 If the version is new, we only negotiate the capabilities that the
 requested version is able to perform and ignore the rest.

-Currently there is only *one* capability in Version #1: dynamic page registration
+Currently there is only one capability in Version #1: dynamic page registration

 Finally: Negotiation happens with the Flags field: If the primary-VM
 sets a flag, but the destination does not support this capability, it
@ -326,8 +330,8 @@ QEMUFileRDMA Interface:

 QEMUFileRDMA introduces a couple of new functions:

-1. qemu_rdma_get_buffer()  (QEMUFileOps rdma_read_ops)
-2. qemu_rdma_put_buffer()  (QEMUFileOps rdma_write_ops)
+1. qemu_rdma_get_buffer()               (QEMUFileOps rdma_read_ops)
+2. qemu_rdma_put_buffer()               (QEMUFileOps rdma_write_ops)

 These two functions are very short and simply use the protocol
 describe above to deliver bytes without changing the upper-level
@ -413,3 +417,8 @@ TODO:
   the use of KSM and ballooning while using RDMA.
 4. Also, some form of balloon-device usage tracking would also
   help alleviate some issues.
+5. Move UNREGISTER requests to a separate thread.
+6. Use LRU to provide more fine-grained direction of UNREGISTER
+   requests for unpinning memory in an overcommitted environment.
+7. Expose UNREGISTER support to the user by way of workload-specific
+   hints about application behavior.
--- a/hmp.c
+++ b/hmp.c
@ -164,6 +164,10 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
            monitor_printf(mon, "downtime: %" PRIu64 " milliseconds\n",
                           info->downtime);
        }
+        if (info->has_setup_time) {
+            monitor_printf(mon, "setup: %" PRIu64 " milliseconds\n",
+                           info->setup_time);
+        }
    }

    if (info->has_ram) {
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@ -49,6 +49,7 @@ struct MigrationState
    int64_t dirty_bytes_rate;
    bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
    int64_t xbzrle_cache_size;
+    int64_t setup_time;
 };

 void process_incoming_migration(QEMUFile *f);
@ -77,6 +78,10 @@ void fd_start_incoming_migration(const char *path, Error **errp);

 void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp);

+void rdma_start_outgoing_migration(void *opaque, const char *host_port, Error **errp);
+
+void rdma_start_incoming_migration(const char *host_port, Error **errp);
+
 void migrate_fd_error(MigrationState *s);

 void migrate_fd_connect(MigrationState *s);
@ -109,6 +114,8 @@ uint64_t xbzrle_mig_pages_transferred(void);
 uint64_t xbzrle_mig_pages_overflow(void);
 uint64_t xbzrle_mig_pages_cache_miss(void);

+void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
+
 /**
 * @migrate_add_blocker - prevent migration from proceeding
 *
--- a/migration-rdma.c
+++ b/migration-rdma.c
--- a/migration.c
+++ b/migration.c
@ -36,7 +36,8 @@
 #endif

 enum {
-    MIG_STATE_ERROR,
+    MIG_STATE_ERROR = -1,
+    MIG_STATE_NONE,
    MIG_STATE_SETUP,
    MIG_STATE_CANCELLED,
    MIG_STATE_ACTIVE,
@ -63,7 +64,7 @@ static NotifierList migration_state_notifiers =
 MigrationState *migrate_get_current(void)
 {
    static MigrationState current_migration = {
-        .state = MIG_STATE_SETUP,
+        .state = MIG_STATE_NONE,
        .bandwidth_limit = MAX_THROTTLE,
        .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE,
        .mbps = -1,
@ -78,6 +79,10 @@ void qemu_start_incoming_migration(const char *uri, Error **errp)

    if (strstart(uri, "tcp:", &p))
        tcp_start_incoming_migration(p, errp);
+#ifdef CONFIG_RDMA
+    else if (strstart(uri, "x-rdma:", &p))
+        rdma_start_incoming_migration(p, errp);
+#endif
 #if !defined(WIN32)
    else if (strstart(uri, "exec:", &p))
        exec_start_incoming_migration(p, errp);
@ -180,9 +185,14 @@ MigrationInfo *qmp_query_migrate(Error **errp)
    MigrationState *s = migrate_get_current();

    switch (s->state) {
-    case MIG_STATE_SETUP:
+    case MIG_STATE_NONE:
        /* no migration has happened ever */
        break;
+    case MIG_STATE_SETUP:
+        info->has_status = true;
+        info->status = g_strdup("setup");
+        info->has_total_time = false;
+        break;
    case MIG_STATE_ACTIVE:
        info->has_status = true;
        info->status = g_strdup("active");
@ -191,6 +201,8 @@ MigrationInfo *qmp_query_migrate(Error **errp)
            - s->total_time;
        info->has_expected_downtime = true;
        info->expected_downtime = s->expected_downtime;
+        info->has_setup_time = true;
+        info->setup_time = s->setup_time;

        info->has_ram = true;
        info->ram = g_malloc0(sizeof(*info->ram));
@ -222,6 +234,8 @@ MigrationInfo *qmp_query_migrate(Error **errp)
        info->total_time = s->total_time;
        info->has_downtime = true;
        info->downtime = s->downtime;
+        info->has_setup_time = true;
+        info->setup_time = s->setup_time;

        info->has_ram = true;
        info->ram = g_malloc0(sizeof(*info->ram));
@ -253,7 +267,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
    MigrationState *s = migrate_get_current();
    MigrationCapabilityStatusList *cap;

-    if (s->state == MIG_STATE_ACTIVE) {
+    if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
        error_set(errp, QERR_MIGRATION_ACTIVE);
        return;
    }
@ -291,9 +305,9 @@ static void migrate_fd_cleanup(void *opaque)
    notifier_list_notify(&migration_state_notifiers, s);
 }

-static void migrate_finish_set_state(MigrationState *s, int new_state)
+static void migrate_set_state(MigrationState *s, int old_state, int new_state)
 {
-    if (atomic_cmpxchg(&s->state, MIG_STATE_ACTIVE, new_state) == new_state) {
+    if (atomic_cmpxchg(&s->state, old_state, new_state) == new_state) {
        trace_migrate_set_state(new_state);
    }
 }
@ -311,7 +325,7 @@ static void migrate_fd_cancel(MigrationState *s)
 {
    DPRINTF("cancelling migration\n");

-    migrate_finish_set_state(s, MIG_STATE_CANCELLED);
+    migrate_set_state(s, s->state, MIG_STATE_CANCELLED);
 }

 void add_migration_state_change_notifier(Notifier *notify)
@ -388,7 +402,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
    params.blk = blk;
    params.shared = inc;

-    if (s->state == MIG_STATE_ACTIVE) {
+    if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
        error_set(errp, QERR_MIGRATION_ACTIVE);
        return;
    }
@ -406,6 +420,10 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,

    if (strstart(uri, "tcp:", &p)) {
        tcp_start_outgoing_migration(s, p, &local_err);
+#ifdef CONFIG_RDMA
+    } else if (strstart(uri, "x-rdma:", &p)) {
+        rdma_start_outgoing_migration(s, p, &local_err);
+#endif
 #if !defined(WIN32)
    } else if (strstart(uri, "exec:", &p)) {
        exec_start_outgoing_migration(s, p, &local_err);
@ -526,6 +544,7 @@ static void *migration_thread(void *opaque)
 {
    MigrationState *s = opaque;
    int64_t initial_time = qemu_get_clock_ms(rt_clock);
+    int64_t setup_start = qemu_get_clock_ms(host_clock);
    int64_t initial_bytes = 0;
    int64_t max_size = 0;
    int64_t start_time = initial_time;
@ -534,6 +553,11 @@ static void *migration_thread(void *opaque)
    DPRINTF("beginning savevm\n");
    qemu_savevm_state_begin(s->file, &s->params);

+    s->setup_time = qemu_get_clock_ms(host_clock) - setup_start;
+    migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE);
+
+    DPRINTF("setup complete\n");
+
    while (s->state == MIG_STATE_ACTIVE) {
        int64_t current_time;
        uint64_t pending_size;
@ -561,19 +585,19 @@ static void *migration_thread(void *opaque)
                qemu_mutex_unlock_iothread();

                if (ret < 0) {
-                    migrate_finish_set_state(s, MIG_STATE_ERROR);
+                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
                    break;
                }

                if (!qemu_file_get_error(s->file)) {
-                    migrate_finish_set_state(s, MIG_STATE_COMPLETED);
+                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COMPLETED);
                    break;
                }
            }
        }

        if (qemu_file_get_error(s->file)) {
-            migrate_finish_set_state(s, MIG_STATE_ERROR);
+            migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
            break;
        }
        current_time = qemu_get_clock_ms(rt_clock);
@ -624,8 +648,8 @@ static void *migration_thread(void *opaque)

 void migrate_fd_connect(MigrationState *s)
 {
-    s->state = MIG_STATE_ACTIVE;
-    trace_migrate_set_state(MIG_STATE_ACTIVE);
+    s->state = MIG_STATE_SETUP;
+    trace_migrate_set_state(MIG_STATE_SETUP);

    /* This is a best 1st approximation. ns to ms */
    s->expected_downtime = max_downtime/1000000;
--- a/qapi-schema.json
+++ b/qapi-schema.json
@ -578,6 +578,12 @@
 #        expected downtime in milliseconds for the guest in last walk
 #        of the dirty bitmap. (since 1.3)
 #
+# @setup-time: #optional amount of setup time in milliseconds _before_ the
+#        iterations begin but _after_ the QMP command is issued. This is designed
+#        to provide an accounting of any activities (such as RDMA pinning) which
+#        may be expensive, but do not actually occur during the iterative
+#        migration rounds themselves. (since 1.6)
+#
 # Since: 0.14.0
 ##
 { 'type': 'MigrationInfo',
@ -586,7 +592,8 @@
           '*xbzrle-cache': 'XBZRLECacheStats',
           '*total-time': 'int',
           '*expected-downtime': 'int',
-           '*downtime': 'int'} }
+           '*downtime': 'int',
+           '*setup-time': 'int'} }

 ##
 # @query-migrate
@ -619,6 +626,9 @@
 #          to enable the capability on the source VM. The feature is disabled by
 #          default. (since 1.6)
 #
+# @auto-converge: If enabled, QEMU will automatically throttle down the guest
+#          to speed up convergence of RAM migration. (since 1.6)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
--- a/savevm.c
+++ b/savevm.c
@ -662,7 +662,7 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
                                    offset, size, bytes_sent);

        if (ret != RAM_SAVE_CONTROL_DELAYED) {
-            if (*bytes_sent > 0) {
+            if (bytes_sent && *bytes_sent > 0) {
                qemu_update_position(f, *bytes_sent);
            } else if (ret < 0) {
                qemu_file_set_error(f, ret);
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@ -252,6 +252,24 @@ static void cpu_pre_save(void *opaque)
    }

    env->fpregs_format_vmstate = 0;
+
+    /*
+     * Real mode guest segments register DPL should be zero.
+     * Older KVM version were setting it wrongly.
+     * Fixing it will allow live migration to host with unrestricted guest
+     * support (otherwise the migration will fail with invalid guest state
+     * error).
+     */
+    if (!(env->cr[0] & CR0_PE_MASK) &&
+        (env->segs[R_CS].flags >> DESC_DPL_SHIFT & 3) != 0) {
+        env->segs[R_CS].flags &= ~(env->segs[R_CS].flags & DESC_DPL_MASK);
+        env->segs[R_DS].flags &= ~(env->segs[R_DS].flags & DESC_DPL_MASK);
+        env->segs[R_ES].flags &= ~(env->segs[R_ES].flags & DESC_DPL_MASK);
+        env->segs[R_FS].flags &= ~(env->segs[R_FS].flags & DESC_DPL_MASK);
+        env->segs[R_GS].flags &= ~(env->segs[R_GS].flags & DESC_DPL_MASK);
+        env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK);
+    }
+
 }

 static int cpu_post_load(void *opaque, int version_id)
@ -260,6 +278,24 @@ static int cpu_post_load(void *opaque, int version_id)
    CPUX86State *env = &cpu->env;
    int i;

+    /*
+     * Real mode guest segments register DPL should be zero.
+     * Older KVM version were setting it wrongly.
+     * Fixing it will allow live migration from such host that don't have
+     * restricted guest support to a host with unrestricted guest support
+     * (otherwise the migration will fail with invalid guest state
+     * error).
+     */
+    if (!(env->cr[0] & CR0_PE_MASK) &&
+        (env->segs[R_CS].flags >> DESC_DPL_SHIFT & 3) != 0) {
+        env->segs[R_CS].flags &= ~(env->segs[R_CS].flags & DESC_DPL_MASK);
+        env->segs[R_DS].flags &= ~(env->segs[R_DS].flags & DESC_DPL_MASK);
+        env->segs[R_ES].flags &= ~(env->segs[R_ES].flags & DESC_DPL_MASK);
+        env->segs[R_FS].flags &= ~(env->segs[R_FS].flags & DESC_DPL_MASK);
+        env->segs[R_GS].flags &= ~(env->segs[R_GS].flags & DESC_DPL_MASK);
+        env->segs[R_SS].flags &= ~(env->segs[R_SS].flags & DESC_DPL_MASK);
+    }
+
    /* XXX: restore FPU round state */
    env->fpstt = (env->fpus_vmstate >> 11) & 7;
    env->fpus = env->fpus_vmstate & ~0x3800;