Update version for 2.11.2 release

Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
usb/dev-mtp: Fix use of uninitialized values
2018-06-26 22:27:39 -05:00 · 2018-06-21 10:18:22 -05:00 · 2018-06-21 10:18:16 -05:00 · 2018-06-21 10:18:10 -05:00 · 2018-06-21 10:16:52 -05:00 · 2018-06-21 04:22:15 -04:00
200 changed files with 4207 additions and 1379 deletions
--- a/6
+++ b/6
@ -1680,6 +1680,12 @@ F: include/sysemu/replay.h
 F: docs/replay.txt
 F: stubs/replay.c

+IOVA Tree
+M: Peter Xu <peterx@redhat.com>
+S: Maintained
+F: include/qemu/iova-tree.h
+F: util/iova-tree.c
+
 Usermode Emulation
 ------------------
 Overall
--- a/2
+++ b/2
@ -1 +1 @@
-2.11.0
+2.11.2
--- a/block.c
+++ b/block.c
@ -1596,13 +1596,24 @@ static int bdrv_reopen_get_flags(BlockReopenQueue *q, BlockDriverState *bs)

 /* Returns whether the image file can be written to after the reopen queue @q
 * has been successfully applied, or right now if @q is NULL. */
-static bool bdrv_is_writable(BlockDriverState *bs, BlockReopenQueue *q)
+static bool bdrv_is_writable_after_reopen(BlockDriverState *bs,
+                                          BlockReopenQueue *q)
 {
    int flags = bdrv_reopen_get_flags(q, bs);

    return (flags & (BDRV_O_RDWR | BDRV_O_INACTIVE)) == BDRV_O_RDWR;
 }

+/*
+ * Return whether the BDS can be written to.  This is not necessarily
+ * the same as !bdrv_is_read_only(bs), as inactivated images may not
+ * be written to but do not count as read-only images.
+ */
+bool bdrv_is_writable(BlockDriverState *bs)
+{
+    return bdrv_is_writable_after_reopen(bs, NULL);
+}
+
 static void bdrv_child_perm(BlockDriverState *bs, BlockDriverState *child_bs,
                            BdrvChild *c, const BdrvChildRole *role,
                            BlockReopenQueue *reopen_queue,
@ -1640,7 +1651,7 @@ static int bdrv_check_perm(BlockDriverState *bs, BlockReopenQueue *q,

    /* Write permissions never work with read-only images */
    if ((cumulative_perms & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) &&
-        !bdrv_is_writable(bs, q))
+        !bdrv_is_writable_after_reopen(bs, q))
    {
        error_setg(errp, "Block node is read-only");
        return -EPERM;
@ -1930,7 +1941,7 @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
                                  &perm, &shared);

        /* Format drivers may touch metadata even if the guest doesn't write */
-        if (bdrv_is_writable(bs, reopen_queue)) {
+        if (bdrv_is_writable_after_reopen(bs, reopen_queue)) {
            perm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
        }

@ -4593,10 +4604,11 @@ void bdrv_img_create(const char *filename, const char *fmt,
        back_flags = flags;
        back_flags &= ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING);

+        backing_options = qdict_new();
        if (backing_fmt) {
-            backing_options = qdict_new();
            qdict_put_str(backing_options, "driver", backing_fmt);
        }
+        qdict_put_bool(backing_options, BDRV_OPT_FORCE_SHARE, true);

        bs = bdrv_open(full_backing, NULL, backing_options, back_flags,
                       &local_err);
--- a/block/file-posix.c
+++ b/block/file-posix.c
@ -1694,6 +1694,7 @@ static int raw_regular_truncate(int fd, int64_t offset, PreallocMode prealloc,
    case PREALLOC_MODE_FULL:
    {
        int64_t num = 0, left = offset - current_length;
+        off_t seek_result;

        /*
         * Knowing the final size from the beginning could allow the file
@ -1708,8 +1709,8 @@ static int raw_regular_truncate(int fd, int64_t offset, PreallocMode prealloc,

        buf = g_malloc0(65536);

-        result = lseek(fd, current_length, SEEK_SET);
-        if (result < 0) {
+        seek_result = lseek(fd, current_length, SEEK_SET);
+        if (seek_result < 0) {
            result = -errno;
            error_setg_errno(errp, -result,
                             "Failed to seek to the old end of file");
--- a/block/gluster.c
+++ b/block/gluster.c
@ -164,7 +164,12 @@ static QemuOptsList runtime_unix_opts = {
        {
            .name = GLUSTER_OPT_SOCKET,
            .type = QEMU_OPT_STRING,
-            .help = "socket file path)",
+            .help = "socket file path (legacy)",
+        },
+        {
+            .name = GLUSTER_OPT_PATH,
+            .type = QEMU_OPT_STRING,
+            .help = "socket file path (QAPI)",
        },
        { /* end of list */ }
    },
@ -612,10 +617,18 @@ static int qemu_gluster_parse_json(BlockdevOptionsGluster *gconf,
                goto out;
            }

-            ptr = qemu_opt_get(opts, GLUSTER_OPT_SOCKET);
+            ptr = qemu_opt_get(opts, GLUSTER_OPT_PATH);
+            if (!ptr) {
+                ptr = qemu_opt_get(opts, GLUSTER_OPT_SOCKET);
+            } else if (qemu_opt_get(opts, GLUSTER_OPT_SOCKET)) {
+                error_setg(&local_err,
+                           "Conflicting parameters 'path' and 'socket'");
+                error_append_hint(&local_err, GERR_INDEX_HINT, i);
+                goto out;
+            }
            if (!ptr) {
                error_setg(&local_err, QERR_MISSING_PARAMETER,
-                           GLUSTER_OPT_SOCKET);
+                           GLUSTER_OPT_PATH);
                error_append_hint(&local_err, GERR_INDEX_HINT, i);
                goto out;
            }
@ -680,7 +693,7 @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
                             "file.server.0.host=1.2.3.4,"
                             "file.server.0.port=24007,"
                             "file.server.1.transport=unix,"
-                             "file.server.1.socket=/var/run/glusterd.socket ..."
+                             "file.server.1.path=/var/run/glusterd.socket ..."
                             "\n");
            errno = -ret;
            return NULL;
--- a/block/io.c
+++ b/block/io.c
@ -175,8 +175,10 @@ static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
    bdrv_wakeup(bs);
 }

+/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 {
+    BdrvChild *child, *tmp;
    BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};

    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
@ -187,6 +189,10 @@ static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
    data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
    bdrv_coroutine_enter(bs, data.co);
    BDRV_POLL_WHILE(bs, !data.done);
+
+    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
+        bdrv_drain_invoke(child->bs, begin);
+    }
 }

 static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
@ -194,9 +200,6 @@ static bool bdrv_drain_recurse(BlockDriverState *bs, bool begin)
    BdrvChild *child, *tmp;
    bool waited;

-    /* Ensure any pending metadata writes are submitted to bs->file.  */
-    bdrv_drain_invoke(bs, begin);
-
    /* Wait for drained requests to finish */
    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);

@ -279,6 +282,7 @@ void bdrv_drained_begin(BlockDriverState *bs)
        bdrv_parent_drained_begin(bs);
    }

+    bdrv_drain_invoke(bs, true);
    bdrv_drain_recurse(bs, true);
 }

@ -294,6 +298,7 @@ void bdrv_drained_end(BlockDriverState *bs)
    }

    bdrv_parent_drained_end(bs);
+    bdrv_drain_invoke(bs, false);
    bdrv_drain_recurse(bs, false);
    aio_enable_external(bdrv_get_aio_context(bs));
 }
@ -350,6 +355,7 @@ void bdrv_drain_all_begin(void)
        aio_context_acquire(aio_context);
        bdrv_parent_drained_begin(bs);
        aio_disable_external(aio_context);
+        bdrv_drain_invoke(bs, true);
        aio_context_release(aio_context);

        if (!g_slist_find(aio_ctxs, aio_context)) {
@ -393,6 +399,7 @@ void bdrv_drain_all_end(void)
        aio_context_acquire(aio_context);
        aio_enable_external(aio_context);
        bdrv_parent_drained_end(bs);
+        bdrv_drain_invoke(bs, false);
        bdrv_drain_recurse(bs, false);
        aio_context_release(aio_context);
    }
--- a/block/iscsi.c
+++ b/block/iscsi.c
@ -2,7 +2,7 @@
 * QEMU Block driver for iSCSI images
 *
 * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com>
- * Copyright (c) 2012-2016 Peter Lieven <pl@kamp.de>
+ * Copyright (c) 2012-2017 Peter Lieven <pl@kamp.de>
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@ -1128,6 +1128,9 @@ retry:
        goto retry;
    }

+    iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
+                               bytes >> BDRV_SECTOR_BITS);
+
    if (iTask.status == SCSI_STATUS_CHECK_CONDITION) {
        /* the target might fail with a check condition if it
           is not happy with the alignment of the UNMAP request
@ -1140,9 +1143,6 @@ retry:
        goto out_unlock;
    }

-    iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                               bytes >> BDRV_SECTOR_BITS);
-
 out_unlock:
    qemu_mutex_unlock(&iscsilun->mutex);
    return r;
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@ -846,9 +846,6 @@ int nbd_client_init(BlockDriverState *bs,
    if (client->info.flags & NBD_FLAG_SEND_WRITE_ZEROES) {
        bs->supported_zero_flags |= BDRV_REQ_MAY_UNMAP;
    }
-    if (client->info.min_block > bs->bl.request_alignment) {
-        bs->bl.request_alignment = client->info.min_block;
-    }

    qemu_co_mutex_init(&client->send_mutex);
    qemu_co_queue_init(&client->free_sema);
--- a/block/nbd.c
+++ b/block/nbd.c
@ -388,6 +388,7 @@ static QemuOptsList nbd_runtime_opts = {
            .type = QEMU_OPT_STRING,
            .help = "ID of the TLS credentials to use",
        },
+        { /* end of list */ }
    },
 };

@ -473,8 +474,10 @@ static int nbd_co_flush(BlockDriverState *bs)
 static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
 {
    NBDClientSession *s = nbd_get_client_session(bs);
+    uint32_t min = s->info.min_block;
    uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, s->info.max_block);

+    bs->bl.request_alignment = min ? min : BDRV_SECTOR_SIZE;
    bs->bl.max_pdiscard = max;
    bs->bl.max_pwrite_zeroes = max;
    bs->bl.max_transfer = max;
--- a/block/qcow2.c
+++ b/block/qcow2.c
@ -4235,7 +4235,7 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
    char *message;
    va_list ap;

-    fatal = fatal && !bs->read_only;
+    fatal = fatal && bdrv_is_writable(bs);

    if (s->signaled_corruption &&
        (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
--- a/block/raw-format.c
+++ b/block/raw-format.c
@ -167,16 +167,37 @@ static void raw_reopen_abort(BDRVReopenState *state)
    state->opaque = NULL;
 }

+/* Check and adjust the offset, against 'offset' and 'size' options. */
+static inline int raw_adjust_offset(BlockDriverState *bs, uint64_t *offset,
+                                    uint64_t bytes, bool is_write)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (s->has_size && (*offset > s->size || bytes > (s->size - *offset))) {
+        /* There's not enough space for the write, or the read request is
+         * out-of-range. Don't read/write anything to prevent leaking out of
+         * the size specified in options. */
+        return is_write ? -ENOSPC : -EINVAL;;
+    }
+
+    if (*offset > INT64_MAX - s->offset) {
+        return -EINVAL;
+    }
+    *offset += s->offset;
+
+    return 0;
+}
+
 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
                                      uint64_t bytes, QEMUIOVector *qiov,
                                      int flags)
 {
-    BDRVRawState *s = bs->opaque;
+    int ret;

-    if (offset > UINT64_MAX - s->offset) {
-        return -EINVAL;
+    ret = raw_adjust_offset(bs, &offset, bytes, false);
+    if (ret) {
+        return ret;
    }
-    offset += s->offset;

    BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
@ -186,23 +207,11 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
                                       uint64_t bytes, QEMUIOVector *qiov,
                                       int flags)
 {
-    BDRVRawState *s = bs->opaque;
    void *buf = NULL;
    BlockDriver *drv;
    QEMUIOVector local_qiov;
    int ret;

-    if (s->has_size && (offset > s->size || bytes > (s->size - offset))) {
-        /* There's not enough space for the data. Don't write anything and just
-         * fail to prevent leaking out of the size specified in options. */
-        return -ENOSPC;
-    }
-
-    if (offset > UINT64_MAX - s->offset) {
-        ret = -EINVAL;
-        goto fail;
-    }
-
    if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) {
        /* Handling partial writes would be a pain - so we just
         * require that guests have 512-byte request alignment if
@ -237,7 +246,10 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState *bs, uint64_t offset,
        qiov = &local_qiov;
    }

-    offset += s->offset;
+    ret = raw_adjust_offset(bs, &offset, bytes, true);
+    if (ret) {
+        goto fail;
+    }

    BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
    ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
@ -267,22 +279,24 @@ static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
                                             int64_t offset, int bytes,
                                             BdrvRequestFlags flags)
 {
-    BDRVRawState *s = bs->opaque;
-    if (offset > UINT64_MAX - s->offset) {
-        return -EINVAL;
+    int ret;
+
+    ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true);
+    if (ret) {
+        return ret;
    }
-    offset += s->offset;
    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 }

 static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
                                        int64_t offset, int bytes)
 {
-    BDRVRawState *s = bs->opaque;
-    if (offset > UINT64_MAX - s->offset) {
-        return -EINVAL;
+    int ret;
+
+    ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true);
+    if (ret) {
+        return ret;
    }
-    offset += s->offset;
    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
 }

--- a/block/rbd.c
+++ b/block/rbd.c
@ -265,13 +265,14 @@ static int qemu_rbd_set_keypairs(rados_t cluster, const char *keypairs_json,
        key = qstring_get_str(name);

        ret = rados_conf_set(cluster, key, qstring_get_str(value));
-        QDECREF(name);
        QDECREF(value);
        if (ret < 0) {
            error_setg_errno(errp, -ret, "invalid conf option %s", key);
+            QDECREF(name);
            ret = -EINVAL;
            break;
        }
+        QDECREF(name);
    }

    QDECREF(keypairs);
--- a/block/ssh.c
+++ b/block/ssh.c
@ -556,6 +556,7 @@ static QemuOptsList ssh_runtime_opts = {
            .type = QEMU_OPT_STRING,
            .help = "Defines how and what to check the host key against",
        },
+        { /* end of list */ }
    },
 };

--- a/block/throttle.c
+++ b/block/throttle.c
@ -35,9 +35,12 @@ static QemuOptsList throttle_opts = {
    },
 };

-static int throttle_configure_tgm(BlockDriverState *bs,
-                                  ThrottleGroupMember *tgm,
-                                  QDict *options, Error **errp)
+/*
+ * If this function succeeds then the throttle group name is stored in
+ * @group and must be freed by the caller.
+ * If there's an error then @group remains unmodified.
+ */
+static int throttle_parse_options(QDict *options, char **group, Error **errp)
 {
    int ret;
    const char *group_name;
@ -62,8 +65,7 @@ static int throttle_configure_tgm(BlockDriverState *bs,
        goto fin;
    }

-    /* Register membership to group with name group_name */
-    throttle_group_register_tgm(tgm, group_name, bdrv_get_aio_context(bs));
+    *group = g_strdup(group_name);
    ret = 0;
 fin:
    qemu_opts_del(opts);
@ -74,6 +76,8 @@ static int throttle_open(BlockDriverState *bs, QDict *options,
                         int flags, Error **errp)
 {
    ThrottleGroupMember *tgm = bs->opaque;
+    char *group;
+    int ret;

    bs->file = bdrv_open_child(NULL, options, "file", bs,
                               &child_file, false, errp);
@ -83,7 +87,14 @@ static int throttle_open(BlockDriverState *bs, QDict *options,
    bs->supported_write_flags = bs->file->bs->supported_write_flags;
    bs->supported_zero_flags = bs->file->bs->supported_zero_flags;

-    return throttle_configure_tgm(bs, tgm, options, errp);
+    ret = throttle_parse_options(options, &group, errp);
+    if (ret == 0) {
+        /* Register membership to group with name group_name */
+        throttle_group_register_tgm(tgm, group, bdrv_get_aio_context(bs));
+        g_free(group);
+    }
+
+    return ret;
 }

 static void throttle_close(BlockDriverState *bs)
@ -159,35 +170,36 @@ static void throttle_attach_aio_context(BlockDriverState *bs,
 static int throttle_reopen_prepare(BDRVReopenState *reopen_state,
                                   BlockReopenQueue *queue, Error **errp)
 {
-    ThrottleGroupMember *tgm;
+    int ret;
+    char *group = NULL;

    assert(reopen_state != NULL);
    assert(reopen_state->bs != NULL);

-    reopen_state->opaque = g_new0(ThrottleGroupMember, 1);
-    tgm = reopen_state->opaque;
-
-    return throttle_configure_tgm(reopen_state->bs, tgm, reopen_state->options,
-            errp);
+    ret = throttle_parse_options(reopen_state->options, &group, errp);
+    reopen_state->opaque = group;
+    return ret;
 }

 static void throttle_reopen_commit(BDRVReopenState *reopen_state)
 {
-    ThrottleGroupMember *old_tgm = reopen_state->bs->opaque;
-    ThrottleGroupMember *new_tgm = reopen_state->opaque;
+    BlockDriverState *bs = reopen_state->bs;
+    ThrottleGroupMember *tgm = bs->opaque;
+    char *group = reopen_state->opaque;

-    throttle_group_unregister_tgm(old_tgm);
-    g_free(old_tgm);
-    reopen_state->bs->opaque = new_tgm;
+    assert(group);
+
+    if (strcmp(group, throttle_group_get_name(tgm))) {
+        throttle_group_unregister_tgm(tgm);
+        throttle_group_register_tgm(tgm, group, bdrv_get_aio_context(bs));
+    }
+    g_free(reopen_state->opaque);
    reopen_state->opaque = NULL;
 }

 static void throttle_reopen_abort(BDRVReopenState *reopen_state)
 {
-    ThrottleGroupMember *tgm = reopen_state->opaque;
-
-    throttle_group_unregister_tgm(tgm);
-    g_free(tgm);
+    g_free(reopen_state->opaque);
    reopen_state->opaque = NULL;
 }

--- a/5
+++ b/5
@ -930,6 +930,8 @@ for opt do
  ;;
  --firmwarepath=*) firmwarepath="$optarg"
  ;;
+  --host=*|--build=*|\
+  --disable-dependency-tracking|\
  --sbindir=*|--sharedstatedir=*|\
  --oldincludedir=*|--datarootdir=*|--infodir=*|--localedir=*|\
  --htmldir=*|--dvidir=*|--pdfdir=*|--psdir=*)
@ -2788,6 +2790,7 @@ if test "$sdl" != "no" ; then
 int main( void ) { return SDL_Init (SDL_INIT_VIDEO); }
 EOF
  sdl_cflags=$($sdlconfig --cflags 2>/dev/null)
+  sdl_cflags="$sdl_cflags -Wno-undef"  # workaround 2.0.8 bug
  if test "$static" = "yes" ; then
    if $pkg_config $sdlname --exists; then
      sdl_libs=$($pkg_config $sdlname --static --libs 2>/dev/null)
@ -3920,7 +3923,7 @@ fi
 # check if memfd is supported
 memfd=no
 cat > $TMPC << EOF
-#include <sys/memfd.h>
+#include <sys/mman.h>

 int main(void)
 {
--- a/cpus.c
+++ b/cpus.c
@ -843,11 +843,19 @@ void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
        return;
    }

-    if (!qemu_in_vcpu_thread() && first_cpu) {
+    if (qemu_in_vcpu_thread()) {
+        /* A CPU is currently running; kick it back out to the
+         * tcg_cpu_exec() loop so it will recalculate its
+         * icount deadline immediately.
+         */
+        qemu_cpu_kick(current_cpu);
+    } else if (first_cpu) {
        /* qemu_cpu_kick is not enough to kick a halted CPU out of
         * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
         * causes cpu_thread_is_idle to return false.  This way,
         * handle_icount_deadline can run.
+         * If we have no CPUs at all for some reason, we don't
+         * need to do anything.
         */
        async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
    }
--- a/device_tree.c
+++ b/device_tree.c
@ -29,7 +29,7 @@

 #include <libfdt.h>

-#define FDT_MAX_SIZE  0x10000
+#define FDT_MAX_SIZE  0x100000

 void *create_device_tree(int *sizep)
 {
--- a/docs/interop/qcow2.txt
+++ b/docs/interop/qcow2.txt
@ -426,10 +426,20 @@ Standard Cluster Descriptor:

 Compressed Clusters Descriptor (x = 62 - (cluster_bits - 8)):

-    Bit  0 -  x:    Host cluster offset. This is usually _not_ aligned to a
-                    cluster boundary!
+    Bit  0 - x-1:   Host cluster offset. This is usually _not_ aligned to a
+                    cluster or sector boundary!

-       x+1 - 61:    Compressed size of the images in sectors of 512 bytes
+         x - 61:    Number of additional 512-byte sectors used for the
+                    compressed data, beyond the sector containing the offset
+                    in the previous field. Some of these sectors may reside
+                    in the next contiguous host cluster.
+
+                    Note that the compressed data does not necessarily occupy
+                    all of the bytes in the final sector; rather, decompression
+                    stops when it has produced a cluster of data.
+
+                    Another compressed cluster may map to the tail of the final
+                    sector used by this compressed cluster.

 If a cluster is unallocated, read requests shall read the data from the backing
 file (except if bit 0 in the Standard Cluster Descriptor is set). If there is
--- a/exec.c
+++ b/exec.c
@ -1455,6 +1455,7 @@ static int find_max_supported_pagesize(Object *obj, void *opaque)
        mem_path = object_property_get_str(obj, "mem-path", NULL);
        if (mem_path) {
            long hpsize = qemu_mempath_getpagesize(mem_path);
+            g_free(mem_path);
            if (hpsize < *hpsize_min) {
                *hpsize_min = hpsize;
            }
@ -2575,6 +2576,8 @@ static const MemoryRegionOps watch_mem_ops = {
    },
 };

+static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
+                                      MemTxAttrs attrs, uint8_t *buf, int len);
 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
                                  const uint8_t *buf, int len);
 static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
@ -3005,6 +3008,7 @@ static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
    return result;
 }

+/* Called from RCU critical section.  */
 static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
                                  const uint8_t *buf, int len)
 {
@ -3013,25 +3017,14 @@ static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
    MemoryRegion *mr;
    MemTxResult result = MEMTX_OK;

-    if (len > 0) {
-        rcu_read_lock();
-        l = len;
-        mr = flatview_translate(fv, addr, &addr1, &l, true);
-        result = flatview_write_continue(fv, addr, attrs, buf, len,
-                                         addr1, l, mr);
-        rcu_read_unlock();
-    }
+    l = len;
+    mr = flatview_translate(fv, addr, &addr1, &l, true);
+    result = flatview_write_continue(fv, addr, attrs, buf, len,
+                                     addr1, l, mr);

    return result;
 }

-MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
-                                              MemTxAttrs attrs,
-                                              const uint8_t *buf, int len)
-{
-    return flatview_write(address_space_to_flatview(as), addr, attrs, buf, len);
-}
-
 /* Called within RCU critical section.  */
 MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
                                   MemTxAttrs attrs, uint8_t *buf,
@ -3102,42 +3095,61 @@ MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
    return result;
 }

-MemTxResult flatview_read_full(FlatView *fv, hwaddr addr,
-                               MemTxAttrs attrs, uint8_t *buf, int len)
+/* Called from RCU critical section.  */
+static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
+                                 MemTxAttrs attrs, uint8_t *buf, int len)
 {
    hwaddr l;
    hwaddr addr1;
    MemoryRegion *mr;
+
+    l = len;
+    mr = flatview_translate(fv, addr, &addr1, &l, false);
+    return flatview_read_continue(fv, addr, attrs, buf, len,
+                                  addr1, l, mr);
+}
+
+MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
+                                    MemTxAttrs attrs, uint8_t *buf, int len)
+{
    MemTxResult result = MEMTX_OK;
+    FlatView *fv;

    if (len > 0) {
        rcu_read_lock();
-        l = len;
-        mr = flatview_translate(fv, addr, &addr1, &l, false);
-        result = flatview_read_continue(fv, addr, attrs, buf, len,
-                                        addr1, l, mr);
+        fv = address_space_to_flatview(as);
+        result = flatview_read(fv, addr, attrs, buf, len);
        rcu_read_unlock();
    }

    return result;
 }

-static MemTxResult flatview_rw(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
-                               uint8_t *buf, int len, bool is_write)
+MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
+                                MemTxAttrs attrs,
+                                const uint8_t *buf, int len)
 {
-    if (is_write) {
-        return flatview_write(fv, addr, attrs, (uint8_t *)buf, len);
-    } else {
-        return flatview_read(fv, addr, attrs, (uint8_t *)buf, len);
+    MemTxResult result = MEMTX_OK;
+    FlatView *fv;
+
+    if (len > 0) {
+        rcu_read_lock();
+        fv = address_space_to_flatview(as);
+        result = flatview_write(fv, addr, attrs, buf, len);
+        rcu_read_unlock();
    }
+
+    return result;
 }

-MemTxResult address_space_rw(AddressSpace *as, hwaddr addr,
-                             MemTxAttrs attrs, uint8_t *buf,
-                             int len, bool is_write)
+MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
+                             uint8_t *buf, int len, bool is_write)
 {
-    return flatview_rw(address_space_to_flatview(as),
-                       addr, attrs, buf, len, is_write);
+    if (is_write) {
+        return address_space_write(as, addr, attrs, buf, len);
+    } else {
+        return address_space_read_full(as, addr, attrs, buf, len);
+    }
 }

 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
@ -3303,14 +3315,12 @@ static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
    MemoryRegion *mr;
    hwaddr l, xlat;

-    rcu_read_lock();
    while (len > 0) {
        l = len;
        mr = flatview_translate(fv, addr, &xlat, &l, is_write);
        if (!memory_access_is_direct(mr, is_write)) {
            l = memory_access_size(mr, l, addr);
            if (!memory_region_access_valid(mr, xlat, l, is_write)) {
-                rcu_read_unlock();
                return false;
            }
        }
@ -3318,15 +3328,20 @@ static bool flatview_access_valid(FlatView *fv, hwaddr addr, int len,
        len -= l;
        addr += l;
    }
-    rcu_read_unlock();
    return true;
 }

 bool address_space_access_valid(AddressSpace *as, hwaddr addr,
                                int len, bool is_write)
 {
-    return flatview_access_valid(address_space_to_flatview(as),
-                                 addr, len, is_write);
+    FlatView *fv;
+    bool result;
+
+    rcu_read_lock();
+    fv = address_space_to_flatview(as);
+    result = flatview_access_valid(fv, addr, len, is_write);
+    rcu_read_unlock();
+    return result;
 }

 static hwaddr
@ -3372,7 +3387,7 @@ void *address_space_map(AddressSpace *as,
    hwaddr l, xlat;
    MemoryRegion *mr;
    void *ptr;
-    FlatView *fv = address_space_to_flatview(as);
+    FlatView *fv;

    if (len == 0) {
        return NULL;
@ -3380,6 +3395,7 @@ void *address_space_map(AddressSpace *as,

    l = len;
    rcu_read_lock();
+    fv = address_space_to_flatview(as);
    mr = flatview_translate(fv, addr, &xlat, &l, is_write);

    if (!memory_access_is_direct(mr, is_write)) {
--- a/gdbstub.c
+++ b/gdbstub.c
@ -515,6 +515,7 @@ static inline int tohex(int v)
        return v - 10 + 'a';
 }

+/* writes 2*len+1 bytes in buf */
 static void memtohex(char *buf, const uint8_t *mem, int len)
 {
    int i, c;
@ -970,8 +971,8 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
    const char *p;
    uint32_t thread;
    int ch, reg_size, type, res;
-    char buf[MAX_PACKET_LENGTH];
    uint8_t mem_buf[MAX_PACKET_LENGTH];
+    char buf[sizeof(mem_buf) + 1 /* trailing NUL */];
    uint8_t *registers;
    target_ulong addr, len;

--- a/hw/block/pflash_cfi01.c
+++ b/hw/block/pflash_cfi01.c
@ -90,7 +90,6 @@ struct pflash_t {
    uint16_t ident1;
    uint16_t ident2;
    uint16_t ident3;
-    uint8_t cfi_len;
    uint8_t cfi_table[0x52];
    uint64_t counter;
    unsigned int writeblock_size;
@ -153,7 +152,7 @@ static uint32_t pflash_cfi_query(pflash_t *pfl, hwaddr offset)
    boff = offset >> (ctz32(pfl->bank_width) +
                      ctz32(pfl->max_device_width) - ctz32(pfl->device_width));

-    if (boff > pfl->cfi_len) {
+    if (boff >= sizeof(pfl->cfi_table)) {
        return 0;
    }
    /* Now we will construct the CFI response generated by a single
@ -385,10 +384,10 @@ static uint32_t pflash_read (pflash_t *pfl, hwaddr offset,
                boff = boff >> 2;
            }

-            if (boff > pfl->cfi_len) {
-                ret = 0;
-            } else {
+            if (boff < sizeof(pfl->cfi_table)) {
                ret = pfl->cfi_table[boff];
+            } else {
+                ret = 0;
            }
        } else {
            /* If we have a read larger than the bank_width, combine multiple
@ -791,7 +790,6 @@ static void pflash_cfi01_realize(DeviceState *dev, Error **errp)
    pfl->cmd = 0;
    pfl->status = 0;
    /* Hardcoded CFI table */
-    pfl->cfi_len = 0x52;
    /* Standard "QRY" string */
    pfl->cfi_table[0x10] = 'Q';
    pfl->cfi_table[0x11] = 'R';
--- a/hw/block/pflash_cfi02.c
+++ b/hw/block/pflash_cfi02.c
@ -83,7 +83,6 @@ struct pflash_t {
    uint16_t ident3;
    uint16_t unlock_addr0;
    uint16_t unlock_addr1;
-    uint8_t cfi_len;
    uint8_t cfi_table[0x52];
    QEMUTimer *timer;
    /* The device replicates the flash memory across its memory space.  Emulate
@ -235,10 +234,11 @@ static uint32_t pflash_read (pflash_t *pfl, hwaddr offset,
        break;
    case 0x98:
        /* CFI query mode */
-        if (boff > pfl->cfi_len)
-            ret = 0;
-        else
+        if (boff < sizeof(pfl->cfi_table)) {
            ret = pfl->cfi_table[boff];
+        } else {
+            ret = 0;
+        }
        break;
    }

@ -663,7 +663,6 @@ static void pflash_cfi02_realize(DeviceState *dev, Error **errp)
    pfl->cmd = 0;
    pfl->status = 0;
    /* Hardcoded CFI table (mostly from SG29 Spansion flash) */
-    pfl->cfi_len = 0x52;
    /* Standard "QRY" string */
    pfl->cfi_table[0x10] = 'Q';
    pfl->cfi_table[0x11] = 'R';
--- a/hw/char/cmsdk-apb-uart.c
+++ b/hw/char/cmsdk-apb-uart.c
@ -274,6 +274,7 @@ static void uart_write(void *opaque, hwaddr offset, uint64_t value,
         * is then reflected into the intstatus value by the update function).
         */
        s->state &= ~(value & (R_INTSTATUS_TXO_MASK | R_INTSTATUS_RXO_MASK));
+        s->intstatus &= ~value;
        cmsdk_apb_uart_update(s);
        break;
    case A_BAUDDIV:
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@ -1104,20 +1104,22 @@ int rom_check_and_register_reset(void)
        if (rom->fw_file) {
            continue;
        }
-        if ((addr > rom->addr) && (as == rom->as)) {
-            fprintf(stderr, "rom: requested regions overlap "
-                    "(rom %s. free=0x" TARGET_FMT_plx
-                    ", addr=0x" TARGET_FMT_plx ")\n",
-                    rom->name, addr, rom->addr);
-            return -1;
+        if (!rom->mr) {
+            if ((addr > rom->addr) && (as == rom->as)) {
+                fprintf(stderr, "rom: requested regions overlap "
+                        "(rom %s. free=0x" TARGET_FMT_plx
+                        ", addr=0x" TARGET_FMT_plx ")\n",
+                        rom->name, addr, rom->addr);
+                return -1;
+            }
+            addr  = rom->addr;
+            addr += rom->romsize;
+            as = rom->as;
        }
-        addr  = rom->addr;
-        addr += rom->romsize;
        section = memory_region_find(rom->mr ? rom->mr : get_system_memory(),
                                     rom->addr, 1);
        rom->isrom = int128_nz(section.size) && memory_region_is_rom(section.mr);
        memory_region_unref(section.mr);
-        as = rom->as;
    }
    qemu_register_reset(rom_reset, NULL);
    roms_loaded = 1;
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@ -1140,6 +1140,30 @@ static void device_class_init(ObjectClass *class, void *data)
    dc->user_creatable = true;
 }

+void device_class_set_parent_reset(DeviceClass *dc,
+                                   DeviceReset dev_reset,
+                                   DeviceReset *parent_reset)
+{
+    *parent_reset = dc->reset;
+    dc->reset = dev_reset;
+}
+
+void device_class_set_parent_realize(DeviceClass *dc,
+                                     DeviceRealize dev_realize,
+                                     DeviceRealize *parent_realize)
+{
+    *parent_realize = dc->realize;
+    dc->realize = dev_realize;
+}
+
+void device_class_set_parent_unrealize(DeviceClass *dc,
+                                       DeviceUnrealize dev_unrealize,
+                                       DeviceUnrealize *parent_unrealize)
+{
+    *parent_unrealize = dc->unrealize;
+    dc->unrealize = dev_unrealize;
+}
+
 void device_reset(DeviceState *dev)
 {
    DeviceClass *klass = DEVICE_GET_CLASS(dev);
--- a/hw/display/qxl-render.c
+++ b/hw/display/qxl-render.c
@ -169,7 +169,8 @@ void qxl_render_update(PCIQXLDevice *qxl)

    qemu_mutex_lock(&qxl->ssd.lock);

-    if (!runstate_is_running() || !qxl->guest_primary.commands) {
+    if (!runstate_is_running() || !qxl->guest_primary.commands ||
+        qxl->mode == QXL_MODE_UNDEFINED) {
        qxl_render_update_area_unlocked(qxl);
        qemu_mutex_unlock(&qxl->ssd.lock);
        return;
--- a/hw/display/vga.c
+++ b/hw/display/vga.c
@ -1280,6 +1280,9 @@ static void vga_draw_text(VGACommonState *s, int full_update)
        cx_min = width;
        cx_max = -1;
        for(cx = 0; cx < width; cx++) {
+            if (src + sizeof(uint16_t) > s->vram_ptr + s->vram_size) {
+                break;
+            }
            ch_attr = *(uint16_t *)src;
            if (full_update || ch_attr != *ch_attr_ptr || src == cursor_ptr) {
                if (cx < cx_min)
@ -1486,6 +1489,8 @@ static void vga_draw_graphic(VGACommonState *s, int full_update)

    region_start = (s->start_addr * 4);
    region_end = region_start + (ram_addr_t)s->line_offset * height;
+    region_end += width * s->get_bpp(s) / 8; /* scanline length */
+    region_end -= s->line_offset;
    if (region_end > s->vbe_size) {
        /* wraps around (can happen with cirrus vbe modes) */
        region_start = 0;
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@ -2460,6 +2460,7 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker)
    AcpiDmarDeviceScope *scope = NULL;
    /* Root complex IOAPIC use one path[0] only */
    size_t ioapic_scope_size = sizeof(*scope) + sizeof(scope->path[0]);
+    IntelIOMMUState *intel_iommu = INTEL_IOMMU_DEVICE(iommu);

    assert(iommu);
    if (iommu->intr_supported) {
@ -2467,7 +2468,7 @@ build_dmar_q35(GArray *table_data, BIOSLinker *linker)
    }

    dmar = acpi_data_push(table_data, sizeof(*dmar));
-    dmar->host_address_width = VTD_HOST_ADDRESS_WIDTH - 1;
+    dmar->host_address_width = intel_iommu->aw_bits - 1;
    dmar->flags = dmar_flags;

    /* DMAR Remapping Hardware Unit Definition structure */
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@ -128,6 +128,22 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
    return new_val;
 }

+static inline void vtd_iommu_lock(IntelIOMMUState *s)
+{
+    qemu_mutex_lock(&s->iommu_lock);
+}
+
+static inline void vtd_iommu_unlock(IntelIOMMUState *s)
+{
+    qemu_mutex_unlock(&s->iommu_lock);
+}
+
+/* Whether the address space needs to notify new mappings */
+static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
+{
+    return as->notifier_flags & IOMMU_NOTIFIER_MAP;
+}
+
 /* GHashTable functions */
 static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
 {
@ -172,9 +188,9 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
 }

 /* Reset all the gen of VTDAddressSpace to zero and set the gen of
- * IntelIOMMUState to 1.
+ * IntelIOMMUState to 1.  Must be called with IOMMU lock held.
 */
-static void vtd_reset_context_cache(IntelIOMMUState *s)
+static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
 {
    VTDAddressSpace *vtd_as;
    VTDBus *vtd_bus;
@ -197,12 +213,20 @@ static void vtd_reset_context_cache(IntelIOMMUState *s)
    s->context_cache_gen = 1;
 }

-static void vtd_reset_iotlb(IntelIOMMUState *s)
+/* Must be called with IOMMU lock held. */
+static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
 {
    assert(s->iotlb);
    g_hash_table_remove_all(s->iotlb);
 }

+static void vtd_reset_iotlb(IntelIOMMUState *s)
+{
+    vtd_iommu_lock(s);
+    vtd_reset_iotlb_locked(s);
+    vtd_iommu_unlock(s);
+}
+
 static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
                                  uint32_t level)
 {
@ -215,6 +239,7 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
    return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
 }

+/* Must be called with IOMMU lock held */
 static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
                                       hwaddr addr)
 {
@ -235,6 +260,7 @@ out:
    return entry;
 }

+/* Must be with IOMMU lock held */
 static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
                             uint16_t domain_id, hwaddr addr, uint64_t slpte,
                             uint8_t access_flags, uint32_t level)
@ -246,7 +272,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
    trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
    if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
        trace_vtd_iotlb_reset("iotlb exceeds size limit");
-        vtd_reset_iotlb(s);
+        vtd_reset_iotlb_locked(s);
    }

    entry->gfn = gfn;
@ -521,9 +547,9 @@ static inline dma_addr_t vtd_ce_get_slpt_base(VTDContextEntry *ce)
    return ce->lo & VTD_CONTEXT_ENTRY_SLPTPTR;
 }

-static inline uint64_t vtd_get_slpte_addr(uint64_t slpte)
+static inline uint64_t vtd_get_slpte_addr(uint64_t slpte, uint8_t aw)
 {
-    return slpte & VTD_SL_PT_BASE_ADDR_MASK;
+    return slpte & VTD_SL_PT_BASE_ADDR_MASK(aw);
 }

 /* Whether the pte indicates the address of the page frame */
@ -608,35 +634,29 @@ static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu,
    return true;
 }

-static inline uint64_t vtd_iova_limit(VTDContextEntry *ce)
+static inline uint64_t vtd_iova_limit(VTDContextEntry *ce, uint8_t aw)
 {
    uint32_t ce_agaw = vtd_ce_get_agaw(ce);
-    return 1ULL << MIN(ce_agaw, VTD_MGAW);
+    return 1ULL << MIN(ce_agaw, aw);
 }

 /* Return true if IOVA passes range check, otherwise false. */
-static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce)
+static inline bool vtd_iova_range_check(uint64_t iova, VTDContextEntry *ce,
+                                        uint8_t aw)
 {
    /*
     * Check if @iova is above 2^X-1, where X is the minimum of MGAW
     * in CAP_REG and AW in context-entry.
     */
-    return !(iova & ~(vtd_iova_limit(ce) - 1));
+    return !(iova & ~(vtd_iova_limit(ce, aw) - 1));
 }

-static const uint64_t vtd_paging_entry_rsvd_field[] = {
-    [0] = ~0ULL,
-    /* For not large page */
-    [1] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [2] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [3] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [4] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    /* For large page */
-    [5] = 0x800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [6] = 0x1ff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [7] = 0x3ffff800ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-    [8] = 0x880ULL | ~(VTD_HAW_MASK | VTD_SL_IGN_COM),
-};
+/*
+ * Rsvd field masks for spte:
+ *     Index [1] to [4] 4k pages
+ *     Index [5] to [8] large pages
+ */
+static uint64_t vtd_paging_entry_rsvd_field[9];

 static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
 {
@ -676,7 +696,7 @@ static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num)
 */
 static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
                             uint64_t *slptep, uint32_t *slpte_level,
-                             bool *reads, bool *writes)
+                             bool *reads, bool *writes, uint8_t aw_bits)
 {
    dma_addr_t addr = vtd_ce_get_slpt_base(ce);
    uint32_t level = vtd_ce_get_level(ce);
@ -684,7 +704,7 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
    uint64_t slpte;
    uint64_t access_right_check;

-    if (!vtd_iova_range_check(iova, ce)) {
+    if (!vtd_iova_range_check(iova, ce, aw_bits)) {
        trace_vtd_err_dmar_iova_overflow(iova);
        return -VTD_FR_ADDR_BEYOND_MGAW;
    }
@ -721,29 +741,124 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
            *slpte_level = level;
            return 0;
        }
-        addr = vtd_get_slpte_addr(slpte);
+        addr = vtd_get_slpte_addr(slpte, aw_bits);
        level--;
    }
 }

 typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);

+/**
+ * Constant information used during page walking
+ *
+ * @hook_fn: hook func to be called when detected page
+ * @private: private data to be passed into hook func
+ * @notify_unmap: whether we should notify invalid entries
+ * @as: VT-d address space of the device
+ * @aw: maximum address width
+ * @domain: domain ID of the page walk
+ */
+typedef struct {
+    VTDAddressSpace *as;
+    vtd_page_walk_hook hook_fn;
+    void *private;
+    bool notify_unmap;
+    uint8_t aw;
+    uint16_t domain_id;
+} vtd_page_walk_info;
+
+static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
+{
+    VTDAddressSpace *as = info->as;
+    vtd_page_walk_hook hook_fn = info->hook_fn;
+    void *private = info->private;
+    DMAMap target = {
+        .iova = entry->iova,
+        .size = entry->addr_mask,
+        .translated_addr = entry->translated_addr,
+        .perm = entry->perm,
+    };
+    DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
+
+    if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
+        trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+        return 0;
+    }
+
+    assert(hook_fn);
+
+    /* Update local IOVA mapped ranges */
+    if (entry->perm) {
+        if (mapped) {
+            /* If it's exactly the same translation, skip */
+            if (!memcmp(mapped, &target, sizeof(target))) {
+                trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
+                                                 entry->translated_addr);
+                return 0;
+            } else {
+                /*
+                 * Translation changed.  Normally this should not
+                 * happen, but it can happen when with buggy guest
+                 * OSes.  Note that there will be a small window that
+                 * we don't have map at all.  But that's the best
+                 * effort we can do.  The ideal way to emulate this is
+                 * atomically modify the PTE to follow what has
+                 * changed, but we can't.  One example is that vfio
+                 * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
+                 * interface to modify a mapping (meanwhile it seems
+                 * meaningless to even provide one).  Anyway, let's
+                 * mark this as a TODO in case one day we'll have
+                 * a better solution.
+                 */
+                IOMMUAccessFlags cache_perm = entry->perm;
+                int ret;
+
+                /* Emulate an UNMAP */
+                entry->perm = IOMMU_NONE;
+                trace_vtd_page_walk_one(info->domain_id,
+                                        entry->iova,
+                                        entry->translated_addr,
+                                        entry->addr_mask,
+                                        entry->perm);
+                ret = hook_fn(entry, private);
+                if (ret) {
+                    return ret;
+                }
+                /* Drop any existing mapping */
+                iova_tree_remove(as->iova_tree, &target);
+                /* Recover the correct permission */
+                entry->perm = cache_perm;
+            }
+        }
+        iova_tree_insert(as->iova_tree, &target);
+    } else {
+        if (!mapped) {
+            /* Skip since we didn't map this range at all */
+            trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+            return 0;
+        }
+        iova_tree_remove(as->iova_tree, &target);
+    }
+
+    trace_vtd_page_walk_one(info->domain_id, entry->iova,
+                            entry->translated_addr, entry->addr_mask,
+                            entry->perm);
+    return hook_fn(entry, private);
+}
+
 /**
 * vtd_page_walk_level - walk over specific level for IOVA range
 *
 * @addr: base GPA addr to start the walk
 * @start: IOVA range start address
 * @end: IOVA range end address (start <= addr < end)
- * @hook_fn: hook func to be called when detected page
- * @private: private data to be passed into hook func
 * @read: whether parent level has read permission
 * @write: whether parent level has write permission
- * @notify_unmap: whether we should notify invalid entries
+ * @info: constant information for the page walk
 */
 static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
-                               uint64_t end, vtd_page_walk_hook hook_fn,
-                               void *private, uint32_t level,
-                               bool read, bool write, bool notify_unmap)
+                               uint64_t end, uint32_t level, bool read,
+                               bool write, vtd_page_walk_info *info)
 {
    bool read_cur, write_cur, entry_valid;
    uint32_t offset;
@ -786,37 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
         */
        entry_valid = read_cur | write_cur;

-        if (vtd_is_last_slpte(slpte, level)) {
+        if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
+            /*
+             * This is a valid PDE (or even bigger than PDE).  We need
+             * to walk one further level.
+             */
+            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
+                                      iova, MIN(iova_next, end), level - 1,
+                                      read_cur, write_cur, info);
+        } else {
+            /*
+             * This means we are either:
+             *
+             * (1) the real page entry (either 4K page, or huge page)
+             * (2) the whole range is invalid
+             *
+             * In either case, we send an IOTLB notification down.
+             */
            entry.target_as = &address_space_memory;
            entry.iova = iova & subpage_mask;
-            /* NOTE: this is only meaningful if entry_valid == true */
-            entry.translated_addr = vtd_get_slpte_addr(slpte);
-            entry.addr_mask = ~subpage_mask;
            entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
-            if (!entry_valid && !notify_unmap) {
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
-                goto next;
-            }
-            trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
-                                    entry.addr_mask, entry.perm);
-            if (hook_fn) {
-                ret = hook_fn(&entry, private);
-                if (ret < 0) {
-                    return ret;
-                }
-            }
-        } else {
-            if (!entry_valid) {
-                trace_vtd_page_walk_skip_perm(iova, iova_next);
-                goto next;
-            }
-            ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte), iova,
-                                      MIN(iova_next, end), hook_fn, private,
-                                      level - 1, read_cur, write_cur,
-                                      notify_unmap);
-            if (ret < 0) {
-                return ret;
-            }
+            entry.addr_mask = ~subpage_mask;
+            /* NOTE: this is only meaningful if entry_valid == true */
+            entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
+            ret = vtd_page_walk_one(&entry, info);
+        }
+
+        if (ret < 0) {
+            return ret;
        }

 next:
@ -832,27 +944,24 @@ next:
 * @ce: context entry to walk upon
 * @start: IOVA address to start the walk
 * @end: IOVA range end address (start <= addr < end)
- * @hook_fn: the hook that to be called for each detected area
- * @private: private data for the hook function
+ * @info: page walking information struct
 */
 static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
-                         vtd_page_walk_hook hook_fn, void *private,
-                         bool notify_unmap)
+                         vtd_page_walk_info *info)
 {
    dma_addr_t addr = vtd_ce_get_slpt_base(ce);
    uint32_t level = vtd_ce_get_level(ce);

-    if (!vtd_iova_range_check(start, ce)) {
+    if (!vtd_iova_range_check(start, ce, info->aw)) {
        return -VTD_FR_ADDR_BEYOND_MGAW;
    }

-    if (!vtd_iova_range_check(end, ce)) {
+    if (!vtd_iova_range_check(end, ce, info->aw)) {
        /* Fix end so that it reaches the maximum */
-        end = vtd_iova_limit(ce);
+        end = vtd_iova_limit(ce, info->aw);
    }

-    return vtd_page_walk_level(addr, start, end, hook_fn, private,
-                               level, true, true, notify_unmap);
+    return vtd_page_walk_level(addr, start, end, level, true, true, info);
 }

 /* Map a device to its corresponding domain (context-entry) */
@ -874,7 +983,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
        return -VTD_FR_ROOT_ENTRY_P;
    }

-    if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD)) {
+    if (re.rsvd || (re.val & VTD_ROOT_ENTRY_RSVD(s->aw_bits))) {
        trace_vtd_re_invalid(re.rsvd, re.val);
        return -VTD_FR_ROOT_ENTRY_RSVD;
    }
@ -891,7 +1000,7 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
    }

    if ((ce->hi & VTD_CONTEXT_ENTRY_RSVD_HI) ||
-        (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO)) {
+               (ce->lo & VTD_CONTEXT_ENTRY_RSVD_LO(s->aw_bits))) {
        trace_vtd_ce_invalid(ce->hi, ce->lo);
        return -VTD_FR_CONTEXT_ENTRY_RSVD;
    }
@ -911,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
    return 0;
 }

+static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
+                                     void *private)
+{
+    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
+    return 0;
+}
+
+/* If context entry is NULL, we'll try to fetch it on our own. */
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
+                                            VTDContextEntry *ce,
+                                            hwaddr addr, hwaddr size)
+{
+    IntelIOMMUState *s = vtd_as->iommu_state;
+    vtd_page_walk_info info = {
+        .hook_fn = vtd_sync_shadow_page_hook,
+        .private = (void *)&vtd_as->iommu,
+        .notify_unmap = true,
+        .aw = s->aw_bits,
+        .as = vtd_as,
+    };
+    VTDContextEntry ce_cache;
+    int ret;
+
+    if (ce) {
+        /* If the caller provided context entry, use it */
+        ce_cache = *ce;
+    } else {
+        /* If the caller didn't provide ce, try to fetch */
+        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+                                       vtd_as->devfn, &ce_cache);
+        if (ret) {
+            /*
+             * This should not really happen, but in case it happens,
+             * we just skip the sync for this time.  After all we even
+             * don't have the root table pointer!
+             */
+            trace_vtd_err("Detected invalid context entry when "
+                          "trying to sync shadow page table");
+            return 0;
+        }
+    }
+
+    info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
+
+    return vtd_page_walk(&ce_cache, addr, addr + size, &info);
+}
+
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
+{
+    return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
+}
+
 /*
 * Fetch translation type for specific device. Returns <0 if error
 * happens, otherwise return the shifted type to check against
@ -1092,7 +1253,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
    IntelIOMMUState *s = vtd_as->iommu_state;
    VTDContextEntry ce;
    uint8_t bus_num = pci_bus_num(bus);
-    VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry;
+    VTDContextCacheEntry *cc_entry;
    uint64_t slpte, page_mask;
    uint32_t level;
    uint16_t source_id = vtd_make_source_id(bus_num, devfn);
@ -1109,6 +1270,10 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
     */
    assert(!vtd_is_interrupt_addr(addr));

+    vtd_iommu_lock(s);
+
+    cc_entry = &vtd_as->context_cache_entry;
+
    /* Try to fetch slpte form IOTLB */
    iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
    if (iotlb_entry) {
@ -1168,12 +1333,12 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
         * IOMMU region can be swapped back.
         */
        vtd_pt_enable_fast_path(s, source_id);
-
+        vtd_iommu_unlock(s);
        return true;
    }

    ret_fr = vtd_iova_to_slpte(&ce, addr, is_write, &slpte, &level,
-                               &reads, &writes);
+                               &reads, &writes, s->aw_bits);
    if (ret_fr) {
        ret_fr = -ret_fr;
        if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) {
@ -1189,13 +1354,15 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
    vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte,
                     access_flags, level);
 out:
+    vtd_iommu_unlock(s);
    entry->iova = addr & page_mask;
-    entry->translated_addr = vtd_get_slpte_addr(slpte) & page_mask;
+    entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
    entry->addr_mask = ~page_mask;
    entry->perm = access_flags;
    return true;

 error:
+    vtd_iommu_unlock(s);
    entry->iova = 0;
    entry->translated_addr = 0;
    entry->addr_mask = 0;
@ -1207,7 +1374,7 @@ static void vtd_root_table_setup(IntelIOMMUState *s)
 {
    s->root = vtd_get_quad_raw(s, DMAR_RTADDR_REG);
    s->root_extended = s->root & VTD_RTADDR_RTT;
-    s->root &= VTD_RTADDR_ADDR_MASK;
+    s->root &= VTD_RTADDR_ADDR_MASK(s->aw_bits);

    trace_vtd_reg_dmar_root(s->root, s->root_extended);
 }
@ -1223,7 +1390,7 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
    uint64_t value = 0;
    value = vtd_get_quad_raw(s, DMAR_IRTA_REG);
    s->intr_size = 1UL << ((value & VTD_IRTA_SIZE_MASK) + 1);
-    s->intr_root = value & VTD_IRTA_ADDR_MASK;
+    s->intr_root = value & VTD_IRTA_ADDR_MASK(s->aw_bits);
    s->intr_eime = value & VTD_IRTA_EIME;

    /* Notify global invalidation */
@ -1234,20 +1401,23 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)

 static void vtd_iommu_replay_all(IntelIOMMUState *s)
 {
-    IntelIOMMUNotifierNode *node;
+    VTDAddressSpace *vtd_as;

-    QLIST_FOREACH(node, &s->notifiers_list, next) {
-        memory_region_iommu_replay_all(&node->vtd_as->iommu);
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+        vtd_sync_shadow_page_table(vtd_as);
    }
 }

 static void vtd_context_global_invalidate(IntelIOMMUState *s)
 {
    trace_vtd_inv_desc_cc_global();
+    /* Protects context cache */
+    vtd_iommu_lock(s);
    s->context_cache_gen++;
    if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
-        vtd_reset_context_cache(s);
+        vtd_reset_context_cache_locked(s);
    }
+    vtd_iommu_unlock(s);
    vtd_switch_address_space_all(s);
    /*
     * From VT-d spec 6.5.2.1, a global context entry invalidation
@ -1299,7 +1469,9 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
            if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
                trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
                                             VTD_PCI_FUNC(devfn_it));
+                vtd_iommu_lock(s);
                vtd_as->context_cache_entry.context_cache_gen = 0;
+                vtd_iommu_unlock(s);
                /*
                 * Do switch address space when needed, in case if the
                 * device passthrough bit is switched.
@ -1307,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
                vtd_switch_address_space(vtd_as);
                /*
                 * So a device is moving out of (or moving into) a
-                 * domain, a replay() suites here to notify all the
-                 * IOMMU_NOTIFIER_MAP registers about this change.
+                 * domain, resync the shadow page table.
                 * This won't bring bad even if we have no such
                 * notifier registered - the IOMMU notification
                 * framework will skip MAP notifications if that
                 * happened.
                 */
-                memory_region_iommu_replay_all(&vtd_as->iommu);
+                vtd_sync_shadow_page_table(vtd_as);
            }
        }
    }
@ -1358,48 +1529,60 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)

 static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
 {
-    IntelIOMMUNotifierNode *node;
    VTDContextEntry ce;
    VTDAddressSpace *vtd_as;

    trace_vtd_inv_desc_iotlb_domain(domain_id);

+    vtd_iommu_lock(s);
    g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
                                &domain_id);
+    vtd_iommu_unlock(s);

-    QLIST_FOREACH(node, &s->notifiers_list, next) {
-        vtd_as = node->vtd_as;
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
        if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                      vtd_as->devfn, &ce) &&
            domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
-            memory_region_iommu_replay_all(&vtd_as->iommu);
+            vtd_sync_shadow_page_table(vtd_as);
        }
    }
 }

-static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
-                                           void *private)
-{
-    memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
-    return 0;
-}
-
 static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
                                           uint16_t domain_id, hwaddr addr,
                                           uint8_t am)
 {
-    IntelIOMMUNotifierNode *node;
+    VTDAddressSpace *vtd_as;
    VTDContextEntry ce;
    int ret;
+    hwaddr size = (1 << am) * VTD_PAGE_SIZE;

-    QLIST_FOREACH(node, &(s->notifiers_list), next) {
-        VTDAddressSpace *vtd_as = node->vtd_as;
+    QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
        ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
                                       vtd_as->devfn, &ce);
        if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
-            vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE,
-                          vtd_page_invalidate_notify_hook,
-                          (void *)&vtd_as->iommu, true);
+            if (vtd_as_has_map_notifier(vtd_as)) {
+                /*
+                 * As long as we have MAP notifications registered in
+                 * any of our IOMMU notifiers, we need to sync the
+                 * shadow page table.
+                 */
+                vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
+            } else {
+                /*
+                 * For UNMAP-only notifiers, we don't need to walk the
+                 * page tables.  We just deliver the PSI down to
+                 * invalidate caches.
+                 */
+                IOMMUTLBEntry entry = {
+                    .target_as = &address_space_memory,
+                    .iova = addr,
+                    .translated_addr = 0,
+                    .addr_mask = size - 1,
+                    .perm = IOMMU_NONE,
+                };
+                memory_region_notify_iommu(&vtd_as->iommu, entry);
+            }
        }
    }
 }
@ -1415,7 +1598,9 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
    info.domain_id = domain_id;
    info.addr = addr;
    info.mask = ~((1 << am) - 1);
+    vtd_iommu_lock(s);
    g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+    vtd_iommu_unlock(s);
    vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
 }

@ -1479,7 +1664,7 @@ static void vtd_handle_gcmd_qie(IntelIOMMUState *s, bool en)
    trace_vtd_inv_qi_enable(en);

    if (en) {
-        s->iq = iqa_val & VTD_IQA_IQA_MASK;
+        s->iq = iqa_val & VTD_IQA_IQA_MASK(s->aw_bits);
        /* 2^(x+8) entries */
        s->iq_size = 1UL << ((iqa_val & VTD_IQA_QS) + 8);
        s->qi_enabled = true;
@ -2323,8 +2508,6 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
 {
    VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
    IntelIOMMUState *s = vtd_as->iommu_state;
-    IntelIOMMUNotifierNode *node = NULL;
-    IntelIOMMUNotifierNode *next_node = NULL;

    if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) {
        error_report("We need to set cache_mode=1 for intel-iommu to enable "
@ -2332,22 +2515,13 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
        exit(1);
    }

-    if (old == IOMMU_NOTIFIER_NONE) {
-        node = g_malloc0(sizeof(*node));
-        node->vtd_as = vtd_as;
-        QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
-        return;
-    }
+    /* Update per-address-space notifier flags */
+    vtd_as->notifier_flags = new;

-    /* update notifier node with new flags */
-    QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) {
-        if (node->vtd_as == vtd_as) {
-            if (new == IOMMU_NOTIFIER_NONE) {
-                QLIST_REMOVE(node, next);
-                g_free(node);
-            }
-            return;
-        }
+    if (old == IOMMU_NOTIFIER_NONE) {
+        QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
+    } else if (new == IOMMU_NOTIFIER_NONE) {
+        QLIST_REMOVE(vtd_as, next);
    }
 }

@ -2410,6 +2584,8 @@ static Property vtd_properties[] = {
    DEFINE_PROP_ON_OFF_AUTO("eim", IntelIOMMUState, intr_eim,
                            ON_OFF_AUTO_AUTO),
    DEFINE_PROP_BOOL("x-buggy-eim", IntelIOMMUState, buggy_eim, false),
+    DEFINE_PROP_UINT8("x-aw-bits", IntelIOMMUState, aw_bits,
+                      VTD_HOST_ADDRESS_WIDTH),
    DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
    DEFINE_PROP_END_OF_LIST(),
 };
@ -2714,6 +2890,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
        vtd_dev_as->devfn = (uint8_t)devfn;
        vtd_dev_as->iommu_state = s;
        vtd_dev_as->context_cache_entry.context_cache_gen = 0;
+        vtd_dev_as->iova_tree = iova_tree_new();

        /*
         * Memory region relationships looks like (Address range shows
@ -2765,6 +2942,8 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
    hwaddr size;
    hwaddr start = n->start;
    hwaddr end = n->end;
+    IntelIOMMUState *s = as->iommu_state;
+    DMAMap map;

    /*
     * Note: all the codes in this function has a assumption that IOVA
@ -2772,12 +2951,12 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
     * VT-d spec), otherwise we need to consider overflow of 64 bits.
     */

-    if (end > VTD_ADDRESS_SIZE) {
+    if (end > VTD_ADDRESS_SIZE(s->aw_bits)) {
        /*
         * Don't need to unmap regions that is bigger than the whole
         * VT-d supported address space size
         */
-        end = VTD_ADDRESS_SIZE;
+        end = VTD_ADDRESS_SIZE(s->aw_bits);
    }

    assert(start <= end);
@ -2789,9 +2968,9 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
         * suite the minimum available mask.
         */
        int n = 64 - clz64(size);
-        if (n > VTD_MGAW) {
+        if (n > s->aw_bits) {
            /* should not happen, but in case it happens, limit it */
-            n = VTD_MGAW;
+            n = s->aw_bits;
        }
        size = 1ULL << n;
    }
@ -2809,17 +2988,19 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
                             VTD_PCI_FUNC(as->devfn),
                             entry.iova, size);

+    map.iova = entry.iova;
+    map.size = entry.addr_mask;
+    iova_tree_remove(as->iova_tree, &map);
+
    memory_region_notify_one(n, &entry);
 }

 static void vtd_address_space_unmap_all(IntelIOMMUState *s)
 {
-    IntelIOMMUNotifierNode *node;
    VTDAddressSpace *vtd_as;
    IOMMUNotifier *n;

-    QLIST_FOREACH(node, &s->notifiers_list, next) {
-        vtd_as = node->vtd_as;
+    QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
        IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
            vtd_address_space_unmap(vtd_as, n);
        }
@ -2851,7 +3032,19 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
                                  PCI_FUNC(vtd_as->devfn),
                                  VTD_CONTEXT_ENTRY_DID(ce.hi),
                                  ce.hi, ce.lo);
-        vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false);
+        if (vtd_as_has_map_notifier(vtd_as)) {
+            /* This is required only for MAP typed notifiers */
+            vtd_page_walk_info info = {
+                .hook_fn = vtd_replay_hook,
+                .private = (void *)n,
+                .notify_unmap = false,
+                .aw = s->aw_bits,
+                .as = vtd_as,
+                .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi),
+            };
+
+            vtd_page_walk(&ce, 0, ~0ULL, &info);
+        }
    } else {
        trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
                                    PCI_FUNC(vtd_as->devfn));
@ -2882,10 +3075,27 @@ static void vtd_init(IntelIOMMUState *s)
    s->qi_enabled = false;
    s->iq_last_desc_type = VTD_INV_DESC_NONE;
    s->next_frcd_reg = 0;
-    s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MGAW |
-             VTD_CAP_SAGAW | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS;
+    s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND |
+             VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS |
+             VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits);
+    if (s->aw_bits == VTD_HOST_AW_48BIT) {
+        s->cap |= VTD_CAP_SAGAW_48bit;
+    }
    s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO;

+    /*
+     * Rsvd field masks for spte
+     */
+    vtd_paging_entry_rsvd_field[0] = ~0ULL;
+    vtd_paging_entry_rsvd_field[1] = VTD_SPTE_PAGE_L1_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[2] = VTD_SPTE_PAGE_L2_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[3] = VTD_SPTE_PAGE_L3_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[4] = VTD_SPTE_PAGE_L4_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[5] = VTD_SPTE_LPAGE_L1_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[6] = VTD_SPTE_LPAGE_L2_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[7] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits);
+    vtd_paging_entry_rsvd_field[8] = VTD_SPTE_LPAGE_L4_RSVD_MASK(s->aw_bits);
+
    if (x86_iommu->intr_supported) {
        s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV;
        if (s->intr_eim == ON_OFF_AUTO_ON) {
@ -2906,8 +3116,10 @@ static void vtd_init(IntelIOMMUState *s)
        s->cap |= VTD_CAP_CM;
    }

-    vtd_reset_context_cache(s);
-    vtd_reset_iotlb(s);
+    vtd_iommu_lock(s);
+    vtd_reset_context_cache_locked(s);
+    vtd_reset_iotlb_locked(s);
+    vtd_iommu_unlock(s);

    /* Define registers with default values and bit semantics */
    vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
@ -3021,6 +3233,14 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
        }
    }

+    /* Currently only address widths supported are 39 and 48 bits */
+    if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
+        (s->aw_bits != VTD_HOST_AW_48BIT)) {
+        error_setg(errp, "Supported values for x-aw-bits are: %d, %d",
+                   VTD_HOST_AW_39BIT, VTD_HOST_AW_48BIT);
+        return false;
+    }
+
    return true;
 }

@ -3047,7 +3267,8 @@ static void vtd_realize(DeviceState *dev, Error **errp)
        return;
    }

-    QLIST_INIT(&s->notifiers_list);
+    QLIST_INIT(&s->vtd_as_with_notifiers);
+    qemu_mutex_init(&s->iommu_lock);
    memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
    memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
                          "intel_iommu", DMAR_REG_SIZE);
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@ -131,7 +131,7 @@
 #define VTD_TLB_DID(val)            (((val) >> 32) & VTD_DOMAIN_ID_MASK)

 /* IVA_REG */
-#define VTD_IVA_ADDR(val)       ((val) & ~0xfffULL & ((1ULL << VTD_MGAW) - 1))
+#define VTD_IVA_ADDR(val)       ((val) & ~0xfffULL)
 #define VTD_IVA_AM(val)         ((val) & 0x3fULL)

 /* GCMD_REG */
@ -172,10 +172,10 @@

 /* RTADDR_REG */
 #define VTD_RTADDR_RTT              (1ULL << 11)
-#define VTD_RTADDR_ADDR_MASK        (VTD_HAW_MASK ^ 0xfffULL)
+#define VTD_RTADDR_ADDR_MASK(aw)    (VTD_HAW_MASK(aw) ^ 0xfffULL)

 /* IRTA_REG */
-#define VTD_IRTA_ADDR_MASK          (VTD_HAW_MASK ^ 0xfffULL)
+#define VTD_IRTA_ADDR_MASK(aw)      (VTD_HAW_MASK(aw) ^ 0xfffULL)
 #define VTD_IRTA_EIME               (1ULL << 11)
 #define VTD_IRTA_SIZE_MASK          (0xfULL)

@ -197,9 +197,8 @@
 #define VTD_DOMAIN_ID_SHIFT         16  /* 16-bit domain id for 64K domains */
 #define VTD_DOMAIN_ID_MASK          ((1UL << VTD_DOMAIN_ID_SHIFT) - 1)
 #define VTD_CAP_ND                  (((VTD_DOMAIN_ID_SHIFT - 4) / 2) & 7ULL)
-#define VTD_MGAW                    39  /* Maximum Guest Address Width */
-#define VTD_ADDRESS_SIZE            (1ULL << VTD_MGAW)
-#define VTD_CAP_MGAW                (((VTD_MGAW - 1) & 0x3fULL) << 16)
+#define VTD_ADDRESS_SIZE(aw)        (1ULL << (aw))
+#define VTD_CAP_MGAW(aw)            ((((aw) - 1) & 0x3fULL) << 16)
 #define VTD_MAMV                    18ULL
 #define VTD_CAP_MAMV                (VTD_MAMV << 48)
 #define VTD_CAP_PSI                 (1ULL << 39)
@ -213,13 +212,12 @@
 #define VTD_CAP_SAGAW_39bit         (0x2ULL << VTD_CAP_SAGAW_SHIFT)
 /* 48-bit AGAW, 4-level page-table */
 #define VTD_CAP_SAGAW_48bit         (0x4ULL << VTD_CAP_SAGAW_SHIFT)
-#define VTD_CAP_SAGAW               VTD_CAP_SAGAW_39bit

 /* IQT_REG */
 #define VTD_IQT_QT(val)             (((val) >> 4) & 0x7fffULL)

 /* IQA_REG */
-#define VTD_IQA_IQA_MASK            (VTD_HAW_MASK ^ 0xfffULL)
+#define VTD_IQA_IQA_MASK(aw)        (VTD_HAW_MASK(aw) ^ 0xfffULL)
 #define VTD_IQA_QS                  0x7ULL

 /* IQH_REG */
@ -252,7 +250,7 @@
 #define VTD_FRCD_SID_MASK       0xffffULL
 #define VTD_FRCD_SID(val)       ((val) & VTD_FRCD_SID_MASK)
 /* For the low 64-bit of 128-bit */
-#define VTD_FRCD_FI(val)        ((val) & (((1ULL << VTD_MGAW) - 1) ^ 0xfffULL))
+#define VTD_FRCD_FI(val)        ((val) & ~0xfffULL)

 /* DMA Remapping Fault Conditions */
 typedef enum VTDFaultReason {
@ -360,8 +358,7 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_IOTLB_DOMAIN       (2ULL << 4)
 #define VTD_INV_DESC_IOTLB_PAGE         (3ULL << 4)
 #define VTD_INV_DESC_IOTLB_DID(val)     (((val) >> 16) & VTD_DOMAIN_ID_MASK)
-#define VTD_INV_DESC_IOTLB_ADDR(val)    ((val) & ~0xfffULL & \
-                                         ((1ULL << VTD_MGAW) - 1))
+#define VTD_INV_DESC_IOTLB_ADDR(val)    ((val) & ~0xfffULL)
 #define VTD_INV_DESC_IOTLB_AM(val)      ((val) & 0x3fULL)
 #define VTD_INV_DESC_IOTLB_RSVD_LO      0xffffffff0000ff00ULL
 #define VTD_INV_DESC_IOTLB_RSVD_HI      0xf80ULL
@ -373,6 +370,24 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_HI 0xffeULL
 #define VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO 0xffff0000ffe0fff8

+/* Rsvd field masks for spte */
+#define VTD_SPTE_PAGE_L1_RSVD_MASK(aw) \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L2_RSVD_MASK(aw) \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L3_RSVD_MASK(aw) \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_PAGE_L4_RSVD_MASK(aw) \
+        (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L1_RSVD_MASK(aw) \
+        (0x800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L2_RSVD_MASK(aw) \
+        (0x1ff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L3_RSVD_MASK(aw) \
+        (0x3ffff800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+#define VTD_SPTE_LPAGE_L4_RSVD_MASK(aw) \
+        (0x880ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
    uint16_t domain_id;
@ -403,7 +418,7 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_ROOT_ENTRY_CTP          (~0xfffULL)

 #define VTD_ROOT_ENTRY_NR           (VTD_PAGE_SIZE / sizeof(VTDRootEntry))
-#define VTD_ROOT_ENTRY_RSVD         (0xffeULL | ~VTD_HAW_MASK)
+#define VTD_ROOT_ENTRY_RSVD(aw)     (0xffeULL | ~VTD_HAW_MASK(aw))

 /* Masks for struct VTDContextEntry */
 /* lo */
@ -415,7 +430,7 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_CONTEXT_TT_PASS_THROUGH (2ULL << 2)
 /* Second Level Page Translation Pointer*/
 #define VTD_CONTEXT_ENTRY_SLPTPTR   (~0xfffULL)
-#define VTD_CONTEXT_ENTRY_RSVD_LO   (0xff0ULL | ~VTD_HAW_MASK)
+#define VTD_CONTEXT_ENTRY_RSVD_LO(aw) (0xff0ULL | ~VTD_HAW_MASK(aw))
 /* hi */
 #define VTD_CONTEXT_ENTRY_AW        7ULL /* Adjusted guest-address-width */
 #define VTD_CONTEXT_ENTRY_DID(val)  (((val) >> 8) & VTD_DOMAIN_ID_MASK)
@ -439,7 +454,7 @@ typedef struct VTDRootEntry VTDRootEntry;
 #define VTD_SL_RW_MASK              3ULL
 #define VTD_SL_R                    1ULL
 #define VTD_SL_W                    (1ULL << 1)
-#define VTD_SL_PT_BASE_ADDR_MASK    (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK)
+#define VTD_SL_PT_BASE_ADDR_MASK(aw) (~(VTD_PAGE_SIZE - 1) & VTD_HAW_MASK(aw))
 #define VTD_SL_IGN_COM              0xbff0000000000000ULL

 #endif
--- a/hw/i386/multiboot.c
+++ b/hw/i386/multiboot.c
@ -31,12 +31,13 @@
 #include "hw/loader.h"
 #include "elf.h"
 #include "sysemu/sysemu.h"
+#include "qemu/error-report.h"

 /* Show multiboot debug output */
 //#define DEBUG_MULTIBOOT

 #ifdef DEBUG_MULTIBOOT
-#define mb_debug(a...) fprintf(stderr, ## a)
+#define mb_debug(a...) error_report(a)
 #else
 #define mb_debug(a...)
 #endif
@ -137,7 +138,7 @@ static void mb_add_mod(MultibootState *s,
    stl_p(p + MB_MOD_END,     end);
    stl_p(p + MB_MOD_CMDLINE, cmdline_phys);

-    mb_debug("mod%02d: "TARGET_FMT_plx" - "TARGET_FMT_plx"\n",
+    mb_debug("mod%02d: "TARGET_FMT_plx" - "TARGET_FMT_plx,
             s->mb_mods_count, start, end);

    s->mb_mods_count++;
@ -179,12 +180,12 @@ int load_multiboot(FWCfgState *fw_cfg,
    if (!is_multiboot)
        return 0; /* no multiboot */

-    mb_debug("qemu: I believe we found a multiboot image!\n");
+    mb_debug("qemu: I believe we found a multiboot image!");
    memset(bootinfo, 0, sizeof(bootinfo));
    memset(&mbs, 0, sizeof(mbs));

    if (flags & 0x00000004) { /* MULTIBOOT_HEADER_HAS_VBE */
-        fprintf(stderr, "qemu: multiboot knows VBE. we don't.\n");
+        error_report("qemu: multiboot knows VBE. we don't.");
    }
    if (!(flags & 0x00010000)) { /* MULTIBOOT_HEADER_HAS_ADDR */
        uint64_t elf_entry;
@ -193,7 +194,7 @@ int load_multiboot(FWCfgState *fw_cfg,
        fclose(f);

        if (((struct elf64_hdr*)header)->e_machine == EM_X86_64) {
-            fprintf(stderr, "Cannot load x86-64 image, give a 32bit one.\n");
+            error_report("Cannot load x86-64 image, give a 32bit one.");
            exit(1);
        }

@ -201,7 +202,7 @@ int load_multiboot(FWCfgState *fw_cfg,
                               &elf_low, &elf_high, 0, I386_ELF_MACHINE,
                               0, 0);
        if (kernel_size < 0) {
-            fprintf(stderr, "Error while loading elf kernel\n");
+            error_report("Error while loading elf kernel");
            exit(1);
        }
        mh_load_addr = elf_low;
@ -210,12 +211,13 @@ int load_multiboot(FWCfgState *fw_cfg,

        mbs.mb_buf = g_malloc(mb_kernel_size);
        if (rom_copy(mbs.mb_buf, mh_load_addr, mb_kernel_size) != mb_kernel_size) {
-            fprintf(stderr, "Error while fetching elf kernel from rom\n");
+            error_report("Error while fetching elf kernel from rom");
            exit(1);
        }

-        mb_debug("qemu: loading multiboot-elf kernel (%#x bytes) with entry %#zx\n",
-                  mb_kernel_size, (size_t)mh_entry_addr);
+        mb_debug("qemu: loading multiboot-elf kernel "
+                 "(%#x bytes) with entry %#zx",
+                 mb_kernel_size, (size_t)mh_entry_addr);
    } else {
        /* Valid if mh_flags sets MULTIBOOT_HEADER_HAS_ADDR. */
        uint32_t mh_header_addr = ldl_p(header+i+12);
@ -224,7 +226,11 @@ int load_multiboot(FWCfgState *fw_cfg,

        mh_load_addr = ldl_p(header+i+16);
        if (mh_header_addr < mh_load_addr) {
-            fprintf(stderr, "invalid mh_load_addr address\n");
+            error_report("invalid load_addr address");
+            exit(1);
+        }
+        if (mh_header_addr - mh_load_addr > i) {
+            error_report("invalid header_addr address");
            exit(1);
        }

@ -233,43 +239,43 @@ int load_multiboot(FWCfgState *fw_cfg,
        mh_entry_addr = ldl_p(header+i+28);

        if (mh_load_end_addr) {
-            if (mh_bss_end_addr < mh_load_addr) {
-                fprintf(stderr, "invalid mh_bss_end_addr address\n");
-                exit(1);
-            }
-            mb_kernel_size = mh_bss_end_addr - mh_load_addr;
-
            if (mh_load_end_addr < mh_load_addr) {
-                fprintf(stderr, "invalid mh_load_end_addr address\n");
+                error_report("invalid load_end_addr address");
                exit(1);
            }
            mb_load_size = mh_load_end_addr - mh_load_addr;
        } else {
            if (kernel_file_size < mb_kernel_text_offset) {
-                fprintf(stderr, "invalid kernel_file_size\n");
+                error_report("invalid kernel_file_size");
                exit(1);
            }
-            mb_kernel_size = kernel_file_size - mb_kernel_text_offset;
-            mb_load_size = mb_kernel_size;
+            mb_load_size = kernel_file_size - mb_kernel_text_offset;
+        }
+        if (mb_load_size > UINT32_MAX - mh_load_addr) {
+            error_report("kernel does not fit in address space");
+            exit(1);
+        }
+        if (mh_bss_end_addr) {
+            if (mh_bss_end_addr < (mh_load_addr + mb_load_size)) {
+                error_report("invalid bss_end_addr address");
+                exit(1);
+            }
+            mb_kernel_size = mh_bss_end_addr - mh_load_addr;
+        } else {
+            mb_kernel_size = mb_load_size;
        }

-        /* Valid if mh_flags sets MULTIBOOT_HEADER_HAS_VBE.
-        uint32_t mh_mode_type = ldl_p(header+i+32);
-        uint32_t mh_width = ldl_p(header+i+36);
-        uint32_t mh_height = ldl_p(header+i+40);
-        uint32_t mh_depth = ldl_p(header+i+44); */
-
-        mb_debug("multiboot: mh_header_addr = %#x\n", mh_header_addr);
-        mb_debug("multiboot: mh_load_addr = %#x\n", mh_load_addr);
-        mb_debug("multiboot: mh_load_end_addr = %#x\n", mh_load_end_addr);
-        mb_debug("multiboot: mh_bss_end_addr = %#x\n", mh_bss_end_addr);
-        mb_debug("qemu: loading multiboot kernel (%#x bytes) at %#x\n",
+        mb_debug("multiboot: header_addr = %#x", mh_header_addr);
+        mb_debug("multiboot: load_addr = %#x", mh_load_addr);
+        mb_debug("multiboot: load_end_addr = %#x", mh_load_end_addr);
+        mb_debug("multiboot: bss_end_addr = %#x", mh_bss_end_addr);
+        mb_debug("qemu: loading multiboot kernel (%#x bytes) at %#x",
                 mb_load_size, mh_load_addr);

        mbs.mb_buf = g_malloc(mb_kernel_size);
        fseek(f, mb_kernel_text_offset, SEEK_SET);
        if (fread(mbs.mb_buf, 1, mb_load_size, f) != mb_load_size) {
-            fprintf(stderr, "fread() failed\n");
+            error_report("fread() failed");
            exit(1);
        }
        memset(mbs.mb_buf + mb_load_size, 0, mb_kernel_size - mb_load_size);
@ -323,10 +329,10 @@ int load_multiboot(FWCfgState *fw_cfg,
            hwaddr c = mb_add_cmdline(&mbs, tmpbuf);
            if ((next_space = strchr(tmpbuf, ' ')))
                *next_space = '\0';
-            mb_debug("multiboot loading module: %s\n", tmpbuf);
+            mb_debug("multiboot loading module: %s", tmpbuf);
            mb_mod_length = get_image_size(tmpbuf);
            if (mb_mod_length < 0) {
-                fprintf(stderr, "Failed to open file '%s'\n", tmpbuf);
+                error_report("Failed to open file '%s'", tmpbuf);
                exit(1);
            }

@ -337,7 +343,7 @@ int load_multiboot(FWCfgState *fw_cfg,
            mb_add_mod(&mbs, mbs.mb_buf_phys + offs,
                       mbs.mb_buf_phys + offs + mb_mod_length, c);

-            mb_debug("mod_start: %p\nmod_end:   %p\n  cmdline: "TARGET_FMT_plx"\n",
+            mb_debug("mod_start: %p\nmod_end:   %p\n  cmdline: "TARGET_FMT_plx,
                     (char *)mbs.mb_buf + offs,
                     (char *)mbs.mb_buf + offs + mb_mod_length, c);
            initrd_filename = next_initrd+1;
@ -365,10 +371,11 @@ int load_multiboot(FWCfgState *fw_cfg,
    stl_p(bootinfo + MBI_BOOT_DEVICE, 0x8000ffff); /* XXX: use the -boot switch? */
    stl_p(bootinfo + MBI_MMAP_ADDR,   ADDR_E820_MAP);

-    mb_debug("multiboot: mh_entry_addr = %#x\n", mh_entry_addr);
-    mb_debug("           mb_buf_phys   = "TARGET_FMT_plx"\n", mbs.mb_buf_phys);
-    mb_debug("           mod_start     = "TARGET_FMT_plx"\n", mbs.mb_buf_phys + mbs.offset_mods);
-    mb_debug("           mb_mods_count = %d\n", mbs.mb_mods_count);
+    mb_debug("multiboot: entry_addr = %#x", mh_entry_addr);
+    mb_debug("           mb_buf_phys   = "TARGET_FMT_plx, mbs.mb_buf_phys);
+    mb_debug("           mod_start     = "TARGET_FMT_plx,
+             mbs.mb_buf_phys + mbs.offset_mods);
+    mb_debug("           mb_mods_count = %d", mbs.mb_mods_count);

    /* save bootinfo off the stack */
    mb_bootinfo_data = g_memdup(bootinfo, sizeof(bootinfo));
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@ -39,9 +39,10 @@ vtd_fault_disabled(void) "Fault processing disabled for context entry"
 vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64
 vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
 vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
-vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
 vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
-vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
 vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
 vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
 vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@ -533,13 +533,6 @@ static void ahci_check_cmd_bh(void *opaque)
    qemu_bh_delete(ad->check_bh);
    ad->check_bh = NULL;

-    if ((ad->busy_slot != -1) &&
-        !(ad->port.ifs[0].status & (BUSY_STAT|DRQ_STAT))) {
-        /* no longer busy */
-        ad->port_regs.cmd_issue &= ~(1 << ad->busy_slot);
-        ad->busy_slot = -1;
-    }
-
    check_cmd(ad->hba, ad->port_no);
 }

@ -1426,6 +1419,12 @@ static void ahci_cmd_done(IDEDMA *dma)

    trace_ahci_cmd_done(ad->hba, ad->port_no);

+    /* no longer busy */
+    if (ad->busy_slot != -1) {
+        ad->port_regs.cmd_issue &= ~(1 << ad->busy_slot);
+        ad->busy_slot = -1;
+    }
+
    /* update d2h status */
    ahci_write_fis_d2h(ad);

--- a/hw/intc/arm_gic.c
+++ b/hw/intc/arm_gic.c
@ -1261,7 +1261,8 @@ static MemTxResult gic_cpu_read(GICState *s, int cpu, int offset,
    default:
        qemu_log_mask(LOG_GUEST_ERROR,
                      "gic_cpu_read: Bad offset %x\n", (int)offset);
-        return MEMTX_ERROR;
+        *data = 0;
+        break;
    }
    return MEMTX_OK;
 }
@ -1329,7 +1330,7 @@ static MemTxResult gic_cpu_write(GICState *s, int cpu, int offset,
    default:
        qemu_log_mask(LOG_GUEST_ERROR,
                      "gic_cpu_write: Bad offset %x\n", (int)offset);
-        return MEMTX_ERROR;
+        return MEMTX_OK;
    }
    gic_update(s);
    return MEMTX_OK;
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@ -27,6 +27,7 @@
 #include "hw/intc/arm_gicv3_common.h"
 #include "gicv3_internal.h"
 #include "hw/arm/linux-boot-if.h"
+#include "sysemu/kvm.h"

 static int gicv3_pre_save(void *opaque)
 {
@ -141,6 +142,79 @@ static const VMStateDescription vmstate_gicv3_cpu = {
    }
 };

+static int gicv3_gicd_no_migration_shift_bug_pre_load(void *opaque)
+{
+    GICv3State *cs = opaque;
+
+   /*
+    * The gicd_no_migration_shift_bug flag is used for migration compatibility
+    * for old version QEMU which may have the GICD bmp shift bug under KVM mode.
+    * Strictly, what we want to know is whether the migration source is using
+    * KVM. Since we don't have any way to determine that, we look at whether the
+    * destination is using KVM; this is close enough because for the older QEMU
+    * versions with this bug KVM -> TCG migration didn't work anyway. If the
+    * source is a newer QEMU without this bug it will transmit the migration
+    * subsection which sets the flag to true; otherwise it will remain set to
+    * the value we select here.
+    */
+    if (kvm_enabled()) {
+        cs->gicd_no_migration_shift_bug = false;
+    }
+
+    return 0;
+}
+
+static int gicv3_gicd_no_migration_shift_bug_post_load(void *opaque,
+                                                       int version_id)
+{
+    GICv3State *cs = opaque;
+
+    if (cs->gicd_no_migration_shift_bug) {
+        return 0;
+    }
+
+    /* Older versions of QEMU had a bug in the handling of state save/restore
+     * to the KVM GICv3: they got the offset in the bitmap arrays wrong,
+     * so that instead of the data for external interrupts 32 and up
+     * starting at bit position 32 in the bitmap, it started at bit
+     * position 64. If we're receiving data from a QEMU with that bug,
+     * we must move the data down into the right place.
+     */
+    memmove(cs->group, (uint8_t *)cs->group + GIC_INTERNAL / 8,
+            sizeof(cs->group) - GIC_INTERNAL / 8);
+    memmove(cs->grpmod, (uint8_t *)cs->grpmod + GIC_INTERNAL / 8,
+            sizeof(cs->grpmod) - GIC_INTERNAL / 8);
+    memmove(cs->enabled, (uint8_t *)cs->enabled + GIC_INTERNAL / 8,
+            sizeof(cs->enabled) - GIC_INTERNAL / 8);
+    memmove(cs->pending, (uint8_t *)cs->pending + GIC_INTERNAL / 8,
+            sizeof(cs->pending) - GIC_INTERNAL / 8);
+    memmove(cs->active, (uint8_t *)cs->active + GIC_INTERNAL / 8,
+            sizeof(cs->active) - GIC_INTERNAL / 8);
+    memmove(cs->edge_trigger, (uint8_t *)cs->edge_trigger + GIC_INTERNAL / 8,
+            sizeof(cs->edge_trigger) - GIC_INTERNAL / 8);
+
+    /*
+     * While this new version QEMU doesn't have this kind of bug as we fix it,
+     * so it needs to set the flag to true to indicate that and it's necessary
+     * for next migration to work from this new version QEMU.
+     */
+    cs->gicd_no_migration_shift_bug = true;
+
+    return 0;
+}
+
+const VMStateDescription vmstate_gicv3_gicd_no_migration_shift_bug = {
+    .name = "arm_gicv3/gicd_no_migration_shift_bug",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .pre_load = gicv3_gicd_no_migration_shift_bug_pre_load,
+    .post_load = gicv3_gicd_no_migration_shift_bug_post_load,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(gicd_no_migration_shift_bug, GICv3State),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_gicv3 = {
    .name = "arm_gicv3",
    .version_id = 1,
@ -165,6 +239,10 @@ static const VMStateDescription vmstate_gicv3 = {
        VMSTATE_STRUCT_VARRAY_POINTER_UINT32(cpu, GICv3State, num_cpu,
                                             vmstate_gicv3_cpu, GICv3CPUState),
        VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription * []) {
+        &vmstate_gicv3_gicd_no_migration_shift_bug,
+        NULL
    }
 };

@ -364,6 +442,7 @@ static void arm_gicv3_common_reset(DeviceState *dev)
            gicv3_gicd_group_set(s, i);
        }
    }
+    s->gicd_no_migration_shift_bug = true;
 }

 static void arm_gic_common_linux_init(ARMLinuxBootIf *obj,
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@ -431,7 +431,7 @@ static uint64_t icv_ap_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
    GICv3CPUState *cs = icc_cs_from_env(env);
    int regno = ri->opc2 & 3;
-    int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS;
+    int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0;
    uint64_t value = cs->ich_apr[grp][regno];

    trace_gicv3_icv_ap_read(ri->crm & 1, regno, gicv3_redist_affid(cs), value);
@ -443,7 +443,7 @@ static void icv_ap_write(CPUARMState *env, const ARMCPRegInfo *ri,
 {
    GICv3CPUState *cs = icc_cs_from_env(env);
    int regno = ri->opc2 & 3;
-    int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS;
+    int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0;

    trace_gicv3_icv_ap_write(ri->crm & 1, regno, gicv3_redist_affid(cs), value);

@ -1465,7 +1465,7 @@ static uint64_t icc_ap_read(CPUARMState *env, const ARMCPRegInfo *ri)
    uint64_t value;

    int regno = ri->opc2 & 3;
-    int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1;
+    int grp = (ri->crm & 1) ? GICV3_G1 : GICV3_G0;

    if (icv_access(env, grp == GICV3_G0 ? HCR_FMO : HCR_IMO)) {
        return icv_ap_read(env, ri);
@ -1487,7 +1487,7 @@ static void icc_ap_write(CPUARMState *env, const ARMCPRegInfo *ri,
    GICv3CPUState *cs = icc_cs_from_env(env);

    int regno = ri->opc2 & 3;
-    int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1;
+    int grp = (ri->crm & 1) ? GICV3_G1 : GICV3_G0;

    if (icv_access(env, grp == GICV3_G0 ? HCR_FMO : HCR_IMO)) {
        icv_ap_write(env, ri, value);
@ -2296,7 +2296,7 @@ static uint64_t ich_ap_read(CPUARMState *env, const ARMCPRegInfo *ri)
 {
    GICv3CPUState *cs = icc_cs_from_env(env);
    int regno = ri->opc2 & 3;
-    int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS;
+    int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0;
    uint64_t value;

    value = cs->ich_apr[grp][regno];
@ -2309,7 +2309,7 @@ static void ich_ap_write(CPUARMState *env, const ARMCPRegInfo *ri,
 {
    GICv3CPUState *cs = icc_cs_from_env(env);
    int regno = ri->opc2 & 3;
-    int grp = ri->crm & 1 ? GICV3_G0 : GICV3_G1NS;
+    int grp = (ri->crm & 1) ? GICV3_G1NS : GICV3_G0;

    trace_gicv3_ich_ap_write(ri->crm & 1, regno, gicv3_redist_affid(cs), value);

--- a/hw/intc/arm_gicv3_dist.c
+++ b/hw/intc/arm_gicv3_dist.c
@ -817,6 +817,13 @@ MemTxResult gicv3_dist_read(void *opaque, hwaddr offset, uint64_t *data,
                      "%s: invalid guest read at offset " TARGET_FMT_plx
                      "size %u\n", __func__, offset, size);
        trace_gicv3_dist_badread(offset, size, attrs.secure);
+        /* The spec requires that reserved registers are RAZ/WI;
+         * so use MEMTX_ERROR returns from leaf functions as a way to
+         * trigger the guest-error logging but don't return it to
+         * the caller, or we'll cause a spurious guest data abort.
+         */
+        r = MEMTX_OK;
+        *data = 0;
    } else {
        trace_gicv3_dist_read(offset, *data, size, attrs.secure);
    }
@ -852,6 +859,12 @@ MemTxResult gicv3_dist_write(void *opaque, hwaddr offset, uint64_t data,
                      "%s: invalid guest write at offset " TARGET_FMT_plx
                      "size %u\n", __func__, offset, size);
        trace_gicv3_dist_badwrite(offset, data, size, attrs.secure);
+        /* The spec requires that reserved registers are RAZ/WI;
+         * so use MEMTX_ERROR returns from leaf functions as a way to
+         * trigger the guest-error logging but don't return it to
+         * the caller, or we'll cause a spurious guest data abort.
+         */
+        r = MEMTX_OK;
    } else {
        trace_gicv3_dist_write(offset, data, size, attrs.secure);
    }
--- a/hw/intc/arm_gicv3_its_common.c
+++ b/hw/intc/arm_gicv3_its_common.c
@ -67,7 +67,8 @@ static MemTxResult gicv3_its_trans_read(void *opaque, hwaddr offset,
                                        MemTxAttrs attrs)
 {
    qemu_log_mask(LOG_GUEST_ERROR, "ITS read at offset 0x%"PRIx64"\n", offset);
-    return MEMTX_ERROR;
+    *data = 0;
+    return MEMTX_OK;
 }

 static MemTxResult gicv3_its_trans_write(void *opaque, hwaddr offset,
@ -82,15 +83,12 @@ static MemTxResult gicv3_its_trans_write(void *opaque, hwaddr offset,
        if (ret <= 0) {
            qemu_log_mask(LOG_GUEST_ERROR,
                          "ITS: Error sending MSI: %s\n", strerror(-ret));
-            return MEMTX_DECODE_ERROR;
        }
-
-        return MEMTX_OK;
    } else {
        qemu_log_mask(LOG_GUEST_ERROR,
                      "ITS write at bad offset 0x%"PRIx64"\n", offset);
-        return MEMTX_DECODE_ERROR;
    }
+    return MEMTX_OK;
 }

 static const MemoryRegionOps gicv3_its_trans_ops = {
--- a/hw/intc/arm_gicv3_kvm.c
+++ b/hw/intc/arm_gicv3_kvm.c
@ -135,7 +135,14 @@ static void kvm_dist_get_priority(GICv3State *s, uint32_t offset, uint8_t *bmp)
    uint32_t reg, *field;
    int irq;

-    field = (uint32_t *)bmp;
+    /* For the KVM GICv3, affinity routing is always enabled, and the first 8
+     * GICD_IPRIORITYR<n> registers are always RAZ/WI. The corresponding
+     * functionality is replaced by GICR_IPRIORITYR<n>. It doesn't need to
+     * sync them. So it needs to skip the field of GIC_INTERNAL irqs in bmp and
+     * offset.
+     */
+    field = (uint32_t *)(bmp + GIC_INTERNAL);
+    offset += (GIC_INTERNAL * 8) / 8;
    for_each_dist_irq_reg(irq, s->num_irq, 8) {
        kvm_gicd_access(s, offset, &reg, false);
        *field = reg;
@ -149,7 +156,14 @@ static void kvm_dist_put_priority(GICv3State *s, uint32_t offset, uint8_t *bmp)
    uint32_t reg, *field;
    int irq;

-    field = (uint32_t *)bmp;
+    /* For the KVM GICv3, affinity routing is always enabled, and the first 8
+     * GICD_IPRIORITYR<n> registers are always RAZ/WI. The corresponding
+     * functionality is replaced by GICR_IPRIORITYR<n>. It doesn't need to
+     * sync them. So it needs to skip the field of GIC_INTERNAL irqs in bmp and
+     * offset.
+     */
+    field = (uint32_t *)(bmp + GIC_INTERNAL);
+    offset += (GIC_INTERNAL * 8) / 8;
    for_each_dist_irq_reg(irq, s->num_irq, 8) {
        reg = *field;
        kvm_gicd_access(s, offset, &reg, true);
@ -164,6 +178,14 @@ static void kvm_dist_get_edge_trigger(GICv3State *s, uint32_t offset,
    uint32_t reg;
    int irq;

+    /* For the KVM GICv3, affinity routing is always enabled, and the first 2
+     * GICD_ICFGR<n> registers are always RAZ/WI. The corresponding
+     * functionality is replaced by GICR_ICFGR<n>. It doesn't need to sync
+     * them. So it should increase the offset to skip GIC_INTERNAL irqs.
+     * This matches the for_each_dist_irq_reg() macro which also skips the
+     * first GIC_INTERNAL irqs.
+     */
+    offset += (GIC_INTERNAL * 2) / 8;
    for_each_dist_irq_reg(irq, s->num_irq, 2) {
        kvm_gicd_access(s, offset, &reg, false);
        reg = half_unshuffle32(reg >> 1);
@ -181,6 +203,14 @@ static void kvm_dist_put_edge_trigger(GICv3State *s, uint32_t offset,
    uint32_t reg;
    int irq;

+    /* For the KVM GICv3, affinity routing is always enabled, and the first 2
+     * GICD_ICFGR<n> registers are always RAZ/WI. The corresponding
+     * functionality is replaced by GICR_ICFGR<n>. It doesn't need to sync
+     * them. So it should increase the offset to skip GIC_INTERNAL irqs.
+     * This matches the for_each_dist_irq_reg() macro which also skips the
+     * first GIC_INTERNAL irqs.
+     */
+    offset += (GIC_INTERNAL * 2) / 8;
    for_each_dist_irq_reg(irq, s->num_irq, 2) {
        reg = *gic_bmp_ptr32(bmp, irq);
        if (irq % 32 != 0) {
@ -222,6 +252,15 @@ static void kvm_dist_getbmp(GICv3State *s, uint32_t offset, uint32_t *bmp)
    uint32_t reg;
    int irq;

+    /* For the KVM GICv3, affinity routing is always enabled, and the
+     * GICD_IGROUPR0/GICD_IGRPMODR0/GICD_ISENABLER0/GICD_ISPENDR0/
+     * GICD_ISACTIVER0 registers are always RAZ/WI. The corresponding
+     * functionality is replaced by the GICR registers. It doesn't need to sync
+     * them. So it should increase the offset to skip GIC_INTERNAL irqs.
+     * This matches the for_each_dist_irq_reg() macro which also skips the
+     * first GIC_INTERNAL irqs.
+     */
+    offset += (GIC_INTERNAL * 1) / 8;
    for_each_dist_irq_reg(irq, s->num_irq, 1) {
        kvm_gicd_access(s, offset, &reg, false);
        *gic_bmp_ptr32(bmp, irq) = reg;
@ -235,6 +274,19 @@ static void kvm_dist_putbmp(GICv3State *s, uint32_t offset,
    uint32_t reg;
    int irq;

+    /* For the KVM GICv3, affinity routing is always enabled, and the
+     * GICD_IGROUPR0/GICD_IGRPMODR0/GICD_ISENABLER0/GICD_ISPENDR0/
+     * GICD_ISACTIVER0 registers are always RAZ/WI. The corresponding
+     * functionality is replaced by the GICR registers. It doesn't need to sync
+     * them. So it should increase the offset and clroffset to skip GIC_INTERNAL
+     * irqs. This matches the for_each_dist_irq_reg() macro which also skips the
+     * first GIC_INTERNAL irqs.
+     */
+    offset += (GIC_INTERNAL * 1) / 8;
+    if (clroffset != 0) {
+        clroffset += (GIC_INTERNAL * 1) / 8;
+    }
+
    for_each_dist_irq_reg(irq, s->num_irq, 1) {
        /* If this bitmap is a set/clear register pair, first write to the
         * clear-reg to clear all bits before using the set-reg to write
@ -243,6 +295,7 @@ static void kvm_dist_putbmp(GICv3State *s, uint32_t offset,
        if (clroffset != 0) {
            reg = 0;
            kvm_gicd_access(s, clroffset, &reg, true);
+            clroffset += 4;
        }
        reg = *gic_bmp_ptr32(bmp, irq);
        kvm_gicd_access(s, offset, &reg, true);
--- a/hw/intc/arm_gicv3_redist.c
+++ b/hw/intc/arm_gicv3_redist.c
@ -455,6 +455,13 @@ MemTxResult gicv3_redist_read(void *opaque, hwaddr offset, uint64_t *data,
                      "size %u\n", __func__, offset, size);
        trace_gicv3_redist_badread(gicv3_redist_affid(cs), offset,
                                   size, attrs.secure);
+        /* The spec requires that reserved registers are RAZ/WI;
+         * so use MEMTX_ERROR returns from leaf functions as a way to
+         * trigger the guest-error logging but don't return it to
+         * the caller, or we'll cause a spurious guest data abort.
+         */
+        r = MEMTX_OK;
+        *data = 0;
    } else {
        trace_gicv3_redist_read(gicv3_redist_affid(cs), offset, *data,
                                size, attrs.secure);
@ -505,6 +512,12 @@ MemTxResult gicv3_redist_write(void *opaque, hwaddr offset, uint64_t data,
                      "size %u\n", __func__, offset, size);
        trace_gicv3_redist_badwrite(gicv3_redist_affid(cs), offset, data,
                                    size, attrs.secure);
+        /* The spec requires that reserved registers are RAZ/WI;
+         * so use MEMTX_ERROR returns from leaf functions as a way to
+         * trigger the guest-error logging but don't return it to
+         * the caller, or we'll cause a spurious guest data abort.
+         */
+        r = MEMTX_OK;
    } else {
        trace_gicv3_redist_write(gicv3_redist_affid(cs), offset, data,
                                 size, attrs.secure);
--- a/hw/intc/openpic_kvm.c
+++ b/hw/intc/openpic_kvm.c
@ -124,10 +124,6 @@ static void kvm_openpic_region_add(MemoryListener *listener,
    uint64_t reg_base;
    int ret;

-    if (section->fv != address_space_to_flatview(&address_space_memory)) {
-        abort();
-    }
-
    /* Ignore events on regions that are not us */
    if (section->mr != &opp->mem) {
        return;
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@ -422,6 +422,7 @@ static RxFilterInfo *virtio_net_query_rxfilter(NetClientState *nc)
 static void virtio_net_reset(VirtIODevice *vdev)
 {
    VirtIONet *n = VIRTIO_NET(vdev);
+    int i;

    /* Reset back to compatibility mode */
    n->promisc = 1;
@ -445,6 +446,16 @@ static void virtio_net_reset(VirtIODevice *vdev)
    memcpy(&n->mac[0], &n->nic->conf->macaddr, sizeof(n->mac));
    qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
    memset(n->vlans, 0, MAX_VLAN >> 3);
+
+    /* Flush any async TX */
+    for (i = 0;  i < n->max_queues; i++) {
+        NetClientState *nc = qemu_get_subqueue(n->nic, i);
+
+        if (nc->peer) {
+            qemu_flush_or_purge_queued_packets(nc->peer, true);
+            assert(!virtio_net_get_subqueue(nc)->async_tx.elem);
+        }
+    }
 }

 static void peer_test_vnet_hdr(VirtIONet *n)
--- a/hw/pci-bridge/gen_pcie_root_port.c
+++ b/hw/pci-bridge/gen_pcie_root_port.c
@ -74,8 +74,13 @@ static void gen_rp_realize(DeviceState *dev, Error **errp)
    PCIDevice *d = PCI_DEVICE(dev);
    GenPCIERootPort *grp = GEN_PCIE_ROOT_PORT(d);
    PCIERootPortClass *rpc = PCIE_ROOT_PORT_GET_CLASS(d);
+    Error *local_err = NULL;

-    rpc->parent_realize(dev, errp);
+    rpc->parent_realize(dev, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }

    int rc = pci_bridge_qemu_reserve_cap_init(d, 0, grp->bus_reserve,
            grp->io_reserve, grp->mem_reserve, grp->pref32_reserve,
--- a/hw/pci-bridge/i82801b11.c
+++ b/hw/pci-bridge/i82801b11.c
@ -98,6 +98,7 @@ static void i82801b11_bridge_class_init(ObjectClass *klass, void *data)
    k->realize = i82801b11_bridge_realize;
    k->config_write = pci_bridge_write_config;
    dc->vmsd = &i82801b11_bridge_dev_vmstate;
+    dc->reset = pci_bridge_reset;
    set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
 }

--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@ -1,7 +1,7 @@
 # shared objects
 obj-y += ppc.o ppc_booke.o fdt.o
 # IBM pSeries (sPAPR)
-obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o
+obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o spapr_events.o
 obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
 obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o
 obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@ -100,6 +100,21 @@

 #define PHANDLE_XICP            0x00001111

+/* These two functions implement the VCPU id numbering: one to compute them
+ * all and one to identify thread 0 of a VCORE. Any change to the first one
+ * is likely to have an impact on the second one, so let's keep them close.
+ */
+static int spapr_vcpu_id(sPAPRMachineState *spapr, int cpu_index)
+{
+    return
+        (cpu_index / smp_threads) * spapr->vsmt + cpu_index % smp_threads;
+}
+static bool spapr_is_thread0_in_vcore(sPAPRMachineState *spapr,
+                                      PowerPCCPU *cpu)
+{
+    return spapr_get_vcpu_id(cpu) % spapr->vsmt == 0;
+}
+
 static ICSState *spapr_ics_create(sPAPRMachineState *spapr,
                                  const char *type_ics,
                                  int nr_irqs, Error **errp)
@ -161,15 +176,14 @@ static void pre_2_10_vmstate_unregister_dummy_icp(int i)
                       (void *)(uintptr_t) i);
 }

-static inline int xics_max_server_number(void)
+static int xics_max_server_number(sPAPRMachineState *spapr)
 {
-    return DIV_ROUND_UP(max_cpus * kvmppc_smt_threads(), smp_threads);
+    return DIV_ROUND_UP(max_cpus * spapr->vsmt, smp_threads);
 }

 static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
 {
    sPAPRMachineState *spapr = SPAPR_MACHINE(machine);
-    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);

    if (kvm_enabled()) {
        if (machine_kernel_irqchip_allowed(machine) &&
@ -191,17 +205,6 @@ static void xics_system_init(MachineState *machine, int nr_irqs, Error **errp)
            return;
        }
    }
-
-    if (smc->pre_2_10_has_unused_icps) {
-        int i;
-
-        for (i = 0; i < xics_max_server_number(); i++) {
-            /* Dummy entries get deregistered when real ICPState objects
-             * are registered during CPU core hotplug.
-             */
-            pre_2_10_vmstate_register_dummy_icp(i);
-        }
-    }
 }

 static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
@ -210,7 +213,7 @@ static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,
    int i, ret = 0;
    uint32_t servers_prop[smt_threads];
    uint32_t gservers_prop[smt_threads * 2];
-    int index = spapr_vcpu_id(cpu);
+    int index = spapr_get_vcpu_id(cpu);

    if (cpu->compat_pvr) {
        ret = fdt_setprop_cell(fdt, offset, "cpu-version", cpu->compat_pvr);
@ -239,7 +242,7 @@ static int spapr_fixup_cpu_smt_dt(void *fdt, int offset, PowerPCCPU *cpu,

 static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, PowerPCCPU *cpu)
 {
-    int index = spapr_vcpu_id(cpu);
+    int index = spapr_get_vcpu_id(cpu);
    uint32_t associativity[] = {cpu_to_be32(0x5),
                                cpu_to_be32(0x0),
                                cpu_to_be32(0x0),
@ -253,7 +256,9 @@ static int spapr_fixup_cpu_numa_dt(void *fdt, int offset, PowerPCCPU *cpu)
 }

 /* Populate the "ibm,pa-features" property */
-static void spapr_populate_pa_features(PowerPCCPU *cpu, void *fdt, int offset,
+static void spapr_populate_pa_features(sPAPRMachineState *spapr,
+                                       PowerPCCPU *cpu,
+                                       void *fdt, int offset,
                                       bool legacy_guest)
 {
    CPUPPCState *env = &cpu->env;
@ -318,7 +323,7 @@ static void spapr_populate_pa_features(PowerPCCPU *cpu, void *fdt, int offset,
         */
        pa_features[3] |= 0x20;
    }
-    if (kvmppc_has_cap_htm() && pa_size > 24) {
+    if ((spapr_get_cap(spapr, SPAPR_CAP_HTM) != 0) && pa_size > 24) {
        pa_features[24] |= 0x80;    /* Transactional memory support */
    }
    if (legacy_guest && pa_size > 40) {
@ -336,16 +341,15 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
    int ret = 0, offset, cpus_offset;
    CPUState *cs;
    char cpu_model[32];
-    int smt = kvmppc_smt_threads();
    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};

    CPU_FOREACH(cs) {
        PowerPCCPU *cpu = POWERPC_CPU(cs);
        DeviceClass *dc = DEVICE_GET_CLASS(cs);
-        int index = spapr_vcpu_id(cpu);
-        int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));
+        int index = spapr_get_vcpu_id(cpu);
+        int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));

-        if ((index % smt) != 0) {
+        if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
            continue;
        }

@ -384,8 +388,8 @@ static int spapr_fixup_cpu_dt(void *fdt, sPAPRMachineState *spapr)
            return ret;
        }

-        spapr_populate_pa_features(cpu, fdt, offset,
-                                         spapr->cas_legacy_guest_workaround);
+        spapr_populate_pa_features(spapr, cpu, fdt, offset,
+                                   spapr->cas_legacy_guest_workaround);
    }
    return ret;
 }
@ -491,7 +495,7 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
    PowerPCCPU *cpu = POWERPC_CPU(cs);
    CPUPPCState *env = &cpu->env;
    PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cs);
-    int index = spapr_vcpu_id(cpu);
+    int index = spapr_get_vcpu_id(cpu);
    uint32_t segs[] = {cpu_to_be32(28), cpu_to_be32(40),
                       0xffffffff, 0xffffffff};
    uint32_t tbfreq = kvm_enabled() ? kvmppc_get_tbfreq()
@ -501,7 +505,7 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
    size_t page_sizes_prop_size;
    uint32_t vcpus_per_socket = smp_threads * smp_cores;
    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
-    int compat_smt = MIN(smp_threads, ppc_compat_max_threads(cpu));
+    int compat_smt = MIN(smp_threads, ppc_compat_max_vthreads(cpu));
    sPAPRDRConnector *drc;
    int drc_index;
    uint32_t radix_AP_encodings[PPC_PAGE_SIZES_MAX_SZ];
@ -555,20 +559,22 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
                          segs, sizeof(segs))));
    }

-    /* Advertise VMX/VSX (vector extensions) if available
-     *   0 / no property == no vector extensions
+    /* Advertise VSX (vector extensions) if available
     *   1               == VMX / Altivec available
-     *   2               == VSX available */
-    if (env->insns_flags & PPC_ALTIVEC) {
-        uint32_t vmx = (env->insns_flags2 & PPC2_VSX) ? 2 : 1;
-
-        _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", vmx)));
+     *   2               == VSX available
+     *
+     * Only CPUs for which we create core types in spapr_cpu_core.c
+     * are possible, and all of those have VMX */
+    if (spapr_get_cap(spapr, SPAPR_CAP_VSX) != 0) {
+        _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 2)));
+    } else {
+        _FDT((fdt_setprop_cell(fdt, offset, "ibm,vmx", 1)));
    }

    /* Advertise DFP (Decimal Floating Point) if available
     *   0 / no property == no DFP
     *   1               == DFP available */
-    if (env->insns_flags2 & PPC2_DFP) {
+    if (spapr_get_cap(spapr, SPAPR_CAP_DFP) != 0) {
        _FDT((fdt_setprop_cell(fdt, offset, "ibm,dfp", 1)));
    }

@ -579,7 +585,7 @@ static void spapr_populate_cpu_dt(CPUState *cs, void *fdt, int offset,
                          page_sizes_prop, page_sizes_prop_size)));
    }

-    spapr_populate_pa_features(cpu, fdt, offset, false);
+    spapr_populate_pa_features(spapr, cpu, fdt, offset, false);

    _FDT((fdt_setprop_cell(fdt, offset, "ibm,chip-id",
                           cs->cpu_index / vcpus_per_socket)));
@ -610,7 +616,6 @@ static void spapr_populate_cpus_dt_node(void *fdt, sPAPRMachineState *spapr)
    CPUState *cs;
    int cpus_offset;
    char *nodename;
-    int smt = kvmppc_smt_threads();

    cpus_offset = fdt_add_subnode(fdt, 0, "cpus");
    _FDT(cpus_offset);
@ -624,11 +629,11 @@ static void spapr_populate_cpus_dt_node(void *fdt, sPAPRMachineState *spapr)
     */
    CPU_FOREACH_REVERSE(cs) {
        PowerPCCPU *cpu = POWERPC_CPU(cs);
-        int index = spapr_vcpu_id(cpu);
+        int index = spapr_get_vcpu_id(cpu);
        DeviceClass *dc = DEVICE_GET_CLASS(cs);
        int offset;

-        if ((index % smt) != 0) {
+        if (!spapr_is_thread0_in_vcore(spapr, cpu)) {
            continue;
        }

@ -1101,7 +1106,7 @@ static void *spapr_build_fdt(sPAPRMachineState *spapr,
    _FDT(fdt_setprop_cell(fdt, 0, "#size-cells", 2));

    /* /interrupt controller */
-    spapr_dt_xics(xics_max_server_number(), fdt, PHANDLE_XICP);
+    spapr_dt_xics(xics_max_server_number(spapr), fdt, PHANDLE_XICP);

    ret = spapr_populate_memory(spapr, fdt);
    if (ret < 0) {
@ -1440,7 +1445,12 @@ static void ppc_spapr_reset(void)
    /* Check for unknown sysbus devices */
    foreach_dynamic_sysbus_device(find_unknown_sysbus_device, NULL);

-    if (kvm_enabled() && kvmppc_has_cap_mmu_radix()) {
+    spapr_caps_reset(spapr);
+
+    first_ppc_cpu = POWERPC_CPU(first_cpu);
+    if (kvm_enabled() && kvmppc_has_cap_mmu_radix() &&
+        ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0,
+                         spapr->max_compat_pvr)) {
        /* If using KVM with radix mode available, VCPUs can be started
         * without a HPT because KVM will start them in radix mode.
         * Set the GR bit in PATB so that we know there is no HPT. */
@ -1449,6 +1459,15 @@ static void ppc_spapr_reset(void)
        spapr_setup_hpt_and_vrma(spapr);
    }

+    /* if this reset wasn't generated by CAS, we should reset our
+     * negotiated options and start from scratch */
+    if (!spapr->cas_reboot) {
+        spapr_ovec_cleanup(spapr->ov5_cas);
+        spapr->ov5_cas = spapr_ovec_new();
+
+        ppc_set_compat(first_ppc_cpu, spapr->max_compat_pvr, &error_fatal);
+    }
+
    qemu_devices_reset();

    /* DRC reset may cause a device to be unplugged. This will cause troubles
@ -1469,15 +1488,6 @@ static void ppc_spapr_reset(void)
    rtas_addr = rtas_limit - RTAS_MAX_SIZE;
    fdt_addr = rtas_addr - FDT_MAX_SIZE;

-    /* if this reset wasn't generated by CAS, we should reset our
-     * negotiated options and start from scratch */
-    if (!spapr->cas_reboot) {
-        spapr_ovec_cleanup(spapr->ov5_cas);
-        spapr->ov5_cas = spapr_ovec_new();
-
-        ppc_set_compat_all(spapr->max_compat_pvr, &error_fatal);
-    }
-
    fdt = spapr_build_fdt(spapr, rtas_addr, spapr->rtas_size);

    spapr_load_rtas(spapr, fdt, rtas_addr);
@ -1499,7 +1509,6 @@ static void ppc_spapr_reset(void)
    g_free(fdt);

    /* Set up the entry state */
-    first_ppc_cpu = POWERPC_CPU(first_cpu);
    first_ppc_cpu->env.gpr[3] = fdt_addr;
    first_ppc_cpu->env.gpr[5] = 0;
    first_cpu->halted = 0;
@ -1552,11 +1561,28 @@ static bool spapr_vga_init(PCIBus *pci_bus, Error **errp)
    }
 }

+static int spapr_pre_load(void *opaque)
+{
+    int rc;
+
+    rc = spapr_caps_pre_load(opaque);
+    if (rc) {
+        return rc;
+    }
+
+    return 0;
+}
+
 static int spapr_post_load(void *opaque, int version_id)
 {
    sPAPRMachineState *spapr = (sPAPRMachineState *)opaque;
    int err = 0;

+    err = spapr_caps_post_migration(spapr);
+    if (err) {
+        return err;
+    }
+
    if (!object_dynamic_cast(OBJECT(spapr->ics), TYPE_ICS_KVM)) {
        CPUState *cs;
        CPU_FOREACH(cs) {
@ -1588,6 +1614,18 @@ static int spapr_post_load(void *opaque, int version_id)
    return err;
 }

+static int spapr_pre_save(void *opaque)
+{
+    int rc;
+
+    rc = spapr_caps_pre_save(opaque);
+    if (rc) {
+        return rc;
+    }
+
+    return 0;
+}
+
 static bool version_before_3(void *opaque, int version_id)
 {
    return version_id < 3;
@ -1708,7 +1746,9 @@ static const VMStateDescription vmstate_spapr = {
    .name = "spapr",
    .version_id = 3,
    .minimum_version_id = 1,
+    .pre_load = spapr_pre_load,
    .post_load = spapr_post_load,
+    .pre_save = spapr_pre_save,
    .fields = (VMStateField[]) {
        /* used to be @next_irq */
        VMSTATE_UNUSED_BUFFER(version_before_3, 0, 4),
@ -1723,6 +1763,12 @@ static const VMStateDescription vmstate_spapr = {
        &vmstate_spapr_ov5_cas,
        &vmstate_spapr_patb_entry,
        &vmstate_spapr_pending_events,
+        &vmstate_spapr_cap_htm,
+        &vmstate_spapr_cap_vsx,
+        &vmstate_spapr_cap_dfp,
+        &vmstate_spapr_cap_cfpc,
+        &vmstate_spapr_cap_sbbc,
+        &vmstate_spapr_cap_ibs,
        NULL
    }
 };
@ -2152,8 +2198,8 @@ static void spapr_init_cpus(sPAPRMachineState *spapr)
 {
    MachineState *machine = MACHINE(spapr);
    MachineClass *mc = MACHINE_GET_CLASS(machine);
+    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(machine);
    const char *type = spapr_get_cpu_core_type(machine->cpu_type);
-    int smt = kvmppc_smt_threads();
    const CPUArchIdList *possible_cpus;
    int boot_cores_nr = smp_cpus / smp_threads;
    int i;
@ -2183,12 +2229,23 @@ static void spapr_init_cpus(sPAPRMachineState *spapr)
        boot_cores_nr = possible_cpus->len;
    }

+    if (smc->pre_2_10_has_unused_icps) {
+        int i;
+
+        for (i = 0; i < xics_max_server_number(spapr); i++) {
+            /* Dummy entries get deregistered when real ICPState objects
+             * are registered during CPU core hotplug.
+             */
+            pre_2_10_vmstate_register_dummy_icp(i);
+        }
+    }
+
    for (i = 0; i < possible_cpus->len; i++) {
        int core_id = i * smp_threads;

        if (mc->has_hotpluggable_cpus) {
            spapr_dr_connector_new(OBJECT(spapr), TYPE_SPAPR_DRC_CPU,
-                                   (core_id / smp_threads) * smt);
+                                   spapr_vcpu_id(spapr, core_id));
        }

        if (i < boot_cores_nr) {
@ -2237,26 +2294,43 @@ static void spapr_set_vsmt_mode(sPAPRMachineState *spapr, Error **errp)
        }
        /* In this case, spapr->vsmt has been set by the command line */
    } else {
-        /* Choose a VSMT mode that may be higher than necessary but is
-         * likely to be compatible with hosts that don't have VSMT. */
-        spapr->vsmt = MAX(kvm_smt, smp_threads);
+        /*
+         * Default VSMT value is tricky, because we need it to be as
+         * consistent as possible (for migration), but this requires
+         * changing it for at least some existing cases.  We pick 8 as
+         * the value that we'd get with KVM on POWER8, the
+         * overwhelmingly common case in production systems.
+         */
+        spapr->vsmt = MAX(8, smp_threads);
    }

    /* KVM: If necessary, set the SMT mode: */
    if (kvm_enabled() && (spapr->vsmt != kvm_smt)) {
        ret = kvmppc_set_smt_threads(spapr->vsmt);
        if (ret) {
+            /* Looks like KVM isn't able to change VSMT mode */
            error_setg(&local_err,
                       "Failed to set KVM's VSMT mode to %d (errno %d)",
                       spapr->vsmt, ret);
-            if (!vsmt_user) {
-                error_append_hint(&local_err, "On PPC, a VM with %d threads/"
-                             "core on a host with %d threads/core requires "
-                             " the use of VSMT mode %d.\n",
-                             smp_threads, kvm_smt, spapr->vsmt);
+            /* We can live with that if the default one is big enough
+             * for the number of threads, and a submultiple of the one
+             * we want.  In this case we'll waste some vcpu ids, but
+             * behaviour will be correct */
+            if ((kvm_smt >= smp_threads) && ((spapr->vsmt % kvm_smt) == 0)) {
+                warn_report_err(local_err);
+                local_err = NULL;
+                goto out;
+            } else {
+                if (!vsmt_user) {
+                    error_append_hint(&local_err,
+                                      "On PPC, a VM with %d threads/core"
+                                      " on a host with %d threads/core"
+                                      " requires the use of VSMT mode %d.\n",
+                                      smp_threads, kvm_smt, spapr->vsmt);
+                }
+                kvmppc_hint_smt_possible(&local_err);
+                goto out;
            }
-            kvmppc_hint_smt_possible(&local_err);
-            goto out;
        }
    }
    /* else TCG: nothing to do currently */
@ -2282,6 +2356,7 @@ static void ppc_spapr_init(MachineState *machine)
    long load_limit, fw_size;
    char *filename;
    Error *resize_hpt_err = NULL;
+    PowerPCCPU *first_ppc_cpu;

    msi_nonbroken = true;

@ -2374,11 +2449,6 @@ static void ppc_spapr_init(MachineState *machine)
    }

    spapr_ovec_set(spapr->ov5, OV5_FORM1_AFFINITY);
-    if (!kvm_enabled() || kvmppc_has_cap_mmu_radix()) {
-        /* KVM and TCG always allow GTSE with radix... */
-        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
-    }
-    /* ... but not with hash (currently). */

    /* advertise support for dedicated HP event source to guests */
    if (spapr->use_hotplug_event_source) {
@ -2395,6 +2465,15 @@ static void ppc_spapr_init(MachineState *machine)

    spapr_init_cpus(spapr);

+    first_ppc_cpu = POWERPC_CPU(first_cpu);
+    if ((!kvm_enabled() || kvmppc_has_cap_mmu_radix()) &&
+        ppc_check_compat(first_ppc_cpu, CPU_POWERPC_LOGICAL_3_00, 0,
+                         spapr->max_compat_pvr)) {
+        /* KVM and TCG always allow GTSE with radix... */
+        spapr_ovec_set(spapr->ov5, OV5_MMU_RADIX_GTSE);
+    }
+    /* ... but not with hash (currently). */
+
    if (kvm_enabled()) {
        /* Enable H_LOGICAL_CI_* so SLOF can talk to in-kernel devices */
        kvmppc_enable_logical_ci_hcalls();
@ -3154,7 +3233,7 @@ static void *spapr_populate_hotplug_cpu_dt(CPUState *cs, int *fdt_offset,
 {
    PowerPCCPU *cpu = POWERPC_CPU(cs);
    DeviceClass *dc = DEVICE_GET_CLASS(cs);
-    int id = spapr_vcpu_id(cpu);
+    int id = spapr_get_vcpu_id(cpu);
    void *fdt;
    int offset, fdt_size;
    char *nodename;
@ -3200,10 +3279,10 @@ static
 void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev,
                               Error **errp)
 {
+    sPAPRMachineState *spapr = SPAPR_MACHINE(OBJECT(hotplug_dev));
    int index;
    sPAPRDRConnector *drc;
    CPUCore *cc = CPU_CORE(dev);
-    int smt = kvmppc_smt_threads();

    if (!spapr_find_cpu_slot(MACHINE(hotplug_dev), cc->core_id, &index)) {
        error_setg(errp, "Unable to find CPU core with core-id: %d",
@ -3215,7 +3294,8 @@ void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev,
        return;
    }

-    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, index * smt);
+    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
+                          spapr_vcpu_id(spapr, cc->core_id));
    g_assert(drc);

    spapr_drc_detach(drc);
@ -3234,7 +3314,6 @@ static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
    CPUState *cs = CPU(core->threads);
    sPAPRDRConnector *drc;
    Error *local_err = NULL;
-    int smt = kvmppc_smt_threads();
    CPUArchId *core_slot;
    int index;
    bool hotplugged = spapr_drc_hotplugged(dev);
@ -3245,7 +3324,8 @@ static void spapr_core_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
                   cc->core_id);
        return;
    }
-    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU, index * smt);
+    drc = spapr_drc_by_id(TYPE_SPAPR_DRC_CPU,
+                          spapr_vcpu_id(spapr, cc->core_id));

    g_assert(drc || !mc->has_hotpluggable_cpus);

@ -3578,15 +3658,27 @@ static void spapr_pic_print_info(InterruptStatsProvider *obj,
    ics_pic_print_info(spapr->ics, mon);
 }

-int spapr_vcpu_id(PowerPCCPU *cpu)
+int spapr_get_vcpu_id(PowerPCCPU *cpu)
 {
-    CPUState *cs = CPU(cpu);
+    return cpu->vcpu_id;
+}

-    if (kvm_enabled()) {
-        return kvm_arch_vcpu_id(cs);
-    } else {
-        return cs->cpu_index;
+void spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp)
+{
+    sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
+    int vcpu_id;
+
+    vcpu_id = spapr_vcpu_id(spapr, cpu_index);
+
+    if (kvm_enabled() && !kvm_vcpu_id_is_valid(vcpu_id)) {
+        error_setg(errp, "Can't create CPU with id %d in KVM", vcpu_id);
+        error_append_hint(errp, "Adjust the number of cpus to %d "
+                          "or try to raise the number of threads per core\n",
+                          vcpu_id * smp_threads / spapr->vsmt);
+        return;
    }
+
+    cpu->vcpu_id = vcpu_id;
 }

 PowerPCCPU *spapr_find_cpu(int vcpu_id)
@ -3596,7 +3688,7 @@ PowerPCCPU *spapr_find_cpu(int vcpu_id)
    CPU_FOREACH(cs) {
        PowerPCCPU *cpu = POWERPC_CPU(cs);

-        if (spapr_vcpu_id(cpu) == vcpu_id) {
+        if (spapr_get_vcpu_id(cpu) == vcpu_id) {
            return cpu;
        }
    }
@ -3663,6 +3755,14 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
     * in which LMBs are represented and hot-added
     */
    mc->numa_mem_align_shift = 28;
+
+    smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF;
+    smc->default_caps.caps[SPAPR_CAP_VSX] = SPAPR_CAP_ON;
+    smc->default_caps.caps[SPAPR_CAP_DFP] = SPAPR_CAP_ON;
+    smc->default_caps.caps[SPAPR_CAP_CFPC] = SPAPR_CAP_BROKEN;
+    smc->default_caps.caps[SPAPR_CAP_SBBC] = SPAPR_CAP_BROKEN;
+    smc->default_caps.caps[SPAPR_CAP_IBS] = SPAPR_CAP_BROKEN;
+    spapr_caps_add_properties(smc, &error_abort);
 }

 static const TypeInfo spapr_machine_info = {
@ -3713,16 +3813,38 @@ static const TypeInfo spapr_machine_info = {
    }                                                                \
    type_init(spapr_machine_register_##suffix)

+/*
+ * pseries-2.12
+ */
+static void spapr_machine_2_12_instance_options(MachineState *machine)
+{
+}
+
+static void spapr_machine_2_12_class_options(MachineClass *mc)
+{
+    /* Defaults for the latest behaviour inherited from the base class */
+}
+
+DEFINE_SPAPR_MACHINE(2_12, "2.12", false);
+
 /*
 * pseries-2.11
 */
+#define SPAPR_COMPAT_2_11                                              \
+    HW_COMPAT_2_11
+
 static void spapr_machine_2_11_instance_options(MachineState *machine)
 {
+    spapr_machine_2_12_instance_options(machine);
 }

 static void spapr_machine_2_11_class_options(MachineClass *mc)
 {
-    /* Defaults for the latest behaviour inherited from the base class */
+    sPAPRMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
+
+    spapr_machine_2_12_class_options(mc);
+    smc->default_caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_ON;
+    SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_11);
 }

 DEFINE_SPAPR_MACHINE(2_11, "2.11", true);
@ -3731,10 +3853,11 @@ DEFINE_SPAPR_MACHINE(2_11, "2.11", true);
 * pseries-2.10
 */
 #define SPAPR_COMPAT_2_10                                              \
-    HW_COMPAT_2_10                                                     \
+    HW_COMPAT_2_10

 static void spapr_machine_2_10_instance_options(MachineState *machine)
 {
+    spapr_machine_2_11_instance_options(machine);
 }

 static void spapr_machine_2_10_class_options(MachineClass *mc)
--- a/hw/ppc/spapr_caps.c
+++ b/hw/ppc/spapr_caps.c
@ -0,0 +1,443 @@
+/*
+ * QEMU PowerPC pSeries Logical Partition capabilities handling
+ *
+ * Copyright (c) 2017 David Gibson, Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qapi/visitor.h"
+#include "sysemu/hw_accel.h"
+#include "target/ppc/cpu.h"
+#include "cpu-models.h"
+#include "kvm_ppc.h"
+
+#include "hw/ppc/spapr.h"
+
+typedef struct sPAPRCapabilityInfo {
+    const char *name;
+    const char *description;
+    const char *options;                        /* valid capability values */
+    int index;
+
+    /* Getter and Setter Function Pointers */
+    ObjectPropertyAccessor *get;
+    ObjectPropertyAccessor *set;
+    const char *type;
+    /* Make sure the virtual hardware can support this capability */
+    void (*apply)(sPAPRMachineState *spapr, uint8_t val, Error **errp);
+} sPAPRCapabilityInfo;
+
+static void spapr_cap_get_bool(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    sPAPRCapabilityInfo *cap = opaque;
+    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+    bool value = spapr_get_cap(spapr, cap->index) == SPAPR_CAP_ON;
+
+    visit_type_bool(v, name, &value, errp);
+}
+
+static void spapr_cap_set_bool(Object *obj, Visitor *v, const char *name,
+                               void *opaque, Error **errp)
+{
+    sPAPRCapabilityInfo *cap = opaque;
+    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+    bool value;
+    Error *local_err = NULL;
+
+    visit_type_bool(v, name, &value, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    spapr->cmd_line_caps[cap->index] = true;
+    spapr->eff.caps[cap->index] = value ? SPAPR_CAP_ON : SPAPR_CAP_OFF;
+}
+
+static void spapr_cap_get_tristate(Object *obj, Visitor *v, const char *name,
+                                   void *opaque, Error **errp)
+{
+    sPAPRCapabilityInfo *cap = opaque;
+    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+    char *val = NULL;
+    uint8_t value = spapr_get_cap(spapr, cap->index);
+
+    switch (value) {
+    case SPAPR_CAP_BROKEN:
+        val = g_strdup("broken");
+        break;
+    case SPAPR_CAP_WORKAROUND:
+        val = g_strdup("workaround");
+        break;
+    case SPAPR_CAP_FIXED:
+        val = g_strdup("fixed");
+        break;
+    default:
+        error_setg(errp, "Invalid value (%d) for cap-%s", value, cap->name);
+        return;
+    }
+
+    visit_type_str(v, name, &val, errp);
+    g_free(val);
+}
+
+static void spapr_cap_set_tristate(Object *obj, Visitor *v, const char *name,
+                                   void *opaque, Error **errp)
+{
+    sPAPRCapabilityInfo *cap = opaque;
+    sPAPRMachineState *spapr = SPAPR_MACHINE(obj);
+    char *val;
+    Error *local_err = NULL;
+    uint8_t value;
+
+    visit_type_str(v, name, &val, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    if (!strcasecmp(val, "broken")) {
+        value = SPAPR_CAP_BROKEN;
+    } else if (!strcasecmp(val, "workaround")) {
+        value = SPAPR_CAP_WORKAROUND;
+    } else if (!strcasecmp(val, "fixed")) {
+        value = SPAPR_CAP_FIXED;
+    } else {
+        error_setg(errp, "Invalid capability mode \"%s\" for cap-%s", val,
+                   cap->name);
+        goto out;
+    }
+
+    spapr->cmd_line_caps[cap->index] = true;
+    spapr->eff.caps[cap->index] = value;
+out:
+    g_free(val);
+}
+
+static void cap_htm_apply(sPAPRMachineState *spapr, uint8_t val, Error **errp)
+{
+    if (!val) {
+        /* TODO: We don't support disabling htm yet */
+        return;
+    }
+    if (tcg_enabled()) {
+        error_setg(errp,
+                   "No Transactional Memory support in TCG, try cap-htm=off");
+    } else if (kvm_enabled() && !kvmppc_has_cap_htm()) {
+        error_setg(errp,
+"KVM implementation does not support Transactional Memory, try cap-htm=off"
+            );
+    }
+}
+
+static void cap_vsx_apply(sPAPRMachineState *spapr, uint8_t val, Error **errp)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
+    CPUPPCState *env = &cpu->env;
+
+    if (!val) {
+        /* TODO: We don't support disabling vsx yet */
+        return;
+    }
+    /* Allowable CPUs in spapr_cpu_core.c should already have gotten
+     * rid of anything that doesn't do VMX */
+    g_assert(env->insns_flags & PPC_ALTIVEC);
+    if (!(env->insns_flags2 & PPC2_VSX)) {
+        error_setg(errp, "VSX support not available, try cap-vsx=off");
+    }
+}
+
+static void cap_dfp_apply(sPAPRMachineState *spapr, uint8_t val, Error **errp)
+{
+    PowerPCCPU *cpu = POWERPC_CPU(first_cpu);
+    CPUPPCState *env = &cpu->env;
+
+    if (!val) {
+        /* TODO: We don't support disabling dfp yet */
+        return;
+    }
+    if (!(env->insns_flags2 & PPC2_DFP)) {
+        error_setg(errp, "DFP support not available, try cap-dfp=off");
+    }
+}
+
+static void cap_safe_cache_apply(sPAPRMachineState *spapr, uint8_t val,
+                                 Error **errp)
+{
+    if (tcg_enabled() && val) {
+        /* TODO - for now only allow broken for TCG */
+        error_setg(errp, "Requested safe cache capability level not supported by tcg, try a different value for cap-cfpc");
+    } else if (kvm_enabled() && (val > kvmppc_get_cap_safe_cache())) {
+        error_setg(errp, "Requested safe cache capability level not supported by kvm, try a different value for cap-cfpc");
+    }
+}
+
+static void cap_safe_bounds_check_apply(sPAPRMachineState *spapr, uint8_t val,
+                                        Error **errp)
+{
+    if (tcg_enabled() && val) {
+        /* TODO - for now only allow broken for TCG */
+        error_setg(errp, "Requested safe bounds check capability level not supported by tcg, try a different value for cap-sbbc");
+    } else if (kvm_enabled() && (val > kvmppc_get_cap_safe_bounds_check())) {
+        error_setg(errp, "Requested safe bounds check capability level not supported by kvm, try a different value for cap-sbbc");
+    }
+}
+
+static void cap_safe_indirect_branch_apply(sPAPRMachineState *spapr,
+                                           uint8_t val, Error **errp)
+{
+    if (tcg_enabled() && val) {
+        /* TODO - for now only allow broken for TCG */
+        error_setg(errp, "Requested safe indirect branch capability level not supported by tcg, try a different value for cap-ibs");
+    } else if (kvm_enabled() && (val > kvmppc_get_cap_safe_indirect_branch())) {
+        error_setg(errp, "Requested safe indirect branch capability level not supported by kvm, try a different value for cap-ibs");
+    }
+}
+
+#define VALUE_DESC_TRISTATE     " (broken, workaround, fixed)"
+
+sPAPRCapabilityInfo capability_table[SPAPR_CAP_NUM] = {
+    [SPAPR_CAP_HTM] = {
+        .name = "htm",
+        .description = "Allow Hardware Transactional Memory (HTM)",
+        .options = "",
+        .index = SPAPR_CAP_HTM,
+        .get = spapr_cap_get_bool,
+        .set = spapr_cap_set_bool,
+        .type = "bool",
+        .apply = cap_htm_apply,
+    },
+    [SPAPR_CAP_VSX] = {
+        .name = "vsx",
+        .description = "Allow Vector Scalar Extensions (VSX)",
+        .options = "",
+        .index = SPAPR_CAP_VSX,
+        .get = spapr_cap_get_bool,
+        .set = spapr_cap_set_bool,
+        .type = "bool",
+        .apply = cap_vsx_apply,
+    },
+    [SPAPR_CAP_DFP] = {
+        .name = "dfp",
+        .description = "Allow Decimal Floating Point (DFP)",
+        .options = "",
+        .index = SPAPR_CAP_DFP,
+        .get = spapr_cap_get_bool,
+        .set = spapr_cap_set_bool,
+        .type = "bool",
+        .apply = cap_dfp_apply,
+    },
+    [SPAPR_CAP_CFPC] = {
+        .name = "cfpc",
+        .description = "Cache Flush on Privilege Change" VALUE_DESC_TRISTATE,
+        .index = SPAPR_CAP_CFPC,
+        .get = spapr_cap_get_tristate,
+        .set = spapr_cap_set_tristate,
+        .type = "string",
+        .apply = cap_safe_cache_apply,
+    },
+    [SPAPR_CAP_SBBC] = {
+        .name = "sbbc",
+        .description = "Speculation Barrier Bounds Checking" VALUE_DESC_TRISTATE,
+        .index = SPAPR_CAP_SBBC,
+        .get = spapr_cap_get_tristate,
+        .set = spapr_cap_set_tristate,
+        .type = "string",
+        .apply = cap_safe_bounds_check_apply,
+    },
+    [SPAPR_CAP_IBS] = {
+        .name = "ibs",
+        .description = "Indirect Branch Serialisation" VALUE_DESC_TRISTATE,
+        .index = SPAPR_CAP_IBS,
+        .get = spapr_cap_get_tristate,
+        .set = spapr_cap_set_tristate,
+        .type = "string",
+        .apply = cap_safe_indirect_branch_apply,
+    },
+};
+
+static sPAPRCapabilities default_caps_with_cpu(sPAPRMachineState *spapr,
+                                               CPUState *cs)
+{
+    sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+    sPAPRCapabilities caps;
+
+    caps = smc->default_caps;
+
+    if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_07,
+                          0, spapr->max_compat_pvr)) {
+        caps.caps[SPAPR_CAP_HTM] = SPAPR_CAP_OFF;
+    }
+
+    if (!ppc_check_compat(cpu, CPU_POWERPC_LOGICAL_2_06,
+                          0, spapr->max_compat_pvr)) {
+        caps.caps[SPAPR_CAP_VSX] = SPAPR_CAP_OFF;
+        caps.caps[SPAPR_CAP_DFP] = SPAPR_CAP_OFF;
+    }
+
+    return caps;
+}
+
+int spapr_caps_pre_load(void *opaque)
+{
+    sPAPRMachineState *spapr = opaque;
+
+    /* Set to default so we can tell if this came in with the migration */
+    spapr->mig = spapr->def;
+    return 0;
+}
+
+int spapr_caps_pre_save(void *opaque)
+{
+    sPAPRMachineState *spapr = opaque;
+
+    spapr->mig = spapr->eff;
+    return 0;
+}
+
+/* This has to be called from the top-level spapr post_load, not the
+ * caps specific one.  Otherwise it wouldn't be called when the source
+ * caps are all defaults, which could still conflict with overridden
+ * caps on the destination */
+int spapr_caps_post_migration(sPAPRMachineState *spapr)
+{
+    int i;
+    bool ok = true;
+    sPAPRCapabilities dstcaps = spapr->eff;
+    sPAPRCapabilities srccaps;
+
+    srccaps = default_caps_with_cpu(spapr, first_cpu);
+    for (i = 0; i < SPAPR_CAP_NUM; i++) {
+        /* If not default value then assume came in with the migration */
+        if (spapr->mig.caps[i] != spapr->def.caps[i]) {
+            srccaps.caps[i] = spapr->mig.caps[i];
+        }
+    }
+
+    for (i = 0; i < SPAPR_CAP_NUM; i++) {
+        sPAPRCapabilityInfo *info = &capability_table[i];
+
+        if (srccaps.caps[i] > dstcaps.caps[i]) {
+            error_report("cap-%s higher level (%d) in incoming stream than on destination (%d)",
+                         info->name, srccaps.caps[i], dstcaps.caps[i]);
+            ok = false;
+        }
+
+        if (srccaps.caps[i] < dstcaps.caps[i]) {
+            warn_report("cap-%s lower level (%d) in incoming stream than on destination (%d)",
+                         info->name, srccaps.caps[i], dstcaps.caps[i]);
+        }
+    }
+
+    return ok ? 0 : -EINVAL;
+}
+
+/* Used to generate the migration field and needed function for a spapr cap */
+#define SPAPR_CAP_MIG_STATE(cap, ccap)                  \
+static bool spapr_cap_##cap##_needed(void *opaque)      \
+{                                                       \
+    sPAPRMachineState *spapr = opaque;                  \
+                                                        \
+    return spapr->cmd_line_caps[SPAPR_CAP_##ccap] &&    \
+           (spapr->eff.caps[SPAPR_CAP_##ccap] !=        \
+            spapr->def.caps[SPAPR_CAP_##ccap]);         \
+}                                                       \
+                                                        \
+const VMStateDescription vmstate_spapr_cap_##cap = {    \
+    .name = "spapr/cap/" #cap,                          \
+    .version_id = 1,                                    \
+    .minimum_version_id = 1,                            \
+    .needed = spapr_cap_##cap##_needed,                 \
+    .fields = (VMStateField[]) {                        \
+        VMSTATE_UINT8(mig.caps[SPAPR_CAP_##ccap],       \
+                      sPAPRMachineState),               \
+        VMSTATE_END_OF_LIST()                           \
+    },                                                  \
+}
+
+SPAPR_CAP_MIG_STATE(htm, HTM);
+SPAPR_CAP_MIG_STATE(vsx, VSX);
+SPAPR_CAP_MIG_STATE(dfp, DFP);
+SPAPR_CAP_MIG_STATE(cfpc, CFPC);
+SPAPR_CAP_MIG_STATE(sbbc, SBBC);
+SPAPR_CAP_MIG_STATE(ibs, IBS);
+
+void spapr_caps_reset(sPAPRMachineState *spapr)
+{
+    sPAPRCapabilities default_caps;
+    int i;
+
+    /* First compute the actual set of caps we're running with.. */
+    default_caps = default_caps_with_cpu(spapr, first_cpu);
+
+    for (i = 0; i < SPAPR_CAP_NUM; i++) {
+        /* Store the defaults */
+        spapr->def.caps[i] = default_caps.caps[i];
+        /* If not set on the command line then apply the default value */
+        if (!spapr->cmd_line_caps[i]) {
+            spapr->eff.caps[i] = default_caps.caps[i];
+        }
+    }
+
+    /* .. then apply those caps to the virtual hardware */
+
+    for (i = 0; i < SPAPR_CAP_NUM; i++) {
+        sPAPRCapabilityInfo *info = &capability_table[i];
+
+        /*
+         * If the apply function can't set the desired level and thinks it's
+         * fatal, it should cause that.
+         */
+        info->apply(spapr, spapr->eff.caps[i], &error_fatal);
+    }
+}
+
+void spapr_caps_add_properties(sPAPRMachineClass *smc, Error **errp)
+{
+    Error *local_err = NULL;
+    ObjectClass *klass = OBJECT_CLASS(smc);
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(capability_table); i++) {
+        sPAPRCapabilityInfo *cap = &capability_table[i];
+        const char *name = g_strdup_printf("cap-%s", cap->name);
+        char *desc;
+
+        object_class_property_add(klass, name, cap->type,
+                                  cap->get, cap->set,
+                                  NULL, cap, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+
+        desc = g_strdup_printf("%s%s", cap->description, cap->options);
+        object_class_property_set_description(klass, name, desc, &local_err);
+        g_free(desc);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
--- a/hw/ppc/spapr_cpu_core.c
+++ b/hw/ppc/spapr_cpu_core.c
@ -35,6 +35,13 @@ static void spapr_cpu_reset(void *opaque)
    cs->halted = 1;

    env->spr[SPR_HIOR] = 0;
+
+    /* Set compatibility mode to match the boot CPU, which was either set
+     * by the machine reset code or by CAS. This should never fail.
+     */
+    if (cs != first_cpu) {
+        ppc_set_compat(cpu, POWERPC_CPU(first_cpu)->compat_pvr, &error_abort);
+    }
 }

 static void spapr_cpu_destroy(PowerPCCPU *cpu)
@ -169,13 +176,8 @@ static void spapr_cpu_core_realize(DeviceState *dev, Error **errp)
        cs = CPU(obj);
        cpu = POWERPC_CPU(cs);
        cs->cpu_index = cc->core_id + i;
-        cpu->vcpu_id = (cc->core_id * spapr->vsmt / smp_threads) + i;
-        if (kvm_enabled() && !kvm_vcpu_id_is_valid(cpu->vcpu_id)) {
-            error_setg(&local_err, "Can't create CPU with id %d in KVM",
-                       cpu->vcpu_id);
-            error_append_hint(&local_err, "Adjust the number of cpus to %d "
-                              "or try to raise the number of threads per core\n",
-                              cpu->vcpu_id * smp_threads / spapr->vsmt);
+        spapr_set_vcpu_id(cpu, cs->cpu_index, &local_err);
+        if (local_err) {
            goto err;
        }

--- a/hw/ppc/spapr_hcall.c
+++ b/hw/ppc/spapr_hcall.c
@ -1655,6 +1655,61 @@ static target_ulong h_client_architecture_support(PowerPCCPU *cpu,
    return H_SUCCESS;
 }

+static target_ulong h_get_cpu_characteristics(PowerPCCPU *cpu,
+                                              sPAPRMachineState *spapr,
+                                              target_ulong opcode,
+                                              target_ulong *args)
+{
+    uint64_t characteristics = H_CPU_CHAR_HON_BRANCH_HINTS &
+                               ~H_CPU_CHAR_THR_RECONF_TRIG;
+    uint64_t behaviour = H_CPU_BEHAV_FAVOUR_SECURITY;
+    uint8_t safe_cache = spapr_get_cap(spapr, SPAPR_CAP_CFPC);
+    uint8_t safe_bounds_check = spapr_get_cap(spapr, SPAPR_CAP_SBBC);
+    uint8_t safe_indirect_branch = spapr_get_cap(spapr, SPAPR_CAP_IBS);
+
+    switch (safe_cache) {
+    case SPAPR_CAP_WORKAROUND:
+        characteristics |= H_CPU_CHAR_L1D_FLUSH_ORI30;
+        characteristics |= H_CPU_CHAR_L1D_FLUSH_TRIG2;
+        characteristics |= H_CPU_CHAR_L1D_THREAD_PRIV;
+        behaviour |= H_CPU_BEHAV_L1D_FLUSH_PR;
+        break;
+    case SPAPR_CAP_FIXED:
+        break;
+    default: /* broken */
+        assert(safe_cache == SPAPR_CAP_BROKEN);
+        behaviour |= H_CPU_BEHAV_L1D_FLUSH_PR;
+        break;
+    }
+
+    switch (safe_bounds_check) {
+    case SPAPR_CAP_WORKAROUND:
+        characteristics |= H_CPU_CHAR_SPEC_BAR_ORI31;
+        behaviour |= H_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+        break;
+    case SPAPR_CAP_FIXED:
+        break;
+    default: /* broken */
+        assert(safe_bounds_check == SPAPR_CAP_BROKEN);
+        behaviour |= H_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+        break;
+    }
+
+    switch (safe_indirect_branch) {
+    case SPAPR_CAP_FIXED:
+        characteristics |= H_CPU_CHAR_BCCTRL_SERIALISED;
+        break;
+    default: /* broken */
+        assert(safe_indirect_branch == SPAPR_CAP_BROKEN);
+        break;
+    }
+
+    args[0] = characteristics;
+    args[1] = behaviour;
+
+    return H_SUCCESS;
+}
+
 static spapr_hcall_fn papr_hypercall_table[(MAX_HCALL_OPCODE / 4) + 1];
 static spapr_hcall_fn kvmppc_hypercall_table[KVMPPC_HCALL_MAX - KVMPPC_HCALL_BASE + 1];

@ -1734,6 +1789,10 @@ static void hypercall_register_types(void)
    spapr_register_hypercall(H_INVALIDATE_PID, h_invalidate_pid);
    spapr_register_hypercall(H_REGISTER_PROC_TBL, h_register_process_table);

+    /* hcall-get-cpu-characteristics */
+    spapr_register_hypercall(H_GET_CPU_CHARACTERISTICS,
+                             h_get_cpu_characteristics);
+
    /* "debugger" hcalls (also used by SLOF). Note: We do -not- differenciate
     * here between the "CI" and the "CACHE" variants, they will use whatever
     * mapping attributes qemu is using. When using KVM, the kernel will
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@ -280,20 +280,6 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
    int *config_addr_key;
    Error *err = NULL;

-    switch (func) {
-    case RTAS_CHANGE_MSI_FN:
-    case RTAS_CHANGE_FN:
-        ret_intr_type = RTAS_TYPE_MSI;
-        break;
-    case RTAS_CHANGE_MSIX_FN:
-        ret_intr_type = RTAS_TYPE_MSIX;
-        break;
-    default:
-        error_report("rtas_ibm_change_msi(%u) is not implemented", func);
-        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
-        return;
-    }
-
    /* Fins sPAPRPHBState */
    phb = spapr_pci_find_phb(spapr, buid);
    if (phb) {
@ -304,6 +290,39 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, sPAPRMachineState *spapr,
        return;
    }

+    switch (func) {
+    case RTAS_CHANGE_FN:
+        if (msi_present(pdev)) {
+            ret_intr_type = RTAS_TYPE_MSI;
+        } else if (msix_present(pdev)) {
+            ret_intr_type = RTAS_TYPE_MSIX;
+        } else {
+            rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+            return;
+        }
+        break;
+    case RTAS_CHANGE_MSI_FN:
+        if (msi_present(pdev)) {
+            ret_intr_type = RTAS_TYPE_MSI;
+        } else {
+            rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+            return;
+        }
+        break;
+    case RTAS_CHANGE_MSIX_FN:
+        if (msix_present(pdev)) {
+            ret_intr_type = RTAS_TYPE_MSIX;
+        } else {
+            rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+            return;
+        }
+        break;
+    default:
+        error_report("rtas_ibm_change_msi(%u) is not implemented", func);
+        rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR);
+        return;
+    }
+
    msi = (spapr_pci_msi *) g_hash_table_lookup(phb->msi, &config_addr);

    /* Releasing MSIs */
@ -1286,13 +1305,17 @@ static void spapr_populate_pci_child_dt(PCIDevice *dev, void *fdt, int offset,
    _FDT(fdt_setprop_cell(fdt, offset, "#size-cells",
                          RESOURCE_CELLS_SIZE));

-    max_msi = msi_nr_vectors_allocated(dev);
-    if (max_msi) {
-        _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi", max_msi));
+    if (msi_present(dev)) {
+        max_msi = msi_nr_vectors_allocated(dev);
+        if (max_msi) {
+            _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi", max_msi));
+        }
    }
-    max_msix = dev->msix_entries_nr;
-    if (max_msix) {
-        _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x", max_msix));
+    if (msix_present(dev)) {
+        max_msix = dev->msix_entries_nr;
+        if (max_msix) {
+            _FDT(fdt_setprop_cell(fdt, offset, "ibm,req#msi-x", max_msix));
+        }
    }

    populate_resource_props(dev, &rp);
--- a/hw/s390x/ccw-device.c
+++ b/hw/s390x/ccw-device.c
@ -40,6 +40,13 @@ static Property ccw_device_properties[] = {
    DEFINE_PROP_END_OF_LIST(),
 };

+static void ccw_device_reset(DeviceState *d)
+{
+    CcwDevice *ccw_dev = CCW_DEVICE(d);
+
+    css_reset_sch(ccw_dev->sch);
+}
+
 static void ccw_device_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
@ -48,6 +55,7 @@ static void ccw_device_class_init(ObjectClass *klass, void *data)
    k->realize = ccw_device_realize;
    k->refill_ids = ccw_device_refill_ids;
    dc->props = ccw_device_properties;
+    dc->reset = ccw_device_reset;
 }

 const VMStateDescription vmstate_ccw_dev = {
--- a/hw/s390x/css.c
+++ b/hw/s390x/css.c
@ -617,6 +617,14 @@ void css_inject_io_interrupt(SubchDev *sch)

 void css_conditional_io_interrupt(SubchDev *sch)
 {
+    /*
+     * If the subchannel is not enabled, it is not made status pending
+     * (see PoP p. 16-17, "Status Control").
+     */
+    if (!(sch->curr_status.pmcw.flags & PMCW_FLAGS_MASK_ENA)) {
+        return;
+    }
+
    /*
     * If the subchannel is not currently status pending, make it pending
     * with alert status.
--- a/hw/s390x/event-facility.c
+++ b/hw/s390x/event-facility.c
@ -293,10 +293,10 @@ static void write_event_mask(SCLPEventFacility *ef, SCCB *sccb)
    ef->receive_mask = be32_to_cpu(tmp_mask);

    /* return the SCLP's capability masks to the guest */
-    tmp_mask = cpu_to_be32(get_host_send_mask(ef));
+    tmp_mask = cpu_to_be32(get_host_receive_mask(ef));
    copy_mask(WEM_RECEIVE_MASK(we_mask, mask_length), (uint8_t *)&tmp_mask,
              mask_length, sizeof(tmp_mask));
-    tmp_mask = cpu_to_be32(get_host_receive_mask(ef));
+    tmp_mask = cpu_to_be32(get_host_send_mask(ef));
    copy_mask(WEM_SEND_MASK(we_mask, mask_length), (uint8_t *)&tmp_mask,
              mask_length, sizeof(tmp_mask));

--- a/hw/s390x/s390-stattrib-kvm.c
+++ b/hw/s390x/s390-stattrib-kvm.c
@ -116,7 +116,7 @@ static void kvm_s390_stattrib_synchronize(S390StAttribState *sa)
        for (cx = 0; cx + len <= max; cx += len) {
            clog.start_gfn = cx;
            clog.count = len;
-            clog.values = (uint64_t)(sas->incoming_buffer + cx * len);
+            clog.values = (uint64_t)(sas->incoming_buffer + cx);
            r = kvm_vm_ioctl(kvm_state, KVM_S390_SET_CMMA_BITS, &clog);
            if (r) {
                error_report("KVM_S390_SET_CMMA_BITS failed: %s", strerror(-r));
@ -126,7 +126,7 @@ static void kvm_s390_stattrib_synchronize(S390StAttribState *sa)
        if (cx < max) {
            clog.start_gfn = cx;
            clog.count = max - cx;
-            clog.values = (uint64_t)(sas->incoming_buffer + cx * len);
+            clog.values = (uint64_t)(sas->incoming_buffer + cx);
            r = kvm_vm_ioctl(kvm_state, KVM_S390_SET_CMMA_BITS, &clog);
            if (r) {
                error_report("KVM_S390_SET_CMMA_BITS failed: %s", strerror(-r));
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@ -152,14 +152,38 @@ static void virtio_ccw_register_hcalls(void)
                                   virtio_ccw_hcall_early_printk);
 }

+/*
+ * KVM does only support memory slots up to KVM_MEM_MAX_NR_PAGES pages
+ * as the dirty bitmap must be managed by bitops that take an int as
+ * position indicator. If we have a guest beyond that we will split off
+ * new subregions. The split must happen on a segment boundary (1MB).
+ */
+#define KVM_MEM_MAX_NR_PAGES ((1ULL << 31) - 1)
+#define SEG_MSK (~0xfffffULL)
+#define KVM_SLOT_MAX_BYTES ((KVM_MEM_MAX_NR_PAGES * TARGET_PAGE_SIZE) & SEG_MSK)
 static void s390_memory_init(ram_addr_t mem_size)
 {
    MemoryRegion *sysmem = get_system_memory();
-    MemoryRegion *ram = g_new(MemoryRegion, 1);
+    ram_addr_t chunk, offset = 0;
+    unsigned int number = 0;
+    gchar *name;

    /* allocate RAM for core */
-    memory_region_allocate_system_memory(ram, NULL, "s390.ram", mem_size);
-    memory_region_add_subregion(sysmem, 0, ram);
+    name = g_strdup_printf("s390.ram");
+    while (mem_size) {
+        MemoryRegion *ram = g_new(MemoryRegion, 1);
+        uint64_t size = mem_size;
+
+        /* KVM does not allow memslots >= 8 TB */
+        chunk = MIN(size, KVM_SLOT_MAX_BYTES);
+        memory_region_allocate_system_memory(ram, NULL, name, chunk);
+        memory_region_add_subregion(sysmem, offset, ram);
+        mem_size -= chunk;
+        offset += chunk;
+        g_free(name);
+        name = g_strdup_printf("s390.ram.%u", ++number);
+    }
+    g_free(name);

    /* Initialize storage key device */
    s390_skeys_init();
--- a/hw/s390x/virtio-ccw.c
+++ b/hw/s390x/virtio-ccw.c
@ -751,7 +751,7 @@ out_err:
    g_free(sch);
 }

-static int virtio_ccw_exit(VirtioCcwDevice *dev)
+static void virtio_ccw_unrealize(VirtioCcwDevice *dev, Error **errp)
 {
    CcwDevice *ccw_dev = CCW_DEVICE(dev);
    SubchDev *sch = ccw_dev->sch;
@ -759,12 +759,12 @@ static int virtio_ccw_exit(VirtioCcwDevice *dev)
    if (sch) {
        css_subch_assign(sch->cssid, sch->ssid, sch->schid, sch->devno, NULL);
        g_free(sch);
+        ccw_dev->sch = NULL;
    }
    if (dev->indicators) {
        release_indicator(&dev->routes.adapter, dev->indicators);
        dev->indicators = NULL;
    }
-    return 0;
 }

 static void virtio_ccw_net_realize(VirtioCcwDevice *ccw_dev, Error **errp)
@ -1057,10 +1057,12 @@ static void virtio_ccw_reset(DeviceState *d)
 {
    VirtioCcwDevice *dev = VIRTIO_CCW_DEVICE(d);
    VirtIODevice *vdev = virtio_bus_get_device(&dev->bus);
-    CcwDevice *ccw_dev = CCW_DEVICE(d);
+    VirtIOCCWDeviceClass *vdc = VIRTIO_CCW_DEVICE_GET_CLASS(dev);

    virtio_ccw_reset_virtio(dev, vdev);
-    css_reset_sch(ccw_dev->sch);
+    if (vdc->parent_reset) {
+        vdc->parent_reset(d);
+    }
 }

 static void virtio_ccw_vmstate_change(DeviceState *d, bool running)
@ -1343,8 +1345,7 @@ static void virtio_ccw_net_class_init(ObjectClass *klass, void *data)
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->realize = virtio_ccw_net_realize;
-    k->exit = virtio_ccw_exit;
-    dc->reset = virtio_ccw_reset;
+    k->unrealize = virtio_ccw_unrealize;
    dc->props = virtio_ccw_net_properties;
    set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
 }
@ -1371,8 +1372,7 @@ static void virtio_ccw_blk_class_init(ObjectClass *klass, void *data)
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->realize = virtio_ccw_blk_realize;
-    k->exit = virtio_ccw_exit;
-    dc->reset = virtio_ccw_reset;
+    k->unrealize = virtio_ccw_unrealize;
    dc->props = virtio_ccw_blk_properties;
    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
 }
@ -1399,8 +1399,7 @@ static void virtio_ccw_serial_class_init(ObjectClass *klass, void *data)
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->realize = virtio_ccw_serial_realize;
-    k->exit = virtio_ccw_exit;
-    dc->reset = virtio_ccw_reset;
+    k->unrealize = virtio_ccw_unrealize;
    dc->props = virtio_ccw_serial_properties;
    set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
 }
@ -1427,8 +1426,7 @@ static void virtio_ccw_balloon_class_init(ObjectClass *klass, void *data)
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->realize = virtio_ccw_balloon_realize;
-    k->exit = virtio_ccw_exit;
-    dc->reset = virtio_ccw_reset;
+    k->unrealize = virtio_ccw_unrealize;
    dc->props = virtio_ccw_balloon_properties;
    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
 }
@ -1455,8 +1453,7 @@ static void virtio_ccw_scsi_class_init(ObjectClass *klass, void *data)
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->realize = virtio_ccw_scsi_realize;
-    k->exit = virtio_ccw_exit;
-    dc->reset = virtio_ccw_reset;
+    k->unrealize = virtio_ccw_unrealize;
    dc->props = virtio_ccw_scsi_properties;
    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
 }
@ -1482,8 +1479,7 @@ static void vhost_ccw_scsi_class_init(ObjectClass *klass, void *data)
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->realize = vhost_ccw_scsi_realize;
-    k->exit = virtio_ccw_exit;
-    dc->reset = virtio_ccw_reset;
+    k->unrealize = virtio_ccw_unrealize;
    dc->props = vhost_ccw_scsi_properties;
    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
 }
@ -1519,8 +1515,7 @@ static void virtio_ccw_rng_class_init(ObjectClass *klass, void *data)
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->realize = virtio_ccw_rng_realize;
-    k->exit = virtio_ccw_exit;
-    dc->reset = virtio_ccw_reset;
+    k->unrealize = virtio_ccw_unrealize;
    dc->props = virtio_ccw_rng_properties;
    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
 }
@ -1557,8 +1552,7 @@ static void virtio_ccw_crypto_class_init(ObjectClass *klass, void *data)
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->realize = virtio_ccw_crypto_realize;
-    k->exit = virtio_ccw_exit;
-    dc->reset = virtio_ccw_reset;
+    k->unrealize = virtio_ccw_unrealize;
    dc->props = virtio_ccw_crypto_properties;
    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
 }
@ -1595,8 +1589,7 @@ static void virtio_ccw_gpu_class_init(ObjectClass *klass, void *data)
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->realize = virtio_ccw_gpu_realize;
-    k->exit = virtio_ccw_exit;
-    dc->reset = virtio_ccw_reset;
+    k->unrealize = virtio_ccw_unrealize;
    dc->props = virtio_ccw_gpu_properties;
    dc->hotpluggable = false;
    set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
@ -1624,8 +1617,7 @@ static void virtio_ccw_input_class_init(ObjectClass *klass, void *data)
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->realize = virtio_ccw_input_realize;
-    k->exit = virtio_ccw_exit;
-    dc->reset = virtio_ccw_reset;
+    k->unrealize = virtio_ccw_unrealize;
    dc->props = virtio_ccw_input_properties;
    set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
 }
@ -1704,12 +1696,12 @@ static void virtio_ccw_busdev_realize(DeviceState *dev, Error **errp)
    virtio_ccw_device_realize(_dev, errp);
 }

-static int virtio_ccw_busdev_exit(DeviceState *dev)
+static void virtio_ccw_busdev_unrealize(DeviceState *dev, Error **errp)
 {
    VirtioCcwDevice *_dev = (VirtioCcwDevice *)dev;
    VirtIOCCWDeviceClass *_info = VIRTIO_CCW_DEVICE_GET_CLASS(dev);

-    return _info->exit(_dev);
+    _info->unrealize(_dev, errp);
 }

 static void virtio_ccw_busdev_unplug(HotplugHandler *hotplug_dev,
@ -1724,11 +1716,13 @@ static void virtio_ccw_device_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
    CCWDeviceClass *k = CCW_DEVICE_CLASS(dc);
+    VirtIOCCWDeviceClass *vdc = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->unplug = virtio_ccw_busdev_unplug;
    dc->realize = virtio_ccw_busdev_realize;
-    dc->exit = virtio_ccw_busdev_exit;
+    dc->unrealize = virtio_ccw_busdev_unrealize;
    dc->bus_type = TYPE_VIRTUAL_CSS_BUS;
+    device_class_set_parent_reset(dc, virtio_ccw_reset, &vdc->parent_reset);
 }

 static const TypeInfo virtio_ccw_device_info = {
@ -1803,9 +1797,8 @@ static void virtio_ccw_9p_class_init(ObjectClass *klass, void *data)
    DeviceClass *dc = DEVICE_CLASS(klass);
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

-    k->exit = virtio_ccw_exit;
+    k->unrealize = virtio_ccw_unrealize;
    k->realize = virtio_ccw_9p_realize;
-    dc->reset = virtio_ccw_reset;
    dc->props = virtio_ccw_9p_properties;
    set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
 }
@ -1852,10 +1845,9 @@ static void vhost_vsock_ccw_class_init(ObjectClass *klass, void *data)
    VirtIOCCWDeviceClass *k = VIRTIO_CCW_DEVICE_CLASS(klass);

    k->realize = vhost_vsock_ccw_realize;
-    k->exit = virtio_ccw_exit;
+    k->unrealize = virtio_ccw_unrealize;
    set_bit(DEVICE_CATEGORY_MISC, dc->categories);
    dc->props = vhost_vsock_ccw_properties;
-    dc->reset = virtio_ccw_reset;
 }

 static void vhost_vsock_ccw_instance_init(Object *obj)
--- a/hw/s390x/virtio-ccw.h
+++ b/hw/s390x/virtio-ccw.h
@ -76,7 +76,8 @@ typedef struct VirtioCcwDevice VirtioCcwDevice;
 typedef struct VirtIOCCWDeviceClass {
    CCWDeviceClass parent_class;
    void (*realize)(VirtioCcwDevice *dev, Error **errp);
-    int (*exit)(VirtioCcwDevice *dev);
+    void (*unrealize)(VirtioCcwDevice *dev, Error **errp);
+    void (*parent_reset)(DeviceState *dev);
 } VirtIOCCWDeviceClass;

 /* Performance improves when virtqueue kick processing is decoupled from the
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@ -224,6 +224,7 @@ static void scsi_qdev_unrealize(DeviceState *qdev, Error **errp)
 /* handle legacy '-drive if=scsi,...' cmd line args */
 SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk,
                                      int unit, bool removable, int bootindex,
+                                      bool share_rw,
                                      const char *serial, Error **errp)
 {
    const char *driver;
@ -254,6 +255,12 @@ SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk,
        object_unparent(OBJECT(dev));
        return NULL;
    }
+    object_property_set_bool(OBJECT(dev), share_rw, "share-rw", &err);
+    if (err != NULL) {
+        error_propagate(errp, err);
+        object_unparent(OBJECT(dev));
+        return NULL;
+    }
    object_property_set_bool(OBJECT(dev), true, "realized", &err);
    if (err != NULL) {
        error_propagate(errp, err);
@ -288,7 +295,7 @@ void scsi_bus_legacy_handle_cmdline(SCSIBus *bus, bool deprecated)
            }
        }
        scsi_bus_legacy_add_drive(bus, blk_by_legacy_dinfo(dinfo),
-                                  unit, false, -1, NULL, &error_fatal);
+                                  unit, false, -1, false, NULL, &error_fatal);
    }
    loc_pop(&loc);
 }
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@ -1755,6 +1755,7 @@ static void scsi_write_same_complete(void *opaque, int ret)
                                       data->sector << BDRV_SECTOR_BITS,
                                       &data->qiov, 0,
                                       scsi_write_same_complete, data);
+        aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
        return;
    }

--- a/hw/sd/milkymist-memcard.c
+++ b/hw/sd/milkymist-memcard.c
@ -248,6 +248,10 @@ static void milkymist_memcard_reset(DeviceState *d)
    for (i = 0; i < R_MAX; i++) {
        s->regs[i] = 0;
    }
+    /* Since we're still using the legacy SD API the card is not plugged
+     * into any bus, and we must reset it manually.
+     */
+    device_reset(DEVICE(s->card));
 }

 static int milkymist_memcard_init(SysBusDevice *dev)
--- a/hw/sd/pl181.c
+++ b/hw/sd/pl181.c
@ -480,6 +480,10 @@ static void pl181_reset(DeviceState *d)

    /* We can assume our GPIO outputs have been wired up now */
    sd_set_cb(s->card, s->cardstatus[0], s->cardstatus[1]);
+    /* Since we're still using the legacy SD API the card is not plugged
+     * into any bus, and we must reset it manually.
+     */
+    device_reset(DEVICE(s->card));
 }

 static void pl181_init(Object *obj)
--- a/hw/sd/ssi-sd.c
+++ b/hw/sd/ssi-sd.c
@ -50,6 +50,9 @@ typedef struct {
    SDState *sd;
 } ssi_sd_state;

+#define TYPE_SSI_SD "ssi-sd"
+#define SSI_SD(obj) OBJECT_CHECK(ssi_sd_state, (obj), TYPE_SSI_SD)
+
 /* State word bits.  */
 #define SSI_SDR_LOCKED          0x0001
 #define SSI_SDR_WP_ERASE        0x0002
@ -241,7 +244,6 @@ static void ssi_sd_realize(SSISlave *d, Error **errp)
    ssi_sd_state *s = FROM_SSI_SLAVE(ssi_sd_state, d);
    DriveInfo *dinfo;

-    s->mode = SSI_SD_CMD;
    /* FIXME use a qdev drive property instead of drive_get_next() */
    dinfo = drive_get_next(IF_SD);
    s->sd = sd_init(dinfo ? blk_by_legacy_dinfo(dinfo) : NULL, true);
@ -251,6 +253,24 @@ static void ssi_sd_realize(SSISlave *d, Error **errp)
    }
 }

+static void ssi_sd_reset(DeviceState *dev)
+{
+    ssi_sd_state *s = SSI_SD(dev);
+
+    s->mode = SSI_SD_CMD;
+    s->cmd = 0;
+    memset(s->cmdarg, 0, sizeof(s->cmdarg));
+    memset(s->response, 0, sizeof(s->response));
+    s->arglen = 0;
+    s->response_pos = 0;
+    s->stopping = 0;
+
+    /* Since we're still using the legacy SD API the card is not plugged
+     * into any bus, and we must reset it manually.
+     */
+    device_reset(DEVICE(s->sd));
+}
+
 static void ssi_sd_class_init(ObjectClass *klass, void *data)
 {
    DeviceClass *dc = DEVICE_CLASS(klass);
@ -260,10 +280,11 @@ static void ssi_sd_class_init(ObjectClass *klass, void *data)
    k->transfer = ssi_sd_transfer;
    k->cs_polarity = SSI_CS_LOW;
    dc->vmsd = &vmstate_ssi_sd;
+    dc->reset = ssi_sd_reset;
 }

 static const TypeInfo ssi_sd_info = {
-    .name          = "ssi-sd",
+    .name          = TYPE_SSI_SD,
    .parent        = TYPE_SSI_SLAVE,
    .instance_size = sizeof(ssi_sd_state),
    .class_init    = ssi_sd_class_init,
--- a/hw/tpm/tpm_emulator.c
+++ b/hw/tpm/tpm_emulator.c
@ -260,7 +260,9 @@ static int tpm_emulator_check_caps(TPMEmulator *tpm_emu)
 static int tpm_emulator_startup_tpm(TPMBackend *tb)
 {
    TPMEmulator *tpm_emu = TPM_EMULATOR(tb);
-    ptm_init init;
+    ptm_init init = {
+        .u.req.init_flags = 0,
+    };
    ptm_res res;

    DPRINTF("%s", __func__);
--- a/hw/tpm/tpm_passthrough.c
+++ b/hw/tpm/tpm_passthrough.c
@ -206,7 +206,8 @@ static TPMVersion tpm_passthrough_get_tpm_version(TPMBackend *tb)
 * Unless path or file descriptor set has been provided by user,
 * determine the sysfs cancel file following kernel documentation
 * in Documentation/ABI/stable/sysfs-class-tpm.
- * From /dev/tpm0 create /sys/class/misc/tpm0/device/cancel
+ * From /dev/tpm0 create /sys/class/tpm/tpm0/device/cancel
+ * before 4.0: /sys/class/misc/tpm0/device/cancel
 */
 static int tpm_passthrough_open_sysfs_cancel(TPMPassthruState *tpm_pt)
 {
@ -217,28 +218,35 @@ static int tpm_passthrough_open_sysfs_cancel(TPMPassthruState *tpm_pt)
    if (tpm_pt->options->cancel_path) {
        fd = qemu_open(tpm_pt->options->cancel_path, O_WRONLY);
        if (fd < 0) {
-            error_report("Could not open TPM cancel path : %s",
+            error_report("tpm_passthrough: Could not open TPM cancel path: %s",
                         strerror(errno));
        }
        return fd;
    }

    dev = strrchr(tpm_pt->tpm_dev, '/');
-    if (dev) {
-        dev++;
-        if (snprintf(path, sizeof(path), "/sys/class/misc/%s/device/cancel",
-                     dev) < sizeof(path)) {
-            fd = qemu_open(path, O_WRONLY);
-            if (fd >= 0) {
-                tpm_pt->options->cancel_path = g_strdup(path);
-            } else {
-                error_report("tpm_passthrough: Could not open TPM cancel "
-                             "path %s : %s", path, strerror(errno));
+    if (!dev) {
+        error_report("tpm_passthrough: Bad TPM device path %s",
+                     tpm_pt->tpm_dev);
+        return -1;
+    }
+
+    dev++;
+    if (snprintf(path, sizeof(path), "/sys/class/tpm/%s/device/cancel",
+                 dev) < sizeof(path)) {
+        fd = qemu_open(path, O_WRONLY);
+        if (fd < 0) {
+            if (snprintf(path, sizeof(path), "/sys/class/misc/%s/device/cancel",
+                         dev) < sizeof(path)) {
+                fd = qemu_open(path, O_WRONLY);
            }
        }
+    }
+
+    if (fd < 0) {
+        error_report("tpm_passthrough: Could not guess TPM cancel path");
    } else {
-       error_report("tpm_passthrough: Bad TPM device path %s",
-                    tpm_pt->tpm_dev);
+        tpm_pt->options->cancel_path = g_strdup(path);
    }

    return fd;
--- a/hw/usb/dev-mtp.c
+++ b/hw/usb/dev-mtp.c
@ -965,12 +965,16 @@ static MTPData *usb_mtp_get_object(MTPState *s, MTPControl *c,
 static MTPData *usb_mtp_get_partial_object(MTPState *s, MTPControl *c,
                                           MTPObject *o)
 {
-    MTPData *d = usb_mtp_data_alloc(c);
+    MTPData *d;
    off_t offset;

+    if (c->argc <= 2) {
+        return NULL;
+    }
    trace_usb_mtp_op_get_partial_object(s->dev.addr, o->handle, o->path,
                                        c->argv[1], c->argv[2]);

+    d = usb_mtp_data_alloc(c);
    d->fd = open(o->path, O_RDONLY);
    if (d->fd == -1) {
        usb_mtp_data_free(d);
--- a/hw/usb/dev-smartcard-reader.c
+++ b/hw/usb/dev-smartcard-reader.c
@ -329,8 +329,8 @@ static const uint8_t qemu_ccid_descriptor[] = {
                     */
        0x07,       /* u8  bVoltageSupport; 01h - 5.0v, 02h - 3.0, 03 - 1.8 */

-        0x00, 0x00, /* u32 dwProtocols; RRRR PPPP. RRRR = 0000h.*/
-        0x01, 0x00, /* PPPP: 0001h = Protocol T=0, 0002h = Protocol T=1 */
+        0x01, 0x00, /* u32 dwProtocols; RRRR PPPP. RRRR = 0000h.*/
+        0x00, 0x00, /* PPPP: 0001h = Protocol T=0, 0002h = Protocol T=1 */
                    /* u32 dwDefaultClock; in kHZ (0x0fa0 is 4 MHz) */
        0xa0, 0x0f, 0x00, 0x00,
                    /* u32 dwMaximumClock; */
--- a/hw/usb/dev-storage.c
+++ b/hw/usb/dev-storage.c
@ -635,7 +635,8 @@ static void usb_msd_realize_storage(USBDevice *dev, Error **errp)
    scsi_bus_new(&s->bus, sizeof(s->bus), DEVICE(dev),
                 &usb_msd_scsi_info_storage, NULL);
    scsi_dev = scsi_bus_legacy_add_drive(&s->bus, blk, 0, !!s->removable,
-                                         s->conf.bootindex, dev->serial,
+                                         s->conf.bootindex, s->conf.share_rw,
+                                         dev->serial,
                                         &err);
    blk_unref(blk);
    if (!scsi_dev) {
--- a/hw/usb/host-libusb.c
+++ b/hw/usb/host-libusb.c
@ -247,7 +247,11 @@ static int usb_host_init(void)
    if (rc != 0) {
        return -1;
    }
+#if LIBUSB_API_VERSION >= 0x01000106
+    libusb_set_option(ctx, LIBUSB_OPTION_LOG_LEVEL, loglevel);
+#else
    libusb_set_debug(ctx, loglevel);
+#endif
 #ifdef CONFIG_WIN32
    /* FIXME: add support for Windows. */
 #else
--- a/hw/usb/redirect.c
+++ b/hw/usb/redirect.c
@ -795,7 +795,7 @@ static void usbredir_handle_bulk_data(USBRedirDevice *dev, USBPacket *p,
           usbredirparser_peer_has_cap(dev->parser,
                                       usb_redir_cap_32bits_bulk_length));

-    if (ep & USB_DIR_IN) {
+    if (ep & USB_DIR_IN || size == 0) {
        usbredirparser_send_bulk_packet(dev->parser, p->id,
                                        &bulk_packet, NULL, 0);
    } else {
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@ -357,11 +357,13 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp)
        if (strcmp(vbasedev->name, vcdev->vdev.name) == 0) {
            error_setg(&err, "vfio: subchannel %s has already been attached",
                       vcdev->vdev.name);
+            g_free(vcdev->vdev.name);
            goto out_device_err;
        }
    }

    if (vfio_get_device(group, cdev->mdevid, &vcdev->vdev, &err)) {
+        g_free(vcdev->vdev.name);
        goto out_device_err;
    }

--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@ -968,6 +968,7 @@ static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
            group->container = container;
            QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+            vfio_kvm_device_add_group(group);
            return 0;
        }
    }
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@ -317,11 +317,14 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev,
                                     &offset);
        fd = memory_region_get_fd(mr);
        if (fd > 0) {
+            if (fd_num == VHOST_MEMORY_MAX_NREGIONS) {
+                error_report("Failed preparing vhost-user memory table msg");
+                return -1;
+            }
            msg.payload.memory.regions[fd_num].userspace_addr = reg->userspace_addr;
            msg.payload.memory.regions[fd_num].memory_size  = reg->memory_size;
            msg.payload.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr;
            msg.payload.memory.regions[fd_num].mmap_offset = offset;
-            assert(fd_num < VHOST_MEMORY_MAX_NREGIONS);
            fds[fd_num++] = fd;
        }
    }
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@ -234,6 +234,7 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
                memory_region_is_rom(section.mr) ||
                memory_region_is_romd(section.mr)) {
                trace_virtio_balloon_bad_addr(pa);
+                memory_region_unref(section.mr);
                continue;
            }

--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@ -2469,7 +2469,7 @@ void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice *vdev, const char *fmt, ...)
    va_end(ap);

    if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
-        virtio_set_status(vdev, vdev->status | VIRTIO_CONFIG_S_NEEDS_RESET);
+        vdev->status = vdev->status | VIRTIO_CONFIG_S_NEEDS_RESET;
        virtio_notify_config(vdev);
    }

--- a/include/block/block.h
+++ b/include/block/block.h
@ -437,6 +437,7 @@ bool bdrv_is_read_only(BlockDriverState *bs);
 int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
                           bool ignore_allow_rdw, Error **errp);
 int bdrv_set_read_only(BlockDriverState *bs, bool read_only, Error **errp);
+bool bdrv_is_writable(BlockDriverState *bs);
 bool bdrv_is_sg(BlockDriverState *bs);
 bool bdrv_is_inserted(BlockDriverState *bs);
 void bdrv_lock_medium(BlockDriverState *bs, bool locked);
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@ -159,8 +159,12 @@ extern unsigned long guest_base;
 extern int have_guest_base;
 extern unsigned long reserved_va;

-#define GUEST_ADDR_MAX (reserved_va ? reserved_va : \
+#if HOST_LONG_BITS <= TARGET_VIRT_ADDR_SPACE_BITS
+#define GUEST_ADDR_MAX (~0ul)
+#else
+#define GUEST_ADDR_MAX (reserved_va ? reserved_va - 1 : \
                                    (1ul << TARGET_VIRT_ADDR_SPACE_BITS) - 1)
+#endif
 #else

 #include "exec/hwaddr.h"
--- a/include/exec/cpu_ldst.h
+++ b/include/exec/cpu_ldst.h
@ -51,15 +51,13 @@
 /* All direct uses of g2h and h2g need to go away for usermode softmmu.  */
 #define g2h(x) ((void *)((unsigned long)(target_ulong)(x) + guest_base))

-#if HOST_LONG_BITS <= TARGET_VIRT_ADDR_SPACE_BITS
-#define h2g_valid(x) 1
-#else
-#define h2g_valid(x) ({ \
-    unsigned long __guest = (unsigned long)(x) - guest_base; \
-    (__guest < (1ul << TARGET_VIRT_ADDR_SPACE_BITS)) && \
-    (!reserved_va || (__guest < reserved_va)); \
-})
-#endif
+#define guest_addr_valid(x) ((x) <= GUEST_ADDR_MAX)
+#define h2g_valid(x) guest_addr_valid((unsigned long)(x) - guest_base)
+
+static inline int guest_range_valid(unsigned long start, unsigned long len)
+{
+    return len - 1 <= GUEST_ADDR_MAX && start <= GUEST_ADDR_MAX - len + 1;
+}

 #define h2g_nocheck(x) ({ \
    unsigned long __ret = (unsigned long)(x) - guest_base; \
--- a/include/exec/memory-internal.h
+++ b/include/exec/memory-internal.h
@ -20,7 +20,15 @@
 #define MEMORY_INTERNAL_H

 #ifndef CONFIG_USER_ONLY
-typedef struct AddressSpaceDispatch AddressSpaceDispatch;
+static inline AddressSpaceDispatch *flatview_to_dispatch(FlatView *fv)
+{
+    return fv->dispatch;
+}
+
+static inline AddressSpaceDispatch *address_space_to_dispatch(AddressSpace *as)
+{
+    return flatview_to_dispatch(address_space_to_flatview(as));
+}

 extern const MemoryRegionOps unassigned_mem_ops;

@ -30,9 +38,6 @@ bool memory_region_access_valid(MemoryRegion *mr, hwaddr addr,
 void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section);
 AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv);
 void address_space_dispatch_compact(AddressSpaceDispatch *d);
-
-AddressSpaceDispatch *address_space_to_dispatch(AddressSpace *as);
-AddressSpaceDispatch *flatview_to_dispatch(FlatView *fv);
 void address_space_dispatch_free(AddressSpaceDispatch *d);

 void mtree_print_dispatch(fprintf_function mon, void *f,
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@ -318,7 +318,27 @@ struct AddressSpace {
    QTAILQ_ENTRY(AddressSpace) address_spaces_link;
 };

-FlatView *address_space_to_flatview(AddressSpace *as);
+typedef struct AddressSpaceDispatch AddressSpaceDispatch;
+typedef struct FlatRange FlatRange;
+
+/* Flattened global view of current active memory hierarchy.  Kept in sorted
+ * order.
+ */
+struct FlatView {
+    struct rcu_head rcu;
+    unsigned ref;
+    FlatRange *ranges;
+    unsigned nr;
+    unsigned nr_allocated;
+    struct AddressSpaceDispatch *dispatch;
+    MemoryRegion *root;
+};
+
+static inline FlatView *address_space_to_flatview(AddressSpace *as)
+{
+    return atomic_rcu_read(&as->current_map);
+}
+

 /**
 * MemoryRegionSection: describes a fragment of a #MemoryRegion
@ -1887,13 +1907,12 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,


 /* Internal functions, part of the implementation of address_space_read.  */
+MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
+                                    MemTxAttrs attrs, uint8_t *buf, int len);
 MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
                                   MemTxAttrs attrs, uint8_t *buf,
                                   int len, hwaddr addr1, hwaddr l,
                                   MemoryRegion *mr);
-
-MemTxResult flatview_read_full(FlatView *fv, hwaddr addr,
-                               MemTxAttrs attrs, uint8_t *buf, int len);
 void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr);

 static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
@ -1912,7 +1931,7 @@ static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
 *
 * Return a MemTxResult indicating whether the operation succeeded
 * or failed (eg unassigned memory, device rejected the transaction,
- * IOMMU fault).
+ * IOMMU fault).  Called within RCU critical section.
 *
 * @as: #AddressSpace to be accessed
 * @addr: address within that address space
@ -1920,17 +1939,20 @@ static inline bool memory_access_is_direct(MemoryRegion *mr, bool is_write)
 * @buf: buffer with the data transferred
 */
 static inline __attribute__((__always_inline__))
-MemTxResult flatview_read(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
-                          uint8_t *buf, int len)
+MemTxResult address_space_read(AddressSpace *as, hwaddr addr,
+                               MemTxAttrs attrs, uint8_t *buf,
+                               int len)
 {
    MemTxResult result = MEMTX_OK;
    hwaddr l, addr1;
    void *ptr;
    MemoryRegion *mr;
+    FlatView *fv;

    if (__builtin_constant_p(len)) {
        if (len) {
            rcu_read_lock();
+            fv = address_space_to_flatview(as);
            l = len;
            mr = flatview_translate(fv, addr, &addr1, &l, false);
            if (len == l && memory_access_is_direct(mr, false)) {
@ -1943,18 +1965,11 @@ MemTxResult flatview_read(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
            rcu_read_unlock();
        }
    } else {
-        result = flatview_read_full(fv, addr, attrs, buf, len);
+        result = address_space_read_full(as, addr, attrs, buf, len);
    }
    return result;
 }

-static inline MemTxResult address_space_read(AddressSpace *as, hwaddr addr,
-                                             MemTxAttrs attrs, uint8_t *buf,
-                                             int len)
-{
-    return flatview_read(address_space_to_flatview(as), addr, attrs, buf, len);
-}
-
 /**
 * address_space_read_cached: read from a cached RAM region
 *
--- a/include/hw/compat.h
+++ b/include/hw/compat.h
@ -1,6 +1,8 @@
 #ifndef HW_COMPAT_H
 #define HW_COMPAT_H

+#define HW_COMPAT_2_11
+
 #define HW_COMPAT_2_10 \
    {\
        .driver   = "virtio-mouse-device",\
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@ -27,6 +27,7 @@
 #include "hw/i386/ioapic.h"
 #include "hw/pci/msi.h"
 #include "hw/sysbus.h"
+#include "qemu/iova-tree.h"

 #define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
 #define INTEL_IOMMU_DEVICE(obj) \
@ -46,8 +47,10 @@
 #define VTD_SID_TO_DEVFN(sid)       ((sid) & 0xff)

 #define DMAR_REG_SIZE               0x230
-#define VTD_HOST_ADDRESS_WIDTH      39
-#define VTD_HAW_MASK                ((1ULL << VTD_HOST_ADDRESS_WIDTH) - 1)
+#define VTD_HOST_AW_39BIT           39
+#define VTD_HOST_AW_48BIT           48
+#define VTD_HOST_ADDRESS_WIDTH      VTD_HOST_AW_39BIT
+#define VTD_HAW_MASK(aw)            ((1ULL << (aw)) - 1)

 #define DMAR_REPORT_F_INTR          (1)

@ -65,7 +68,6 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry;
 typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
 typedef struct VTDIrq VTDIrq;
 typedef struct VTD_MSIMessage VTD_MSIMessage;
-typedef struct IntelIOMMUNotifierNode IntelIOMMUNotifierNode;

 /* Context-Entry */
 struct VTDContextEntry {
@ -91,6 +93,10 @@ struct VTDAddressSpace {
    MemoryRegion iommu_ir;      /* Interrupt region: 0xfeeXXXXX */
    IntelIOMMUState *iommu_state;
    VTDContextCacheEntry context_cache_entry;
+    QLIST_ENTRY(VTDAddressSpace) next;
+    /* Superset of notifier flags that this address space has */
+    IOMMUNotifierFlag notifier_flags;
+    IOVATree *iova_tree;          /* Traces mapped IOVA ranges */
 };

 struct VTDBus {
@ -251,11 +257,6 @@ struct VTD_MSIMessage {
 /* When IR is enabled, all MSI/MSI-X data bits should be zero */
 #define VTD_IR_MSI_DATA          (0)

-struct IntelIOMMUNotifierNode {
-    VTDAddressSpace *vtd_as;
-    QLIST_ENTRY(IntelIOMMUNotifierNode) next;
-};
-
 /* The iommu (DMAR) device state struct */
 struct IntelIOMMUState {
    X86IOMMUState x86_iommu;
@ -293,7 +294,7 @@ struct IntelIOMMUState {
    GHashTable *vtd_as_by_busptr;   /* VTDBus objects indexed by PCIBus* reference */
    VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */
    /* list of registered notifiers */
-    QLIST_HEAD(, IntelIOMMUNotifierNode) notifiers_list;
+    QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;

    /* interrupt remapping */
    bool intr_enabled;              /* Whether guest enabled IR */
@ -302,6 +303,13 @@ struct IntelIOMMUState {
    bool intr_eime;                 /* Extended interrupt mode enabled */
    OnOffAuto intr_eim;             /* Toggle for EIM cabability */
    bool buggy_eim;                 /* Force buggy EIM unless eim=off */
+    uint8_t aw_bits;                /* Host/IOVA address width (in bits) */
+
+    /*
+     * Protects IOMMU states in general.  Currently it protects the
+     * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace.
+     */
+    QemuMutex iommu_lock;
 };

 /* Find the VTD Address space associated with the given bus pointer,
--- a/include/hw/intc/arm_gicv3_common.h
+++ b/include/hw/intc/arm_gicv3_common.h
@ -217,6 +217,7 @@ struct GICv3State {
    uint32_t revision;
    bool security_extn;
    bool irq_reset_nonsecure;
+    bool gicd_no_migration_shift_bug;

    int dev_fd; /* kvm device fd if backed by kvm vgic support */
    Error *migration_blocker;
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@ -50,6 +50,41 @@ typedef enum {
    SPAPR_RESIZE_HPT_REQUIRED,
 } sPAPRResizeHPT;

+/**
+ * Capabilities
+ */
+
+/* Hardware Transactional Memory */
+#define SPAPR_CAP_HTM                   0x00
+/* Vector Scalar Extensions */
+#define SPAPR_CAP_VSX                   0x01
+/* Decimal Floating Point */
+#define SPAPR_CAP_DFP                   0x02
+/* Cache Flush on Privilege Change */
+#define SPAPR_CAP_CFPC                  0x03
+/* Speculation Barrier Bounds Checking */
+#define SPAPR_CAP_SBBC                  0x04
+/* Indirect Branch Serialisation */
+#define SPAPR_CAP_IBS                   0x05
+/* Num Caps */
+#define SPAPR_CAP_NUM                   (SPAPR_CAP_IBS + 1)
+
+/*
+ * Capability Values
+ */
+/* Bool Caps */
+#define SPAPR_CAP_OFF                   0x00
+#define SPAPR_CAP_ON                    0x01
+/* Broken | Workaround | Fixed Caps */
+#define SPAPR_CAP_BROKEN                0x00
+#define SPAPR_CAP_WORKAROUND            0x01
+#define SPAPR_CAP_FIXED                 0x02
+
+typedef struct sPAPRCapabilities sPAPRCapabilities;
+struct sPAPRCapabilities {
+    uint8_t caps[SPAPR_CAP_NUM];
+};
+
 /**
 * sPAPRMachineClass:
 */
@ -66,6 +101,7 @@ struct sPAPRMachineClass {
                          hwaddr *mmio32, hwaddr *mmio64,
                          unsigned n_dma, uint32_t *liobns, Error **errp);
    sPAPRResizeHPT resize_hpt_default;
+    sPAPRCapabilities default_caps;
 };

 /**
@ -127,6 +163,9 @@ struct sPAPRMachineState {
    MemoryHotplugState hotplug_memory;

    const char *icp_type;
+
+    bool cmd_line_caps[SPAPR_CAP_NUM];
+    sPAPRCapabilities def, eff, mig;
 };

 #define H_SUCCESS         0
@ -266,6 +305,18 @@ struct sPAPRMachineState {
 #define H_DABRX_KERNEL     (1ULL<<(63-62))
 #define H_DABRX_USER       (1ULL<<(63-63))

+/* Values for KVM_PPC_GET_CPU_CHAR & H_GET_CPU_CHARACTERISTICS */
+#define H_CPU_CHAR_SPEC_BAR_ORI31               PPC_BIT(0)
+#define H_CPU_CHAR_BCCTRL_SERIALISED            PPC_BIT(1)
+#define H_CPU_CHAR_L1D_FLUSH_ORI30              PPC_BIT(2)
+#define H_CPU_CHAR_L1D_FLUSH_TRIG2              PPC_BIT(3)
+#define H_CPU_CHAR_L1D_THREAD_PRIV              PPC_BIT(4)
+#define H_CPU_CHAR_HON_BRANCH_HINTS             PPC_BIT(5)
+#define H_CPU_CHAR_THR_RECONF_TRIG              PPC_BIT(6)
+#define H_CPU_BEHAV_FAVOUR_SECURITY             PPC_BIT(0)
+#define H_CPU_BEHAV_L1D_FLUSH_PR                PPC_BIT(1)
+#define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR           PPC_BIT(2)
+
 /* Each control block has to be on a 4K boundary */
 #define H_CB_ALIGNMENT     4096

@ -353,6 +404,7 @@ struct sPAPRMachineState {
 #define H_GET_HCA_INFO          0x1B8
 #define H_GET_PERF_COUNT        0x1BC
 #define H_MANAGE_TRACE          0x1C0
+#define H_GET_CPU_CHARACTERISTICS 0x1C8
 #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4
 #define H_QUERY_INT_STATE       0x1E4
 #define H_POLL_PENDING          0x1D8
@ -704,7 +756,30 @@ void spapr_do_system_reset_on_cpu(CPUState *cs, run_on_cpu_data arg);

 #define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))

-int spapr_vcpu_id(PowerPCCPU *cpu);
+int spapr_get_vcpu_id(PowerPCCPU *cpu);
+void spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp);
 PowerPCCPU *spapr_find_cpu(int vcpu_id);

+int spapr_caps_pre_load(void *opaque);
+int spapr_caps_pre_save(void *opaque);
+
+/*
+ * Handling of optional capabilities
+ */
+extern const VMStateDescription vmstate_spapr_cap_htm;
+extern const VMStateDescription vmstate_spapr_cap_vsx;
+extern const VMStateDescription vmstate_spapr_cap_dfp;
+extern const VMStateDescription vmstate_spapr_cap_cfpc;
+extern const VMStateDescription vmstate_spapr_cap_sbbc;
+extern const VMStateDescription vmstate_spapr_cap_ibs;
+
+static inline uint8_t spapr_get_cap(sPAPRMachineState *spapr, int cap)
+{
+    return spapr->eff.caps[cap];
+}
+
+void spapr_caps_reset(sPAPRMachineState *spapr);
+void spapr_caps_add_properties(sPAPRMachineClass *smc, Error **errp);
+int spapr_caps_post_migration(sPAPRMachineState *spapr);
+
 #endif /* HW_SPAPR_H */
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@ -32,9 +32,9 @@ typedef enum DeviceCategory {

 typedef int (*qdev_initfn)(DeviceState *dev);
 typedef int (*qdev_event)(DeviceState *dev);
-typedef void (*qdev_resetfn)(DeviceState *dev);
 typedef void (*DeviceRealize)(DeviceState *dev, Error **errp);
 typedef void (*DeviceUnrealize)(DeviceState *dev, Error **errp);
+typedef void (*DeviceReset)(DeviceState *dev);
 typedef void (*BusRealize)(BusState *bus, Error **errp);
 typedef void (*BusUnrealize)(BusState *bus, Error **errp);

@ -117,7 +117,7 @@ typedef struct DeviceClass {
    bool hotpluggable;

    /* callbacks */
-    void (*reset)(DeviceState *dev);
+    DeviceReset reset;
    DeviceRealize realize;
    DeviceUnrealize unrealize;

@ -381,6 +381,16 @@ void qdev_machine_init(void);
 */
 void device_reset(DeviceState *dev);

+void device_class_set_parent_reset(DeviceClass *dc,
+                                   DeviceReset dev_reset,
+                                   DeviceReset *parent_reset);
+void device_class_set_parent_realize(DeviceClass *dc,
+                                     DeviceRealize dev_realize,
+                                     DeviceRealize *parent_realize);
+void device_class_set_parent_unrealize(DeviceClass *dc,
+                                       DeviceUnrealize dev_unrealize,
+                                       DeviceUnrealize *parent_unrealize);
+
 const struct VMStateDescription *qdev_get_vmsd(DeviceState *dev);

 const char *qdev_fw_name(DeviceState *dev);
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@ -151,6 +151,7 @@ static inline SCSIBus *scsi_bus_from_device(SCSIDevice *d)

 SCSIDevice *scsi_bus_legacy_add_drive(SCSIBus *bus, BlockBackend *blk,
                                      int unit, bool removable, int bootindex,
+                                      bool share_rw,
                                      const char *serial, Error **errp);
 void scsi_bus_legacy_handle_cmdline(SCSIBus *bus, bool deprecated);
 void scsi_legacy_handle_cmdline(void);
--- a/include/net/net.h
+++ b/include/net/net.h
@ -156,6 +156,7 @@ ssize_t qemu_send_packet_async(NetClientState *nc, const uint8_t *buf,
                               int size, NetPacketSent *sent_cb);
 void qemu_purge_queued_packets(NetClientState *nc);
 void qemu_flush_queued_packets(NetClientState *nc);
+void qemu_flush_or_purge_queued_packets(NetClientState *nc, bool purge);
 void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6]);
 bool qemu_has_ufo(NetClientState *nc);
 bool qemu_has_vnet_hdr(NetClientState *nc);
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@ -0,0 +1,134 @@
+/*
+ * An very simplified iova tree implementation based on GTree.
+ *
+ * Copyright 2018 Red Hat, Inc.
+ *
+ * Authors:
+ *  Peter Xu <peterx@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ */
+#ifndef IOVA_TREE_H
+#define IOVA_TREE_H
+
+/*
+ * Currently the iova tree will only allow to keep ranges
+ * information, and no extra user data is allowed for each element.  A
+ * benefit is that we can merge adjacent ranges internally within the
+ * tree.  It can save a lot of memory when the ranges are splitted but
+ * mostly continuous.
+ *
+ * Note that current implementation does not provide any thread
+ * protections.  Callers of the iova tree should be responsible
+ * for the thread safety issue.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/memory.h"
+#include "exec/hwaddr.h"
+
+#define  IOVA_OK           (0)
+#define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
+#define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
+
+typedef struct IOVATree IOVATree;
+typedef struct DMAMap {
+    hwaddr iova;
+    hwaddr translated_addr;
+    hwaddr size;                /* Inclusive */
+    IOMMUAccessFlags perm;
+} QEMU_PACKED DMAMap;
+typedef gboolean (*iova_tree_iterator)(DMAMap *map);
+
+/**
+ * iova_tree_new:
+ *
+ * Create a new iova tree.
+ *
+ * Returns: the tree pointer when succeeded, or NULL if error.
+ */
+IOVATree *iova_tree_new(void);
+
+/**
+ * iova_tree_insert:
+ *
+ * @tree: the iova tree to insert
+ * @map: the mapping to insert
+ *
+ * Insert an iova range to the tree.  If there is overlapped
+ * ranges, IOVA_ERR_OVERLAP will be returned.
+ *
+ * Return: 0 if succeeded, or <0 if error.
+ */
+int iova_tree_insert(IOVATree *tree, DMAMap *map);
+
+/**
+ * iova_tree_remove:
+ *
+ * @tree: the iova tree to remove range from
+ * @map: the map range to remove
+ *
+ * Remove mappings from the tree that are covered by the map range
+ * provided.  The range does not need to be exactly what has inserted,
+ * all the mappings that are included in the provided range will be
+ * removed from the tree.  Here map->translated_addr is meaningless.
+ *
+ * Return: 0 if succeeded, or <0 if error.
+ */
+int iova_tree_remove(IOVATree *tree, DMAMap *map);
+
+/**
+ * iova_tree_find:
+ *
+ * @tree: the iova tree to search from
+ * @map: the mapping to search
+ *
+ * Search for a mapping in the iova tree that overlaps with the
+ * mapping range specified.  Only the first found mapping will be
+ * returned.
+ *
+ * Return: DMAMap pointer if found, or NULL if not found.  Note that
+ * the returned DMAMap pointer is maintained internally.  User should
+ * only read the content but never modify or free the content.  Also,
+ * user is responsible to make sure the pointer is valid (say, no
+ * concurrent deletion in progress).
+ */
+DMAMap *iova_tree_find(IOVATree *tree, DMAMap *map);
+
+/**
+ * iova_tree_find_address:
+ *
+ * @tree: the iova tree to search from
+ * @iova: the iova address to find
+ *
+ * Similar to iova_tree_find(), but it tries to find mapping with
+ * range iova=iova & size=0.
+ *
+ * Return: same as iova_tree_find().
+ */
+DMAMap *iova_tree_find_address(IOVATree *tree, hwaddr iova);
+
+/**
+ * iova_tree_foreach:
+ *
+ * @tree: the iova tree to iterate on
+ * @iterator: the interator for the mappings, return true to stop
+ *
+ * Iterate over the iova tree.
+ *
+ * Return: 1 if found any overlap, 0 if not, <0 if error.
+ */
+void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
+
+/**
+ * iova_tree_destroy:
+ *
+ * @tree: the iova tree to destroy
+ *
+ * Destroy an existing iova tree.
+ *
+ * Return: None.
+ */
+void iova_tree_destroy(IOVATree *tree);
+
+#endif
--- a/include/scsi/utils.h
+++ b/include/scsi/utils.h
@ -76,7 +76,11 @@ extern const struct SCSISense sense_code_LUN_FAILURE;
 extern const struct SCSISense sense_code_LUN_COMM_FAILURE;
 /* Command aborted, Overlapped Commands Attempted */
 extern const struct SCSISense sense_code_OVERLAPPED_COMMANDS;
-/* LUN not ready, Capacity data has changed */
+/* Medium error, Unrecovered read error */
+extern const struct SCSISense sense_code_READ_ERROR;
+/* LUN not ready, Cause not reportable */
+extern const struct SCSISense sense_code_NOT_READY;
+/* Unit attention, Capacity data has changed */
 extern const struct SCSISense sense_code_CAPACITY_CHANGED;
 /* Unit attention, SCSI bus reset */
 extern const struct SCSISense sense_code_SCSI_BUS_RESET;
--- a/include/standard-headers/asm-s390/virtio-ccw.h
+++ b/include/standard-headers/asm-s390/virtio-ccw.h
@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
 /*
 * Definitions for virtio-ccw devices.
 *
 * Copyright IBM Corp. 2013
 *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License (version 2 only)
- * as published by the Free Software Foundation.
- *
 *  Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
 */
 #ifndef __KVM_VIRTIO_CCW_H
--- a/include/standard-headers/asm-x86/hyperv.h
+++ b/include/standard-headers/asm-x86/hyperv.h
@ -1,393 +1 @@
-#ifndef _ASM_X86_HYPERV_H
-#define _ASM_X86_HYPERV_H
-
-#include "standard-headers/linux/types.h"
-
-/*
- * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
- * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
- */
-#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS	0x40000000
-#define HYPERV_CPUID_INTERFACE			0x40000001
-#define HYPERV_CPUID_VERSION			0x40000002
-#define HYPERV_CPUID_FEATURES			0x40000003
-#define HYPERV_CPUID_ENLIGHTMENT_INFO		0x40000004
-#define HYPERV_CPUID_IMPLEMENT_LIMITS		0x40000005
-
-#define HYPERV_HYPERVISOR_PRESENT_BIT		0x80000000
-#define HYPERV_CPUID_MIN			0x40000005
-#define HYPERV_CPUID_MAX			0x4000ffff
-
-/*
- * Feature identification. EAX indicates which features are available
- * to the partition based upon the current partition privileges.
- */
-
-/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
-#define HV_X64_MSR_VP_RUNTIME_AVAILABLE		(1 << 0)
-/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
-#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE	(1 << 1)
-/* Partition reference TSC MSR is available */
-#define HV_X64_MSR_REFERENCE_TSC_AVAILABLE              (1 << 9)
-
-/* A partition's reference time stamp counter (TSC) page */
-#define HV_X64_MSR_REFERENCE_TSC		0x40000021
-
-/*
- * There is a single feature flag that signifies if the partition has access
- * to MSRs with local APIC and TSC frequencies.
- */
-#define HV_X64_ACCESS_FREQUENCY_MSRS		(1 << 11)
-
-/*
- * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
- * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
- */
-#define HV_X64_MSR_SYNIC_AVAILABLE		(1 << 2)
-/*
- * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
- * HV_X64_MSR_STIMER3_COUNT) available
- */
-#define HV_X64_MSR_SYNTIMER_AVAILABLE		(1 << 3)
-/*
- * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
- * are available
- */
-#define HV_X64_MSR_APIC_ACCESS_AVAILABLE	(1 << 4)
-/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
-#define HV_X64_MSR_HYPERCALL_AVAILABLE		(1 << 5)
-/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
-#define HV_X64_MSR_VP_INDEX_AVAILABLE		(1 << 6)
-/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
-#define HV_X64_MSR_RESET_AVAILABLE		(1 << 7)
- /*
-  * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
-  * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
-  * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
-  */
-#define HV_X64_MSR_STAT_PAGES_AVAILABLE		(1 << 8)
-
-/* Frequency MSRs available */
-#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE	(1 << 8)
-
-/* Crash MSR available */
-#define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10)
-
-/*
- * Feature identification: EBX indicates which flags were specified at
- * partition creation. The format is the same as the partition creation
- * flag structure defined in section Partition Creation Flags.
- */
-#define HV_X64_CREATE_PARTITIONS		(1 << 0)
-#define HV_X64_ACCESS_PARTITION_ID		(1 << 1)
-#define HV_X64_ACCESS_MEMORY_POOL		(1 << 2)
-#define HV_X64_ADJUST_MESSAGE_BUFFERS		(1 << 3)
-#define HV_X64_POST_MESSAGES			(1 << 4)
-#define HV_X64_SIGNAL_EVENTS			(1 << 5)
-#define HV_X64_CREATE_PORT			(1 << 6)
-#define HV_X64_CONNECT_PORT			(1 << 7)
-#define HV_X64_ACCESS_STATS			(1 << 8)
-#define HV_X64_DEBUGGING			(1 << 11)
-#define HV_X64_CPU_POWER_MANAGEMENT		(1 << 12)
-#define HV_X64_CONFIGURE_PROFILER		(1 << 13)
-
-/*
- * Feature identification. EDX indicates which miscellaneous features
- * are available to the partition.
- */
-/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
-#define HV_X64_MWAIT_AVAILABLE				(1 << 0)
-/* Guest debugging support is available */
-#define HV_X64_GUEST_DEBUGGING_AVAILABLE		(1 << 1)
-/* Performance Monitor support is available*/
-#define HV_X64_PERF_MONITOR_AVAILABLE			(1 << 2)
-/* Support for physical CPU dynamic partitioning events is available*/
-#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE	(1 << 3)
-/*
- * Support for passing hypercall input parameter block via XMM
- * registers is available
- */
-#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE		(1 << 4)
-/* Support for a virtual guest idle state is available */
-#define HV_X64_GUEST_IDLE_STATE_AVAILABLE		(1 << 5)
-/* Guest crash data handler available */
-#define HV_X64_GUEST_CRASH_MSR_AVAILABLE		(1 << 10)
-
-/*
- * Implementation recommendations. Indicates which behaviors the hypervisor
- * recommends the OS implement for optimal performance.
- */
- /*
-  * Recommend using hypercall for address space switches rather
-  * than MOV to CR3 instruction
-  */
-#define HV_X64_AS_SWITCH_RECOMMENDED		(1 << 0)
-/* Recommend using hypercall for local TLB flushes rather
- * than INVLPG or MOV to CR3 instructions */
-#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED	(1 << 1)
-/*
- * Recommend using hypercall for remote TLB flushes rather
- * than inter-processor interrupts
- */
-#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED	(1 << 2)
-/*
- * Recommend using MSRs for accessing APIC registers
- * EOI, ICR and TPR rather than their memory-mapped counterparts
- */
-#define HV_X64_APIC_ACCESS_RECOMMENDED		(1 << 3)
-/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
-#define HV_X64_SYSTEM_RESET_RECOMMENDED		(1 << 4)
-/*
- * Recommend using relaxed timing for this partition. If used,
- * the VM should disable any watchdog timeouts that rely on the
- * timely delivery of external interrupts
- */
-#define HV_X64_RELAXED_TIMING_RECOMMENDED	(1 << 5)
-
-/*
- * Virtual APIC support
- */
-#define HV_X64_DEPRECATING_AEOI_RECOMMENDED	(1 << 9)
-
-/* Recommend using the newer ExProcessorMasks interface */
-#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED	(1 << 11)
-
-/*
- * Crash notification flag.
- */
-#define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63)
-
-/* MSR used to identify the guest OS. */
-#define HV_X64_MSR_GUEST_OS_ID			0x40000000
-
-/* MSR used to setup pages used to communicate with the hypervisor. */
-#define HV_X64_MSR_HYPERCALL			0x40000001
-
-/* MSR used to provide vcpu index */
-#define HV_X64_MSR_VP_INDEX			0x40000002
-
-/* MSR used to reset the guest OS. */
-#define HV_X64_MSR_RESET			0x40000003
-
-/* MSR used to provide vcpu runtime in 100ns units */
-#define HV_X64_MSR_VP_RUNTIME			0x40000010
-
-/* MSR used to read the per-partition time reference counter */
-#define HV_X64_MSR_TIME_REF_COUNT		0x40000020
-
-/* MSR used to retrieve the TSC frequency */
-#define HV_X64_MSR_TSC_FREQUENCY		0x40000022
-
-/* MSR used to retrieve the local APIC timer frequency */
-#define HV_X64_MSR_APIC_FREQUENCY		0x40000023
-
-/* Define the virtual APIC registers */
-#define HV_X64_MSR_EOI				0x40000070
-#define HV_X64_MSR_ICR				0x40000071
-#define HV_X64_MSR_TPR				0x40000072
-#define HV_X64_MSR_APIC_ASSIST_PAGE		0x40000073
-
-/* Define synthetic interrupt controller model specific registers. */
-#define HV_X64_MSR_SCONTROL			0x40000080
-#define HV_X64_MSR_SVERSION			0x40000081
-#define HV_X64_MSR_SIEFP			0x40000082
-#define HV_X64_MSR_SIMP				0x40000083
-#define HV_X64_MSR_EOM				0x40000084
-#define HV_X64_MSR_SINT0			0x40000090
-#define HV_X64_MSR_SINT1			0x40000091
-#define HV_X64_MSR_SINT2			0x40000092
-#define HV_X64_MSR_SINT3			0x40000093
-#define HV_X64_MSR_SINT4			0x40000094
-#define HV_X64_MSR_SINT5			0x40000095
-#define HV_X64_MSR_SINT6			0x40000096
-#define HV_X64_MSR_SINT7			0x40000097
-#define HV_X64_MSR_SINT8			0x40000098
-#define HV_X64_MSR_SINT9			0x40000099
-#define HV_X64_MSR_SINT10			0x4000009A
-#define HV_X64_MSR_SINT11			0x4000009B
-#define HV_X64_MSR_SINT12			0x4000009C
-#define HV_X64_MSR_SINT13			0x4000009D
-#define HV_X64_MSR_SINT14			0x4000009E
-#define HV_X64_MSR_SINT15			0x4000009F
-
-/*
- * Synthetic Timer MSRs. Four timers per vcpu.
- */
-#define HV_X64_MSR_STIMER0_CONFIG		0x400000B0
-#define HV_X64_MSR_STIMER0_COUNT		0x400000B1
-#define HV_X64_MSR_STIMER1_CONFIG		0x400000B2
-#define HV_X64_MSR_STIMER1_COUNT		0x400000B3
-#define HV_X64_MSR_STIMER2_CONFIG		0x400000B4
-#define HV_X64_MSR_STIMER2_COUNT		0x400000B5
-#define HV_X64_MSR_STIMER3_CONFIG		0x400000B6
-#define HV_X64_MSR_STIMER3_COUNT		0x400000B7
-
-/* Hyper-V guest crash notification MSR's */
-#define HV_X64_MSR_CRASH_P0			0x40000100
-#define HV_X64_MSR_CRASH_P1			0x40000101
-#define HV_X64_MSR_CRASH_P2			0x40000102
-#define HV_X64_MSR_CRASH_P3			0x40000103
-#define HV_X64_MSR_CRASH_P4			0x40000104
-#define HV_X64_MSR_CRASH_CTL			0x40000105
-#define HV_X64_MSR_CRASH_CTL_NOTIFY		(1ULL << 63)
-#define HV_X64_MSR_CRASH_PARAMS		\
-		(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
-
-#define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001
-#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12
-#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK	\
-		(~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
-
-/* Declare the various hypercall operations. */
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE	0x0002
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST	0x0003
-#define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX  0x0013
-#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX   0x0014
-#define HVCALL_POST_MESSAGE			0x005c
-#define HVCALL_SIGNAL_EVENT			0x005d
-
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE		0x00000001
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT	12
-#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK	\
-		(~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
-
-#define HV_X64_MSR_TSC_REFERENCE_ENABLE		0x00000001
-#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT	12
-
-#define HV_PROCESSOR_POWER_STATE_C0		0
-#define HV_PROCESSOR_POWER_STATE_C1		1
-#define HV_PROCESSOR_POWER_STATE_C2		2
-#define HV_PROCESSOR_POWER_STATE_C3		3
-
-#define HV_FLUSH_ALL_PROCESSORS			BIT(0)
-#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES	BIT(1)
-#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY	BIT(2)
-#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT	BIT(3)
-
-enum HV_GENERIC_SET_FORMAT {
-	HV_GENERIC_SET_SPARCE_4K,
-	HV_GENERIC_SET_ALL,
-};
-
-/* hypercall status code */
-#define HV_STATUS_SUCCESS			0
-#define HV_STATUS_INVALID_HYPERCALL_CODE	2
-#define HV_STATUS_INVALID_HYPERCALL_INPUT	3
-#define HV_STATUS_INVALID_ALIGNMENT		4
-#define HV_STATUS_INSUFFICIENT_MEMORY		11
-#define HV_STATUS_INVALID_CONNECTION_ID		18
-#define HV_STATUS_INSUFFICIENT_BUFFERS		19
-
-typedef struct _HV_REFERENCE_TSC_PAGE {
-	uint32_t tsc_sequence;
-	uint32_t res1;
-	uint64_t tsc_scale;
-	int64_t tsc_offset;
-} HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
-
-/* Define the number of synthetic interrupt sources. */
-#define HV_SYNIC_SINT_COUNT		(16)
-/* Define the expected SynIC version. */
-#define HV_SYNIC_VERSION_1		(0x1)
-
-#define HV_SYNIC_CONTROL_ENABLE		(1ULL << 0)
-#define HV_SYNIC_SIMP_ENABLE		(1ULL << 0)
-#define HV_SYNIC_SIEFP_ENABLE		(1ULL << 0)
-#define HV_SYNIC_SINT_MASKED		(1ULL << 16)
-#define HV_SYNIC_SINT_AUTO_EOI		(1ULL << 17)
-#define HV_SYNIC_SINT_VECTOR_MASK	(0xFF)
-
-#define HV_SYNIC_STIMER_COUNT		(4)
-
-/* Define synthetic interrupt controller message constants. */
-#define HV_MESSAGE_SIZE			(256)
-#define HV_MESSAGE_PAYLOAD_BYTE_COUNT	(240)
-#define HV_MESSAGE_PAYLOAD_QWORD_COUNT	(30)
-
-/* Define hypervisor message types. */
-enum hv_message_type {
-	HVMSG_NONE			= 0x00000000,
-
-	/* Memory access messages. */
-	HVMSG_UNMAPPED_GPA		= 0x80000000,
-	HVMSG_GPA_INTERCEPT		= 0x80000001,
-
-	/* Timer notification messages. */
-	HVMSG_TIMER_EXPIRED			= 0x80000010,
-
-	/* Error messages. */
-	HVMSG_INVALID_VP_REGISTER_VALUE	= 0x80000020,
-	HVMSG_UNRECOVERABLE_EXCEPTION	= 0x80000021,
-	HVMSG_UNSUPPORTED_FEATURE		= 0x80000022,
-
-	/* Trace buffer complete messages. */
-	HVMSG_EVENTLOG_BUFFERCOMPLETE	= 0x80000040,
-
-	/* Platform-specific processor intercept messages. */
-	HVMSG_X64_IOPORT_INTERCEPT		= 0x80010000,
-	HVMSG_X64_MSR_INTERCEPT		= 0x80010001,
-	HVMSG_X64_CPUID_INTERCEPT		= 0x80010002,
-	HVMSG_X64_EXCEPTION_INTERCEPT	= 0x80010003,
-	HVMSG_X64_APIC_EOI			= 0x80010004,
-	HVMSG_X64_LEGACY_FP_ERROR		= 0x80010005
-};
-
-/* Define synthetic interrupt controller message flags. */
-union hv_message_flags {
-	uint8_t asu8;
-	struct {
-		uint8_t msg_pending:1;
-		uint8_t reserved:7;
-	};
-};
-
-/* Define port identifier type. */
-union hv_port_id {
-	uint32_t asu32;
-	struct {
-		uint32_t id:24;
-		uint32_t reserved:8;
-	} u;
-};
-
-/* Define synthetic interrupt controller message header. */
-struct hv_message_header {
-	uint32_t message_type;
-	uint8_t payload_size;
-	union hv_message_flags message_flags;
-	uint8_t reserved[2];
-	union {
-		uint64_t sender;
-		union hv_port_id port;
-	};
-};
-
-/* Define synthetic interrupt controller message format. */
-struct hv_message {
-	struct hv_message_header header;
-	union {
-		uint64_t payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
-	} u;
-};
-
-/* Define the synthetic interrupt message page layout. */
-struct hv_message_page {
-	struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
-};
-
-/* Define timer message payload structure. */
-struct hv_timer_message_payload {
-	uint32_t timer_index;
-	uint32_t reserved;
-	uint64_t expiration_time;	/* When the timer expired */
-	uint64_t delivery_time;	/* When the message was delivered */
-};
-
-#define HV_STIMER_ENABLE		(1ULL << 0)
-#define HV_STIMER_PERIODIC		(1ULL << 1)
-#define HV_STIMER_LAZY			(1ULL << 2)
-#define HV_STIMER_AUTOENABLE		(1ULL << 3)
-#define HV_STIMER_SINT(config)		(uint8_t)(((config) >> 16) & 0x0F)
-
-#endif
+        /* this is a temporary placeholder until kvm_para.h stops including it */
--- a/include/standard-headers/linux/input-event-codes.h
+++ b/include/standard-headers/linux/input-event-codes.h
@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 * Input event codes
 *
@ -406,6 +407,7 @@
 #define BTN_TOOL_MOUSE		0x146
 #define BTN_TOOL_LENS		0x147
 #define BTN_TOOL_QUINTTAP	0x148	/* Five fingers on trackpad */
+#define BTN_STYLUS3		0x149
 #define BTN_TOUCH		0x14a
 #define BTN_STYLUS		0x14b
 #define BTN_STYLUS2		0x14c
--- a/include/standard-headers/linux/input.h
+++ b/include/standard-headers/linux/input.h
@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 * Copyright (c) 1999-2002 Vojtech Pavlik
 *
--- a/include/standard-headers/linux/pci_regs.h
+++ b/include/standard-headers/linux/pci_regs.h
@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *	pci_regs.h
 *
@ -746,6 +747,7 @@
 #define PCI_ERR_ROOT_FIRST_FATAL	0x00000010 /* First UNC is Fatal */
 #define PCI_ERR_ROOT_NONFATAL_RCV	0x00000020 /* Non-Fatal Received */
 #define PCI_ERR_ROOT_FATAL_RCV		0x00000040 /* Fatal Received */
+#define PCI_ERR_ROOT_AER_IRQ		0xf8000000 /* Advanced Error Interrupt Message Number */
 #define PCI_ERR_ROOT_ERR_SRC	52	/* Error Source Identification */

 /* Virtual Channel */
@ -939,9 +941,13 @@
 #define PCI_SATA_SIZEOF_LONG	16

 /* Resizable BARs */
+#define PCI_REBAR_CAP		4	/* capability register */
+#define  PCI_REBAR_CAP_SIZES		0x00FFFFF0  /* supported BAR sizes */
 #define PCI_REBAR_CTRL		8	/* control register */
-#define  PCI_REBAR_CTRL_NBAR_MASK	(7 << 5)	/* mask for # bars */
-#define  PCI_REBAR_CTRL_NBAR_SHIFT	5	/* shift for # bars */
+#define  PCI_REBAR_CTRL_BAR_IDX		0x00000007  /* BAR index */
+#define  PCI_REBAR_CTRL_NBAR_MASK	0x000000E0  /* # of resizable BARs */
+#define  PCI_REBAR_CTRL_NBAR_SHIFT	5  	    /* shift for # of BARs */
+#define  PCI_REBAR_CTRL_BAR_SIZE	0x00001F00  /* BAR size */

 /* Dynamic Power Allocation */
 #define PCI_DPA_CAP		4	/* capability register */
@ -960,6 +966,7 @@

 /* Downstream Port Containment */
 #define PCI_EXP_DPC_CAP			4	/* DPC Capability */
+#define PCI_EXP_DPC_IRQ			0x1f	/* DPC Interrupt Message Number */
 #define  PCI_EXP_DPC_CAP_RP_EXT		0x20	/* Root Port Extensions for DPC */
 #define  PCI_EXP_DPC_CAP_POISONED_TLP	0x40	/* Poisoned TLP Egress Blocking Supported */
 #define  PCI_EXP_DPC_CAP_SW_TRIGGER	0x80	/* Software Triggering Supported */
@ -995,19 +1002,25 @@
 #define  PCI_PTM_CTRL_ENABLE		0x00000001  /* PTM enable */
 #define  PCI_PTM_CTRL_ROOT		0x00000002  /* Root select */

-/* L1 PM Substates */
-#define PCI_L1SS_CAP		    4	/* capability register */
-#define  PCI_L1SS_CAP_PCIPM_L1_2	 1	/* PCI PM L1.2 Support */
-#define  PCI_L1SS_CAP_PCIPM_L1_1	 2	/* PCI PM L1.1 Support */
-#define  PCI_L1SS_CAP_ASPM_L1_2		 4	/* ASPM L1.2 Support */
-#define  PCI_L1SS_CAP_ASPM_L1_1		 8	/* ASPM L1.1 Support */
-#define  PCI_L1SS_CAP_L1_PM_SS		16	/* L1 PM Substates Support */
-#define PCI_L1SS_CTL1		    8	/* Control Register 1 */
-#define  PCI_L1SS_CTL1_PCIPM_L1_2	1	/* PCI PM L1.2 Enable */
-#define  PCI_L1SS_CTL1_PCIPM_L1_1	2	/* PCI PM L1.1 Support */
-#define  PCI_L1SS_CTL1_ASPM_L1_2	4	/* ASPM L1.2 Support */
-#define  PCI_L1SS_CTL1_ASPM_L1_1	8	/* ASPM L1.1 Support */
-#define  PCI_L1SS_CTL1_L1SS_MASK	0x0000000F
-#define PCI_L1SS_CTL2		    0xC	/* Control Register 2 */
+/* ASPM L1 PM Substates */
+#define PCI_L1SS_CAP		0x04	/* Capabilities Register */
+#define  PCI_L1SS_CAP_PCIPM_L1_2	0x00000001  /* PCI-PM L1.2 Supported */
+#define  PCI_L1SS_CAP_PCIPM_L1_1	0x00000002  /* PCI-PM L1.1 Supported */
+#define  PCI_L1SS_CAP_ASPM_L1_2		0x00000004  /* ASPM L1.2 Supported */
+#define  PCI_L1SS_CAP_ASPM_L1_1		0x00000008  /* ASPM L1.1 Supported */
+#define  PCI_L1SS_CAP_L1_PM_SS		0x00000010  /* L1 PM Substates Supported */
+#define  PCI_L1SS_CAP_CM_RESTORE_TIME	0x0000ff00  /* Port Common_Mode_Restore_Time */
+#define  PCI_L1SS_CAP_P_PWR_ON_SCALE	0x00030000  /* Port T_POWER_ON scale */
+#define  PCI_L1SS_CAP_P_PWR_ON_VALUE	0x00f80000  /* Port T_POWER_ON value */
+#define PCI_L1SS_CTL1		0x08	/* Control 1 Register */
+#define  PCI_L1SS_CTL1_PCIPM_L1_2	0x00000001  /* PCI-PM L1.2 Enable */
+#define  PCI_L1SS_CTL1_PCIPM_L1_1	0x00000002  /* PCI-PM L1.1 Enable */
+#define  PCI_L1SS_CTL1_ASPM_L1_2	0x00000004  /* ASPM L1.2 Enable */
+#define  PCI_L1SS_CTL1_ASPM_L1_1	0x00000008  /* ASPM L1.1 Enable */
+#define  PCI_L1SS_CTL1_L1SS_MASK	0x0000000f
+#define  PCI_L1SS_CTL1_CM_RESTORE_TIME	0x0000ff00  /* Common_Mode_Restore_Time */
+#define  PCI_L1SS_CTL1_LTR_L12_TH_VALUE	0x03ff0000  /* LTR_L1.2_THRESHOLD_Value */
+#define  PCI_L1SS_CTL1_LTR_L12_TH_SCALE	0xe0000000  /* LTR_L1.2_THRESHOLD_Scale */
+#define PCI_L1SS_CTL2		0x0c	/* Control 2 Register */

 #endif /* LINUX_PCI_REGS_H */
--- a/linux-headers/asm-arm/kvm.h
+++ b/linux-headers/asm-arm/kvm.h
@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
@ -151,6 +152,12 @@ struct kvm_arch_memory_slot {
 	(__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64)
 #define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__)

+/* PL1 Physical Timer Registers */
+#define KVM_REG_ARM_PTIMER_CTL		ARM_CP15_REG32(0, 14, 2, 1)
+#define KVM_REG_ARM_PTIMER_CNT		ARM_CP15_REG64(0, 14)
+#define KVM_REG_ARM_PTIMER_CVAL		ARM_CP15_REG64(2, 14)
+
+/* Virtual Timer Registers */
 #define KVM_REG_ARM_TIMER_CTL		ARM_CP15_REG32(0, 14, 3, 1)
 #define KVM_REG_ARM_TIMER_CNT		ARM_CP15_REG64(1, 14)
 #define KVM_REG_ARM_TIMER_CVAL		ARM_CP15_REG64(3, 14)
@ -215,6 +222,7 @@ struct kvm_arch_memory_slot {
 #define   KVM_DEV_ARM_ITS_SAVE_TABLES		1
 #define   KVM_DEV_ARM_ITS_RESTORE_TABLES	2
 #define   KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES	3
+#define   KVM_DEV_ARM_ITS_CTRL_RESET		4

 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT		24
--- a/linux-headers/asm-arm/kvm_para.h
+++ b/linux-headers/asm-arm/kvm_para.h
@ -1 +1,2 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 #include <asm-generic/kvm_para.h>
--- a/linux-headers/asm-arm/unistd.h
+++ b/linux-headers/asm-arm/unistd.h
@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 *  arch/arm/include/asm/unistd.h
 *
@ -35,5 +36,6 @@
 #define __ARM_NR_usr26			(__ARM_NR_BASE+3)
 #define __ARM_NR_usr32			(__ARM_NR_BASE+4)
 #define __ARM_NR_set_tls		(__ARM_NR_BASE+5)
+#define __ARM_NR_get_tls		(__ARM_NR_BASE+6)

 #endif /* __ASM_ARM_UNISTD_H */
--- a/linux-headers/asm-arm64/kvm.h
+++ b/linux-headers/asm-arm64/kvm.h
@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
@ -195,6 +196,12 @@ struct kvm_arch_memory_slot {

 #define ARM64_SYS_REG(...) (__ARM64_SYS_REG(__VA_ARGS__) | KVM_REG_SIZE_U64)

+/* Physical Timer EL0 Registers */
+#define KVM_REG_ARM_PTIMER_CTL		ARM64_SYS_REG(3, 3, 14, 2, 1)
+#define KVM_REG_ARM_PTIMER_CVAL		ARM64_SYS_REG(3, 3, 14, 2, 2)
+#define KVM_REG_ARM_PTIMER_CNT		ARM64_SYS_REG(3, 3, 14, 0, 1)
+
+/* EL0 Virtual Timer Registers */
 #define KVM_REG_ARM_TIMER_CTL		ARM64_SYS_REG(3, 3, 14, 3, 1)
 #define KVM_REG_ARM_TIMER_CNT		ARM64_SYS_REG(3, 3, 14, 3, 2)
 #define KVM_REG_ARM_TIMER_CVAL		ARM64_SYS_REG(3, 3, 14, 0, 2)
@ -227,6 +234,7 @@ struct kvm_arch_memory_slot {
 #define   KVM_DEV_ARM_ITS_SAVE_TABLES           1
 #define   KVM_DEV_ARM_ITS_RESTORE_TABLES        2
 #define   KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES	3
+#define   KVM_DEV_ARM_ITS_CTRL_RESET		4

 /* Device Control API on vcpu fd */
 #define KVM_ARM_VCPU_PMU_V3_CTRL	0
--- a/linux-headers/asm-arm64/unistd.h
+++ b/linux-headers/asm-arm64/unistd.h
@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 * Copyright (C) 2012 ARM Ltd.
 *
--- a/linux-headers/asm-powerpc/epapr_hcalls.h
+++ b/linux-headers/asm-powerpc/epapr_hcalls.h
@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
 /*
 * ePAPR hcall interface
 *
--- a/linux-headers/asm-powerpc/kvm.h
+++ b/linux-headers/asm-powerpc/kvm.h
@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2, as
@ -442,6 +443,31 @@ struct kvm_ppc_rmmu_info {
 	__u32	ap_encodings[8];
 };

+/* For KVM_PPC_GET_CPU_CHAR */
+struct kvm_ppc_cpu_char {
+	__u64	character;		/* characteristics of the CPU */
+	__u64	behaviour;		/* recommended software behaviour */
+	__u64	character_mask;		/* valid bits in character */
+	__u64	behaviour_mask;		/* valid bits in behaviour */
+};
+
+/*
+ * Values for character and character_mask.
+ * These are identical to the values used by H_GET_CPU_CHARACTERISTICS.
+ */
+#define KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31		(1ULL << 63)
+#define KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED	(1ULL << 62)
+#define KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30	(1ULL << 61)
+#define KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2	(1ULL << 60)
+#define KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV	(1ULL << 59)
+#define KVM_PPC_CPU_CHAR_BR_HINT_HONOURED	(1ULL << 58)
+#define KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF	(1ULL << 57)
+#define KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS	(1ULL << 56)
+
+#define KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY	(1ULL << 63)
+#define KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR		(1ULL << 62)
+#define KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR	(1ULL << 61)
+
 /* Per-vcpu XICS interrupt controller state */
 #define KVM_REG_PPC_ICP_STATE	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c)

--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .11.0
 .11.2