Block layer patches

-----BEGIN PGP SIGNATURE----- iQIcBAABAgAGBQJanYJPAAoJEH8JsnLIjy/WxjUQAJA+DTOmGXvaNpMs65BrU79K /r/iGVrzHv/RMLmrWMnqj96W9SnpMuiAP9hVLNsekqClY9q4ME4DpGcXhWfhSvF5 FC51ehvFJdfo8cPorsevcqNj60iWebjcx3lFfUq2606UOyYih3oijYxr6gSwWbRc GAgdGMqsvGYpzgqAQVEWHUhaX0La49/OzY42aR+E+LCBNfTYvlydvyoc+tUTdIpW 1eM/ASGndGsN0Cf2vxlbKgJ0/P6v+cRZuuIDhKZqre+YG+yM+pq7yZb+o7nf/P36 TPR93BsT7FSVAizRK7VFRuPIynHpiaxYygrJERCXF0sxsV4OlKjpmt/uUPamWFh+ 46Jx2NK1AuAx87BdErgmA119ObO3oAPxK0+2p981obb6SphTbbPxDj6SOlYCt4mJ mhff4JtIiwCmDSckAwd2mkBI1Tvl9qqcELrpyd2t2eU4ec2vf7fPd85EsK/Mq6Kr dbfqFvjNaaMxChoqFgkHAveYJ7zYqRFI2IY5o9c1QyZehCGPWjScxHXZZYdpDl59 YF9DkYQDOyvEX2jmMECaO1r/0nnO+BqQHu5ItJuTte9rjP9Q0do3iBISiIefewtf yji6/QNn2hFrnr1HPAwLFFC3kPgc8Mq8mIUb53j8vG/01KhVRCcnJm2K6D4IUwLZ S6ZnQJB97eE4y7YR5dNt =2axz -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging Block layer patches # gpg: Signature made Mon 05 Mar 2018 17:45:51 GMT # gpg: using RSA key 7F09B272C88F2FD6 # gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>" # Primary key fingerprint: DC3D EB15 9A9A F95D 3D74 56FE 7F09 B272 C88F 2FD6 * remotes/kevin/tags/for-upstream: (38 commits) block: Fix NULL dereference on empty drive error qcow2: Replace align_offset() with ROUND_UP() block/ssh: Add basic .bdrv_truncate() block/ssh: Make ssh_grow_file() blocking block/ssh: Pull ssh_grow_file() from ssh_create() qemu-img: Make resize error message more general qcow2: make qcow2_co_create2() a coroutine_fn block: rename .bdrv_create() to .bdrv_co_create_opts() Revert "IDE: Do not flush empty CDROM drives" block: test blk_aio_flush() with blk->root == NULL block: add BlockBackend->in_flight counter block: extract AIO_WAIT_WHILE() from BlockDriverState aio: rename aio_context_in_iothread() to in_aio_context_home_thread() docs: document how to use the l2-cache-entry-size parameter specs/qcow2: Fix documentation of the compressed cluster descriptor iotest 033: add misaligned write-zeroes test via truncate block: fix write with zero flag set and iovector provided block: Drop unused .bdrv_co_get_block_status() vvfat: Switch to .bdrv_co_block_status() vpc: Switch to .bdrv_co_block_status() ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org> # Conflicts: # include/block/block.h
2018-03-06 11:20:44 +00:00 · 2018-03-06 11:20:44 +00:00 · 58e2e17dba
parent e1ee9ee139 bfe1a14c18
commit 58e2e17dba
48 changed files with 983 additions and 613 deletions
--- a/block.c
+++ b/block.c
@ -418,7 +418,7 @@ static void coroutine_fn bdrv_create_co_entry(void *opaque)
    CreateCo *cco = opaque;
    assert(cco->drv);

-    ret = cco->drv->bdrv_create(cco->filename, cco->opts, &local_err);
+    ret = cco->drv->bdrv_co_create_opts(cco->filename, cco->opts, &local_err);
    error_propagate(&cco->err, local_err);
    cco->ret = ret;
 }
@ -437,7 +437,7 @@ int bdrv_create(BlockDriver *drv, const char* filename,
        .err = NULL,
    };

-    if (!drv->bdrv_create) {
+    if (!drv->bdrv_co_create_opts) {
        error_setg(errp, "Driver '%s' does not support image creation", drv->format_name);
        ret = -ENOTSUP;
        goto out;
@ -4711,7 +4711,12 @@ out:

 AioContext *bdrv_get_aio_context(BlockDriverState *bs)
 {
-    return bs->aio_context;
+    return bs ? bs->aio_context : qemu_get_aio_context();
+}
+
+AioWait *bdrv_get_aio_wait(BlockDriverState *bs)
+{
+    return bs ? &bs->wait : NULL;
 }

 void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co)
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@ -627,15 +627,17 @@ static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
    return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
 }

-static int64_t coroutine_fn blkdebug_co_get_block_status(
-    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
-    BlockDriverState **file)
+static int coroutine_fn blkdebug_co_block_status(BlockDriverState *bs,
+                                                 bool want_zero,
+                                                 int64_t offset,
+                                                 int64_t bytes,
+                                                 int64_t *pnum,
+                                                 int64_t *map,
+                                                 BlockDriverState **file)
 {
-    assert(QEMU_IS_ALIGNED(sector_num | nb_sectors,
-                           DIV_ROUND_UP(bs->bl.request_alignment,
-                                        BDRV_SECTOR_SIZE)));
-    return bdrv_co_get_block_status_from_file(bs, sector_num, nb_sectors,
-                                              pnum, file);
+    assert(QEMU_IS_ALIGNED(offset | bytes, bs->bl.request_alignment));
+    return bdrv_co_block_status_from_file(bs, want_zero, offset, bytes,
+                                          pnum, map, file);
 }

 static void blkdebug_close(BlockDriverState *bs)
@ -907,7 +909,7 @@ static BlockDriver bdrv_blkdebug = {
    .bdrv_co_flush_to_disk  = blkdebug_co_flush,
    .bdrv_co_pwrite_zeroes  = blkdebug_co_pwrite_zeroes,
    .bdrv_co_pdiscard       = blkdebug_co_pdiscard,
-    .bdrv_co_get_block_status = blkdebug_co_get_block_status,
+    .bdrv_co_block_status   = blkdebug_co_block_status,

    .bdrv_debug_event           = blkdebug_debug_event,
    .bdrv_debug_breakpoint      = blkdebug_debug_breakpoint,
--- a/block/block-backend.c
+++ b/block/block-backend.c
@ -73,6 +73,14 @@ struct BlockBackend {
    int quiesce_counter;
    VMChangeStateEntry *vmsh;
    bool force_allow_inactivate;
+
+    /* Number of in-flight aio requests.  BlockDriverState also counts
+     * in-flight requests but aio requests can exist even when blk->root is
+     * NULL, so we cannot rely on its counter for that case.
+     * Accessed with atomic ops.
+     */
+    unsigned int in_flight;
+    AioWait wait;
 };

 typedef struct BlockBackendAIOCB {
@ -1225,11 +1233,22 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
    return bdrv_make_zero(blk->root, flags);
 }

+static void blk_inc_in_flight(BlockBackend *blk)
+{
+    atomic_inc(&blk->in_flight);
+}
+
+static void blk_dec_in_flight(BlockBackend *blk)
+{
+    atomic_dec(&blk->in_flight);
+    aio_wait_kick(&blk->wait);
+}
+
 static void error_callback_bh(void *opaque)
 {
    struct BlockBackendAIOCB *acb = opaque;

-    bdrv_dec_in_flight(acb->common.bs);
+    blk_dec_in_flight(acb->blk);
    acb->common.cb(acb->common.opaque, acb->ret);
    qemu_aio_unref(acb);
 }
@ -1240,7 +1259,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
 {
    struct BlockBackendAIOCB *acb;

-    bdrv_inc_in_flight(blk_bs(blk));
+    blk_inc_in_flight(blk);
    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
    acb->blk = blk;
    acb->ret = ret;
@ -1263,7 +1282,7 @@ static const AIOCBInfo blk_aio_em_aiocb_info = {
 static void blk_aio_complete(BlkAioEmAIOCB *acb)
 {
    if (acb->has_returned) {
-        bdrv_dec_in_flight(acb->common.bs);
+        blk_dec_in_flight(acb->rwco.blk);
        acb->common.cb(acb->common.opaque, acb->rwco.ret);
        qemu_aio_unref(acb);
    }
@ -1284,7 +1303,7 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
    BlkAioEmAIOCB *acb;
    Coroutine *co;

-    bdrv_inc_in_flight(blk_bs(blk));
+    blk_inc_in_flight(blk);
    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
    acb->rwco = (BlkRwCo) {
        .blk    = blk,
@ -1521,14 +1540,41 @@ int blk_flush(BlockBackend *blk)

 void blk_drain(BlockBackend *blk)
 {
-    if (blk_bs(blk)) {
-        bdrv_drain(blk_bs(blk));
+    BlockDriverState *bs = blk_bs(blk);
+
+    if (bs) {
+        bdrv_drained_begin(bs);
+    }
+
+    /* We may have -ENOMEDIUM completions in flight */
+    AIO_WAIT_WHILE(&blk->wait,
+            blk_get_aio_context(blk),
+            atomic_mb_read(&blk->in_flight) > 0);
+
+    if (bs) {
+        bdrv_drained_end(bs);
    }
 }

 void blk_drain_all(void)
 {
-    bdrv_drain_all();
+    BlockBackend *blk = NULL;
+
+    bdrv_drain_all_begin();
+
+    while ((blk = blk_all_next(blk)) != NULL) {
+        AioContext *ctx = blk_get_aio_context(blk);
+
+        aio_context_acquire(ctx);
+
+        /* We may have -ENOMEDIUM completions in flight */
+        AIO_WAIT_WHILE(&blk->wait, ctx,
+                atomic_mb_read(&blk->in_flight) > 0);
+
+        aio_context_release(ctx);
+    }
+
+    bdrv_drain_all_end();
 }

 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
@ -1569,10 +1615,11 @@ static void send_qmp_error_event(BlockBackend *blk,
                                 bool is_read, int error)
 {
    IoOperationType optype;
+    BlockDriverState *bs = blk_bs(blk);

    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
-    qapi_event_send_block_io_error(blk_name(blk),
-                                   bdrv_get_node_name(blk_bs(blk)), optype,
+    qapi_event_send_block_io_error(blk_name(blk), !!bs,
+                                   bs ? bdrv_get_node_name(bs) : NULL, optype,
                                   action, blk_iostatus_is_enabled(blk),
                                   error == ENOSPC, strerror(error),
                                   &error_abort);
--- a/block/commit.c
+++ b/block/commit.c
@ -265,7 +265,7 @@ static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
 static BlockDriver bdrv_commit_top = {
    .format_name                = "commit_top",
    .bdrv_co_preadv             = bdrv_commit_top_preadv,
-    .bdrv_co_get_block_status   = bdrv_co_get_block_status_from_backing,
+    .bdrv_co_block_status       = bdrv_co_block_status_from_backing,
    .bdrv_refresh_filename      = bdrv_commit_top_refresh_filename,
    .bdrv_close                 = bdrv_commit_top_close,
    .bdrv_child_perm            = bdrv_commit_top_child_perm,
--- a/block/crypto.c
+++ b/block/crypto.c
@ -556,9 +556,9 @@ static int block_crypto_open_luks(BlockDriverState *bs,
                                     bs, options, flags, errp);
 }

-static int block_crypto_create_luks(const char *filename,
-                                    QemuOpts *opts,
-                                    Error **errp)
+static int coroutine_fn block_crypto_co_create_opts_luks(const char *filename,
+                                                         QemuOpts *opts,
+                                                         Error **errp)
 {
    return block_crypto_create_generic(Q_CRYPTO_BLOCK_FORMAT_LUKS,
                                       filename, opts, errp);
@ -617,7 +617,7 @@ BlockDriver bdrv_crypto_luks = {
    .bdrv_open          = block_crypto_open_luks,
    .bdrv_close         = block_crypto_close,
    .bdrv_child_perm    = bdrv_format_default_perms,
-    .bdrv_create        = block_crypto_create_luks,
+    .bdrv_co_create_opts = block_crypto_co_create_opts_luks,
    .bdrv_truncate      = block_crypto_truncate,
    .create_opts        = &block_crypto_create_opts_luks,

--- a/block/file-posix.c
+++ b/block/file-posix.c
@ -1982,7 +1982,8 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
    return (int64_t)st.st_blocks * 512;
 }

-static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
+                                           Error **errp)
 {
    int fd;
    int result = 0;
@ -2131,25 +2132,24 @@ static int find_allocation(BlockDriverState *bs, off_t start,
 }

 /*
- * Returns the allocation status of the specified sectors.
+ * Returns the allocation status of the specified offset.
 *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
- * and 'pnum' is set to 0.
+ * The block layer guarantees 'offset' and 'bytes' are within bounds.
 *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
+ * 'pnum' is set to the number of bytes (including and immediately following
+ * the specified offset) that are known to be in the same
 * allocated/unallocated state.
 *
- * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
- * beyond the end of the disk image it will be clamped.
+ * 'bytes' is the max value 'pnum' should be set to.
 */
-static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
-                                                    int64_t sector_num,
-                                                    int nb_sectors, int *pnum,
-                                                    BlockDriverState **file)
+static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
+                                            bool want_zero,
+                                            int64_t offset,
+                                            int64_t bytes, int64_t *pnum,
+                                            int64_t *map,
+                                            BlockDriverState **file)
 {
-    off_t start, data = 0, hole = 0;
-    int64_t total_size;
+    off_t data = 0, hole = 0;
    int ret;

    ret = fd_open(bs);
@ -2157,39 +2157,36 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
        return ret;
    }

-    start = sector_num * BDRV_SECTOR_SIZE;
-    total_size = bdrv_getlength(bs);
-    if (total_size < 0) {
-        return total_size;
-    } else if (start >= total_size) {
-        *pnum = 0;
-        return 0;
-    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
-        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
+    if (!want_zero) {
+        *pnum = bytes;
+        *map = offset;
+        *file = bs;
+        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
    }

-    ret = find_allocation(bs, start, &data, &hole);
+    ret = find_allocation(bs, offset, &data, &hole);
    if (ret == -ENXIO) {
        /* Trailing hole */
-        *pnum = nb_sectors;
+        *pnum = bytes;
        ret = BDRV_BLOCK_ZERO;
    } else if (ret < 0) {
        /* No info available, so pretend there are no holes */
-        *pnum = nb_sectors;
+        *pnum = bytes;
        ret = BDRV_BLOCK_DATA;
-    } else if (data == start) {
-        /* On a data extent, compute sectors to the end of the extent,
+    } else if (data == offset) {
+        /* On a data extent, compute bytes to the end of the extent,
         * possibly including a partial sector at EOF. */
-        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
+        *pnum = MIN(bytes, hole - offset);
        ret = BDRV_BLOCK_DATA;
    } else {
-        /* On a hole, compute sectors to the beginning of the next extent.  */
-        assert(hole == start);
-        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
+        /* On a hole, compute bytes to the beginning of the next extent.  */
+        assert(hole == offset);
+        *pnum = MIN(bytes, data - offset);
        ret = BDRV_BLOCK_ZERO;
    }
+    *map = offset;
    *file = bs;
-    return ret | BDRV_BLOCK_OFFSET_VALID | start;
+    return ret | BDRV_BLOCK_OFFSET_VALID;
 }

 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
@ -2280,9 +2277,9 @@ BlockDriver bdrv_file = {
    .bdrv_reopen_commit = raw_reopen_commit,
    .bdrv_reopen_abort = raw_reopen_abort,
    .bdrv_close = raw_close,
-    .bdrv_create = raw_create,
+    .bdrv_co_create_opts = raw_co_create_opts,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
-    .bdrv_co_get_block_status = raw_co_get_block_status,
+    .bdrv_co_block_status = raw_co_block_status,
    .bdrv_co_pwrite_zeroes = raw_co_pwrite_zeroes,

    .bdrv_co_preadv         = raw_co_preadv,
@ -2684,8 +2681,8 @@ static coroutine_fn int hdev_co_pwrite_zeroes(BlockDriverState *bs,
    return -ENOTSUP;
 }

-static int hdev_create(const char *filename, QemuOpts *opts,
-                       Error **errp)
+static int coroutine_fn hdev_co_create_opts(const char *filename, QemuOpts *opts,
+                                            Error **errp)
 {
    int fd;
    int ret = 0;
@ -2758,7 +2755,7 @@ static BlockDriver bdrv_host_device = {
    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit  = raw_reopen_commit,
    .bdrv_reopen_abort   = raw_reopen_abort,
-    .bdrv_create         = hdev_create,
+    .bdrv_co_create_opts = hdev_co_create_opts,
    .create_opts         = &raw_create_opts,
    .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,

@ -2880,7 +2877,7 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit  = raw_reopen_commit,
    .bdrv_reopen_abort   = raw_reopen_abort,
-    .bdrv_create         = hdev_create,
+    .bdrv_co_create_opts = hdev_co_create_opts,
    .create_opts         = &raw_create_opts,


@ -3011,7 +3008,7 @@ static BlockDriver bdrv_host_cdrom = {
    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit  = raw_reopen_commit,
    .bdrv_reopen_abort   = raw_reopen_abort,
-    .bdrv_create        = hdev_create,
+    .bdrv_co_create_opts = hdev_co_create_opts,
    .create_opts        = &raw_create_opts,

    .bdrv_co_preadv         = raw_co_preadv,
--- a/block/file-win32.c
+++ b/block/file-win32.c
@ -553,7 +553,8 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs)
    return st.st_size;
 }

-static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
+                                           Error **errp)
 {
    int fd;
    int64_t total_size = 0;
@ -599,7 +600,7 @@ BlockDriver bdrv_file = {
    .bdrv_file_open     = raw_open,
    .bdrv_refresh_limits = raw_probe_alignment,
    .bdrv_close         = raw_close,
-    .bdrv_create        = raw_create,
+    .bdrv_co_create_opts = raw_co_create_opts,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,

    .bdrv_aio_readv     = raw_aio_readv,
--- a/block/gluster.c
+++ b/block/gluster.c
@ -1021,8 +1021,9 @@ static int qemu_gluster_do_truncate(struct glfs_fd *fd, int64_t offset,
    return 0;
 }

-static int qemu_gluster_create(const char *filename,
-                               QemuOpts *opts, Error **errp)
+static int coroutine_fn qemu_gluster_co_create_opts(const char *filename,
+                                                    QemuOpts *opts,
+                                                    Error **errp)
 {
    BlockdevOptionsGluster *gconf;
    struct glfs *glfs;
@ -1362,68 +1363,66 @@ exit:
 }

 /*
- * Returns the allocation status of the specified sectors.
+ * Returns the allocation status of the specified offset.
 *
- * If 'sector_num' is beyond the end of the disk image the return value is 0
- * and 'pnum' is set to 0.
+ * The block layer guarantees 'offset' and 'bytes' are within bounds.
 *
- * 'pnum' is set to the number of sectors (including and immediately following
- * the specified sector) that are known to be in the same
+ * 'pnum' is set to the number of bytes (including and immediately following
+ * the specified offset) that are known to be in the same
 * allocated/unallocated state.
 *
- * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
- * beyond the end of the disk image it will be clamped.
+ * 'bytes' is the max value 'pnum' should be set to.
 *
- * (Based on raw_co_get_block_status() from file-posix.c.)
+ * (Based on raw_co_block_status() from file-posix.c.)
 */
-static int64_t coroutine_fn qemu_gluster_co_get_block_status(
-        BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
-        BlockDriverState **file)
+static int coroutine_fn qemu_gluster_co_block_status(BlockDriverState *bs,
+                                                     bool want_zero,
+                                                     int64_t offset,
+                                                     int64_t bytes,
+                                                     int64_t *pnum,
+                                                     int64_t *map,
+                                                     BlockDriverState **file)
 {
    BDRVGlusterState *s = bs->opaque;
-    off_t start, data = 0, hole = 0;
-    int64_t total_size;
+    off_t data = 0, hole = 0;
    int ret = -EINVAL;

    if (!s->fd) {
        return ret;
    }

-    start = sector_num * BDRV_SECTOR_SIZE;
-    total_size = bdrv_getlength(bs);
-    if (total_size < 0) {
-        return total_size;
-    } else if (start >= total_size) {
-        *pnum = 0;
-        return 0;
-    } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) {
-        nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE);
+    if (!want_zero) {
+        *pnum = bytes;
+        *map = offset;
+        *file = bs;
+        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
    }

-    ret = find_allocation(bs, start, &data, &hole);
+    ret = find_allocation(bs, offset, &data, &hole);
    if (ret == -ENXIO) {
        /* Trailing hole */
-        *pnum = nb_sectors;
+        *pnum = bytes;
        ret = BDRV_BLOCK_ZERO;
    } else if (ret < 0) {
        /* No info available, so pretend there are no holes */
-        *pnum = nb_sectors;
+        *pnum = bytes;
        ret = BDRV_BLOCK_DATA;
-    } else if (data == start) {
-        /* On a data extent, compute sectors to the end of the extent,
+    } else if (data == offset) {
+        /* On a data extent, compute bytes to the end of the extent,
         * possibly including a partial sector at EOF. */
-        *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE));
+        *pnum = MIN(bytes, hole - offset);
        ret = BDRV_BLOCK_DATA;
    } else {
-        /* On a hole, compute sectors to the beginning of the next extent.  */
-        assert(hole == start);
-        *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE);
+        /* On a hole, compute bytes to the beginning of the next extent.  */
+        assert(hole == offset);
+        *pnum = MIN(bytes, data - offset);
        ret = BDRV_BLOCK_ZERO;
    }

+    *map = offset;
    *file = bs;

-    return ret | BDRV_BLOCK_OFFSET_VALID | start;
+    return ret | BDRV_BLOCK_OFFSET_VALID;
 }


@ -1437,7 +1436,7 @@ static BlockDriver bdrv_gluster = {
    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
    .bdrv_close                   = qemu_gluster_close,
-    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_co_create_opts          = qemu_gluster_co_create_opts,
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
@ -1451,7 +1450,7 @@ static BlockDriver bdrv_gluster = {
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
-    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
+    .bdrv_co_block_status         = qemu_gluster_co_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

@ -1465,7 +1464,7 @@ static BlockDriver bdrv_gluster_tcp = {
    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
    .bdrv_close                   = qemu_gluster_close,
-    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_co_create_opts          = qemu_gluster_co_create_opts,
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
@ -1479,7 +1478,7 @@ static BlockDriver bdrv_gluster_tcp = {
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
-    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
+    .bdrv_co_block_status         = qemu_gluster_co_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

@ -1493,7 +1492,7 @@ static BlockDriver bdrv_gluster_unix = {
    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
    .bdrv_close                   = qemu_gluster_close,
-    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_co_create_opts          = qemu_gluster_co_create_opts,
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
@ -1507,7 +1506,7 @@ static BlockDriver bdrv_gluster_unix = {
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
-    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
+    .bdrv_co_block_status         = qemu_gluster_co_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

@ -1527,7 +1526,7 @@ static BlockDriver bdrv_gluster_rdma = {
    .bdrv_reopen_commit           = qemu_gluster_reopen_commit,
    .bdrv_reopen_abort            = qemu_gluster_reopen_abort,
    .bdrv_close                   = qemu_gluster_close,
-    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_co_create_opts          = qemu_gluster_co_create_opts,
    .bdrv_getlength               = qemu_gluster_getlength,
    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
    .bdrv_truncate                = qemu_gluster_truncate,
@ -1541,7 +1540,7 @@ static BlockDriver bdrv_gluster_rdma = {
 #ifdef CONFIG_GLUSTERFS_ZEROFILL
    .bdrv_co_pwrite_zeroes        = qemu_gluster_co_pwrite_zeroes,
 #endif
-    .bdrv_co_get_block_status     = qemu_gluster_co_get_block_status,
+    .bdrv_co_block_status         = qemu_gluster_co_block_status,
    .create_opts                  = &qemu_gluster_create_opts,
 };

--- a/block/io.c
+++ b/block/io.c
@ -25,6 +25,7 @@
 #include "qemu/osdep.h"
 #include "trace.h"
 #include "sysemu/block-backend.h"
+#include "block/aio-wait.h"
 #include "block/blockjob.h"
 #include "block/blockjob_int.h"
 #include "block/block_int.h"
@ -587,16 +588,9 @@ void bdrv_inc_in_flight(BlockDriverState *bs)
    atomic_inc(&bs->in_flight);
 }

-static void dummy_bh_cb(void *opaque)
-{
-}
-
 void bdrv_wakeup(BlockDriverState *bs)
 {
-    /* The barrier (or an atomic op) is in the caller.  */
-    if (atomic_read(&bs->wakeup)) {
-        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
-    }
+    aio_wait_kick(bdrv_get_aio_wait(bs));
 }

 void bdrv_dec_in_flight(BlockDriverState *bs)
@ -1701,7 +1695,7 @@ int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
     */
    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);

-    if (!qiov) {
+    if (flags & BDRV_REQ_ZERO_WRITE) {
        ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
        goto out;
    }
@ -1868,30 +1862,34 @@ typedef struct BdrvCoBlockStatusData {
    bool done;
 } BdrvCoBlockStatusData;

-int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
-                                                        int64_t sector_num,
-                                                        int nb_sectors,
-                                                        int *pnum,
-                                                        BlockDriverState **file)
+int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
+                                                bool want_zero,
+                                                int64_t offset,
+                                                int64_t bytes,
+                                                int64_t *pnum,
+                                                int64_t *map,
+                                                BlockDriverState **file)
 {
    assert(bs->file && bs->file->bs);
-    *pnum = nb_sectors;
+    *pnum = bytes;
+    *map = offset;
    *file = bs->file->bs;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
-           (sector_num << BDRV_SECTOR_BITS);
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
 }

-int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
-                                                           int64_t sector_num,
-                                                           int nb_sectors,
-                                                           int *pnum,
-                                                           BlockDriverState **file)
+int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
+                                                   bool want_zero,
+                                                   int64_t offset,
+                                                   int64_t bytes,
+                                                   int64_t *pnum,
+                                                   int64_t *map,
+                                                   BlockDriverState **file)
 {
    assert(bs->backing && bs->backing->bs);
-    *pnum = nb_sectors;
+    *pnum = bytes;
+    *map = offset;
    *file = bs->backing->bs;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
-           (sector_num << BDRV_SECTOR_BITS);
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
 }

 /*
@ -1899,10 +1897,10 @@ int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
 * Drivers not implementing the functionality are assumed to not support
 * backing files, hence all their sectors are reported as allocated.
 *
- * If 'want_zero' is true, the caller is querying for mapping purposes,
- * and the result should include BDRV_BLOCK_OFFSET_VALID and
- * BDRV_BLOCK_ZERO where possible; otherwise, the result may omit those
- * bits particularly if it allows for a larger value in 'pnum'.
+ * If 'want_zero' is true, the caller is querying for mapping
+ * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
+ * _ZERO where possible; otherwise, the result favors larger 'pnum',
+ * with a focus on accurate BDRV_BLOCK_ALLOCATED.
 *
 * If 'offset' is beyond the end of the disk image the return value is
 * BDRV_BLOCK_EOF and 'pnum' is set to 0.
@ -1959,7 +1957,7 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,

    /* Must be non-NULL or bdrv_getlength() would have failed */
    assert(bs->drv);
-    if (!bs->drv->bdrv_co_get_block_status) {
+    if (!bs->drv->bdrv_co_block_status) {
        *pnum = bytes;
        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
        if (offset + bytes == total_size) {
@ -1976,44 +1974,24 @@ static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
    bdrv_inc_in_flight(bs);

    /* Round out to request_alignment boundaries */
-    /* TODO: until we have a byte-based driver callback, we also have to
-     * round out to sectors, even if that is bigger than request_alignment */
-    align = MAX(bs->bl.request_alignment, BDRV_SECTOR_SIZE);
+    align = bs->bl.request_alignment;
    aligned_offset = QEMU_ALIGN_DOWN(offset, align);
    aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;

-    {
-        int count; /* sectors */
-        int64_t longret;
-
-        assert(QEMU_IS_ALIGNED(aligned_offset | aligned_bytes,
-                               BDRV_SECTOR_SIZE));
-        /*
-         * The contract allows us to return pnum smaller than bytes, even
-         * if the next query would see the same status; we truncate the
-         * request to avoid overflowing the driver's 32-bit interface.
-         */
-        longret = bs->drv->bdrv_co_get_block_status(
-            bs, aligned_offset >> BDRV_SECTOR_BITS,
-            MIN(INT_MAX, aligned_bytes) >> BDRV_SECTOR_BITS, &count,
-            &local_file);
-        if (longret < 0) {
-            assert(INT_MIN <= longret);
-            ret = longret;
-            goto out;
-        }
-        if (longret & BDRV_BLOCK_OFFSET_VALID) {
-            local_map = longret & BDRV_BLOCK_OFFSET_MASK;
-        }
-        ret = longret & ~BDRV_BLOCK_OFFSET_MASK;
-        *pnum = count * BDRV_SECTOR_SIZE;
+    ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
+                                        aligned_bytes, pnum, &local_map,
+                                        &local_file);
+    if (ret < 0) {
+        *pnum = 0;
+        goto out;
    }

    /*
-     * The driver's result must be a multiple of request_alignment.
+     * The driver's result must be a non-zero multiple of request_alignment.
     * Clamp pnum and adjust map to original request.
     */
-    assert(QEMU_IS_ALIGNED(*pnum, align) && align > offset - aligned_offset);
+    assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
+           align > offset - aligned_offset);
    *pnum -= offset - aligned_offset;
    if (*pnum > bytes) {
        *pnum = bytes;
--- a/block/iscsi.c
+++ b/block/iscsi.c
@ -86,7 +86,7 @@ typedef struct IscsiLun {
    unsigned long *allocmap;
    unsigned long *allocmap_valid;
    long allocmap_size;
-    int cluster_sectors;
+    int cluster_size;
    bool use_16_for_rw;
    bool write_protected;
    bool lbpme;
@ -430,9 +430,10 @@ static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
 {
    iscsi_allocmap_free(iscsilun);

+    assert(iscsilun->cluster_size);
    iscsilun->allocmap_size =
-        DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun),
-                     iscsilun->cluster_sectors);
+        DIV_ROUND_UP(iscsilun->num_blocks * iscsilun->block_size,
+                     iscsilun->cluster_size);

    iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size);
    if (!iscsilun->allocmap) {
@ -440,7 +441,7 @@ static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
    }

    if (open_flags & BDRV_O_NOCACHE) {
-        /* in case that cache.direct = on all allocmap entries are
+        /* when cache.direct = on all allocmap entries are
         * treated as invalid to force a relookup of the block
         * status on every read request */
        return 0;
@ -457,8 +458,8 @@ static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
 }

 static void
-iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
-                      int nb_sectors, bool allocated, bool valid)
+iscsi_allocmap_update(IscsiLun *iscsilun, int64_t offset,
+                      int64_t bytes, bool allocated, bool valid)
 {
    int64_t cl_num_expanded, nb_cls_expanded, cl_num_shrunk, nb_cls_shrunk;

@ -466,13 +467,13 @@ iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
        return;
    }
    /* expand to entirely contain all affected clusters */
-    cl_num_expanded = sector_num / iscsilun->cluster_sectors;
-    nb_cls_expanded = DIV_ROUND_UP(sector_num + nb_sectors,
-                                   iscsilun->cluster_sectors) - cl_num_expanded;
+    assert(iscsilun->cluster_size);
+    cl_num_expanded = offset / iscsilun->cluster_size;
+    nb_cls_expanded = DIV_ROUND_UP(offset + bytes,
+                                   iscsilun->cluster_size) - cl_num_expanded;
    /* shrink to touch only completely contained clusters */
-    cl_num_shrunk = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors);
-    nb_cls_shrunk = (sector_num + nb_sectors) / iscsilun->cluster_sectors
-                      - cl_num_shrunk;
+    cl_num_shrunk = DIV_ROUND_UP(offset, iscsilun->cluster_size);
+    nb_cls_shrunk = (offset + bytes) / iscsilun->cluster_size - cl_num_shrunk;
    if (allocated) {
        bitmap_set(iscsilun->allocmap, cl_num_expanded, nb_cls_expanded);
    } else {
@ -495,26 +496,26 @@ iscsi_allocmap_update(IscsiLun *iscsilun, int64_t sector_num,
 }

 static void
-iscsi_allocmap_set_allocated(IscsiLun *iscsilun, int64_t sector_num,
-                             int nb_sectors)
+iscsi_allocmap_set_allocated(IscsiLun *iscsilun, int64_t offset,
+                             int64_t bytes)
 {
-    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, true, true);
+    iscsi_allocmap_update(iscsilun, offset, bytes, true, true);
 }

 static void
-iscsi_allocmap_set_unallocated(IscsiLun *iscsilun, int64_t sector_num,
-                               int nb_sectors)
+iscsi_allocmap_set_unallocated(IscsiLun *iscsilun, int64_t offset,
+                               int64_t bytes)
 {
    /* Note: if cache.direct=on the fifth argument to iscsi_allocmap_update
     * is ignored, so this will in effect be an iscsi_allocmap_set_invalid.
     */
-    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, true);
+    iscsi_allocmap_update(iscsilun, offset, bytes, false, true);
 }

-static void iscsi_allocmap_set_invalid(IscsiLun *iscsilun, int64_t sector_num,
-                                       int nb_sectors)
+static void iscsi_allocmap_set_invalid(IscsiLun *iscsilun, int64_t offset,
+                                       int64_t bytes)
 {
-    iscsi_allocmap_update(iscsilun, sector_num, nb_sectors, false, false);
+    iscsi_allocmap_update(iscsilun, offset, bytes, false, false);
 }

 static void iscsi_allocmap_invalidate(IscsiLun *iscsilun)
@ -528,28 +529,30 @@ static void iscsi_allocmap_invalidate(IscsiLun *iscsilun)
 }

 static inline bool
-iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num,
-                            int nb_sectors)
+iscsi_allocmap_is_allocated(IscsiLun *iscsilun, int64_t offset,
+                            int64_t bytes)
 {
    unsigned long size;
    if (iscsilun->allocmap == NULL) {
        return true;
    }
-    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
+    assert(iscsilun->cluster_size);
+    size = DIV_ROUND_UP(offset + bytes, iscsilun->cluster_size);
    return !(find_next_bit(iscsilun->allocmap, size,
-                           sector_num / iscsilun->cluster_sectors) == size);
+                           offset / iscsilun->cluster_size) == size);
 }

 static inline bool iscsi_allocmap_is_valid(IscsiLun *iscsilun,
-                                           int64_t sector_num, int nb_sectors)
+                                           int64_t offset, int64_t bytes)
 {
    unsigned long size;
    if (iscsilun->allocmap_valid == NULL) {
        return false;
    }
-    size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors);
+    assert(iscsilun->cluster_size);
+    size = DIV_ROUND_UP(offset + bytes, iscsilun->cluster_size);
    return (find_next_zero_bit(iscsilun->allocmap_valid, size,
-                               sector_num / iscsilun->cluster_sectors) == size);
+                               offset / iscsilun->cluster_size) == size);
 }

 static int coroutine_fn
@ -631,14 +634,16 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        iscsi_allocmap_set_invalid(iscsilun, sector_num, nb_sectors);
+        iscsi_allocmap_set_invalid(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                   nb_sectors * BDRV_SECTOR_SIZE);
        error_report("iSCSI WRITE10/16 failed at lba %" PRIu64 ": %s", lba,
                     iTask.err_str);
        r = iTask.err_code;
        goto out_unlock;
    }

-    iscsi_allocmap_set_allocated(iscsilun, sector_num, nb_sectors);
+    iscsi_allocmap_set_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                 nb_sectors * BDRV_SECTOR_SIZE);

 out_unlock:
    qemu_mutex_unlock(&iscsilun->mutex);
@ -648,36 +653,36 @@ out_unlock:



-static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs,
-                                                  int64_t sector_num,
-                                                  int nb_sectors, int *pnum,
-                                                  BlockDriverState **file)
+static int coroutine_fn iscsi_co_block_status(BlockDriverState *bs,
+                                              bool want_zero, int64_t offset,
+                                              int64_t bytes, int64_t *pnum,
+                                              int64_t *map,
+                                              BlockDriverState **file)
 {
    IscsiLun *iscsilun = bs->opaque;
    struct scsi_get_lba_status *lbas = NULL;
    struct scsi_lba_status_descriptor *lbasd = NULL;
    struct IscsiTask iTask;
    uint64_t lba;
-    int64_t ret;
+    int ret;

    iscsi_co_init_iscsitask(iscsilun, &iTask);

-    if (!is_sector_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
-        ret = -EINVAL;
-        goto out;
-    }
+    assert(QEMU_IS_ALIGNED(offset | bytes, iscsilun->block_size));

    /* default to all sectors allocated */
-    ret = BDRV_BLOCK_DATA;
-    ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID;
-    *pnum = nb_sectors;
+    ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+    if (map) {
+        *map = offset;
+    }
+    *pnum = bytes;

    /* LUN does not support logical block provisioning */
    if (!iscsilun->lbpme) {
        goto out;
    }

-    lba = sector_qemu2lun(sector_num, iscsilun);
+    lba = offset / iscsilun->block_size;

    qemu_mutex_lock(&iscsilun->mutex);
 retry:
@ -722,12 +727,12 @@ retry:

    lbasd = &lbas->descriptors[0];

-    if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) {
+    if (lba != lbasd->lba) {
        ret = -EIO;
        goto out_unlock;
    }

-    *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun);
+    *pnum = lbasd->num_blocks * iscsilun->block_size;

    if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED ||
        lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) {
@ -738,13 +743,13 @@ retry:
    }

    if (ret & BDRV_BLOCK_ZERO) {
-        iscsi_allocmap_set_unallocated(iscsilun, sector_num, *pnum);
+        iscsi_allocmap_set_unallocated(iscsilun, offset, *pnum);
    } else {
-        iscsi_allocmap_set_allocated(iscsilun, sector_num, *pnum);
+        iscsi_allocmap_set_allocated(iscsilun, offset, *pnum);
    }

-    if (*pnum > nb_sectors) {
-        *pnum = nb_sectors;
+    if (*pnum > bytes) {
+        *pnum = bytes;
    }
 out_unlock:
    qemu_mutex_unlock(&iscsilun->mutex);
@ -753,7 +758,7 @@ out:
    if (iTask.task != NULL) {
        scsi_free_scsi_task(iTask.task);
    }
-    if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
+    if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID && file) {
        *file = bs;
    }
    return ret;
@ -780,29 +785,37 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs,
    /* if cache.direct is off and we have a valid entry in our allocation map
     * we can skip checking the block status and directly return zeroes if
     * the request falls within an unallocated area */
-    if (iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
-        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
+    if (iscsi_allocmap_is_valid(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                nb_sectors * BDRV_SECTOR_SIZE) &&
+        !iscsi_allocmap_is_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                     nb_sectors * BDRV_SECTOR_SIZE)) {
            qemu_iovec_memset(iov, 0, 0x00, iov->size);
            return 0;
    }

    if (nb_sectors >= ISCSI_CHECKALLOC_THRES &&
-        !iscsi_allocmap_is_valid(iscsilun, sector_num, nb_sectors) &&
-        !iscsi_allocmap_is_allocated(iscsilun, sector_num, nb_sectors)) {
-        int pnum;
-        BlockDriverState *file;
+        !iscsi_allocmap_is_valid(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                 nb_sectors * BDRV_SECTOR_SIZE) &&
+        !iscsi_allocmap_is_allocated(iscsilun, sector_num * BDRV_SECTOR_SIZE,
+                                     nb_sectors * BDRV_SECTOR_SIZE)) {
+        int64_t pnum;
        /* check the block status from the beginning of the cluster
         * containing the start sector */
-        int64_t ret = iscsi_co_get_block_status(bs,
-                          sector_num - sector_num % iscsilun->cluster_sectors,
-                          BDRV_REQUEST_MAX_SECTORS, &pnum, &file);
+        int64_t head;
+        int ret;
+
+        assert(iscsilun->cluster_size);
+        head = (sector_num * BDRV_SECTOR_SIZE) % iscsilun->cluster_size;
+        ret = iscsi_co_block_status(bs, true,
+                                    sector_num * BDRV_SECTOR_SIZE - head,
+                                    BDRV_REQUEST_MAX_BYTES, &pnum, NULL, NULL);
        if (ret < 0) {
            return ret;
        }
        /* if the whole request falls into an unallocated area we can avoid
-         * to read and directly return zeroes instead */
+         * reading and directly return zeroes instead */
        if (ret & BDRV_BLOCK_ZERO &&
-            pnum >= nb_sectors + sector_num % iscsilun->cluster_sectors) {
+            pnum >= nb_sectors * BDRV_SECTOR_SIZE + head) {
            qemu_iovec_memset(iov, 0, 0x00, iov->size);
            return 0;
        }
@ -1146,8 +1159,7 @@ retry:
        goto retry;
    }

-    iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                               bytes >> BDRV_SECTOR_BITS);
+    iscsi_allocmap_set_invalid(iscsilun, offset, bytes);

    if (iTask.status == SCSI_STATUS_CHECK_CONDITION) {
        /* the target might fail with a check condition if it
@ -1260,8 +1272,7 @@ retry:
    }

    if (iTask.status != SCSI_STATUS_GOOD) {
-        iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                   bytes >> BDRV_SECTOR_BITS);
+        iscsi_allocmap_set_invalid(iscsilun, offset, bytes);
        error_report("iSCSI WRITESAME10/16 failed at lba %" PRIu64 ": %s",
                     lba, iTask.err_str);
        r = iTask.err_code;
@ -1269,11 +1280,9 @@ retry:
    }

    if (flags & BDRV_REQ_MAY_UNMAP) {
-        iscsi_allocmap_set_invalid(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                   bytes >> BDRV_SECTOR_BITS);
+        iscsi_allocmap_set_invalid(iscsilun, offset, bytes);
    } else {
-        iscsi_allocmap_set_allocated(iscsilun, offset >> BDRV_SECTOR_BITS,
-                                     bytes >> BDRV_SECTOR_BITS);
+        iscsi_allocmap_set_allocated(iscsilun, offset, bytes);
    }

 out_unlock:
@ -1953,8 +1962,8 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
     * reasonable size */
    if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 4 * 1024 &&
        iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) {
-        iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran *
-                                     iscsilun->block_size) >> BDRV_SECTOR_BITS;
+        iscsilun->cluster_size = iscsilun->bl.opt_unmap_gran *
+            iscsilun->block_size;
        if (iscsilun->lbprz) {
            ret = iscsi_allocmap_init(iscsilun, bs->open_flags);
        }
@ -2108,7 +2117,8 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset,
    return 0;
 }

-static int iscsi_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn iscsi_co_create_opts(const char *filename, QemuOpts *opts,
+                                             Error **errp)
 {
    int ret = 0;
    int64_t total_size = 0;
@ -2163,7 +2173,7 @@ static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
    IscsiLun *iscsilun = bs->opaque;
    bdi->unallocated_blocks_are_zero = iscsilun->lbprz;
-    bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE;
+    bdi->cluster_size = iscsilun->cluster_size;
    return 0;
 }

@ -2195,7 +2205,7 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_parse_filename    = iscsi_parse_filename,
    .bdrv_file_open         = iscsi_open,
    .bdrv_close             = iscsi_close,
-    .bdrv_create            = iscsi_create,
+    .bdrv_co_create_opts    = iscsi_co_create_opts,
    .create_opts            = &iscsi_create_opts,
    .bdrv_reopen_prepare    = iscsi_reopen_prepare,
    .bdrv_reopen_commit     = iscsi_reopen_commit,
@ -2206,7 +2216,7 @@ static BlockDriver bdrv_iscsi = {
    .bdrv_truncate   = iscsi_truncate,
    .bdrv_refresh_limits = iscsi_refresh_limits,

-    .bdrv_co_get_block_status = iscsi_co_get_block_status,
+    .bdrv_co_block_status  = iscsi_co_block_status,
    .bdrv_co_pdiscard      = iscsi_co_pdiscard,
    .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
    .bdrv_co_readv         = iscsi_co_readv,
@ -2230,7 +2240,7 @@ static BlockDriver bdrv_iser = {
    .bdrv_parse_filename    = iscsi_parse_filename,
    .bdrv_file_open         = iscsi_open,
    .bdrv_close             = iscsi_close,
-    .bdrv_create            = iscsi_create,
+    .bdrv_co_create_opts    = iscsi_co_create_opts,
    .create_opts            = &iscsi_create_opts,
    .bdrv_reopen_prepare    = iscsi_reopen_prepare,
    .bdrv_reopen_commit     = iscsi_reopen_commit,
@ -2241,7 +2251,7 @@ static BlockDriver bdrv_iser = {
    .bdrv_truncate   = iscsi_truncate,
    .bdrv_refresh_limits = iscsi_refresh_limits,

-    .bdrv_co_get_block_status = iscsi_co_get_block_status,
+    .bdrv_co_block_status  = iscsi_co_block_status,
    .bdrv_co_pdiscard      = iscsi_co_pdiscard,
    .bdrv_co_pwrite_zeroes = iscsi_co_pwrite_zeroes,
    .bdrv_co_readv         = iscsi_co_readv,
--- a/block/mirror.c
+++ b/block/mirror.c
@ -1094,7 +1094,7 @@ static BlockDriver bdrv_mirror_top = {
    .bdrv_co_pwrite_zeroes      = bdrv_mirror_top_pwrite_zeroes,
    .bdrv_co_pdiscard           = bdrv_mirror_top_pdiscard,
    .bdrv_co_flush              = bdrv_mirror_top_flush,
-    .bdrv_co_get_block_status   = bdrv_co_get_block_status_from_backing,
+    .bdrv_co_block_status       = bdrv_co_block_status_from_backing,
    .bdrv_refresh_filename      = bdrv_mirror_top_refresh_filename,
    .bdrv_close                 = bdrv_mirror_top_close,
    .bdrv_child_perm            = bdrv_mirror_top_child_perm,
--- a/block/nfs.c
+++ b/block/nfs.c
@ -684,7 +684,8 @@ static QemuOptsList nfs_create_opts = {
    }
 };

-static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp)
+static int coroutine_fn nfs_file_co_create_opts(const char *url, QemuOpts *opts,
+                                                Error **errp)
 {
    int64_t ret, total_size;
    NFSClient *client = g_new0(NFSClient, 1);
@ -897,7 +898,7 @@ static BlockDriver bdrv_nfs = {

    .bdrv_file_open                 = nfs_file_open,
    .bdrv_close                     = nfs_file_close,
-    .bdrv_create                    = nfs_file_create,
+    .bdrv_co_create_opts            = nfs_file_co_create_opts,
    .bdrv_reopen_prepare            = nfs_reopen_prepare,

    .bdrv_co_preadv                 = nfs_co_preadv,
--- a/block/null.c
+++ b/block/null.c
@ -223,22 +223,23 @@ static int null_reopen_prepare(BDRVReopenState *reopen_state,
    return 0;
 }

-static int64_t coroutine_fn null_co_get_block_status(BlockDriverState *bs,
-                                                     int64_t sector_num,
-                                                     int nb_sectors, int *pnum,
-                                                     BlockDriverState **file)
+static int coroutine_fn null_co_block_status(BlockDriverState *bs,
+                                             bool want_zero, int64_t offset,
+                                             int64_t bytes, int64_t *pnum,
+                                             int64_t *map,
+                                             BlockDriverState **file)
 {
    BDRVNullState *s = bs->opaque;
-    off_t start = sector_num * BDRV_SECTOR_SIZE;
+    int ret = BDRV_BLOCK_OFFSET_VALID;

-    *pnum = nb_sectors;
+    *pnum = bytes;
+    *map = offset;
    *file = bs;

    if (s->read_zeroes) {
-        return BDRV_BLOCK_OFFSET_VALID | start | BDRV_BLOCK_ZERO;
-    } else {
-        return BDRV_BLOCK_OFFSET_VALID | start;
+        ret |= BDRV_BLOCK_ZERO;
    }
+    return ret;
 }

 static void null_refresh_filename(BlockDriverState *bs, QDict *opts)
@ -270,7 +271,7 @@ static BlockDriver bdrv_null_co = {
    .bdrv_co_flush_to_disk  = null_co_flush,
    .bdrv_reopen_prepare    = null_reopen_prepare,

-    .bdrv_co_get_block_status   = null_co_get_block_status,
+    .bdrv_co_block_status   = null_co_block_status,

    .bdrv_refresh_filename  = null_refresh_filename,
 };
@ -290,7 +291,7 @@ static BlockDriver bdrv_null_aio = {
    .bdrv_aio_flush         = null_aio_flush,
    .bdrv_reopen_prepare    = null_reopen_prepare,

-    .bdrv_co_get_block_status   = null_co_get_block_status,
+    .bdrv_co_block_status   = null_co_block_status,

    .bdrv_refresh_filename  = null_refresh_filename,
 };
--- a/block/nvme.c
+++ b/block/nvme.c
@ -1072,18 +1072,6 @@ static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
    return 0;
 }

-static int64_t coroutine_fn nvme_co_get_block_status(BlockDriverState *bs,
-                                                     int64_t sector_num,
-                                                     int nb_sectors, int *pnum,
-                                                     BlockDriverState **file)
-{
-    *pnum = nb_sectors;
-    *file = bs;
-
-    return BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_OFFSET_VALID |
-           (sector_num << BDRV_SECTOR_BITS);
-}
-
 static void nvme_refresh_filename(BlockDriverState *bs, QDict *opts)
 {
    QINCREF(opts);
@ -1183,8 +1171,6 @@ static BlockDriver bdrv_nvme = {
    .bdrv_co_flush_to_disk    = nvme_co_flush,
    .bdrv_reopen_prepare      = nvme_reopen_prepare,

-    .bdrv_co_get_block_status = nvme_co_get_block_status,
-
    .bdrv_refresh_filename    = nvme_refresh_filename,
    .bdrv_refresh_limits      = nvme_refresh_limits,

--- a/block/parallels.c
+++ b/block/parallels.c
@ -261,23 +261,31 @@ static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs)
 }


-static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
+static int coroutine_fn parallels_co_block_status(BlockDriverState *bs,
+                                                  bool want_zero,
+                                                  int64_t offset,
+                                                  int64_t bytes,
+                                                  int64_t *pnum,
+                                                  int64_t *map,
+                                                  BlockDriverState **file)
 {
    BDRVParallelsState *s = bs->opaque;
-    int64_t offset;
+    int count;

+    assert(QEMU_IS_ALIGNED(offset | bytes, BDRV_SECTOR_SIZE));
    qemu_co_mutex_lock(&s->lock);
-    offset = block_status(s, sector_num, nb_sectors, pnum);
+    offset = block_status(s, offset >> BDRV_SECTOR_BITS,
+                          bytes >> BDRV_SECTOR_BITS, &count);
    qemu_co_mutex_unlock(&s->lock);

+    *pnum = count * BDRV_SECTOR_SIZE;
    if (offset < 0) {
        return 0;
    }

+    *map = offset * BDRV_SECTOR_SIZE;
    *file = bs->file->bs;
-    return (offset << BDRV_SECTOR_BITS) |
-        BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 }

 static coroutine_fn int parallels_co_writev(BlockDriverState *bs,
@ -467,7 +475,9 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res,
 }


-static int parallels_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn parallels_co_create_opts(const char *filename,
+                                                 QemuOpts *opts,
+                                                 Error **errp)
 {
    int64_t total_size, cl_size;
    uint8_t tmp[BDRV_SECTOR_SIZE];
@ -782,13 +792,13 @@ static BlockDriver bdrv_parallels = {
    .bdrv_open		= parallels_open,
    .bdrv_close		= parallels_close,
    .bdrv_child_perm          = bdrv_format_default_perms,
-    .bdrv_co_get_block_status = parallels_co_get_block_status,
+    .bdrv_co_block_status     = parallels_co_block_status,
    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
    .bdrv_co_flush_to_os      = parallels_co_flush_to_os,
    .bdrv_co_readv  = parallels_co_readv,
    .bdrv_co_writev = parallels_co_writev,
    .supports_backing = true,
-    .bdrv_create    = parallels_create,
+    .bdrv_co_create_opts = parallels_co_create_opts,
    .bdrv_check     = parallels_check,
    .create_opts    = &parallels_create_opts,
 };
--- a/block/qcow.c
+++ b/block/qcow.c
@ -524,23 +524,28 @@ static int get_cluster_offset(BlockDriverState *bs,
    return 1;
 }

-static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
+static int coroutine_fn qcow_co_block_status(BlockDriverState *bs,
+                                             bool want_zero,
+                                             int64_t offset, int64_t bytes,
+                                             int64_t *pnum, int64_t *map,
+                                             BlockDriverState **file)
 {
    BDRVQcowState *s = bs->opaque;
-    int index_in_cluster, n, ret;
+    int index_in_cluster, ret;
+    int64_t n;
    uint64_t cluster_offset;

    qemu_co_mutex_lock(&s->lock);
-    ret = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0, &cluster_offset);
+    ret = get_cluster_offset(bs, offset, 0, 0, 0, 0, &cluster_offset);
    qemu_co_mutex_unlock(&s->lock);
    if (ret < 0) {
        return ret;
    }
-    index_in_cluster = sector_num & (s->cluster_sectors - 1);
-    n = s->cluster_sectors - index_in_cluster;
-    if (n > nb_sectors)
-        n = nb_sectors;
+    index_in_cluster = offset & (s->cluster_size - 1);
+    n = s->cluster_size - index_in_cluster;
+    if (n > bytes) {
+        n = bytes;
+    }
    *pnum = n;
    if (!cluster_offset) {
        return 0;
@ -548,9 +553,9 @@ static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs,
    if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypto) {
        return BDRV_BLOCK_DATA;
    }
-    cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
+    *map = cluster_offset | index_in_cluster;
    *file = bs->file->bs;
-    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset;
+    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 }

 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
@ -805,7 +810,8 @@ static void qcow_close(BlockDriverState *bs)
    error_free(s->migration_blocker);
 }

-static int qcow_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn qcow_co_create_opts(const char *filename, QemuOpts *opts,
+                                            Error **errp)
 {
    int header_size, backing_filename_len, l1_size, shift, i;
    QCowHeader header;
@ -1122,13 +1128,13 @@ static BlockDriver bdrv_qcow = {
    .bdrv_close		= qcow_close,
    .bdrv_child_perm        = bdrv_format_default_perms,
    .bdrv_reopen_prepare    = qcow_reopen_prepare,
-    .bdrv_create            = qcow_create,
+    .bdrv_co_create_opts    = qcow_co_create_opts,
    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
    .supports_backing       = true,

    .bdrv_co_readv          = qcow_co_readv,
    .bdrv_co_writev         = qcow_co_writev,
-    .bdrv_co_get_block_status   = qcow_co_get_block_status,
+    .bdrv_co_block_status   = qcow_co_block_status,

    .bdrv_make_empty        = qcow_make_empty,
    .bdrv_co_pwritev_compressed = qcow_co_pwritev_compressed,
--- a/block/qcow2-bitmap.c
+++ b/block/qcow2-bitmap.c
@ -413,8 +413,8 @@ static inline void bitmap_dir_entry_to_be(Qcow2BitmapDirEntry *entry)

 static inline int calc_dir_entry_size(size_t name_size, size_t extra_data_size)
 {
-    return align_offset(sizeof(Qcow2BitmapDirEntry) +
-                        name_size + extra_data_size, 8);
+    int size = sizeof(Qcow2BitmapDirEntry) + name_size + extra_data_size;
+    return ROUND_UP(size, 8);
 }

 static inline int dir_entry_size(Qcow2BitmapDirEntry *entry)
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@ -126,11 +126,11 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,

    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
    new_l1_table = qemu_try_blockalign(bs->file->bs,
-                                       align_offset(new_l1_size2, 512));
+                                       ROUND_UP(new_l1_size2, 512));
    if (new_l1_table == NULL) {
        return -ENOMEM;
    }
-    memset(new_l1_table, 0, align_offset(new_l1_size2, 512));
+    memset(new_l1_table, 0, ROUND_UP(new_l1_size2, 512));

    if (s->l1_size) {
        memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@ -1204,7 +1204,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs,
     * l1_table_offset when it is the current s->l1_table_offset! Be careful
     * when changing this! */
    if (l1_table_offset != s->l1_table_offset) {
-        l1_table = g_try_malloc0(align_offset(l1_size2, 512));
+        l1_table = g_try_malloc0(ROUND_UP(l1_size2, 512));
        if (l1_size2 && l1_table == NULL) {
            ret = -ENOMEM;
            goto fail;
@ -2553,7 +2553,7 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
    }

    /* align range to test to cluster boundaries */
-    size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
+    size = ROUND_UP(offset_into_cluster(s, offset) + size, s->cluster_size);
    offset = start_of_cluster(s, offset);

    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@ -66,7 +66,7 @@ int qcow2_read_snapshots(BlockDriverState *bs)

    for(i = 0; i < s->nb_snapshots; i++) {
        /* Read statically sized part of the snapshot header */
-        offset = align_offset(offset, 8);
+        offset = ROUND_UP(offset, 8);
        ret = bdrv_pread(bs->file, offset, &h, sizeof(h));
        if (ret < 0) {
            goto fail;
@ -155,7 +155,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
    offset = 0;
    for(i = 0; i < s->nb_snapshots; i++) {
        sn = s->snapshots + i;
-        offset = align_offset(offset, 8);
+        offset = ROUND_UP(offset, 8);
        offset += sizeof(h);
        offset += sizeof(extra);
        offset += strlen(sn->id_str);
@ -215,7 +215,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
        assert(id_str_size <= UINT16_MAX && name_size <= UINT16_MAX);
        h.id_str_size = cpu_to_be16(id_str_size);
        h.name_size = cpu_to_be16(name_size);
-        offset = align_offset(offset, 8);
+        offset = ROUND_UP(offset, 8);

        ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h));
        if (ret < 0) {
@ -441,7 +441,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
    /* The VM state isn't needed any more in the active L1 table; in fact, it
     * hurts by causing expensive COW for the next snapshot. */
    qcow2_cluster_discard(bs, qcow2_vm_state_offset(s),
-                          align_offset(sn->vm_state_size, s->cluster_size),
+                          ROUND_UP(sn->vm_state_size, s->cluster_size),
                          QCOW2_DISCARD_NEVER, false);

 #ifdef DEBUG_ALLOC
@ -710,7 +710,7 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs,
    }
    new_l1_bytes = sn->l1_size * sizeof(uint64_t);
    new_l1_table = qemu_try_blockalign(bs->file->bs,
-                                       align_offset(new_l1_bytes, 512));
+                                       ROUND_UP(new_l1_bytes, 512));
    if (new_l1_table == NULL) {
        return -ENOMEM;
    }
--- a/block/qcow2.c
+++ b/block/qcow2.c
@ -1377,7 +1377,7 @@ static int qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,

    if (s->l1_size > 0) {
        s->l1_table = qemu_try_blockalign(bs->file->bs,
-            align_offset(s->l1_size * sizeof(uint64_t), 512));
+            ROUND_UP(s->l1_size * sizeof(uint64_t), 512));
        if (s->l1_table == NULL) {
            error_setg(errp, "Could not allocate L1 table");
            ret = -ENOMEM;
@ -1668,32 +1668,34 @@ static void qcow2_join_options(QDict *options, QDict *old_options)
    }
 }

-static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
+static int coroutine_fn qcow2_co_block_status(BlockDriverState *bs,
+                                              bool want_zero,
+                                              int64_t offset, int64_t count,
+                                              int64_t *pnum, int64_t *map,
+                                              BlockDriverState **file)
 {
    BDRVQcow2State *s = bs->opaque;
    uint64_t cluster_offset;
    int index_in_cluster, ret;
    unsigned int bytes;
-    int64_t status = 0;
+    int status = 0;

-    bytes = MIN(INT_MAX, nb_sectors * BDRV_SECTOR_SIZE);
+    bytes = MIN(INT_MAX, count);
    qemu_co_mutex_lock(&s->lock);
-    ret = qcow2_get_cluster_offset(bs, sector_num << BDRV_SECTOR_BITS, &bytes,
-                                   &cluster_offset);
+    ret = qcow2_get_cluster_offset(bs, offset, &bytes, &cluster_offset);
    qemu_co_mutex_unlock(&s->lock);
    if (ret < 0) {
        return ret;
    }

-    *pnum = bytes >> BDRV_SECTOR_BITS;
+    *pnum = bytes;

    if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
        !s->crypto) {
-        index_in_cluster = sector_num & (s->cluster_sectors - 1);
-        cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS);
+        index_in_cluster = offset & (s->cluster_size - 1);
+        *map = cluster_offset | index_in_cluster;
        *file = bs->file->bs;
-        status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset;
+        status |= BDRV_BLOCK_OFFSET_VALID;
    }
    if (ret == QCOW2_CLUSTER_ZERO_PLAIN || ret == QCOW2_CLUSTER_ZERO_ALLOC) {
        status |= BDRV_BLOCK_ZERO;
@ -2638,19 +2640,19 @@ static int64_t qcow2_calc_prealloc_size(int64_t total_size,
 {
    int64_t meta_size = 0;
    uint64_t nl1e, nl2e;
-    int64_t aligned_total_size = align_offset(total_size, cluster_size);
+    int64_t aligned_total_size = ROUND_UP(total_size, cluster_size);

    /* header: 1 cluster */
    meta_size += cluster_size;

    /* total size of L2 tables */
    nl2e = aligned_total_size / cluster_size;
-    nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t));
+    nl2e = ROUND_UP(nl2e, cluster_size / sizeof(uint64_t));
    meta_size += nl2e * sizeof(uint64_t);

    /* total size of L1 tables */
    nl1e = nl2e * sizeof(uint64_t) / cluster_size;
-    nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t));
+    nl1e = ROUND_UP(nl1e, cluster_size / sizeof(uint64_t));
    meta_size += nl1e * sizeof(uint64_t);

    /* total size of refcount table and blocks */
@ -2721,11 +2723,12 @@ static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version,
    return refcount_bits;
 }

-static int qcow2_create2(const char *filename, int64_t total_size,
-                         const char *backing_file, const char *backing_format,
-                         int flags, size_t cluster_size, PreallocMode prealloc,
-                         QemuOpts *opts, int version, int refcount_order,
-                         const char *encryptfmt, Error **errp)
+static int coroutine_fn
+qcow2_co_create2(const char *filename, int64_t total_size,
+                 const char *backing_file, const char *backing_format,
+                 int flags, size_t cluster_size, PreallocMode prealloc,
+                 QemuOpts *opts, int version, int refcount_order,
+                 const char *encryptfmt, Error **errp)
 {
    QDict *options;

@ -2912,7 +2915,8 @@ out:
    return ret;
 }

-static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn qcow2_co_create_opts(const char *filename, QemuOpts *opts,
+                                             Error **errp)
 {
    char *backing_file = NULL;
    char *backing_fmt = NULL;
@ -2993,9 +2997,9 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp)

    refcount_order = ctz32(refcount_bits);

-    ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags,
-                        cluster_size, prealloc, opts, version, refcount_order,
-                        encryptfmt, &local_err);
+    ret = qcow2_co_create2(filename, size, backing_file, backing_fmt, flags,
+                           cluster_size, prealloc, opts, version, refcount_order,
+                           encryptfmt, &local_err);
    error_propagate(errp, local_err);

 finish:
@ -3704,8 +3708,8 @@ static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
    has_backing_file = !!optstr;
    g_free(optstr);

-    virtual_size = align_offset(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
-                                cluster_size);
+    virtual_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+    virtual_size = ROUND_UP(virtual_size, cluster_size);

    /* Check that virtual disk size is valid */
    l2_tables = DIV_ROUND_UP(virtual_size / cluster_size,
@ -3725,7 +3729,7 @@ static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
            goto err;
        }

-        virtual_size = align_offset(ssize, cluster_size);
+        virtual_size = ROUND_UP(ssize, cluster_size);

        if (has_backing_file) {
            /* We don't how much of the backing chain is shared by the input
@ -4348,9 +4352,9 @@ BlockDriver bdrv_qcow2 = {
    .bdrv_reopen_abort    = qcow2_reopen_abort,
    .bdrv_join_options    = qcow2_join_options,
    .bdrv_child_perm      = bdrv_format_default_perms,
-    .bdrv_create        = qcow2_create,
+    .bdrv_co_create_opts  = qcow2_co_create_opts,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
-    .bdrv_co_get_block_status = qcow2_co_get_block_status,
+    .bdrv_co_block_status = qcow2_co_block_status,

    .bdrv_co_preadv         = qcow2_co_preadv,
    .bdrv_co_pwritev        = qcow2_co_pwritev,
--- a/block/qcow2.h
+++ b/block/qcow2.h
@ -480,12 +480,6 @@ static inline int offset_to_l2_slice_index(BDRVQcow2State *s, int64_t offset)
    return (offset >> s->cluster_bits) & (s->l2_slice_size - 1);
 }

-static inline int64_t align_offset(int64_t offset, int n)
-{
-    offset = (offset + n - 1) & ~(n - 1);
-    return offset;
-}
-
 static inline int64_t qcow2_vm_state_offset(BDRVQcow2State *s)
 {
    return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
--- a/block/qed.c
+++ b/block/qed.c
@ -638,7 +638,9 @@ out:
    return ret;
 }

-static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn bdrv_qed_co_create_opts(const char *filename,
+                                                QemuOpts *opts,
+                                                Error **errp)
 {
    uint64_t image_size = 0;
    uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE;
@ -688,74 +690,46 @@ finish:
    return ret;
 }

-typedef struct {
-    BlockDriverState *bs;
-    Coroutine *co;
-    uint64_t pos;
-    int64_t status;
-    int *pnum;
-    BlockDriverState **file;
-} QEDIsAllocatedCB;
-
-/* Called with table_lock held.  */
-static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len)
-{
-    QEDIsAllocatedCB *cb = opaque;
-    BDRVQEDState *s = cb->bs->opaque;
-    *cb->pnum = len / BDRV_SECTOR_SIZE;
-    switch (ret) {
-    case QED_CLUSTER_FOUND:
-        offset |= qed_offset_into_cluster(s, cb->pos);
-        cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
-        *cb->file = cb->bs->file->bs;
-        break;
-    case QED_CLUSTER_ZERO:
-        cb->status = BDRV_BLOCK_ZERO;
-        break;
-    case QED_CLUSTER_L2:
-    case QED_CLUSTER_L1:
-        cb->status = 0;
-        break;
-    default:
-        assert(ret < 0);
-        cb->status = ret;
-        break;
-    }
-
-    if (cb->co) {
-        aio_co_wake(cb->co);
-    }
-}
-
-static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs,
-                                                 int64_t sector_num,
-                                                 int nb_sectors, int *pnum,
+static int coroutine_fn bdrv_qed_co_block_status(BlockDriverState *bs,
+                                                 bool want_zero,
+                                                 int64_t pos, int64_t bytes,
+                                                 int64_t *pnum, int64_t *map,
                                                 BlockDriverState **file)
 {
    BDRVQEDState *s = bs->opaque;
-    size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE;
-    QEDIsAllocatedCB cb = {
-        .bs = bs,
-        .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE,
-        .status = BDRV_BLOCK_OFFSET_MASK,
-        .pnum = pnum,
-        .file = file,
-    };
+    size_t len = MIN(bytes, SIZE_MAX);
+    int status;
    QEDRequest request = { .l2_table = NULL };
    uint64_t offset;
    int ret;

    qemu_co_mutex_lock(&s->table_lock);
-    ret = qed_find_cluster(s, &request, cb.pos, &len, &offset);
-    qed_is_allocated_cb(&cb, ret, offset, len);
+    ret = qed_find_cluster(s, &request, pos, &len, &offset);

-    /* The callback was invoked immediately */
-    assert(cb.status != BDRV_BLOCK_OFFSET_MASK);
+    *pnum = len;
+    switch (ret) {
+    case QED_CLUSTER_FOUND:
+        *map = offset | qed_offset_into_cluster(s, pos);
+        status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+        *file = bs->file->bs;
+        break;
+    case QED_CLUSTER_ZERO:
+        status = BDRV_BLOCK_ZERO;
+        break;
+    case QED_CLUSTER_L2:
+    case QED_CLUSTER_L1:
+        status = 0;
+        break;
+    default:
+        assert(ret < 0);
+        status = ret;
+        break;
+    }

    qed_unref_l2_cache_entry(request.l2_table);
    qemu_co_mutex_unlock(&s->table_lock);

-    return cb.status;
+    return status;
 }

 static BDRVQEDState *acb_to_s(QEDAIOCB *acb)
@ -1592,9 +1566,9 @@ static BlockDriver bdrv_qed = {
    .bdrv_close               = bdrv_qed_close,
    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
    .bdrv_child_perm          = bdrv_format_default_perms,
-    .bdrv_create              = bdrv_qed_create,
+    .bdrv_co_create_opts      = bdrv_qed_co_create_opts,
    .bdrv_has_zero_init       = bdrv_has_zero_init_1,
-    .bdrv_co_get_block_status = bdrv_qed_co_get_block_status,
+    .bdrv_co_block_status     = bdrv_qed_co_block_status,
    .bdrv_co_readv            = bdrv_qed_co_readv,
    .bdrv_co_writev           = bdrv_qed_co_writev,
    .bdrv_co_pwrite_zeroes    = bdrv_qed_co_pwrite_zeroes,
--- a/block/raw-format.c
+++ b/block/raw-format.c
@ -250,17 +250,17 @@ fail:
    return ret;
 }

-static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs,
-                                            int64_t sector_num,
-                                            int nb_sectors, int *pnum,
+static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
+                                            bool want_zero, int64_t offset,
+                                            int64_t bytes, int64_t *pnum,
+                                            int64_t *map,
                                            BlockDriverState **file)
 {
    BDRVRawState *s = bs->opaque;
-    *pnum = nb_sectors;
+    *pnum = bytes;
    *file = bs->file->bs;
-    sector_num += s->offset / BDRV_SECTOR_SIZE;
-    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
-           (sector_num << BDRV_SECTOR_BITS);
+    *map = offset + s->offset;
+    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
 }

 static int coroutine_fn raw_co_pwrite_zeroes(BlockDriverState *bs,
@ -396,7 +396,8 @@ static int raw_has_zero_init(BlockDriverState *bs)
    return bdrv_has_zero_init(bs->file->bs);
 }

-static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts,
+                                           Error **errp)
 {
    return bdrv_create_file(filename, opts, errp);
 }
@ -491,12 +492,12 @@ BlockDriver bdrv_raw = {
    .bdrv_open            = &raw_open,
    .bdrv_close           = &raw_close,
    .bdrv_child_perm      = bdrv_filter_default_perms,
-    .bdrv_create          = &raw_create,
+    .bdrv_co_create_opts  = &raw_co_create_opts,
    .bdrv_co_preadv       = &raw_co_preadv,
    .bdrv_co_pwritev      = &raw_co_pwritev,
    .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
    .bdrv_co_pdiscard     = &raw_co_pdiscard,
-    .bdrv_co_get_block_status = &raw_co_get_block_status,
+    .bdrv_co_block_status = &raw_co_block_status,
    .bdrv_truncate        = &raw_truncate,
    .bdrv_getlength       = &raw_getlength,
    .has_variable_length  = true,
--- a/block/rbd.c
+++ b/block/rbd.c
@ -351,7 +351,9 @@ static QemuOptsList runtime_opts = {
    },
 };

-static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn qemu_rbd_co_create_opts(const char *filename,
+                                                QemuOpts *opts,
+                                                Error **errp)
 {
    Error *local_err = NULL;
    int64_t bytes = 0;
@ -1132,7 +1134,7 @@ static BlockDriver bdrv_rbd = {
    .bdrv_file_open         = qemu_rbd_open,
    .bdrv_close             = qemu_rbd_close,
    .bdrv_reopen_prepare    = qemu_rbd_reopen_prepare,
-    .bdrv_create            = qemu_rbd_create,
+    .bdrv_co_create_opts    = qemu_rbd_co_create_opts,
    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
    .bdrv_get_info          = qemu_rbd_getinfo,
    .create_opts            = &qemu_rbd_create_opts,
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@ -1959,8 +1959,8 @@ static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
    return 0;
 }

-static int sd_create(const char *filename, QemuOpts *opts,
-                     Error **errp)
+static int coroutine_fn sd_co_create_opts(const char *filename, QemuOpts *opts,
+                                          Error **errp)
 {
    Error *err = NULL;
    int ret = 0;
@ -3004,19 +3004,19 @@ static coroutine_fn int sd_co_pdiscard(BlockDriverState *bs, int64_t offset,
    return acb.ret;
 }

-static coroutine_fn int64_t
-sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-                       int *pnum, BlockDriverState **file)
+static coroutine_fn int
+sd_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
+                   int64_t bytes, int64_t *pnum, int64_t *map,
+                   BlockDriverState **file)
 {
    BDRVSheepdogState *s = bs->opaque;
    SheepdogInode *inode = &s->inode;
    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
-    uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
    unsigned long start = offset / object_size,
-                  end = DIV_ROUND_UP((sector_num + nb_sectors) *
-                                     BDRV_SECTOR_SIZE, object_size);
+                  end = DIV_ROUND_UP(offset + bytes, object_size);
    unsigned long idx;
-    int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
+    *map = offset;
+    int ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;

    for (idx = start; idx < end; idx++) {
        if (inode->data_vdi_id[idx] == 0) {
@ -3033,9 +3033,9 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
        }
    }

-    *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
-    if (*pnum > nb_sectors) {
-        *pnum = nb_sectors;
+    *pnum = (idx - start) * object_size;
+    if (*pnum > bytes) {
+        *pnum = bytes;
    }
    if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) {
        *file = bs;
@ -3103,7 +3103,7 @@ static BlockDriver bdrv_sheepdog = {
    .bdrv_reopen_commit           = sd_reopen_commit,
    .bdrv_reopen_abort            = sd_reopen_abort,
    .bdrv_close                   = sd_close,
-    .bdrv_create                  = sd_create,
+    .bdrv_co_create_opts          = sd_co_create_opts,
    .bdrv_has_zero_init           = bdrv_has_zero_init_1,
    .bdrv_getlength               = sd_getlength,
    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
@ -3113,7 +3113,7 @@ static BlockDriver bdrv_sheepdog = {
    .bdrv_co_writev               = sd_co_writev,
    .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
    .bdrv_co_pdiscard             = sd_co_pdiscard,
-    .bdrv_co_get_block_status     = sd_co_get_block_status,
+    .bdrv_co_block_status         = sd_co_block_status,

    .bdrv_snapshot_create         = sd_snapshot_create,
    .bdrv_snapshot_goto           = sd_snapshot_goto,
@ -3139,7 +3139,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
    .bdrv_reopen_commit           = sd_reopen_commit,
    .bdrv_reopen_abort            = sd_reopen_abort,
    .bdrv_close                   = sd_close,
-    .bdrv_create                  = sd_create,
+    .bdrv_co_create_opts          = sd_co_create_opts,
    .bdrv_has_zero_init           = bdrv_has_zero_init_1,
    .bdrv_getlength               = sd_getlength,
    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
@ -3149,7 +3149,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
    .bdrv_co_writev               = sd_co_writev,
    .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
    .bdrv_co_pdiscard             = sd_co_pdiscard,
-    .bdrv_co_get_block_status     = sd_co_get_block_status,
+    .bdrv_co_block_status         = sd_co_block_status,

    .bdrv_snapshot_create         = sd_snapshot_create,
    .bdrv_snapshot_goto           = sd_snapshot_goto,
@ -3175,7 +3175,7 @@ static BlockDriver bdrv_sheepdog_unix = {
    .bdrv_reopen_commit           = sd_reopen_commit,
    .bdrv_reopen_abort            = sd_reopen_abort,
    .bdrv_close                   = sd_close,
-    .bdrv_create                  = sd_create,
+    .bdrv_co_create_opts          = sd_co_create_opts,
    .bdrv_has_zero_init           = bdrv_has_zero_init_1,
    .bdrv_getlength               = sd_getlength,
    .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
@ -3185,7 +3185,7 @@ static BlockDriver bdrv_sheepdog_unix = {
    .bdrv_co_writev               = sd_co_writev,
    .bdrv_co_flush_to_disk        = sd_co_flush_to_disk,
    .bdrv_co_pdiscard             = sd_co_pdiscard,
-    .bdrv_co_get_block_status     = sd_co_get_block_status,
+    .bdrv_co_block_status         = sd_co_block_status,

    .bdrv_snapshot_create         = sd_snapshot_create,
    .bdrv_snapshot_goto           = sd_snapshot_goto,
--- a/block/ssh.c
+++ b/block/ssh.c
@ -803,6 +803,33 @@ static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags,
    return ret;
 }

+/* Note: This is a blocking operation */
+static int ssh_grow_file(BDRVSSHState *s, int64_t offset, Error **errp)
+{
+    ssize_t ret;
+    char c[1] = { '\0' };
+    int was_blocking = libssh2_session_get_blocking(s->session);
+
+    /* offset must be strictly greater than the current size so we do
+     * not overwrite anything */
+    assert(offset > 0 && offset > s->attrs.filesize);
+
+    libssh2_session_set_blocking(s->session, 1);
+
+    libssh2_sftp_seek64(s->sftp_handle, offset - 1);
+    ret = libssh2_sftp_write(s->sftp_handle, c, 1);
+
+    libssh2_session_set_blocking(s->session, was_blocking);
+
+    if (ret < 0) {
+        sftp_error_setg(errp, s, "Failed to grow file");
+        return -EIO;
+    }
+
+    s->attrs.filesize = offset;
+    return 0;
+}
+
 static QemuOptsList ssh_create_opts = {
    .name = "ssh-create-opts",
    .head = QTAILQ_HEAD_INITIALIZER(ssh_create_opts.head),
@ -816,14 +843,13 @@ static QemuOptsList ssh_create_opts = {
    }
 };

-static int ssh_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn ssh_co_create_opts(const char *filename, QemuOpts *opts,
+                                           Error **errp)
 {
    int r, ret;
    int64_t total_size = 0;
    QDict *uri_options = NULL;
    BDRVSSHState s;
-    ssize_t r2;
-    char c[1] = { '\0' };

    ssh_state_init(&s);

@ -849,14 +875,10 @@ static int ssh_create(const char *filename, QemuOpts *opts, Error **errp)
    }

    if (total_size > 0) {
-        libssh2_sftp_seek64(s.sftp_handle, total_size-1);
-        r2 = libssh2_sftp_write(s.sftp_handle, c, 1);
-        if (r2 < 0) {
-            sftp_error_setg(errp, &s, "truncate failed");
-            ret = -EINVAL;
+        ret = ssh_grow_file(&s, total_size, errp);
+        if (ret < 0) {
            goto out;
        }
-        s.attrs.filesize = total_size;
    }

    ret = 0;
@ -1198,18 +1220,42 @@ static int64_t ssh_getlength(BlockDriverState *bs)
    return length;
 }

+static int ssh_truncate(BlockDriverState *bs, int64_t offset,
+                        PreallocMode prealloc, Error **errp)
+{
+    BDRVSSHState *s = bs->opaque;
+
+    if (prealloc != PREALLOC_MODE_OFF) {
+        error_setg(errp, "Unsupported preallocation mode '%s'",
+                   PreallocMode_str(prealloc));
+        return -ENOTSUP;
+    }
+
+    if (offset < s->attrs.filesize) {
+        error_setg(errp, "ssh driver does not support shrinking files");
+        return -ENOTSUP;
+    }
+
+    if (offset == s->attrs.filesize) {
+        return 0;
+    }
+
+    return ssh_grow_file(s, offset, errp);
+}
+
 static BlockDriver bdrv_ssh = {
    .format_name                  = "ssh",
    .protocol_name                = "ssh",
    .instance_size                = sizeof(BDRVSSHState),
    .bdrv_parse_filename          = ssh_parse_filename,
    .bdrv_file_open               = ssh_file_open,
-    .bdrv_create                  = ssh_create,
+    .bdrv_co_create_opts          = ssh_co_create_opts,
    .bdrv_close                   = ssh_close,
    .bdrv_has_zero_init           = ssh_has_zero_init,
    .bdrv_co_readv                = ssh_co_readv,
    .bdrv_co_writev               = ssh_co_writev,
    .bdrv_getlength               = ssh_getlength,
+    .bdrv_truncate                = ssh_truncate,
    .bdrv_co_flush_to_disk        = ssh_co_flush,
    .create_opts                  = &ssh_create_opts,
 };
--- a/block/throttle.c
+++ b/block/throttle.c
@ -240,7 +240,7 @@ static BlockDriver bdrv_throttle = {
    .bdrv_reopen_prepare                =   throttle_reopen_prepare,
    .bdrv_reopen_commit                 =   throttle_reopen_commit,
    .bdrv_reopen_abort                  =   throttle_reopen_abort,
-    .bdrv_co_get_block_status           =   bdrv_co_get_block_status_from_file,
+    .bdrv_co_block_status               =   bdrv_co_block_status_from_file,

    .bdrv_co_drain_begin                =   throttle_co_drain_begin,
    .bdrv_co_drain_end                  =   throttle_co_drain_end,
--- a/block/vdi.c
+++ b/block/vdi.c
@ -87,12 +87,18 @@
 #define DEFAULT_CLUSTER_SIZE (1 * MiB)

 #if defined(CONFIG_VDI_DEBUG)
-#define logout(fmt, ...) \
-                fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__)
+#define VDI_DEBUG 1
 #else
-#define logout(fmt, ...) ((void)0)
+#define VDI_DEBUG 0
 #endif

+#define logout(fmt, ...) \
+    do {                                                                \
+        if (VDI_DEBUG) {                                                \
+            fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__); \
+        }                                                               \
+    } while (0)
+
 /* Image signature. */
 #define VDI_SIGNATURE 0xbeda107f

@ -166,8 +172,6 @@ typedef struct {
    uint32_t *bmap;
    /* Size of block (bytes). */
    uint32_t block_size;
-    /* Size of block (sectors). */
-    uint32_t block_sectors;
    /* First sector of block map. */
    uint32_t bmap_sector;
    /* VDI header (converted to host endianness). */
@ -457,7 +461,6 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags,
    bs->total_sectors = header.disk_size / SECTOR_SIZE;

    s->block_size = header.block_size;
-    s->block_sectors = header.block_size / SECTOR_SIZE;
    s->bmap_sector = header.offset_bmap / SECTOR_SIZE;
    s->header = header;

@ -503,33 +506,29 @@ static int vdi_reopen_prepare(BDRVReopenState *state,
    return 0;
 }

-static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
+static int coroutine_fn vdi_co_block_status(BlockDriverState *bs,
+                                            bool want_zero,
+                                            int64_t offset, int64_t bytes,
+                                            int64_t *pnum, int64_t *map,
+                                            BlockDriverState **file)
 {
-    /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */
    BDRVVdiState *s = (BDRVVdiState *)bs->opaque;
-    size_t bmap_index = sector_num / s->block_sectors;
-    size_t sector_in_block = sector_num % s->block_sectors;
-    int n_sectors = s->block_sectors - sector_in_block;
+    size_t bmap_index = offset / s->block_size;
+    size_t index_in_block = offset % s->block_size;
    uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]);
-    uint64_t offset;
    int result;

-    logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum);
-    if (n_sectors > nb_sectors) {
-        n_sectors = nb_sectors;
-    }
-    *pnum = n_sectors;
+    logout("%p, %" PRId64 ", %" PRId64 ", %p\n", bs, offset, bytes, pnum);
+    *pnum = MIN(s->block_size - index_in_block, bytes);
    result = VDI_IS_ALLOCATED(bmap_entry);
    if (!result) {
        return 0;
    }

-    offset = s->header.offset_data +
-                              (uint64_t)bmap_entry * s->block_size +
-                              sector_in_block * SECTOR_SIZE;
+    *map = s->header.offset_data + (uint64_t)bmap_entry * s->block_size +
+        index_in_block;
    *file = bs->file->bs;
-    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
+    return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
 }

 static int coroutine_fn
@ -717,7 +716,8 @@ nonallocating_write:
    return ret;
 }

-static int vdi_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn vdi_co_create_opts(const char *filename, QemuOpts *opts,
+                                           Error **errp)
 {
    int ret = 0;
    uint64_t bytes = 0;
@ -895,9 +895,9 @@ static BlockDriver bdrv_vdi = {
    .bdrv_close = vdi_close,
    .bdrv_reopen_prepare = vdi_reopen_prepare,
    .bdrv_child_perm          = bdrv_format_default_perms,
-    .bdrv_create = vdi_create,
+    .bdrv_co_create_opts = vdi_co_create_opts,
    .bdrv_has_zero_init = bdrv_has_zero_init_1,
-    .bdrv_co_get_block_status = vdi_co_get_block_status,
+    .bdrv_co_block_status = vdi_co_block_status,
    .bdrv_make_empty = vdi_make_empty,

    .bdrv_co_preadv     = vdi_co_preadv,
--- a/block/vhdx.c
+++ b/block/vhdx.c
@ -1792,7 +1792,8 @@ exit:
 *    .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------.
 *   1MB
 */
-static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn vhdx_co_create_opts(const char *filename, QemuOpts *opts,
+                                            Error **errp)
 {
    int ret = 0;
    uint64_t image_size = (uint64_t) 2 * GiB;
@ -2003,7 +2004,7 @@ static BlockDriver bdrv_vhdx = {
    .bdrv_child_perm        = bdrv_format_default_perms,
    .bdrv_co_readv          = vhdx_co_readv,
    .bdrv_co_writev         = vhdx_co_writev,
-    .bdrv_create            = vhdx_create,
+    .bdrv_co_create_opts    = vhdx_co_create_opts,
    .bdrv_get_info          = vhdx_get_info,
    .bdrv_check             = vhdx_check,
    .bdrv_has_zero_init     = bdrv_has_zero_init_1,
--- a/block/vmdk.c
+++ b/block/vmdk.c
@ -1304,33 +1304,27 @@ static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
    return extent_relative_offset % cluster_size;
 }

-static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
-                                                  int64_t sector_num)
-{
-    uint64_t offset;
-    offset = vmdk_find_offset_in_cluster(extent, sector_num * BDRV_SECTOR_SIZE);
-    return offset / BDRV_SECTOR_SIZE;
-}
-
-static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
+static int coroutine_fn vmdk_co_block_status(BlockDriverState *bs,
+                                             bool want_zero,
+                                             int64_t offset, int64_t bytes,
+                                             int64_t *pnum, int64_t *map,
+                                             BlockDriverState **file)
 {
    BDRVVmdkState *s = bs->opaque;
    int64_t index_in_cluster, n, ret;
-    uint64_t offset;
+    uint64_t cluster_offset;
    VmdkExtent *extent;

-    extent = find_extent(s, sector_num, NULL);
+    extent = find_extent(s, offset >> BDRV_SECTOR_BITS, NULL);
    if (!extent) {
-        return 0;
+        return -EIO;
    }
    qemu_co_mutex_lock(&s->lock);
-    ret = get_cluster_offset(bs, extent, NULL,
-                             sector_num * 512, false, &offset,
+    ret = get_cluster_offset(bs, extent, NULL, offset, false, &cluster_offset,
                             0, 0);
    qemu_co_mutex_unlock(&s->lock);

-    index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
+    index_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
    switch (ret) {
    case VMDK_ERROR:
        ret = -EIO;
@ -1345,18 +1339,14 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs,
        ret = BDRV_BLOCK_DATA;
        if (!extent->compressed) {
            ret |= BDRV_BLOCK_OFFSET_VALID;
-            ret |= (offset + (index_in_cluster << BDRV_SECTOR_BITS))
-                    & BDRV_BLOCK_OFFSET_MASK;
+            *map = cluster_offset + index_in_cluster;
        }
        *file = extent->file->bs;
        break;
    }

-    n = extent->cluster_sectors - index_in_cluster;
-    if (n > nb_sectors) {
-        n = nb_sectors;
-    }
-    *pnum = n;
+    n = extent->cluster_sectors * BDRV_SECTOR_SIZE - index_in_cluster;
+    *pnum = MIN(n, bytes);
    return ret;
 }

@ -1892,7 +1882,8 @@ static int filename_decompose(const char *filename, char *path, char *prefix,
    return VMDK_OK;
 }

-static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn vmdk_co_create_opts(const char *filename, QemuOpts *opts,
+                                            Error **errp)
 {
    int idx = 0;
    BlockBackend *new_blk = NULL;
@ -2408,9 +2399,9 @@ static BlockDriver bdrv_vmdk = {
    .bdrv_co_pwritev_compressed   = vmdk_co_pwritev_compressed,
    .bdrv_co_pwrite_zeroes        = vmdk_co_pwrite_zeroes,
    .bdrv_close                   = vmdk_close,
-    .bdrv_create                  = vmdk_create,
+    .bdrv_co_create_opts          = vmdk_co_create_opts,
    .bdrv_co_flush_to_disk        = vmdk_co_flush,
-    .bdrv_co_get_block_status     = vmdk_co_get_block_status,
+    .bdrv_co_block_status         = vmdk_co_block_status,
    .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size,
    .bdrv_has_zero_init           = vmdk_has_zero_init,
    .bdrv_get_specific_info       = vmdk_get_specific_info,
--- a/block/vpc.c
+++ b/block/vpc.c
@ -706,53 +706,54 @@ fail:
    return ret;
 }

-static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
+static int coroutine_fn vpc_co_block_status(BlockDriverState *bs,
+                                            bool want_zero,
+                                            int64_t offset, int64_t bytes,
+                                            int64_t *pnum, int64_t *map,
+                                            BlockDriverState **file)
 {
    BDRVVPCState *s = bs->opaque;
    VHDFooter *footer = (VHDFooter*) s->footer_buf;
-    int64_t start, offset;
+    int64_t image_offset;
    bool allocated;
-    int64_t ret;
-    int n;
+    int ret;
+    int64_t n;

    if (be32_to_cpu(footer->type) == VHD_FIXED) {
-        *pnum = nb_sectors;
+        *pnum = bytes;
+        *map = offset;
        *file = bs->file->bs;
-        return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
-               (sector_num << BDRV_SECTOR_BITS);
+        return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
    }

    qemu_co_mutex_lock(&s->lock);

-    offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false, NULL);
-    start = offset;
-    allocated = (offset != -1);
+    image_offset = get_image_offset(bs, offset, false, NULL);
+    allocated = (image_offset != -1);
    *pnum = 0;
    ret = 0;

    do {
        /* All sectors in a block are contiguous (without using the bitmap) */
-        n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE)
-          - sector_num;
-        n = MIN(n, nb_sectors);
+        n = ROUND_UP(offset + 1, s->block_size) - offset;
+        n = MIN(n, bytes);

        *pnum += n;
-        sector_num += n;
-        nb_sectors -= n;
+        offset += n;
+        bytes -= n;
        /* *pnum can't be greater than one block for allocated
         * sectors since there is always a bitmap in between. */
        if (allocated) {
            *file = bs->file->bs;
-            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start;
+            *map = image_offset;
+            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
            break;
        }
-        if (nb_sectors == 0) {
+        if (bytes == 0) {
            break;
        }
-        offset = get_image_offset(bs, sector_num << BDRV_SECTOR_BITS, false,
-                                  NULL);
-    } while (offset == -1);
+        image_offset = get_image_offset(bs, offset, false, NULL);
+    } while (image_offset == -1);

    qemu_co_mutex_unlock(&s->lock);
    return ret;
@ -896,7 +897,8 @@ static int create_fixed_disk(BlockBackend *blk, uint8_t *buf,
    return ret;
 }

-static int vpc_create(const char *filename, QemuOpts *opts, Error **errp)
+static int coroutine_fn vpc_co_create_opts(const char *filename, QemuOpts *opts,
+                                           Error **errp)
 {
    uint8_t buf[1024];
    VHDFooter *footer = (VHDFooter *) buf;
@ -1094,11 +1096,11 @@ static BlockDriver bdrv_vpc = {
    .bdrv_close             = vpc_close,
    .bdrv_reopen_prepare    = vpc_reopen_prepare,
    .bdrv_child_perm        = bdrv_format_default_perms,
-    .bdrv_create            = vpc_create,
+    .bdrv_co_create_opts    = vpc_co_create_opts,

    .bdrv_co_preadv             = vpc_co_preadv,
    .bdrv_co_pwritev            = vpc_co_pwritev,
-    .bdrv_co_get_block_status   = vpc_co_get_block_status,
+    .bdrv_co_block_status       = vpc_co_block_status,

    .bdrv_get_info          = vpc_get_info,

--- a/block/vvfat.c
+++ b/block/vvfat.c
@ -3088,15 +3088,13 @@ vvfat_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
    return ret;
 }

-static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *n, BlockDriverState **file)
+static int coroutine_fn vvfat_co_block_status(BlockDriverState *bs,
+                                              bool want_zero, int64_t offset,
+                                              int64_t bytes, int64_t *n,
+                                              int64_t *map,
+                                              BlockDriverState **file)
 {
-    *n = bs->total_sectors - sector_num;
-    if (*n > nb_sectors) {
-        *n = nb_sectors;
-    } else if (*n < 0) {
-        return 0;
-    }
+    *n = bytes;
    return BDRV_BLOCK_DATA;
 }

@ -3257,7 +3255,7 @@ static BlockDriver bdrv_vvfat = {

    .bdrv_co_preadv         = vvfat_co_preadv,
    .bdrv_co_pwritev        = vvfat_co_pwritev,
-    .bdrv_co_get_block_status = vvfat_co_get_block_status,
+    .bdrv_co_block_status   = vvfat_co_block_status,
 };

 static void bdrv_vvfat_init(void)
--- a/docs/interop/qcow2.txt
+++ b/docs/interop/qcow2.txt
@ -426,10 +426,20 @@ Standard Cluster Descriptor:

 Compressed Clusters Descriptor (x = 62 - (cluster_bits - 8)):

-    Bit  0 -  x:    Host cluster offset. This is usually _not_ aligned to a
-                    cluster boundary!
+    Bit  0 - x-1:   Host cluster offset. This is usually _not_ aligned to a
+                    cluster or sector boundary!

-       x+1 - 61:    Compressed size of the images in sectors of 512 bytes
+         x - 61:    Number of additional 512-byte sectors used for the
+                    compressed data, beyond the sector containing the offset
+                    in the previous field. Some of these sectors may reside
+                    in the next contiguous host cluster.
+
+                    Note that the compressed data does not necessarily occupy
+                    all of the bytes in the final sector; rather, decompression
+                    stops when it has produced a cluster of data.
+
+                    Another compressed cluster may map to the tail of the final
+                    sector used by this compressed cluster.

 If a cluster is unallocated, read requests shall read the data from the backing
 file (except if bit 0 in the Standard Cluster Descriptor is set). If there is
--- a/docs/qcow2-cache.txt
+++ b/docs/qcow2-cache.txt
@ -1,6 +1,6 @@
 qcow2 L2/refcount cache configuration
 =====================================
-Copyright (C) 2015 Igalia, S.L.
+Copyright (C) 2015, 2018 Igalia, S.L.
 Author: Alberto Garcia <berto@igalia.com>

 This work is licensed under the terms of the GNU GPL, version 2 or
@ -118,8 +118,8 @@ There are three options available, and all of them take bytes:

 There are two things that need to be taken into account:

- - Both caches must have a size that is a multiple of the cluster
-   size.
+ - Both caches must have a size that is a multiple of the cluster size
+   (or the cache entry size: see "Using smaller cache sizes" below).

 - If you only set one of the options above, QEMU will automatically
   adjust the others so that the L2 cache is 4 times bigger than the
@ -143,6 +143,46 @@ much less often than the L2 cache, so it's perfectly reasonable to
 keep it small.


+Using smaller cache entries
+---------------------------
+The qcow2 L2 cache stores complete tables by default. This means that
+if QEMU needs an entry from an L2 table then the whole table is read
+from disk and is kept in the cache. If the cache is full then a
+complete table needs to be evicted first.
+
+This can be inefficient with large cluster sizes since it results in
+more disk I/O and wastes more cache memory.
+
+Since QEMU 2.12 you can change the size of the L2 cache entry and make
+it smaller than the cluster size. This can be configured using the
+"l2-cache-entry-size" parameter:
+
+   -drive file=hd.qcow2,l2-cache-size=2097152,l2-cache-entry-size=4096
+
+Some things to take into account:
+
+ - The L2 cache entry size has the same restrictions as the cluster
+   size (power of two, at least 512 bytes).
+
+ - Smaller entry sizes generally improve the cache efficiency and make
+   disk I/O faster. This is particularly true with solid state drives
+   so it's a good idea to reduce the entry size in those cases. With
+   rotating hard drives the situation is a bit more complicated so you
+   should test it first and stay with the default size if unsure.
+
+ - Try different entry sizes to see which one gives faster performance
+   in your case. The block size of the host filesystem is generally a
+   good default (usually 4096 bytes in the case of ext4).
+
+ - Only the L2 cache can be configured this way. The refcount cache
+   always uses the cluster size as the entry size.
+
+ - If the L2 cache is big enough to hold all of the image's L2 tables
+   (as explained in the "Choosing the right cache sizes" section
+   earlier in this document) then none of this is necessary and you
+   can omit the "l2-cache-entry-size" parameter altogether.
+
+
 Reducing the memory usage
 -------------------------
 It is possible to clean unused cache entries in order to reduce the
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@ -1087,15 +1087,7 @@ static void ide_flush_cache(IDEState *s)
    s->status |= BUSY_STAT;
    ide_set_retry(s);
    block_acct_start(blk_get_stats(s->blk), &s->acct, 0, BLOCK_ACCT_FLUSH);
-
-    if (blk_bs(s->blk)) {
-        s->pio_aiocb = blk_aio_flush(s->blk, ide_flush_cb, s);
-    } else {
-        /* XXX blk_aio_flush() crashes when blk_bs(blk) is NULL, remove this
-         * temporary workaround when blk_aio_*() functions handle NULL blk_bs.
-         */
-        ide_flush_cb(s, 0);
-    }
+    s->pio_aiocb = blk_aio_flush(s->blk, ide_flush_cb, s);
 }

 static void ide_cfata_metadata_inquiry(IDEState *s)
--- a/include/block/aio-wait.h
+++ b/include/block/aio-wait.h
@ -0,0 +1,116 @@
+/*
+ * AioContext wait support
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_AIO_WAIT_H
+#define QEMU_AIO_WAIT_H
+
+#include "block/aio.h"
+
+/**
+ * AioWait:
+ *
+ * An object that facilitates synchronous waiting on a condition.  The main
+ * loop can wait on an operation running in an IOThread as follows:
+ *
+ *   AioWait *wait = ...;
+ *   AioContext *ctx = ...;
+ *   MyWork work = { .done = false };
+ *   schedule_my_work_in_iothread(ctx, &work);
+ *   AIO_WAIT_WHILE(wait, ctx, !work.done);
+ *
+ * The IOThread must call aio_wait_kick() to notify the main loop when
+ * work.done changes:
+ *
+ *   static void do_work(...)
+ *   {
+ *       ...
+ *       work.done = true;
+ *       aio_wait_kick(wait);
+ *   }
+ */
+typedef struct {
+    /* Is the main loop waiting for a kick?  Accessed with atomic ops. */
+    bool need_kick;
+} AioWait;
+
+/**
+ * AIO_WAIT_WHILE:
+ * @wait: the aio wait object
+ * @ctx: the aio context
+ * @cond: wait while this conditional expression is true
+ *
+ * Wait while a condition is true.  Use this to implement synchronous
+ * operations that require event loop activity.
+ *
+ * The caller must be sure that something calls aio_wait_kick() when the value
+ * of @cond might have changed.
+ *
+ * The caller's thread must be the IOThread that owns @ctx or the main loop
+ * thread (with @ctx acquired exactly once).  This function cannot be used to
+ * wait on conditions between two IOThreads since that could lead to deadlock,
+ * go via the main loop instead.
+ */
+#define AIO_WAIT_WHILE(wait, ctx, cond) ({                  \
+    bool waited_ = false;                                   \
+    bool busy_ = true;                                      \
+    AioWait *wait_ = (wait);                                \
+    AioContext *ctx_ = (ctx);                               \
+    if (in_aio_context_home_thread(ctx_)) {                 \
+        while ((cond) || busy_) {                           \
+            busy_ = aio_poll(ctx_, (cond));                 \
+            waited_ |= !!(cond) | busy_;                    \
+        }                                                   \
+    } else {                                                \
+        assert(qemu_get_current_aio_context() ==            \
+               qemu_get_aio_context());                     \
+        assert(!wait_->need_kick);                          \
+        /* Set wait_->need_kick before evaluating cond.  */ \
+        atomic_mb_set(&wait_->need_kick, true);             \
+        while (busy_) {                                     \
+            if ((cond)) {                                   \
+                waited_ = busy_ = true;                     \
+                aio_context_release(ctx_);                  \
+                aio_poll(qemu_get_aio_context(), true);     \
+                aio_context_acquire(ctx_);                  \
+            } else {                                        \
+                busy_ = aio_poll(ctx_, false);              \
+                waited_ |= busy_;                           \
+            }                                               \
+        }                                                   \
+        atomic_set(&wait_->need_kick, false);               \
+    }                                                       \
+    waited_; })
+
+/**
+ * aio_wait_kick:
+ * @wait: the aio wait object that should re-evaluate its condition
+ *
+ * Wake up the main thread if it is waiting on AIO_WAIT_WHILE().  During
+ * synchronous operations performed in an IOThread, the main thread lets the
+ * IOThread's event loop run, waiting for the operation to complete.  A
+ * aio_wait_kick() call will wake up the main thread.
+ */
+void aio_wait_kick(AioWait *wait);
+
+#endif /* QEMU_AIO_WAIT */
--- a/include/block/aio.h
+++ b/include/block/aio.h
@ -534,11 +534,14 @@ void aio_co_enter(AioContext *ctx, struct Coroutine *co);
 AioContext *qemu_get_current_aio_context(void);

 /**
+ * in_aio_context_home_thread:
 * @ctx: the aio context
 *
- * Return whether we are running in the I/O thread that manages @ctx.
+ * Return whether we are running in the thread that normally runs @ctx.  Note
+ * that acquiring/releasing ctx does not affect the outcome, each AioContext
+ * still only has one home thread that is responsible for running it.
 */
-static inline bool aio_context_in_iothread(AioContext *ctx)
+static inline bool in_aio_context_home_thread(AioContext *ctx)
 {
    return ctx == qemu_get_current_aio_context();
 }
--- a/include/block/block.h
+++ b/include/block/block.h
@ -3,6 +3,7 @@

 #include "block/aio.h"
 #include "qapi/qapi-types-block-core.h"
+#include "block/aio-wait.h"
 #include "qemu/iov.h"
 #include "qemu/coroutine.h"
 #include "block/accounting.h"
@ -115,19 +116,19 @@ typedef struct HDGeometry {
 * BDRV_BLOCK_ZERO: offset reads as zero
 * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
 * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
- *                       layer (short for DATA || ZERO), set by block layer
- * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this layer
+ *                       layer rather than any backing, set by block layer
+ * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
+ *                 layer, set by block layer
 *
 * Internal flag:
 * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
 *                 that the block layer recompute the answer from the returned
 *                 BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
 *
- * If BDRV_BLOCK_OFFSET_VALID is set, bits 9-62 (BDRV_BLOCK_OFFSET_MASK) of
- * the return value (old interface) or the entire map parameter (new
- * interface) represent the offset in the returned BDS that is allocated for
- * the corresponding raw data.  However, whether that offset actually
- * contains data also depends on BDRV_BLOCK_DATA, as follows:
+ * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
+ * host offset within the returned BDS that is allocated for the
+ * corresponding raw guest data.  However, whether that offset
+ * actually contains data also depends on BDRV_BLOCK_DATA, as follows:
 *
 * DATA ZERO OFFSET_VALID
 *  t    t        t       sectors read as zero, returned file is zero at offset
@ -367,41 +368,14 @@ void bdrv_drain_all_begin(void);
 void bdrv_drain_all_end(void);
 void bdrv_drain_all(void);

+/* Returns NULL when bs == NULL */
+AioWait *bdrv_get_aio_wait(BlockDriverState *bs);
+
 #define BDRV_POLL_WHILE(bs, cond) ({                       \
-    bool waited_ = false;                                  \
-    bool busy_ = true;                                     \
    BlockDriverState *bs_ = (bs);                          \
-    AioContext *ctx_ = bdrv_get_aio_context(bs_);          \
-    if (aio_context_in_iothread(ctx_)) {                   \
-        while ((cond) || busy_) {                          \
-            busy_ = aio_poll(ctx_, (cond));                \
-            waited_ |= !!(cond) | busy_;                   \
-        }                                                  \
-    } else {                                               \
-        assert(qemu_get_current_aio_context() ==           \
-               qemu_get_aio_context());                    \
-        /* Ask bdrv_dec_in_flight to wake up the main      \
-         * QEMU AioContext.  Extra I/O threads never take  \
-         * other I/O threads' AioContexts (see for example \
-         * block_job_defer_to_main_loop for how to do it). \
-         */                                                \
-        assert(!bs_->wakeup);                              \
-        /* Set bs->wakeup before evaluating cond.  */      \
-        atomic_mb_set(&bs_->wakeup, true);                 \
-        while (busy_) {                                    \
-            if ((cond)) {                                  \
-                waited_ = busy_ = true;                    \
-                aio_context_release(ctx_);                 \
-                aio_poll(qemu_get_aio_context(), true);    \
-                aio_context_acquire(ctx_);                 \
-            } else {                                       \
-                busy_ = aio_poll(ctx_, false);             \
-                waited_ |= busy_;                          \
-            }                                              \
-        }                                                  \
-        atomic_set(&bs_->wakeup, false);                   \
-    }                                                      \
-    waited_; })
+    AIO_WAIT_WHILE(bdrv_get_aio_wait(bs_),                 \
+                   bdrv_get_aio_context(bs_),              \
+                   cond); })

 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes);
 int bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset, int bytes);
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@ -26,6 +26,7 @@

 #include "block/accounting.h"
 #include "block/block.h"
+#include "block/aio-wait.h"
 #include "qemu/queue.h"
 #include "qemu/coroutine.h"
 #include "qemu/stats64.h"
@ -128,7 +129,8 @@ struct BlockDriver {
    int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags,
                          Error **errp);
    void (*bdrv_close)(BlockDriverState *bs);
-    int (*bdrv_create)(const char *filename, QemuOpts *opts, Error **errp);
+    int coroutine_fn (*bdrv_co_create_opts)(const char *filename, QemuOpts *opts,
+                                       Error **errp);
    int (*bdrv_make_empty)(BlockDriverState *bs);

    void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options);
@ -202,15 +204,22 @@ struct BlockDriver {
    /*
     * Building block for bdrv_block_status[_above] and
     * bdrv_is_allocated[_above].  The driver should answer only
-     * according to the current layer, and should not set
-     * BDRV_BLOCK_ALLOCATED, but may set BDRV_BLOCK_RAW.  See block.h
-     * for the meaning of _DATA, _ZERO, and _OFFSET_VALID.  The block
-     * layer guarantees input aligned to request_alignment, as well as
-     * non-NULL pnum and file.
+     * according to the current layer, and should only need to set
+     * BDRV_BLOCK_DATA, BDRV_BLOCK_ZERO, BDRV_BLOCK_OFFSET_VALID,
+     * and/or BDRV_BLOCK_RAW; if the current layer defers to a backing
+     * layer, the result should be 0 (and not BDRV_BLOCK_ZERO).  See
+     * block.h for the overall meaning of the bits.  As a hint, the
+     * flag want_zero is true if the caller cares more about precise
+     * mappings (favor accurate _OFFSET_VALID/_ZERO) or false for
+     * overall allocation (favor larger *pnum, perhaps by reporting
+     * _DATA instead of _ZERO).  The block layer guarantees input
+     * clamped to bdrv_getlength() and aligned to request_alignment,
+     * as well as non-NULL pnum, map, and file; in turn, the driver
+     * must return an error or set pnum to an aligned non-zero value.
     */
-    int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors, int *pnum,
-        BlockDriverState **file);
+    int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs,
+        bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum,
+        int64_t *map, BlockDriverState **file);

    /*
     * Invalidate any cached meta-data.
@ -709,10 +718,8 @@ struct BlockDriverState {
    unsigned int in_flight;
    unsigned int serialising_in_flight;

-    /* Internal to BDRV_POLL_WHILE and bdrv_wakeup.  Accessed with atomic
-     * ops.
-     */
-    bool wakeup;
+    /* Kicked to signal main loop when a request completes. */
+    AioWait wait;

    /* counter for nested bdrv_io_plug.
     * Accessed with atomic ops.
@ -1031,23 +1038,27 @@ void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
                               uint64_t *nperm, uint64_t *nshared);

 /*
- * Default implementation for drivers to pass bdrv_co_get_block_status() to
+ * Default implementation for drivers to pass bdrv_co_block_status() to
 * their file.
 */
-int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
-                                                        int64_t sector_num,
-                                                        int nb_sectors,
-                                                        int *pnum,
-                                                        BlockDriverState **file);
+int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
+                                                bool want_zero,
+                                                int64_t offset,
+                                                int64_t bytes,
+                                                int64_t *pnum,
+                                                int64_t *map,
+                                                BlockDriverState **file);
 /*
- * Default implementation for drivers to pass bdrv_co_get_block_status() to
+ * Default implementation for drivers to pass bdrv_co_block_status() to
 * their backing file.
 */
-int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
-                                                           int64_t sector_num,
-                                                           int nb_sectors,
-                                                           int *pnum,
-                                                           BlockDriverState **file);
+int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
+                                                   bool want_zero,
+                                                   int64_t offset,
+                                                   int64_t bytes,
+                                                   int64_t *pnum,
+                                                   int64_t *map,
+                                                   BlockDriverState **file);
 const char *bdrv_get_parent_name(const BlockDriverState *bs);
 void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp);
 bool blk_dev_has_removable_media(BlockBackend *blk);
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@ -3676,7 +3676,8 @@
 #
 # @node-name: node name. Note that errors may be reported for the root node
 #             that is directly attached to a guest device rather than for the
-#             node where the error occurred. (Since: 2.8)
+#             node where the error occurred. The node name is not present if
+#             the drive is empty. (Since: 2.8)
 #
 # @operation: I/O operation
 #
@ -3707,7 +3708,8 @@
 #
 ##
 { 'event': 'BLOCK_IO_ERROR',
-  'data': { 'device': 'str', 'node-name': 'str', 'operation': 'IoOperationType',
+  'data': { 'device': 'str', '*node-name': 'str',
+            'operation': 'IoOperationType',
            'action': 'BlockErrorAction', '*nospace': 'bool',
            'reason': 'str' } }

--- a/qemu-img.c
+++ b/qemu-img.c
@ -3469,7 +3469,7 @@ static int img_resize(int argc, char **argv)
        }
    }
    if (optind != argc - 1) {
-        error_exit("Expecting one image file name");
+        error_exit("Expecting image file name and size");
    }
    filename = argv[optind++];

--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@ -92,6 +92,7 @@ gcov-files-test-hbitmap-y = blockjob.c
 check-unit-y += tests/test-bdrv-drain$(EXESUF)
 check-unit-y += tests/test-blockjob$(EXESUF)
 check-unit-y += tests/test-blockjob-txn$(EXESUF)
+check-unit-y += tests/test-block-backend$(EXESUF)
 check-unit-y += tests/test-x86-cpuid$(EXESUF)
 # all code tested by test-x86-cpuid is inside topology.h
 gcov-files-test-x86-cpuid-y =
@ -622,6 +623,7 @@ tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 tests/test-bdrv-drain$(EXESUF): tests/test-bdrv-drain.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
+tests/test-block-backend$(EXESUF): tests/test-block-backend.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
 tests/test-iov$(EXESUF): tests/test-iov.o $(test-util-obj-y)
 tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o $(test-util-obj-y) $(test-crypto-obj-y)
--- a/tests/qemu-iotests/033
+++ b/tests/qemu-iotests/033
@ -64,6 +64,9 @@ do_test()
 	} | $QEMU_IO $IO_EXTRA_ARGS
 }

+echo
+echo "=== Test aligned and misaligned write zeroes operations ==="
+
 for write_zero_cmd in "write -z" "aio_write -z"; do
 for align in 512 4k; do
 	echo
@ -102,7 +105,33 @@ for align in 512 4k; do
 done
 done

+
+# Trigger truncate that would shrink qcow2 L1 table, which is done by
+#   clearing one entry (8 bytes) with bdrv_co_pwrite_zeroes()
+
+echo
+echo "=== Test misaligned write zeroes via truncate ==="
+echo
+
+# any size will do, but the smaller the size the smaller the required image
+CLUSTER_SIZE=$((4 * 1024))
+L2_COVERAGE=$(($CLUSTER_SIZE * $CLUSTER_SIZE / 8))
+_make_test_img $(($L2_COVERAGE * 2))
+
+do_test 512 "write -P 1 0 0x200" "$TEST_IMG" | _filter_qemu_io
+# next L2 table
+do_test 512 "write -P 1 $L2_COVERAGE 0x200" "$TEST_IMG" | _filter_qemu_io
+
+# only interested in qcow2 here; also other formats might respond with
+#  "not supported" error message
+if [ $IMGFMT = "qcow2" ]; then
+    do_test 512 "truncate $L2_COVERAGE" "$TEST_IMG" | _filter_qemu_io
+fi
+
+do_test 512 "read -P 1 0 0x200" "$TEST_IMG" | _filter_qemu_io
+
 # success, all done
+echo
 echo "*** done"
 rm -f $seq.full
 status=0
--- a/tests/qemu-iotests/033.out
+++ b/tests/qemu-iotests/033.out
@ -1,6 +1,8 @@
 QA output created by 033
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728

+=== Test aligned and misaligned write zeroes operations ===
+
 == preparing image ==
 wrote 1024/1024 bytes at offset 512
 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
@ -164,4 +166,15 @@ read 512/512 bytes at offset 512
 read 3072/3072 bytes at offset 1024
 3 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)

+
+=== Test misaligned write zeroes via truncate ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=4194304
+wrote 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 512/512 bytes at offset 2097152
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
 *** done
--- a/tests/test-block-backend.c
+++ b/tests/test-block-backend.c
@ -0,0 +1,82 @@
+/*
+ * BlockBackend tests
+ *
+ * Copyright (c) 2017 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "sysemu/block-backend.h"
+#include "qapi/error.h"
+
+static void test_drain_aio_error_flush_cb(void *opaque, int ret)
+{
+    bool *completed = opaque;
+
+    g_assert(ret == -ENOMEDIUM);
+    *completed = true;
+}
+
+static void test_drain_aio_error(void)
+{
+    BlockBackend *blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    BlockAIOCB *acb;
+    bool completed = false;
+
+    acb = blk_aio_flush(blk, test_drain_aio_error_flush_cb, &completed);
+    g_assert(acb != NULL);
+    g_assert(completed == false);
+
+    blk_drain(blk);
+    g_assert(completed == true);
+
+    blk_unref(blk);
+}
+
+static void test_drain_all_aio_error(void)
+{
+    BlockBackend *blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+    BlockAIOCB *acb;
+    bool completed = false;
+
+    acb = blk_aio_flush(blk, test_drain_aio_error_flush_cb, &completed);
+    g_assert(acb != NULL);
+    g_assert(completed == false);
+
+    blk_drain_all();
+    g_assert(completed == true);
+
+    blk_unref(blk);
+}
+
+int main(int argc, char **argv)
+{
+    bdrv_init();
+    qemu_init_main_loop(&error_abort);
+
+    g_test_init(&argc, &argv, NULL);
+
+    g_test_add_func("/block-backend/drain_aio_error", test_drain_aio_error);
+    g_test_add_func("/block-backend/drain_all_aio_error",
+                    test_drain_all_aio_error);
+
+    return g_test_run();
+}
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@ -1,7 +1,7 @@
 util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
 util-obj-y += bufferiszero.o
 util-obj-y += lockcnt.o
-util-obj-y += aiocb.o async.o thread-pool.o qemu-timer.o
+util-obj-y += aiocb.o async.o aio-wait.o thread-pool.o qemu-timer.o
 util-obj-y += main-loop.o iohandler.o
 util-obj-$(CONFIG_POSIX) += aio-posix.o
 util-obj-$(CONFIG_POSIX) += compatfd.o
--- a/util/aio-wait.c
+++ b/util/aio-wait.c
@ -0,0 +1,40 @@
+/*
+ * AioContext wait support
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "block/aio-wait.h"
+
+static void dummy_bh_cb(void *opaque)
+{
+    /* The point is to make AIO_WAIT_WHILE()'s aio_poll() return */
+}
+
+void aio_wait_kick(AioWait *wait)
+{
+    /* The barrier (or an atomic op) is in the caller.  */
+    if (atomic_read(&wait->need_kick)) {
+        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
+    }
+}