From 8d2497c3552e19a60e7a75d20976471ecb2a8e2b Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Mon, 14 Jan 2013 17:31:31 +0100 Subject: [PATCH 01/15] qcow2: Fix segfault on zero-length write One of the recent refactoring patches (commit f50f88b9) didn't take care to initialise l2meta properly, so with zero-length writes, which don't even enter the write loop, qemu just segfaulted. Signed-off-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- block/qcow2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/qcow2.c b/block/qcow2.c index d603f98a9c..f6abff6111 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -759,7 +759,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, QEMUIOVector hd_qiov; uint64_t bytes_done = 0; uint8_t *cluster_data = NULL; - QCowL2Meta *l2meta; + QCowL2Meta *l2meta = NULL; trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num, remaining_sectors); From 029d091e4975af60ff9622717af19c5910f2f4e9 Mon Sep 17 00:00:00 2001 From: Peter Lieven Date: Fri, 11 Jan 2013 13:29:55 +0100 Subject: [PATCH 02/15] block: fix initialization in bdrv_io_limits_enable() bdrv_io_limits_enable() starts a new slice, but does not set io_base correctly for that slice. Here is how io_base is used: bytes_base = bs->nr_bytes[is_write] - bs->io_base.bytes[is_write]; bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; if (bytes_base + bytes_res <= bytes_limit) { /* no wait */ } else { /* operation needs to be throttled */ } As a result, any I/O operations that are triggered between now and bs->slice_end are incorrectly limited. If 10 MB of data has been written since the VM was started, QEMU thinks that 10 MB of data has been written in this slice. This leads to a I/O lockup in the guest. We fix this by delaying the start of a new slice to the next call of bdrv_exceed_io_limits(). Signed-off-by: Peter Lieven Reviewed-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- block.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/block.c b/block.c index b5e64ecf11..4a90dd1a78 100644 --- a/block.c +++ b/block.c @@ -155,10 +155,6 @@ void bdrv_io_limits_enable(BlockDriverState *bs) { qemu_co_queue_init(&bs->throttled_reqs); bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); - bs->slice_time = 5 * BLOCK_IO_SLICE_TIME; - bs->slice_start = qemu_get_clock_ns(vm_clock); - bs->slice_end = bs->slice_start + bs->slice_time; - memset(&bs->io_base, 0, sizeof(bs->io_base)); bs->io_limits_enabled = true; } From 3d4fa43e648f3b169e7ab5dd4e21312e510805d7 Mon Sep 17 00:00:00 2001 From: Kusanagi Kouichi Date: Mon, 14 Jan 2013 16:26:52 +0100 Subject: [PATCH 03/15] raw-posix: support discard on more filesystems Linux 2.6.38 introduced the filesystem independent interface to deallocate part of a file. As of Linux 3.7, btrfs, ext4, ocfs2, tmpfs and xfs support it. Even though the system calls here are in practice issued on Linux, the code is structured to allow plugging in alternatives for other Unix variants. EOPNOTSUPP is used unconditionally in this patch, but it is supported in both OpenBSD and Mac OS X since forever (see for example http://lists.debian.org/debian-glibc/2006/02/msg00337.html). Signed-off-by: Kusanagi Kouichi Signed-off-by: Stefan Hajnoczi --- block/raw-posix.c | 26 ++++++++++++++++++++++++-- configure | 19 +++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index c3d7fda7b7..e8d79afe5f 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -59,6 +59,9 @@ #ifdef CONFIG_FIEMAP #include #endif +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE +#include +#endif #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) #include #include @@ -1074,15 +1077,34 @@ static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors) static coroutine_fn int raw_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { -#ifdef CONFIG_XFS + int ret = -EOPNOTSUPP; + +#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_XFS) BDRVRawState *s = bs->opaque; +#ifdef CONFIG_XFS if (s->is_xfs) { return xfs_discard(s, sector_num, nb_sectors); } #endif - return 0; +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE + do { + if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + sector_num << BDRV_SECTOR_BITS, + (int64_t)nb_sectors << BDRV_SECTOR_BITS) == 0) { + return 0; + } + } while (errno == EINTR); + + ret = -errno; +#endif +#endif + + if (ret == -EOPNOTSUPP) { + return 0; + } + return ret; } static QEMUOptionParameter raw_create_options[] = { diff --git a/configure b/configure index c908f66583..40d250ccac 100755 --- a/configure +++ b/configure @@ -2581,6 +2581,22 @@ if compile_prog "" "" ; then fallocate=yes fi +# check for fallocate hole punching +fallocate_punch_hole=no +cat > $TMPC << EOF +#include +#include + +int main(void) +{ + fallocate(0, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, 0); + return 0; +} +EOF +if compile_prog "" "" ; then + fallocate_punch_hole=yes +fi + # check for sync_file_range sync_file_range=no cat > $TMPC << EOF @@ -3490,6 +3506,9 @@ fi if test "$fallocate" = "yes" ; then echo "CONFIG_FALLOCATE=y" >> $config_host_mak fi +if test "$fallocate_punch_hole" = "yes" ; then + echo "CONFIG_FALLOCATE_PUNCH_HOLE=y" >> $config_host_mak +fi if test "$sync_file_range" = "yes" ; then echo "CONFIG_SYNC_FILE_RANGE=y" >> $config_host_mak fi From c85191e5c9e14d65cc4281ef3b31f480227aa6dd Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 14 Jan 2013 16:26:53 +0100 Subject: [PATCH 04/15] raw-posix: remember whether discard failed Avoid sending system calls repeatedly if they shall fail. This does not apply to XFS: if the filesystem-specific ioctl fails, something weird is happening. Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- block/raw-posix.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/block/raw-posix.c b/block/raw-posix.c index e8d79afe5f..b647cfbb2f 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -141,6 +141,7 @@ typedef struct BDRVRawState { #ifdef CONFIG_XFS bool is_xfs : 1; #endif + bool has_discard : 1; } BDRVRawState; typedef struct BDRVRawReopenState { @@ -292,6 +293,7 @@ static int raw_open_common(BlockDriverState *bs, const char *filename, } #endif + s->has_discard = 1; #ifdef CONFIG_XFS if (platform_test_xfs_fd(s->fd)) { s->is_xfs = 1; @@ -1078,10 +1080,12 @@ static coroutine_fn int raw_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { int ret = -EOPNOTSUPP; - -#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_XFS) BDRVRawState *s = bs->opaque; + if (!s->has_discard) { + return 0; + } + #ifdef CONFIG_XFS if (s->is_xfs) { return xfs_discard(s, sector_num, nb_sectors); @@ -1098,7 +1102,6 @@ static coroutine_fn int raw_co_discard(BlockDriverState *bs, } while (errno == EINTR); ret = -errno; -#endif #endif if (ret == -EOPNOTSUPP) { From fcd9d4555252c47a337357dfce0806e5dde99d96 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 14 Jan 2013 16:26:54 +0100 Subject: [PATCH 05/15] raw: support discard on block devices Block devices use a ioctl instead of fallocate, so add a separate implementation. Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- block/raw-posix.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/block/raw-posix.c b/block/raw-posix.c index b647cfbb2f..1d32139c9b 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -1345,6 +1345,40 @@ static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs, return thread_pool_submit_aio(aio_worker, acb, cb, opaque); } +static coroutine_fn int hdev_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + BDRVRawState *s = bs->opaque; + int ret; + + if (s->has_discard == 0) { + return 0; + } + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + ret = -EOPNOTSUPP; +#ifdef BLKDISCARD + do { + uint64_t range[2] = { sector_num * 512, (uint64_t)nb_sectors * 512 }; + if (ioctl(s->fd, BLKDISCARD, range) == 0) { + return 0; + } + } while (errno == EINTR); + + ret = -errno; +#endif + if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || + ret == -ENOTTY) { + s->has_discard = 0; + ret = 0; + } + return ret; + +} + #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) static int fd_open(BlockDriverState *bs) { @@ -1413,6 +1447,8 @@ static BlockDriver bdrv_host_device = { .create_options = raw_create_options, .bdrv_has_zero_init = hdev_has_zero_init, + .bdrv_co_discard = hdev_co_discard, + .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, From 8238010b265886249f9f3d45e890788319b7736e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 14 Jan 2013 16:26:55 +0100 Subject: [PATCH 06/15] block: make discard asynchronous This is easy with the thread pool, because we can use s->is_xfs and s->has_discard from the worker function. QEMU has a widespread assumption that each I/O operation writes less than 2^32 bytes. This patch doesn't fix it throughout of course, but it starts correcting struct RawPosixAIOData so that there is no regression with respect to the synchronous discard implementation. Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- block/raw-aio.h | 5 +- block/raw-posix.c | 164 ++++++++++++++++++++++++---------------------- 2 files changed, 88 insertions(+), 81 deletions(-) diff --git a/block/raw-aio.h b/block/raw-aio.h index e77f361148..c61f1595d9 100644 --- a/block/raw-aio.h +++ b/block/raw-aio.h @@ -20,11 +20,14 @@ #define QEMU_AIO_WRITE 0x0002 #define QEMU_AIO_IOCTL 0x0004 #define QEMU_AIO_FLUSH 0x0008 +#define QEMU_AIO_DISCARD 0x0010 #define QEMU_AIO_TYPE_MASK \ - (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH) + (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \ + QEMU_AIO_DISCARD) /* AIO flags */ #define QEMU_AIO_MISALIGNED 0x1000 +#define QEMU_AIO_BLKDEV 0x2000 /* linux-aio.c - Linux native implementation */ diff --git a/block/raw-posix.c b/block/raw-posix.c index 1d32139c9b..679fcc5113 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -163,7 +163,7 @@ typedef struct RawPosixAIOData { void *aio_ioctl_buf; }; int aio_niov; - size_t aio_nbytes; + uint64_t aio_nbytes; #define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ off_t aio_offset; int aio_type; @@ -623,6 +623,72 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) return nbytes; } +#ifdef CONFIG_XFS +static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) +{ + struct xfs_flock64 fl; + + memset(&fl, 0, sizeof(fl)); + fl.l_whence = SEEK_SET; + fl.l_start = offset; + fl.l_len = bytes; + + if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { + DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno)); + return -errno; + } + + return 0; +} +#endif + +static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) +{ + int ret = -EOPNOTSUPP; + BDRVRawState *s = aiocb->bs->opaque; + + if (s->has_discard == 0) { + return 0; + } + + if (aiocb->aio_type & QEMU_AIO_BLKDEV) { +#ifdef BLKDISCARD + do { + uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; + if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) { + return 0; + } + } while (errno == EINTR); + + ret = -errno; +#endif + } else { +#ifdef CONFIG_XFS + if (s->is_xfs) { + return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); + } +#endif + +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE + do { + if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + aiocb->aio_offset, aiocb->aio_nbytes) == 0) { + return 0; + } + } while (errno == EINTR); + + ret = -errno; +#endif + } + + if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || + ret == -ENOTTY) { + s->has_discard = 0; + ret = 0; + } + return ret; +} + static int aio_worker(void *arg) { RawPosixAIOData *aiocb = arg; @@ -657,6 +723,9 @@ static int aio_worker(void *arg) case QEMU_AIO_IOCTL: ret = handle_aiocb_ioctl(aiocb); break; + case QEMU_AIO_DISCARD: + ret = handle_aiocb_discard(aiocb); + break; default: fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); ret = -EINVAL; @@ -1057,57 +1126,14 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, } } -#ifdef CONFIG_XFS -static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors) +static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) { - struct xfs_flock64 fl; - - memset(&fl, 0, sizeof(fl)); - fl.l_whence = SEEK_SET; - fl.l_start = sector_num << 9; - fl.l_len = (int64_t)nb_sectors << 9; - - if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { - DEBUG_BLOCK_PRINT("cannot punch hole (%s)\n", strerror(errno)); - return -errno; - } - - return 0; -} -#endif - -static coroutine_fn int raw_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) -{ - int ret = -EOPNOTSUPP; BDRVRawState *s = bs->opaque; - if (!s->has_discard) { - return 0; - } - -#ifdef CONFIG_XFS - if (s->is_xfs) { - return xfs_discard(s, sector_num, nb_sectors); - } -#endif - -#ifdef CONFIG_FALLOCATE_PUNCH_HOLE - do { - if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - sector_num << BDRV_SECTOR_BITS, - (int64_t)nb_sectors << BDRV_SECTOR_BITS) == 0) { - return 0; - } - } while (errno == EINTR); - - ret = -errno; -#endif - - if (ret == -EOPNOTSUPP) { - return 0; - } - return ret; + return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, + cb, opaque, QEMU_AIO_DISCARD); } static QEMUOptionParameter raw_create_options[] = { @@ -1130,12 +1156,12 @@ static BlockDriver bdrv_file = { .bdrv_reopen_abort = raw_reopen_abort, .bdrv_close = raw_close, .bdrv_create = raw_create, - .bdrv_co_discard = raw_co_discard, .bdrv_co_is_allocated = raw_co_is_allocated, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_aio_discard = raw_aio_discard, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, @@ -1345,38 +1371,17 @@ static BlockDriverAIOCB *hdev_aio_ioctl(BlockDriverState *bs, return thread_pool_submit_aio(aio_worker, acb, cb, opaque); } -static coroutine_fn int hdev_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockDriverCompletionFunc *cb, void *opaque) { BDRVRawState *s = bs->opaque; - int ret; - if (s->has_discard == 0) { - return 0; + if (fd_open(bs) < 0) { + return NULL; } - ret = fd_open(bs); - if (ret < 0) { - return ret; - } - - ret = -EOPNOTSUPP; -#ifdef BLKDISCARD - do { - uint64_t range[2] = { sector_num * 512, (uint64_t)nb_sectors * 512 }; - if (ioctl(s->fd, BLKDISCARD, range) == 0) { - return 0; - } - } while (errno == EINTR); - - ret = -errno; -#endif - if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || - ret == -ENOTTY) { - s->has_discard = 0; - ret = 0; - } - return ret; - + return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, + cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); } #elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) @@ -1447,11 +1452,10 @@ static BlockDriver bdrv_host_device = { .create_options = raw_create_options, .bdrv_has_zero_init = hdev_has_zero_init, - .bdrv_co_discard = hdev_co_discard, - .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_aio_discard = hdev_aio_discard, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, From 80bc2e8d807939bee89d1a5ca0dbe89946d39ed1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 14 Jan 2013 16:26:56 +0100 Subject: [PATCH 07/15] ide: fix TRIM with empty range entry ATA-ACS-3 says "If the two byte range length is zero, then the LBA Range Entry shall be discarded as padding." iovecs are used as if they are linearized, so it is incorrect to discard the rest of this iovec. Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- hw/ide/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/ide/core.c b/hw/ide/core.c index 6f1938a0a8..cb77dfcd4e 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -374,7 +374,7 @@ BlockDriverAIOCB *ide_issue_trim(BlockDriverState *bs, uint16_t count = entry >> 48; if (count == 0) { - break; + continue; } ret = bdrv_discard(bs, sector, count); From 501378c3af16e8e83a9dd500c11e594f4d1dbe79 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 14 Jan 2013 16:26:57 +0100 Subject: [PATCH 08/15] ide: issue discard asynchronously but serialize the pieces Now that discard can take a long time, make it asynchronous. Each LBA range entry is processed separately because discard can be an expensive operation. Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- hw/ide/core.c | 79 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/hw/ide/core.c b/hw/ide/core.c index cb77dfcd4e..14ad0799c3 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -325,14 +325,26 @@ typedef struct TrimAIOCB { BlockDriverAIOCB common; QEMUBH *bh; int ret; + QEMUIOVector *qiov; + BlockDriverAIOCB *aiocb; + int i, j; } TrimAIOCB; static void trim_aio_cancel(BlockDriverAIOCB *acb) { TrimAIOCB *iocb = container_of(acb, TrimAIOCB, common); + /* Exit the loop in case bdrv_aio_cancel calls ide_issue_trim_cb again. */ + iocb->j = iocb->qiov->niov - 1; + iocb->i = (iocb->qiov->iov[iocb->j].iov_len / 8) - 1; + + /* Tell ide_issue_trim_cb not to trigger the completion, too. */ qemu_bh_delete(iocb->bh); iocb->bh = NULL; + + if (iocb->aiocb) { + bdrv_aio_cancel(iocb->aiocb); + } qemu_aio_release(iocb); } @@ -349,43 +361,60 @@ static void ide_trim_bh_cb(void *opaque) qemu_bh_delete(iocb->bh); iocb->bh = NULL; - qemu_aio_release(iocb); } +static void ide_issue_trim_cb(void *opaque, int ret) +{ + TrimAIOCB *iocb = opaque; + if (ret >= 0) { + while (iocb->j < iocb->qiov->niov) { + int j = iocb->j; + while (++iocb->i < iocb->qiov->iov[j].iov_len / 8) { + int i = iocb->i; + uint64_t *buffer = iocb->qiov->iov[j].iov_base; + + /* 6-byte LBA + 2-byte range per entry */ + uint64_t entry = le64_to_cpu(buffer[i]); + uint64_t sector = entry & 0x0000ffffffffffffULL; + uint16_t count = entry >> 48; + + if (count == 0) { + continue; + } + + /* Got an entry! Submit and exit. */ + iocb->aiocb = bdrv_aio_discard(iocb->common.bs, sector, count, + ide_issue_trim_cb, opaque); + return; + } + + iocb->j++; + iocb->i = -1; + } + } else { + iocb->ret = ret; + } + + iocb->aiocb = NULL; + if (iocb->bh) { + qemu_bh_schedule(iocb->bh); + } +} + BlockDriverAIOCB *ide_issue_trim(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { TrimAIOCB *iocb; - int i, j, ret; iocb = qemu_aio_get(&trim_aiocb_info, bs, cb, opaque); iocb->bh = qemu_bh_new(ide_trim_bh_cb, iocb); iocb->ret = 0; - - for (j = 0; j < qiov->niov; j++) { - uint64_t *buffer = qiov->iov[j].iov_base; - - for (i = 0; i < qiov->iov[j].iov_len / 8; i++) { - /* 6-byte LBA + 2-byte range per entry */ - uint64_t entry = le64_to_cpu(buffer[i]); - uint64_t sector = entry & 0x0000ffffffffffffULL; - uint16_t count = entry >> 48; - - if (count == 0) { - continue; - } - - ret = bdrv_discard(bs, sector, count); - if (!iocb->ret) { - iocb->ret = ret; - } - } - } - - qemu_bh_schedule(iocb->bh); - + iocb->qiov = qiov; + iocb->i = -1; + iocb->j = 0; + ide_issue_trim_cb(iocb, 0); return &iocb->common; } From df702c9b4c1d049b12d7cf2f2ee607ff32f766cb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 14 Jan 2013 16:26:58 +0100 Subject: [PATCH 09/15] block: clear dirty bitmap when discarding Note that resetting bits in the dirty bitmap is done _before_ actually processing the request. Writes, instead, set bits after the request is completed. This way, when there are concurrent write and discard requests, the outcome will always be that the blocks are marked dirty. This scenario should never happen, but it is safer to do it this way. Signed-off-by: Paolo Bonzini Signed-off-by: Stefan Hajnoczi --- block.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index 4a90dd1a78..6fa7c90144 100644 --- a/block.c +++ b/block.c @@ -4170,7 +4170,13 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, return -EIO; } else if (bs->read_only) { return -EROFS; - } else if (bs->drv->bdrv_co_discard) { + } + + if (bs->dirty_bitmap) { + set_dirty_bitmap(bs, sector_num, nb_sectors, 0); + } + + if (bs->drv->bdrv_co_discard) { return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors); } else if (bs->drv->bdrv_aio_discard) { BlockDriverAIOCB *acb; From 477830727821e4bc337f4ac1fd222ffe0b900e1a Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Tue, 15 Jan 2013 16:28:55 +0800 Subject: [PATCH 10/15] sheepdog: multiplex the rw FD to flush cache This will reduce sockfds connected to the sheep server to one, which simply the future hacks. Cc: MORITA Kazutaka Cc: Kevin Wolf Cc: Stefan Hajnoczi Signed-off-by: Liu Yuan Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 82 ++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 45 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index 462c4b2d5d..04661da2dd 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -266,6 +266,7 @@ typedef struct AIOReq { enum AIOCBState { AIOCB_WRITE_UDATA, AIOCB_READ_UDATA, + AIOCB_FLUSH_CACHE, }; struct SheepdogAIOCB { @@ -299,7 +300,6 @@ typedef struct BDRVSheepdogState { char *addr; char *port; int fd; - int flush_fd; CoMutex lock; Coroutine *co_send; @@ -736,6 +736,13 @@ static void coroutine_fn aio_read_response(void *opaque) goto out; } break; + case AIOCB_FLUSH_CACHE: + if (rsp.result == SD_RES_INVALID_PARMS) { + dprintf("disable cache since the server doesn't support it\n"); + s->cache_flags = SD_FLAG_CMD_DIRECT; + rsp.result = SD_RES_SUCCESS; + } + break; } if (rsp.result != SD_RES_SUCCESS) { @@ -950,7 +957,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, { int nr_copies = s->inode.nr_copies; SheepdogObjReq hdr; - unsigned int wlen; + unsigned int wlen = 0; int ret; uint64_t oid = aio_req->oid; unsigned int datalen = aio_req->data_len; @@ -964,18 +971,23 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, memset(&hdr, 0, sizeof(hdr)); - if (aiocb_type == AIOCB_READ_UDATA) { - wlen = 0; + switch (aiocb_type) { + case AIOCB_FLUSH_CACHE: + hdr.opcode = SD_OP_FLUSH_VDI; + break; + case AIOCB_READ_UDATA: hdr.opcode = SD_OP_READ_OBJ; hdr.flags = flags; - } else if (create) { + break; + case AIOCB_WRITE_UDATA: + if (create) { + hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; + } else { + hdr.opcode = SD_OP_WRITE_OBJ; + } wlen = datalen; - hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; - hdr.flags = SD_FLAG_CMD_WRITE | flags; - } else { - wlen = datalen; - hdr.opcode = SD_OP_WRITE_OBJ; hdr.flags = SD_FLAG_CMD_WRITE | flags; + break; } if (s->cache_flags) { @@ -1127,15 +1139,6 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags) s->cache_flags = SD_FLAG_CMD_DIRECT; } - if (s->cache_flags == SD_FLAG_CMD_CACHE) { - s->flush_fd = connect_to_sdog(s->addr, s->port); - if (s->flush_fd < 0) { - error_report("failed to connect"); - ret = s->flush_fd; - goto out; - } - } - if (snapid || tag[0] != '\0') { dprintf("%" PRIx32 " snapshot inode was open.\n", vid); s->is_snapshot = true; @@ -1397,9 +1400,6 @@ static void sd_close(BlockDriverState *bs) qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL); closesocket(s->fd); - if (s->cache_flags) { - closesocket(s->flush_fd); - } g_free(s->addr); } @@ -1711,39 +1711,31 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) { BDRVSheepdogState *s = bs->opaque; - SheepdogObjReq hdr = { 0 }; - SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr; - SheepdogInode *inode = &s->inode; + SheepdogAIOCB *acb; + AIOReq *aio_req; int ret; - unsigned int wlen = 0, rlen = 0; if (s->cache_flags != SD_FLAG_CMD_CACHE) { return 0; } - hdr.opcode = SD_OP_FLUSH_VDI; - hdr.oid = vid_to_vdi_oid(inode->vdi_id); + acb = sd_aio_setup(bs, NULL, 0, 0, NULL, NULL); + acb->aiocb_type = AIOCB_FLUSH_CACHE; + acb->aio_done_func = sd_finish_aiocb; - ret = do_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen); - if (ret) { - error_report("failed to send a request to the sheep"); + aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), + 0, 0, 0, 0, 0); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + ret = add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type); + if (ret < 0) { + error_report("add_aio_request is failed"); + free_aio_req(s, aio_req); + qemu_aio_release(acb); return ret; } - if (rsp->result == SD_RES_INVALID_PARMS) { - dprintf("disable write cache since the server doesn't support it\n"); - - s->cache_flags = SD_FLAG_CMD_DIRECT; - closesocket(s->flush_fd); - return 0; - } - - if (rsp->result != SD_RES_SUCCESS) { - error_report("%s", sd_strerror(rsp->result)); - return -EIO; - } - - return 0; + qemu_coroutine_yield(); + return acb->ret; } static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) From f700f8e3463b5d61383121fa6f79564d6132b10d Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Mon, 14 Jan 2013 14:01:03 +0800 Subject: [PATCH 11/15] sheepdog: clean up sd_aio_setup() The last two parameters of sd_aio_setup() are never used, so remove them. Cc: MORITA Kazutaka Cc: Kevin Wolf Cc: Stefan Hajnoczi Signed-off-by: Liu Yuan Signed-off-by: Stefan Hajnoczi --- block/sheepdog.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/block/sheepdog.c b/block/sheepdog.c index 04661da2dd..3e49bb83bb 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -427,12 +427,11 @@ static const AIOCBInfo sd_aiocb_info = { }; static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, - int64_t sector_num, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) + int64_t sector_num, int nb_sectors) { SheepdogAIOCB *acb; - acb = qemu_aio_get(&sd_aiocb_info, bs, cb, opaque); + acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL); acb->qiov = qiov; @@ -1672,7 +1671,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, bs->total_sectors = sector_num + nb_sectors; } - acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL); + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); acb->aio_done_func = sd_write_done; acb->aiocb_type = AIOCB_WRITE_UDATA; @@ -1693,7 +1692,7 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, SheepdogAIOCB *acb; int ret; - acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL); + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); acb->aiocb_type = AIOCB_READ_UDATA; acb->aio_done_func = sd_finish_aiocb; @@ -1719,7 +1718,7 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) return 0; } - acb = sd_aio_setup(bs, NULL, 0, 0, NULL, NULL); + acb = sd_aio_setup(bs, NULL, 0, 0); acb->aiocb_type = AIOCB_FLUSH_CACHE; acb->aio_done_func = sd_finish_aiocb; From 94c8ff3a01d9bd1005f066a0ee3fe43c842a43b7 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Tue, 15 Jan 2013 14:23:37 +0100 Subject: [PATCH 12/15] w32: Make qemu_vfree() accept NULL like the POSIX implementation On POSIX, qemu_vfree() accepts NULL, because it's merely wrapper around free(). As far as I can tell, the Windows implementation doesn't. Breeds bugs that bite only under Windows. Make the Windows implementation behave like the POSIX implementation. Signed-off-by: Markus Armbruster Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- util/oslib-win32.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/util/oslib-win32.c b/util/oslib-win32.c index e7e283e875..640194c0cf 100644 --- a/util/oslib-win32.c +++ b/util/oslib-win32.c @@ -71,7 +71,9 @@ void *qemu_vmalloc(size_t size) void qemu_vfree(void *ptr) { trace_qemu_vfree(ptr); - VirtualFree(ptr, 0, MEM_RELEASE); + if (ptr) { + VirtualFree(ptr, 0, MEM_RELEASE); + } } /* FIXME: add proper locking */ From db4c34c3df5107ec4900ff07f70c540479a7eeca Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Tue, 15 Jan 2013 14:23:38 +0100 Subject: [PATCH 13/15] scsi-disk: qemu_vfree(NULL) is fine, simplify Signed-off-by: Markus Armbruster Acked-by: Paolo Bonzini Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- hw/scsi-disk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c index f8d7ef3374..96db9a73c7 100644 --- a/hw/scsi-disk.c +++ b/hw/scsi-disk.c @@ -85,9 +85,7 @@ static void scsi_free_request(SCSIRequest *req) { SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req); - if (r->iov.iov_base) { - qemu_vfree(r->iov.iov_base); - } + qemu_vfree(r->iov.iov_base); } /* Helper function for command completion with sense. */ From 7479acdbce2ecf6cbd0b7d72b81608c8fc51b1ae Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Tue, 15 Jan 2013 14:23:39 +0100 Subject: [PATCH 14/15] win32-aio: Fix how win32_aio_process_completion() frees buffer win32_aio_submit() allocates it with qemu_blockalign(), therefore it must be freed with qemu_vfree(), not g_free(). Signed-off-by: Markus Armbruster Reviewed-by: Kevin Wolf Signed-off-by: Stefan Hajnoczi --- block/win32-aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/win32-aio.c b/block/win32-aio.c index 46a5db78cc..03833700b4 100644 --- a/block/win32-aio.c +++ b/block/win32-aio.c @@ -87,7 +87,7 @@ static void win32_aio_process_completion(QEMUWin32AIOState *s, memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len); p += qiov->iov[i].iov_len; } - g_free(waiocb->buf); + qemu_vfree(waiocb->buf); } } From 7191bf311ea9722cdcc3b2229788eff69d896bd0 Mon Sep 17 00:00:00 2001 From: Markus Armbruster Date: Tue, 15 Jan 2013 15:29:10 +0100 Subject: [PATCH 15/15] block: Fix how mirror_run() frees its buffer It allocates with qemu_blockalign(), therefore it must free with qemu_vfree(), not g_free(). Signed-off-by: Markus Armbruster Signed-off-by: Stefan Hajnoczi --- block/mirror.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/mirror.c b/block/mirror.c index 8aeacbf12c..6180aa30e5 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -225,7 +225,7 @@ static void coroutine_fn mirror_run(void *opaque) } immediate_exit: - g_free(s->buf); + qemu_vfree(s->buf); bdrv_set_dirty_tracking(bs, false); bdrv_iostatus_disable(s->target); if (s->should_complete && ret == 0) {