diff --git a/aio-posix.c b/aio-posix.c index 0467f23a63..06148a9ba3 100644 --- a/aio-posix.c +++ b/aio-posix.c @@ -17,6 +17,9 @@ #include "block/block.h" #include "qemu/queue.h" #include "qemu/sockets.h" +#ifdef CONFIG_EPOLL +#include +#endif struct AioHandler { @@ -29,6 +32,162 @@ struct AioHandler QLIST_ENTRY(AioHandler) node; }; +#ifdef CONFIG_EPOLL + +/* The fd number threashold to switch to epoll */ +#define EPOLL_ENABLE_THRESHOLD 64 + +static void aio_epoll_disable(AioContext *ctx) +{ + ctx->epoll_available = false; + if (!ctx->epoll_enabled) { + return; + } + ctx->epoll_enabled = false; + close(ctx->epollfd); +} + +static inline int epoll_events_from_pfd(int pfd_events) +{ + return (pfd_events & G_IO_IN ? EPOLLIN : 0) | + (pfd_events & G_IO_OUT ? EPOLLOUT : 0) | + (pfd_events & G_IO_HUP ? EPOLLHUP : 0) | + (pfd_events & G_IO_ERR ? EPOLLERR : 0); +} + +static bool aio_epoll_try_enable(AioContext *ctx) +{ + AioHandler *node; + struct epoll_event event; + + QLIST_FOREACH(node, &ctx->aio_handlers, node) { + int r; + if (node->deleted || !node->pfd.events) { + continue; + } + event.events = epoll_events_from_pfd(node->pfd.events); + event.data.ptr = node; + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); + if (r) { + return false; + } + } + ctx->epoll_enabled = true; + return true; +} + +static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) +{ + struct epoll_event event; + int r; + + if (!ctx->epoll_enabled) { + return; + } + if (!node->pfd.events) { + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event); + if (r) { + aio_epoll_disable(ctx); + } + } else { + event.data.ptr = node; + event.events = epoll_events_from_pfd(node->pfd.events); + if (is_new) { + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); + if (r) { + aio_epoll_disable(ctx); + } + } else { + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event); + if (r) { + aio_epoll_disable(ctx); + } + } + } +} + +static int aio_epoll(AioContext *ctx, GPollFD *pfds, + unsigned npfd, int64_t timeout) +{ + AioHandler *node; + int i, ret = 0; + struct epoll_event events[128]; + + assert(npfd == 1); + assert(pfds[0].fd == ctx->epollfd); + if (timeout > 0) { + ret = qemu_poll_ns(pfds, npfd, timeout); + } + if (timeout <= 0 || ret > 0) { + ret = epoll_wait(ctx->epollfd, events, + sizeof(events) / sizeof(events[0]), + timeout); + if (ret <= 0) { + goto out; + } + for (i = 0; i < ret; i++) { + int ev = events[i].events; + node = events[i].data.ptr; + node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) | + (ev & EPOLLOUT ? G_IO_OUT : 0) | + (ev & EPOLLHUP ? G_IO_HUP : 0) | + (ev & EPOLLERR ? G_IO_ERR : 0); + } + } +out: + return ret; +} + +static bool aio_epoll_enabled(AioContext *ctx) +{ + /* Fall back to ppoll when external clients are disabled. */ + return !aio_external_disabled(ctx) && ctx->epoll_enabled; +} + +static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, + unsigned npfd, int64_t timeout) +{ + if (!ctx->epoll_available) { + return false; + } + if (aio_epoll_enabled(ctx)) { + return true; + } + if (npfd >= EPOLL_ENABLE_THRESHOLD) { + if (aio_epoll_try_enable(ctx)) { + return true; + } else { + aio_epoll_disable(ctx); + } + } + return false; +} + +#else + +static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new) +{ +} + +static int aio_epoll(AioContext *ctx, GPollFD *pfds, + unsigned npfd, int64_t timeout) +{ + assert(false); +} + +static bool aio_epoll_enabled(AioContext *ctx) +{ + return false; +} + +static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds, + unsigned npfd, int64_t timeout) +{ + return false; +} + +#endif + static AioHandler *find_aio_handler(AioContext *ctx, int fd) { AioHandler *node; @@ -50,6 +209,7 @@ void aio_set_fd_handler(AioContext *ctx, void *opaque) { AioHandler *node; + bool is_new = false; node = find_aio_handler(ctx, fd); @@ -79,6 +239,7 @@ void aio_set_fd_handler(AioContext *ctx, QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node); g_source_add_poll(&ctx->source, &node->pfd); + is_new = true; } /* Update handler with latest information */ node->io_read = io_read; @@ -90,6 +251,7 @@ void aio_set_fd_handler(AioContext *ctx, node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0); } + aio_epoll_update(ctx, node, is_new); aio_notify(ctx); } @@ -262,6 +424,7 @@ bool aio_poll(AioContext *ctx, bool blocking) /* fill pollfds */ QLIST_FOREACH(node, &ctx->aio_handlers, node) { if (!node->deleted && node->pfd.events + && !aio_epoll_enabled(ctx) && aio_node_check(ctx, node->is_external)) { add_pollfd(node); } @@ -273,7 +436,17 @@ bool aio_poll(AioContext *ctx, bool blocking) if (timeout) { aio_context_release(ctx); } - ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout); + if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) { + AioHandler epoll_handler; + + epoll_handler.pfd.fd = ctx->epollfd; + epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR; + npfd = 0; + add_pollfd(&epoll_handler); + ret = aio_epoll(ctx, pollfds, npfd, timeout); + } else { + ret = qemu_poll_ns(pollfds, npfd, timeout); + } if (blocking) { atomic_sub(&ctx->notify_me, 2); } @@ -302,3 +475,16 @@ bool aio_poll(AioContext *ctx, bool blocking) return progress; } + +void aio_context_setup(AioContext *ctx, Error **errp) +{ +#ifdef CONFIG_EPOLL + assert(!ctx->epollfd); + ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); + if (ctx->epollfd == -1) { + ctx->epoll_available = false; + } else { + ctx->epoll_available = true; + } +#endif +} diff --git a/aio-win32.c b/aio-win32.c index 43c4c79a75..cdc445608b 100644 --- a/aio-win32.c +++ b/aio-win32.c @@ -369,3 +369,7 @@ bool aio_poll(AioContext *ctx, bool blocking) aio_context_release(ctx); return progress; } + +void aio_context_setup(AioContext *ctx, Error **errp) +{ +} diff --git a/async.c b/async.c index 9589e4bb7d..e106072a44 100644 --- a/async.c +++ b/async.c @@ -325,12 +325,18 @@ AioContext *aio_context_new(Error **errp) { int ret; AioContext *ctx; + Error *local_err = NULL; + ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext)); + aio_context_setup(ctx, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto fail; + } ret = event_notifier_init(&ctx->notifier, false); if (ret < 0) { - g_source_destroy(&ctx->source); error_setg_errno(errp, -ret, "Failed to initialize event notifier"); - return NULL; + goto fail; } g_source_set_can_recurse(&ctx->source, true); aio_set_event_notifier(ctx, &ctx->notifier, @@ -345,6 +351,9 @@ AioContext *aio_context_new(Error **errp) ctx->notify_dummy_bh = aio_bh_new(ctx, notify_dummy_bh, NULL); return ctx; +fail: + g_source_destroy(&ctx->source); + return NULL; } void aio_context_ref(AioContext *ctx) diff --git a/blockdev.c b/blockdev.c index 8b8bfa992c..97be42f6cd 100644 --- a/blockdev.c +++ b/blockdev.c @@ -1120,6 +1120,9 @@ void hmp_commit(Monitor *mon, const QDict *qdict) if (!strcmp(device, "all")) { ret = bdrv_commit_all(); } else { + BlockDriverState *bs; + AioContext *aio_context; + blk = blk_by_name(device); if (!blk) { monitor_printf(mon, "Device '%s' not found\n", device); @@ -1129,7 +1132,14 @@ void hmp_commit(Monitor *mon, const QDict *qdict) monitor_printf(mon, "Device '%s' has no medium\n", device); return; } - ret = bdrv_commit(blk_bs(blk)); + + bs = blk_bs(blk); + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + + ret = bdrv_commit(bs); + + aio_context_release(aio_context); } if (ret < 0) { monitor_printf(mon, "'commit' error for '%s': %s\n", device, diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c index 68f1994434..23f667ef4b 100644 --- a/hw/virtio/dataplane/vring.c +++ b/hw/virtio/dataplane/vring.c @@ -25,15 +25,30 @@ /* vring_map can be coupled with vring_unmap or (if you still have the * value returned in *mr) memory_region_unref. + * Returns NULL on failure. + * Callers that can handle a partial mapping must supply mapped_len pointer to + * get the actual length mapped. + * Passing mapped_len == NULL requires either a full mapping or a failure. */ -static void *vring_map(MemoryRegion **mr, hwaddr phys, hwaddr len, +static void *vring_map(MemoryRegion **mr, hwaddr phys, + hwaddr len, hwaddr *mapped_len, bool is_write) { MemoryRegionSection section = memory_region_find(get_system_memory(), phys, len); + uint64_t size; - if (!section.mr || int128_get64(section.size) < len) { + if (!section.mr) { goto out; } + + size = int128_get64(section.size); + assert(size); + + /* Passing mapped_len == NULL requires either a full mapping or a failure. */ + if (!mapped_len && size < len) { + goto out; + } + if (is_write && section.readonly) { goto out; } @@ -46,6 +61,10 @@ static void *vring_map(MemoryRegion **mr, hwaddr phys, hwaddr len, goto out; } + if (mapped_len) { + *mapped_len = MIN(size, len); + } + *mr = section.mr; return memory_region_get_ram_ptr(section.mr) + section.offset_within_region; @@ -78,7 +97,7 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n) addr = virtio_queue_get_desc_addr(vdev, n); size = virtio_queue_get_desc_size(vdev, n); /* Map the descriptor area as read only */ - ptr = vring_map(&vring->mr_desc, addr, size, false); + ptr = vring_map(&vring->mr_desc, addr, size, NULL, false); if (!ptr) { error_report("Failed to map 0x%" HWADDR_PRIx " byte for vring desc " "at 0x%" HWADDR_PRIx, @@ -92,7 +111,7 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n) /* Add the size of the used_event_idx */ size += sizeof(uint16_t); /* Map the driver area as read only */ - ptr = vring_map(&vring->mr_avail, addr, size, false); + ptr = vring_map(&vring->mr_avail, addr, size, NULL, false); if (!ptr) { error_report("Failed to map 0x%" HWADDR_PRIx " byte for vring avail " "at 0x%" HWADDR_PRIx, @@ -106,7 +125,7 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n) /* Add the size of the avail_event_idx */ size += sizeof(uint16_t); /* Map the device area as read-write */ - ptr = vring_map(&vring->mr_used, addr, size, true); + ptr = vring_map(&vring->mr_used, addr, size, NULL, true); if (!ptr) { error_report("Failed to map 0x%" HWADDR_PRIx " byte for vring used " "at 0x%" HWADDR_PRIx, @@ -206,6 +225,7 @@ static int get_desc(Vring *vring, VirtQueueElement *elem, struct iovec *iov; hwaddr *addr; MemoryRegion *mr; + hwaddr len; if (desc->flags & VRING_DESC_F_WRITE) { num = &elem->in_num; @@ -224,26 +244,30 @@ static int get_desc(Vring *vring, VirtQueueElement *elem, } } - /* Stop for now if there are not enough iovecs available. */ - if (*num >= VIRTQUEUE_MAX_SIZE) { - error_report("Invalid SG num: %u", *num); - return -EFAULT; + while (desc->len) { + /* Stop for now if there are not enough iovecs available. */ + if (*num >= VIRTQUEUE_MAX_SIZE) { + error_report("Invalid SG num: %u", *num); + return -EFAULT; + } + + iov->iov_base = vring_map(&mr, desc->addr, desc->len, &len, + desc->flags & VRING_DESC_F_WRITE); + if (!iov->iov_base) { + error_report("Failed to map descriptor addr %#" PRIx64 " len %u", + (uint64_t)desc->addr, desc->len); + return -EFAULT; + } + + /* The MemoryRegion is looked up again and unref'ed later, leave the + * ref in place. */ + (iov++)->iov_len = len; + *addr++ = desc->addr; + desc->len -= len; + desc->addr += len; + *num += 1; } - /* TODO handle non-contiguous memory across region boundaries */ - iov->iov_base = vring_map(&mr, desc->addr, desc->len, - desc->flags & VRING_DESC_F_WRITE); - if (!iov->iov_base) { - error_report("Failed to map descriptor addr %#" PRIx64 " len %u", - (uint64_t)desc->addr, desc->len); - return -EFAULT; - } - - /* The MemoryRegion is looked up again and unref'ed later, leave the - * ref in place. */ - iov->iov_len = desc->len; - *addr = desc->addr; - *num += 1; return 0; } @@ -257,6 +281,21 @@ static void copy_in_vring_desc(VirtIODevice *vdev, host->next = virtio_lduw_p(vdev, &guest->next); } +static bool read_vring_desc(VirtIODevice *vdev, + hwaddr guest, + struct vring_desc *host) +{ + if (address_space_read(&address_space_memory, guest, MEMTXATTRS_UNSPECIFIED, + (uint8_t *)host, sizeof *host)) { + return false; + } + host->addr = virtio_tswap64(vdev, host->addr); + host->len = virtio_tswap32(vdev, host->len); + host->flags = virtio_tswap16(vdev, host->flags); + host->next = virtio_tswap16(vdev, host->next); + return true; +} + /* This is stolen from linux/drivers/vhost/vhost.c. */ static int get_indirect(VirtIODevice *vdev, Vring *vring, VirtQueueElement *elem, struct vring_desc *indirect) @@ -284,23 +323,16 @@ static int get_indirect(VirtIODevice *vdev, Vring *vring, } do { - struct vring_desc *desc_ptr; - MemoryRegion *mr; - /* Translate indirect descriptor */ - desc_ptr = vring_map(&mr, - indirect->addr + found * sizeof(desc), - sizeof(desc), false); - if (!desc_ptr) { - error_report("Failed to map indirect descriptor " + if (!read_vring_desc(vdev, indirect->addr + found * sizeof(desc), + &desc)) { + error_report("Failed to read indirect descriptor " "addr %#" PRIx64 " len %zu", (uint64_t)indirect->addr + found * sizeof(desc), sizeof(desc)); vring->broken = true; return -EFAULT; } - copy_in_vring_desc(vdev, desc_ptr, &desc); - memory_region_unref(mr); /* Ensure descriptor has been loaded before accessing fields */ barrier(); /* read_barrier_depends(); */ diff --git a/include/block/aio.h b/include/block/aio.h index 92efc5e1f1..e086e3b4ee 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -124,6 +124,11 @@ struct AioContext { QEMUTimerListGroup tlg; int external_disable_cnt; + + /* epoll(7) state used when built with CONFIG_EPOLL */ + int epollfd; + bool epoll_enabled; + bool epoll_available; }; /** @@ -405,6 +410,17 @@ static inline void aio_enable_external(AioContext *ctx) atomic_dec(&ctx->external_disable_cnt); } +/** + * aio_external_disabled: + * @ctx: the aio context + * + * Return true if the external clients are disabled. + */ +static inline bool aio_external_disabled(AioContext *ctx) +{ + return atomic_read(&ctx->external_disable_cnt); +} + /** * aio_node_check: * @ctx: the aio context @@ -418,4 +434,12 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external) return !is_external || !atomic_read(&ctx->external_disable_cnt); } +/** + * aio_context_setup: + * @ctx: the aio context + * + * Initialize the aio context. + */ +void aio_context_setup(AioContext *ctx, Error **errp); + #endif diff --git a/monitor.c b/monitor.c index 6cd747f4f9..32958407dd 100644 --- a/monitor.c +++ b/monitor.c @@ -3408,13 +3408,18 @@ static void vm_completion(ReadLineState *rs, const char *str) readline_set_completion_index(rs, len); while ((bs = bdrv_next(bs))) { SnapshotInfoList *snapshots, *snapshot; + AioContext *ctx = bdrv_get_aio_context(bs); + bool ok = false; - if (!bdrv_can_snapshot(bs)) { - continue; - } - if (bdrv_query_snapshot_info_list(bs, &snapshots, NULL)) { + aio_context_acquire(ctx); + if (bdrv_can_snapshot(bs)) { + ok = bdrv_query_snapshot_info_list(bs, &snapshots, NULL) == 0; + } + aio_context_release(ctx); + if (!ok) { continue; } + snapshot = snapshots; while (snapshot) { char *completion = snapshot->value->name;