From 86abad0fedc44554adde5e189cf7edfa5b1c948e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 22 Oct 2015 22:31:28 +0300 Subject: [PATCH 01/16] vhost-user: cleanup struct size math We are using local msg structures everywhere, use them for sizeof as well. Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index 0aa8e0d93a..dce4dd35e9 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -201,7 +201,7 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base, .request = VHOST_USER_SET_LOG_BASE, .flags = VHOST_USER_VERSION, .payload.u64 = base, - .size = sizeof(m.payload.u64), + .size = sizeof(msg.payload.u64), }; if (shmfd && log->fd != -1) { @@ -265,8 +265,8 @@ static int vhost_user_set_mem_table(struct vhost_dev *dev, return -1; } - msg.size = sizeof(m.payload.memory.nregions); - msg.size += sizeof(m.payload.memory.padding); + msg.size = sizeof(msg.payload.memory.nregions); + msg.size += sizeof(msg.payload.memory.padding); msg.size += fd_num * sizeof(VhostUserMemoryRegion); vhost_user_write(dev, &msg, fds, fd_num); @@ -361,7 +361,7 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev, return -1; } - if (msg.size != sizeof(m.payload.state)) { + if (msg.size != sizeof(msg.payload.state)) { error_report("Received bad msg size."); return -1; } @@ -381,7 +381,7 @@ static int vhost_set_vring_file(struct vhost_dev *dev, .request = request, .flags = VHOST_USER_VERSION, .payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK, - .size = sizeof(m.payload.u64), + .size = sizeof(msg.payload.u64), }; if (ioeventfd_enabled() && file->fd > 0) { @@ -413,7 +413,7 @@ static int vhost_user_set_u64(struct vhost_dev *dev, int request, uint64_t u64) .request = request, .flags = VHOST_USER_VERSION, .payload.u64 = u64, - .size = sizeof(m.payload.u64), + .size = sizeof(msg.payload.u64), }; vhost_user_write(dev, &msg, NULL, 0); @@ -456,7 +456,7 @@ static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64) return -1; } - if (msg.size != sizeof(m.payload.u64)) { + if (msg.size != sizeof(msg.payload.u64)) { error_report("Received bad msg size."); return -1; } @@ -592,7 +592,7 @@ static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr) msg.request = VHOST_USER_SEND_RARP; msg.flags = VHOST_USER_VERSION; memcpy((char *)&msg.payload.u64, mac_addr, 6); - msg.size = sizeof(m.payload.u64); + msg.size = sizeof(msg.payload.u64); err = vhost_user_write(dev, &msg, NULL, 0); return err; From 7fc0246c0792767b732c0989e8eba24bea185feb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 22 Oct 2015 22:33:39 +0300 Subject: [PATCH 02/16] vhost-user: cleanup msg size math We are sending msg fields, use sizeof on these and not on local variables which happen to have a matching type. Signed-off-by: Michael S. Tsirkin --- hw/virtio/vhost-user.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c index dce4dd35e9..83c84f1cd6 100644 --- a/hw/virtio/vhost-user.c +++ b/hw/virtio/vhost-user.c @@ -281,7 +281,7 @@ static int vhost_user_set_vring_addr(struct vhost_dev *dev, .request = VHOST_USER_SET_VRING_ADDR, .flags = VHOST_USER_VERSION, .payload.addr = *addr, - .size = sizeof(*addr), + .size = sizeof(msg.payload.addr), }; vhost_user_write(dev, &msg, NULL, 0); @@ -304,7 +304,7 @@ static int vhost_set_vring(struct vhost_dev *dev, .request = request, .flags = VHOST_USER_VERSION, .payload.state = *ring, - .size = sizeof(*ring), + .size = sizeof(msg.payload.state), }; vhost_user_write(dev, &msg, NULL, 0); @@ -346,7 +346,7 @@ static int vhost_user_get_vring_base(struct vhost_dev *dev, .request = VHOST_USER_GET_VRING_BASE, .flags = VHOST_USER_VERSION, .payload.state = *ring, - .size = sizeof(*ring), + .size = sizeof(msg.payload.state), }; vhost_user_write(dev, &msg, NULL, 0); From 12ebf6908333a86775ef18f12ea283601fd1d2df Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 22 Oct 2015 22:28:37 +0300 Subject: [PATCH 03/16] vhost-user-test: fix up rhel6 build Build on RHEL6 fails: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=42875 Apparently unnamed unions couldn't use C99 named field initializers. Let's just name the payload union field. Signed-off-by: Michael S. Tsirkin --- tests/vhost-user-test.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c index a74c934cc0..b6dde753f8 100644 --- a/tests/vhost-user-test.c +++ b/tests/vhost-user-test.c @@ -98,7 +98,7 @@ typedef struct VhostUserMsg { struct vhost_vring_state state; struct vhost_vring_addr addr; VhostUserMemory memory; - }; + } payload; } QEMU_PACKED VhostUserMsg; static VhostUserMsg m __attribute__ ((unused)); @@ -242,23 +242,23 @@ static void chr_read(void *opaque, const uint8_t *buf, int size) case VHOST_USER_GET_FEATURES: /* send back features to qemu */ msg.flags |= VHOST_USER_REPLY_MASK; - msg.size = sizeof(m.u64); - msg.u64 = 0x1ULL << VHOST_F_LOG_ALL | + msg.size = sizeof(m.payload.u64); + msg.payload.u64 = 0x1ULL << VHOST_F_LOG_ALL | 0x1ULL << VHOST_USER_F_PROTOCOL_FEATURES; p = (uint8_t *) &msg; qemu_chr_fe_write_all(chr, p, VHOST_USER_HDR_SIZE + msg.size); break; case VHOST_USER_SET_FEATURES: - g_assert_cmpint(msg.u64 & (0x1ULL << VHOST_USER_F_PROTOCOL_FEATURES), + g_assert_cmpint(msg.payload.u64 & (0x1ULL << VHOST_USER_F_PROTOCOL_FEATURES), !=, 0ULL); break; case VHOST_USER_GET_PROTOCOL_FEATURES: /* send back features to qemu */ msg.flags |= VHOST_USER_REPLY_MASK; - msg.size = sizeof(m.u64); - msg.u64 = 1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD; + msg.size = sizeof(m.payload.u64); + msg.payload.u64 = 1 << VHOST_USER_PROTOCOL_F_LOG_SHMFD; p = (uint8_t *) &msg; qemu_chr_fe_write_all(chr, p, VHOST_USER_HDR_SIZE + msg.size); break; @@ -266,15 +266,15 @@ static void chr_read(void *opaque, const uint8_t *buf, int size) case VHOST_USER_GET_VRING_BASE: /* send back vring base to qemu */ msg.flags |= VHOST_USER_REPLY_MASK; - msg.size = sizeof(m.state); - msg.state.num = 0; + msg.size = sizeof(m.payload.state); + msg.payload.state.num = 0; p = (uint8_t *) &msg; qemu_chr_fe_write_all(chr, p, VHOST_USER_HDR_SIZE + msg.size); break; case VHOST_USER_SET_MEM_TABLE: /* received the mem table */ - memcpy(&s->memory, &msg.memory, sizeof(msg.memory)); + memcpy(&s->memory, &msg.payload.memory, sizeof(msg.payload.memory)); s->fds_num = qemu_chr_fe_get_msgfds(chr, s->fds, G_N_ELEMENTS(s->fds)); /* signal the test that it can continue */ From 4828b10bda6a74a22a7695303e0648157d0e3ea4 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Fri, 23 Oct 2015 14:55:26 +0200 Subject: [PATCH 04/16] pc: memhp: do not emit inserting event for coldplugged DIMMs currently acpi_memory_plug_cb() sets is_inserting for cold- and hot-plugged DIMMs as result ASL MHPD.MSCN() method issues device check even for every coldplugged DIMM. There isn't much harm in it but if we try to unplug such DIMM, OSPM will issue device check intstead of device eject event. So OSPM won't eject memory module as expected and it will try to eject it only when another memory device is hot-(un)plugged. As a fix do not set 'is_inserting' event and do not issue SCI for cold-plugged DIMMs as they are enumerated and activated by OSPM during guest's boot. Signed-off-by: Igor Mammedov Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/acpi/memory_hotplug.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/hw/acpi/memory_hotplug.c b/hw/acpi/memory_hotplug.c index 2ff0d5ce1b..ce428dfc18 100644 --- a/hw/acpi/memory_hotplug.c +++ b/hw/acpi/memory_hotplug.c @@ -238,10 +238,12 @@ void acpi_memory_plug_cb(ACPIREGS *ar, qemu_irq irq, MemHotplugState *mem_st, mdev->dimm = dev; mdev->is_enabled = true; - mdev->is_inserting = true; + if (dev->hotplugged) { + mdev->is_inserting = true; - /* do ACPI magic */ - acpi_send_gpe_event(ar, irq, ACPI_MEMORY_HOTPLUG_STATUS); + /* do ACPI magic */ + acpi_send_gpe_event(ar, irq, ACPI_MEMORY_HOTPLUG_STATUS); + } return; } From 9d4ec9370a36f8a564e1ba05519328c0bd60da13 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 25 Oct 2015 17:07:45 +0200 Subject: [PATCH 05/16] mmap-alloc: fix error handling Existing callers are checking for MAP_FAILED, so we should return that on error. Reported-by: Paolo Bonzini Signed-off-by: Michael S. Tsirkin --- util/mmap-alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c index 13942694cc..c37acbe58e 100644 --- a/util/mmap-alloc.c +++ b/util/mmap-alloc.c @@ -26,7 +26,7 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) void *ptr1; if (ptr == MAP_FAILED) { - return NULL; + return MAP_FAILED; } /* Make sure align is a power of 2 */ @@ -41,7 +41,7 @@ void *qemu_ram_mmap(int fd, size_t size, size_t align, bool shared) fd, 0); if (ptr1 == MAP_FAILED) { munmap(ptr, total); - return NULL; + return MAP_FAILED; } ptr += offset; From 8059feee004111534c4c0652e2f0715e9b4e0754 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 27 Oct 2015 10:01:44 +0200 Subject: [PATCH 06/16] virtio: introduce virtio_map virtio_map_sg currently fails if one of the entries it's mapping is contigious in GPA but not HVA address space. Introduce virtio_map which handles this by splitting sg entries. This new API generally turns out to be a good idea since it's harder to misuse: at least in one case the existing one was used incorrectly. This will still fail if there's no space left in the sg, but luckily max queue size in use is currently 256, while max sg size is 1024, so we should be OK even is all entries happen to cross a single DIMM boundary. Won't work well with very small DIMM sizes, unfortunately: e.g. this will fail with 4K DIMMs where a single request might span a large number of DIMMs. Let's hope these are uncommon - at least we are not breaking things. Note: virtio-scsi calls virtio_map_sg on data loaded from network, and validates input, asserting on failure. Copy the validating code here - it will be dropped from virtio-scsi in a follow-up patch. Reported-by: Igor Mammedov Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Igor Mammedov --- hw/virtio/virtio.c | 56 ++++++++++++++++++++++++++++++++------ include/hw/virtio/virtio.h | 1 + 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index d0bc72e0e4..943b9904bc 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -448,28 +448,66 @@ int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, return in_bytes <= in_total && out_bytes <= out_total; } -void virtqueue_map_sg(struct iovec *sg, hwaddr *addr, - size_t num_sg, int is_write) +static void virtqueue_map_iovec(struct iovec *sg, hwaddr *addr, + unsigned int *num_sg, unsigned int max_size, + int is_write) { unsigned int i; hwaddr len; - if (num_sg > VIRTQUEUE_MAX_SIZE) { - error_report("virtio: map attempt out of bounds: %zd > %d", - num_sg, VIRTQUEUE_MAX_SIZE); - exit(1); - } + /* Note: this function MUST validate input, some callers + * are passing in num_sg values received over the network. + */ + /* TODO: teach all callers that this can fail, and return failure instead + * of asserting here. + * When we do, we might be able to re-enable NDEBUG below. + */ +#ifdef NDEBUG +#error building with NDEBUG is not supported +#endif + assert(*num_sg <= max_size); - for (i = 0; i < num_sg; i++) { + for (i = 0; i < *num_sg; i++) { len = sg[i].iov_len; sg[i].iov_base = cpu_physical_memory_map(addr[i], &len, is_write); - if (sg[i].iov_base == NULL || len != sg[i].iov_len) { + if (!sg[i].iov_base) { error_report("virtio: error trying to map MMIO memory"); exit(1); } + if (len == sg[i].iov_len) { + continue; + } + if (*num_sg >= max_size) { + error_report("virtio: memory split makes iovec too large"); + exit(1); + } + memmove(sg + i + 1, sg + i, sizeof(*sg) * (*num_sg - i)); + memmove(addr + i + 1, addr + i, sizeof(*addr) * (*num_sg - i)); + assert(len < sg[i + 1].iov_len); + sg[i].iov_len = len; + addr[i + 1] += len; + sg[i + 1].iov_len -= len; + ++*num_sg; } } +/* Deprecated: don't use in new code */ +void virtqueue_map_sg(struct iovec *sg, hwaddr *addr, + size_t num_sg, int is_write) +{ + virtqueue_map_iovec(sg, addr, &num_sg, num_sg, is_write); +} + +void virtqueue_map(VirtQueueElement *elem) +{ + virtqueue_map_iovec(elem->in_sg, elem->in_addr, &elem->in_num, + MIN(ARRAY_SIZE(elem->in_sg), ARRAY_SIZE(elem->in_addr)), + 1); + virtqueue_map_iovec(elem->out_sg, elem->out_addr, &elem->out_num, + MIN(ARRAY_SIZE(elem->out_sg), ARRAY_SIZE(elem->out_addr)), + 0); +} + int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) { unsigned int i, head, max; diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 9d09115fab..9d9abb4689 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -153,6 +153,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, void virtqueue_map_sg(struct iovec *sg, hwaddr *addr, size_t num_sg, int is_write); +void virtqueue_map(VirtQueueElement *elem); int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem); int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, unsigned int out_bytes); From 13972ac5e263a7319609253edac5754129489132 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 27 Oct 2015 10:09:16 +0200 Subject: [PATCH 07/16] virtio: switch to virtio_map Drop use of the deprecated virtio_map_sg in virtio core. Signed-off-by: Michael S. Tsirkin Reviewed-by: Igor Mammedov --- hw/virtio/virtio.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 943b9904bc..462157044c 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -569,8 +569,7 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max); /* Now map what we have collected */ - virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1); - virtqueue_map_sg(elem->out_sg, elem->out_addr, elem->out_num, 0); + virtqueue_map(elem); elem->index = head; From 3d8db153b4b4e12b6d11590ccd318fff0eafd688 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 27 Oct 2015 10:18:24 +0200 Subject: [PATCH 08/16] virtio-blk: convert to virtqueue_map Drop deprecated use of virtqueue_map_sg. Signed-off-by: Michael S. Tsirkin Reviewed-by: Igor Mammedov --- hw/block/virtio-blk.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c index 8beb26b4db..3e230debb8 100644 --- a/hw/block/virtio-blk.c +++ b/hw/block/virtio-blk.c @@ -839,10 +839,7 @@ static int virtio_blk_load_device(VirtIODevice *vdev, QEMUFile *f, req->next = s->rq; s->rq = req; - virtqueue_map_sg(req->elem.in_sg, req->elem.in_addr, - req->elem.in_num, 1); - virtqueue_map_sg(req->elem.out_sg, req->elem.out_addr, - req->elem.out_num, 0); + virtqueue_map(&req->elem); } return 0; From bff712dc223f685c684f9caf960e6460e84a96f0 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 27 Oct 2015 10:19:43 +0200 Subject: [PATCH 09/16] virtio-serial: convert to virtio_map This also fixes a minor bug: - virtqueue_map_sg(port->elem.out_sg, port->elem.out_addr, - port->elem.out_num, 1); is wrong: out_sg is not written so should not be marked dirty. Signed-off-by: Michael S. Tsirkin Reviewed-by: Igor Mammedov --- hw/char/virtio-serial-bus.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/hw/char/virtio-serial-bus.c b/hw/char/virtio-serial-bus.c index be97058712..497b0afd9f 100644 --- a/hw/char/virtio-serial-bus.c +++ b/hw/char/virtio-serial-bus.c @@ -705,10 +705,7 @@ static int fetch_active_ports_list(QEMUFile *f, int version_id, qemu_get_buffer(f, (unsigned char *)&port->elem, sizeof(port->elem)); - virtqueue_map_sg(port->elem.in_sg, port->elem.in_addr, - port->elem.in_num, 1); - virtqueue_map_sg(port->elem.out_sg, port->elem.out_addr, - port->elem.out_num, 1); + virtqueue_map(&port->elem); /* * Port was throttled on source machine. Let's From 4ada5331895551570846e12e7eb00e06616f9152 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 27 Oct 2015 10:22:13 +0200 Subject: [PATCH 10/16] virtio-scsi: convert to virtqueue_map Note: virtqueue_map already validates input so virtio-scsi does not have to. Signed-off-by: Michael S. Tsirkin Reviewed-by: Igor Mammedov --- hw/scsi/virtio-scsi.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c index 20885fb7f1..46553e1e68 100644 --- a/hw/scsi/virtio-scsi.c +++ b/hw/scsi/virtio-scsi.c @@ -205,20 +205,8 @@ static void *virtio_scsi_load_request(QEMUFile *f, SCSIRequest *sreq) assert(n < vs->conf.num_queues); req = virtio_scsi_init_req(s, vs->cmd_vqs[n]); qemu_get_buffer(f, (unsigned char *)&req->elem, sizeof(req->elem)); - /* TODO: add a way for SCSIBusInfo's load_request to fail, - * and fail migration instead of asserting here. - * When we do, we might be able to re-enable NDEBUG below. - */ -#ifdef NDEBUG -#error building with NDEBUG is not supported -#endif - assert(req->elem.in_num <= ARRAY_SIZE(req->elem.in_sg)); - assert(req->elem.out_num <= ARRAY_SIZE(req->elem.out_sg)); - virtqueue_map_sg(req->elem.in_sg, req->elem.in_addr, - req->elem.in_num, 1); - virtqueue_map_sg(req->elem.out_sg, req->elem.out_addr, - req->elem.out_num, 0); + virtqueue_map(&req->elem); if (virtio_scsi_parse_req(req, sizeof(VirtIOSCSICmdReq) + vs->cdb_size, sizeof(VirtIOSCSICmdResp) + vs->sense_size) < 0) { From 3945ecf1ec4f6e6aa28d0c396a7f5d983c6810d8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 27 Oct 2015 10:22:59 +0200 Subject: [PATCH 11/16] virtio: drop virtqueue_map_sg Deprecated in favor of virtqueue_map. Signed-off-by: Michael S. Tsirkin Reviewed-by: Igor Mammedov --- hw/virtio/virtio.c | 7 ------- include/hw/virtio/virtio.h | 2 -- 2 files changed, 9 deletions(-) diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c index 462157044c..939f802110 100644 --- a/hw/virtio/virtio.c +++ b/hw/virtio/virtio.c @@ -491,13 +491,6 @@ static void virtqueue_map_iovec(struct iovec *sg, hwaddr *addr, } } -/* Deprecated: don't use in new code */ -void virtqueue_map_sg(struct iovec *sg, hwaddr *addr, - size_t num_sg, int is_write) -{ - virtqueue_map_iovec(sg, addr, &num_sg, num_sg, is_write); -} - void virtqueue_map(VirtQueueElement *elem) { virtqueue_map_iovec(elem->in_sg, elem->in_addr, &elem->in_num, diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h index 9d9abb4689..205fadf234 100644 --- a/include/hw/virtio/virtio.h +++ b/include/hw/virtio/virtio.h @@ -151,8 +151,6 @@ void virtqueue_discard(VirtQueue *vq, const VirtQueueElement *elem, void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len, unsigned int idx); -void virtqueue_map_sg(struct iovec *sg, hwaddr *addr, - size_t num_sg, int is_write); void virtqueue_map(VirtQueueElement *elem); int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem); int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, From 340065e5a11a515382c8b1112424c97e86ad2a3f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 28 Oct 2015 18:54:05 +0200 Subject: [PATCH 12/16] Revert "pc: memhp: force gaps between DIMM's GPA" This reverts commit aa8580cddf011e8cedcf87f7a0fdea7549fc4704. As described in http://article.gmane.org/gmane.comp.emulators.qemu/371432 that commit causes linux guests to crash on memory hot-unplug. The original problem it's trying to solve has now been addressed within virtio. Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 6 ++---- hw/i386/pc_piix.c | 1 - hw/i386/pc_q35.c | 1 - include/hw/i386/pc.h | 1 - 4 files changed, 2 insertions(+), 7 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 3d958bae5b..d234caebbb 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1616,7 +1616,6 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, HotplugHandlerClass *hhc; Error *local_err = NULL; PCMachineState *pcms = PC_MACHINE(hotplug_dev); - PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms); PCDIMMDevice *dimm = PC_DIMM(dev); PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm); MemoryRegion *mr = ddc->get_memory_region(dimm); @@ -1632,8 +1631,8 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, goto out; } - pc_dimm_memory_plug(dev, &pcms->hotplug_memory, mr, align, - pcmc->inter_dimm_gap, &local_err); + pc_dimm_memory_plug(dev, &pcms->hotplug_memory, mr, align, false, + &local_err); if (local_err) { goto out; } @@ -1953,7 +1952,6 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) PCMachineClass *pcmc = PC_MACHINE_CLASS(oc); HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(oc); - pcmc->inter_dimm_gap = true; pcmc->get_hotplug_handler = mc->get_hotplug_handler; mc->get_hotplug_handler = pc_get_hotpug_handler; mc->cpu_index_to_socket_id = pc_cpu_index_to_socket_id; diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index 9d4425a5b9..393dcc4544 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -487,7 +487,6 @@ static void pc_i440fx_2_4_machine_options(MachineClass *m) m->alias = NULL; m->is_default = 0; pcmc->broken_reserved_end = true; - pcmc->inter_dimm_gap = false; SET_MACHINE_COMPAT(m, PC_COMPAT_2_4); } diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 3744abd397..2f8f3963c4 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -385,7 +385,6 @@ static void pc_q35_2_4_machine_options(MachineClass *m) pc_q35_2_5_machine_options(m); m->alias = NULL; pcmc->broken_reserved_end = true; - pcmc->inter_dimm_gap = false; SET_MACHINE_COMPAT(m, PC_COMPAT_2_4); } diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index 7037de044d..606dbc2854 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -60,7 +60,6 @@ struct PCMachineClass { /*< public >*/ bool broken_reserved_end; - bool inter_dimm_gap; HotplugHandler *(*get_hotplug_handler)(MachineState *machine, DeviceState *dev); }; From d6a9b0b89d27e0a688f37c1732d4dec40613669e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 28 Oct 2015 18:55:06 +0200 Subject: [PATCH 13/16] Revert "memhp: extend address auto assignment to support gaps" This reverts commit df0acded19ec4b826aa095cfc19d341bd66fafd3. There's no point to it now that the only user has been reverted. Signed-off-by: Michael S. Tsirkin --- hw/i386/pc.c | 3 +-- hw/mem/pc-dimm.c | 15 ++++++--------- hw/ppc/spapr.c | 2 +- include/hw/mem/pc-dimm.h | 7 +++---- 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index d234caebbb..0cb8afd2c2 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1631,8 +1631,7 @@ static void pc_dimm_plug(HotplugHandler *hotplug_dev, goto out; } - pc_dimm_memory_plug(dev, &pcms->hotplug_memory, mr, align, false, - &local_err); + pc_dimm_memory_plug(dev, &pcms->hotplug_memory, mr, align, &local_err); if (local_err) { goto out; } diff --git a/hw/mem/pc-dimm.c b/hw/mem/pc-dimm.c index 2bae994667..80f424b442 100644 --- a/hw/mem/pc-dimm.c +++ b/hw/mem/pc-dimm.c @@ -33,8 +33,7 @@ typedef struct pc_dimms_capacity { } pc_dimms_capacity; void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms, - MemoryRegion *mr, uint64_t align, bool gap, - Error **errp) + MemoryRegion *mr, uint64_t align, Error **errp) { int slot; MachineState *machine = MACHINE(qdev_get_machine()); @@ -50,7 +49,7 @@ void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms, addr = pc_dimm_get_free_addr(hpms->base, memory_region_size(&hpms->mr), - !addr ? NULL : &addr, align, gap, + !addr ? NULL : &addr, align, memory_region_size(mr), &local_err); if (local_err) { goto out; @@ -295,8 +294,8 @@ static int pc_dimm_built_list(Object *obj, void *opaque) uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, uint64_t address_space_size, - uint64_t *hint, uint64_t align, bool gap, - uint64_t size, Error **errp) + uint64_t *hint, uint64_t align, uint64_t size, + Error **errp) { GSList *list = NULL, *item; uint64_t new_addr, ret = 0; @@ -341,15 +340,13 @@ uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, goto out; } - if (ranges_overlap(dimm->addr, dimm_size, new_addr, - size + (gap ? 1 : 0))) { + if (ranges_overlap(dimm->addr, dimm_size, new_addr, size)) { if (hint) { DeviceState *d = DEVICE(dimm); error_setg(errp, "address range conflicts with '%s'", d->id); goto out; } - new_addr = QEMU_ALIGN_UP(dimm->addr + dimm_size + (gap ? 1 : 0), - align); + new_addr = QEMU_ALIGN_UP(dimm->addr + dimm_size, align); } } ret = new_addr; diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index e1202cec9f..288b57e24f 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -2157,7 +2157,7 @@ static void spapr_memory_plug(HotplugHandler *hotplug_dev, DeviceState *dev, goto out; } - pc_dimm_memory_plug(dev, &ms->hotplug_memory, mr, align, false, &local_err); + pc_dimm_memory_plug(dev, &ms->hotplug_memory, mr, align, &local_err); if (local_err) { goto out; } diff --git a/include/hw/mem/pc-dimm.h b/include/hw/mem/pc-dimm.h index c1ee7b0408..d83bf30ea9 100644 --- a/include/hw/mem/pc-dimm.h +++ b/include/hw/mem/pc-dimm.h @@ -83,16 +83,15 @@ typedef struct MemoryHotplugState { uint64_t pc_dimm_get_free_addr(uint64_t address_space_start, uint64_t address_space_size, - uint64_t *hint, uint64_t align, bool gap, - uint64_t size, Error **errp); + uint64_t *hint, uint64_t align, uint64_t size, + Error **errp); int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp); int qmp_pc_dimm_device_list(Object *obj, void *opaque); uint64_t pc_existing_dimms_capacity(Error **errp); void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms, - MemoryRegion *mr, uint64_t align, bool gap, - Error **errp); + MemoryRegion *mr, uint64_t align, Error **errp); void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms, MemoryRegion *mr); #endif From 3595e2eb0a233a881789fcc71f5b1072e5aaf669 Mon Sep 17 00:00:00 2001 From: Victor Kaplansky Date: Wed, 28 Oct 2015 14:53:07 +0200 Subject: [PATCH 14/16] tests/vhost-user-bridge: add vhost-user bridge application The test existing in QEMU for vhost-user feature is good for testing the management protocol, but does not allow actual traffic. This patch proposes Vhost-User Bridge application, which can serve the QEMU community as a comprehensive test by running real internet traffic by means of vhost-user interface. Essentially the Vhost-User Bridge is a very basic vhost-user backend for QEMU. It runs as a standalone user-level process. For packet processing Vhost-User Bridge uses an additional QEMU instance with a backend configured by "-net socket" as a shared VLAN. This way another QEMU virtual machine can effectively serve as a shared bus by means of UDP communication. For a more simple setup, the another QEMU instance running the SLiRP backend can be the same QEMU instance running vhost-user client. This Vhost-User Bridge implementation is very preliminary. It is missing many features. I has been studying vhost-user protocol internals, so I've written vhost-user-bridge bit by bit as I progressed through the protocol. Most probably its internal architecture will change significantly. To run Vhost-User Bridge application: 1. Build vhost-user-bridge with a regular procedure. This will create a vhost-user-bridge executable under tests directory: $ configure; make tests/vhost-user-bridge 2. Ensure the machine has hugepages enabled in kernel with command line like: default_hugepagesz=2M hugepagesz=2M hugepages=2048 3. Run Vhost-User Bridge with: $ tests/vhost-user-bridge The above will run vhost-user server listening for connections on UNIX domain socket /tmp/vubr.sock, and will try to connect by UDP to VLAN bridge to localhost:5555, while listening on localhost:4444 Run qemu with a virtio-net backed by vhost-user: $ qemu \ -enable-kvm -m 512 -smp 2 \ -object memory-backend-file,id=mem,size=512M,mem-path=/dev/hugepages,share=on \ -numa node,memdev=mem -mem-prealloc \ -chardev socket,id=char0,path=/tmp/vubr.sock \ -netdev type=vhost-user,id=mynet1,chardev=char0,vhostforce \ -device virtio-net-pci,netdev=mynet1 \ -net none \ -net socket,vlan=0,udp=localhost:4444,localaddr=localhost:5555 \ -net user,vlan=0 \ disk.img vhost-user-bridge was tested very lightly: it's able to bringup a linux on client VM with the virtio-net driver, and execute transmits and receives to the internet. I tested with "wget redhat.com", "dig redhat.com". PS. I've consulted DPDK's code for vhost-user during Vhost-User Bridge implementation. Signed-off-by: Victor Kaplansky Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- tests/Makefile | 1 + tests/vhost-user-bridge.c | 1110 +++++++++++++++++++++++++++++++++++++ 2 files changed, 1111 insertions(+) create mode 100644 tests/vhost-user-bridge.c diff --git a/tests/Makefile b/tests/Makefile index 1c57e39c53..0739bfe1bf 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -525,6 +525,7 @@ tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o $(test-util-obj-y) tests/test-write-threshold$(EXESUF): tests/test-write-threshold.o $(test-block-obj-y) tests/test-netfilter$(EXESUF): tests/test-netfilter.o $(qtest-obj-y) tests/ivshmem-test$(EXESUF): tests/ivshmem-test.o contrib/ivshmem-server/ivshmem-server.o $(libqos-pc-obj-y) +tests/vhost-user-bridge$(EXESUF): tests/vhost-user-bridge.o ifeq ($(CONFIG_POSIX),y) LIBS += -lutil diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c new file mode 100644 index 0000000000..fa18ad55fb --- /dev/null +++ b/tests/vhost-user-bridge.c @@ -0,0 +1,1110 @@ +/* + * Vhost User Bridge + * + * Copyright (c) 2015 Red Hat, Inc. + * + * Authors: + * Victor Kaplansky + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +/* + * TODO: + * - main should get parameters from the command line. + * - implement all request handlers. + * - test for broken requests and virtqueue. + * - implement features defined by Virtio 1.0 spec. + * - support mergeable buffers and indirect descriptors. + * - implement RESET_DEVICE request. + * - implement clean shutdown. + * - implement non-blocking writes to UDP backend. + * - implement polling strategy. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "qemu/atomic.h" +#include "standard-headers/linux/virtio_net.h" +#include "standard-headers/linux/virtio_ring.h" + +#define VHOST_USER_BRIDGE_DEBUG 1 + +#define DPRINT(...) \ + do { \ + if (VHOST_USER_BRIDGE_DEBUG) { \ + printf(__VA_ARGS__); \ + } \ + } while (0) + +typedef void (*CallbackFunc)(int sock, void *ctx); + +typedef struct Event { + void *ctx; + CallbackFunc callback; +} Event; + +typedef struct Dispatcher { + int max_sock; + fd_set fdset; + Event events[FD_SETSIZE]; +} Dispatcher; + +static void +vubr_die(const char *s) +{ + perror(s); + exit(1); +} + +static int +dispatcher_init(Dispatcher *dispr) +{ + FD_ZERO(&dispr->fdset); + dispr->max_sock = -1; + return 0; +} + +static int +dispatcher_add(Dispatcher *dispr, int sock, void *ctx, CallbackFunc cb) +{ + if (sock >= FD_SETSIZE) { + fprintf(stderr, + "Error: Failed to add new event. sock %d should be less than %d\n", + sock, FD_SETSIZE); + return -1; + } + + dispr->events[sock].ctx = ctx; + dispr->events[sock].callback = cb; + + FD_SET(sock, &dispr->fdset); + if (sock > dispr->max_sock) { + dispr->max_sock = sock; + } + DPRINT("Added sock %d for watching. max_sock: %d\n", + sock, dispr->max_sock); + return 0; +} + +#if 0 +/* dispatcher_remove() is not currently in use but may be useful + * in the future. */ +static int +dispatcher_remove(Dispatcher *dispr, int sock) +{ + if (sock >= FD_SETSIZE) { + fprintf(stderr, + "Error: Failed to remove event. sock %d should be less than %d\n", + sock, FD_SETSIZE); + return -1; + } + + FD_CLR(sock, &dispr->fdset); + return 0; +} +#endif + +/* timeout in us */ +static int +dispatcher_wait(Dispatcher *dispr, uint32_t timeout) +{ + struct timeval tv; + tv.tv_sec = timeout / 1000000; + tv.tv_usec = timeout % 1000000; + + fd_set fdset = dispr->fdset; + + /* wait until some of sockets become readable. */ + int rc = select(dispr->max_sock + 1, &fdset, 0, 0, &tv); + + if (rc == -1) { + vubr_die("select"); + } + + /* Timeout */ + if (rc == 0) { + return 0; + } + + /* Now call callback for every ready socket. */ + + int sock; + for (sock = 0; sock < dispr->max_sock + 1; sock++) + if (FD_ISSET(sock, &fdset)) { + Event *e = &dispr->events[sock]; + e->callback(sock, e->ctx); + } + + return 0; +} + +typedef struct VubrVirtq { + int call_fd; + int kick_fd; + uint32_t size; + uint16_t last_avail_index; + uint16_t last_used_index; + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; +} VubrVirtq; + +/* Based on qemu/hw/virtio/vhost-user.c */ + +#define VHOST_MEMORY_MAX_NREGIONS 8 +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +enum VhostUserProtocolFeature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + + VHOST_USER_PROTOCOL_F_MAX +}; + +#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1) + +typedef enum VhostUserRequest { + VHOST_USER_NONE = 0, + VHOST_USER_GET_FEATURES = 1, + VHOST_USER_SET_FEATURES = 2, + VHOST_USER_SET_OWNER = 3, + VHOST_USER_RESET_DEVICE = 4, + VHOST_USER_SET_MEM_TABLE = 5, + VHOST_USER_SET_LOG_BASE = 6, + VHOST_USER_SET_LOG_FD = 7, + VHOST_USER_SET_VRING_NUM = 8, + VHOST_USER_SET_VRING_ADDR = 9, + VHOST_USER_SET_VRING_BASE = 10, + VHOST_USER_GET_VRING_BASE = 11, + VHOST_USER_SET_VRING_KICK = 12, + VHOST_USER_SET_VRING_CALL = 13, + VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, + VHOST_USER_MAX +} VhostUserRequest; + +typedef struct VhostUserMemoryRegion { + uint64_t guest_phys_addr; + uint64_t memory_size; + uint64_t userspace_addr; + uint64_t mmap_offset; +} VhostUserMemoryRegion; + +typedef struct VhostUserMemory { + uint32_t nregions; + uint32_t padding; + VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; +} VhostUserMemory; + +typedef struct VhostUserMsg { + VhostUserRequest request; + +#define VHOST_USER_VERSION_MASK (0x3) +#define VHOST_USER_REPLY_MASK (0x1<<2) + uint32_t flags; + uint32_t size; /* the following payload size */ + union { +#define VHOST_USER_VRING_IDX_MASK (0xff) +#define VHOST_USER_VRING_NOFD_MASK (0x1<<8) + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + } payload; + int fds[VHOST_MEMORY_MAX_NREGIONS]; + int fd_num; +} QEMU_PACKED VhostUserMsg; + +#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64) + +/* The version of the protocol we support */ +#define VHOST_USER_VERSION (0x1) + +#define MAX_NR_VIRTQUEUE (8) + +typedef struct VubrDevRegion { + /* Guest Physical address. */ + uint64_t gpa; + /* Memory region size. */ + uint64_t size; + /* QEMU virtual address (userspace). */ + uint64_t qva; + /* Starting offset in our mmaped space. */ + uint64_t mmap_offset; + /* Start address of mmaped space. */ + uint64_t mmap_addr; +} VubrDevRegion; + +typedef struct VubrDev { + int sock; + Dispatcher dispatcher; + uint32_t nregions; + VubrDevRegion regions[VHOST_MEMORY_MAX_NREGIONS]; + VubrVirtq vq[MAX_NR_VIRTQUEUE]; + int backend_udp_sock; + struct sockaddr_in backend_udp_dest; +} VubrDev; + +static const char *vubr_request_str[] = { + [VHOST_USER_NONE] = "VHOST_USER_NONE", + [VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES", + [VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES", + [VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER", + [VHOST_USER_RESET_DEVICE] = "VHOST_USER_RESET_DEVICE", + [VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE", + [VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE", + [VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD", + [VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM", + [VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR", + [VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE", + [VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE", + [VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK", + [VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL", + [VHOST_USER_SET_VRING_ERR] = "VHOST_USER_SET_VRING_ERR", + [VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES", + [VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES", + [VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM", + [VHOST_USER_SET_VRING_ENABLE] = "VHOST_USER_SET_VRING_ENABLE", + [VHOST_USER_SEND_RARP] = "VHOST_USER_SEND_RARP", + [VHOST_USER_MAX] = "VHOST_USER_MAX", +}; + +static void +print_buffer(uint8_t *buf, size_t len) +{ + int i; + printf("Raw buffer:\n"); + for (i = 0; i < len; i++) { + if (i % 16 == 0) { + printf("\n"); + } + if (i % 4 == 0) { + printf(" "); + } + printf("%02x ", buf[i]); + } + printf("\n............................................................\n"); +} + +/* Translate guest physical address to our virtual address. */ +static uint64_t +gpa_to_va(VubrDev *dev, uint64_t guest_addr) +{ + int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + VubrDevRegion *r = &dev->regions[i]; + + if ((guest_addr >= r->gpa) && (guest_addr < (r->gpa + r->size))) { + return guest_addr - r->gpa + r->mmap_addr + r->mmap_offset; + } + } + + assert(!"address not found in regions"); + return 0; +} + +/* Translate qemu virtual address to our virtual address. */ +static uint64_t +qva_to_va(VubrDev *dev, uint64_t qemu_addr) +{ + int i; + + /* Find matching memory region. */ + for (i = 0; i < dev->nregions; i++) { + VubrDevRegion *r = &dev->regions[i]; + + if ((qemu_addr >= r->qva) && (qemu_addr < (r->qva + r->size))) { + return qemu_addr - r->qva + r->mmap_addr + r->mmap_offset; + } + } + + assert(!"address not found in regions"); + return 0; +} + +static void +vubr_message_read(int conn_fd, VhostUserMsg *vmsg) +{ + char control[CMSG_SPACE(VHOST_MEMORY_MAX_NREGIONS * sizeof(int))] = { }; + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = control, + .msg_controllen = sizeof(control), + }; + size_t fd_size; + struct cmsghdr *cmsg; + int rc; + + rc = recvmsg(conn_fd, &msg, 0); + + if (rc <= 0) { + vubr_die("recvmsg"); + } + + vmsg->fd_num = 0; + for (cmsg = CMSG_FIRSTHDR(&msg); + cmsg != NULL; + cmsg = CMSG_NXTHDR(&msg, cmsg)) + { + if (cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_RIGHTS) { + fd_size = cmsg->cmsg_len - CMSG_LEN(0); + vmsg->fd_num = fd_size / sizeof(int); + memcpy(vmsg->fds, CMSG_DATA(cmsg), fd_size); + break; + } + } + + if (vmsg->size > sizeof(vmsg->payload)) { + fprintf(stderr, + "Error: too big message request: %d, size: vmsg->size: %u, " + "while sizeof(vmsg->payload) = %lu\n", + vmsg->request, vmsg->size, sizeof(vmsg->payload)); + exit(1); + } + + if (vmsg->size) { + rc = read(conn_fd, &vmsg->payload, vmsg->size); + if (rc <= 0) { + vubr_die("recvmsg"); + } + + assert(rc == vmsg->size); + } +} + +static void +vubr_message_write(int conn_fd, VhostUserMsg *vmsg) +{ + int rc; + + do { + rc = write(conn_fd, vmsg, VHOST_USER_HDR_SIZE + vmsg->size); + } while (rc < 0 && errno == EINTR); + + if (rc < 0) { + vubr_die("write"); + } +} + +static void +vubr_backend_udp_sendbuf(VubrDev *dev, uint8_t *buf, size_t len) +{ + int slen = sizeof(struct sockaddr_in); + + if (sendto(dev->backend_udp_sock, buf, len, 0, + (struct sockaddr *) &dev->backend_udp_dest, slen) == -1) { + vubr_die("sendto()"); + } +} + +static int +vubr_backend_udp_recvbuf(VubrDev *dev, uint8_t *buf, size_t buflen) +{ + int slen = sizeof(struct sockaddr_in); + int rc; + + rc = recvfrom(dev->backend_udp_sock, buf, buflen, 0, + (struct sockaddr *) &dev->backend_udp_dest, + (socklen_t *)&slen); + if (rc == -1) { + vubr_die("recvfrom()"); + } + + return rc; +} + +static void +vubr_consume_raw_packet(VubrDev *dev, uint8_t *buf, uint32_t len) +{ + int hdrlen = sizeof(struct virtio_net_hdr_v1); + + if (VHOST_USER_BRIDGE_DEBUG) { + print_buffer(buf, len); + } + vubr_backend_udp_sendbuf(dev, buf + hdrlen, len - hdrlen); +} + +/* Kick the guest if necessary. */ +static void +vubr_virtqueue_kick(VubrVirtq *vq) +{ + if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + DPRINT("Kicking the guest...\n"); + eventfd_write(vq->call_fd, 1); + } +} + +static void +vubr_post_buffer(VubrDev *dev, VubrVirtq *vq, uint8_t *buf, int32_t len) +{ + struct vring_desc *desc = vq->desc; + struct vring_avail *avail = vq->avail; + struct vring_used *used = vq->used; + + unsigned int size = vq->size; + + uint16_t avail_index = atomic_mb_read(&avail->idx); + + /* We check the available descriptors before posting the + * buffer, so here we assume that enough available + * descriptors. */ + assert(vq->last_avail_index != avail_index); + uint16_t a_index = vq->last_avail_index % size; + uint16_t u_index = vq->last_used_index % size; + uint16_t d_index = avail->ring[a_index]; + + int i = d_index; + + DPRINT("Post packet to guest on vq:\n"); + DPRINT(" size = %d\n", vq->size); + DPRINT(" last_avail_index = %d\n", vq->last_avail_index); + DPRINT(" last_used_index = %d\n", vq->last_used_index); + DPRINT(" a_index = %d\n", a_index); + DPRINT(" u_index = %d\n", u_index); + DPRINT(" d_index = %d\n", d_index); + DPRINT(" desc[%d].addr = 0x%016"PRIx64"\n", i, desc[i].addr); + DPRINT(" desc[%d].len = %d\n", i, desc[i].len); + DPRINT(" desc[%d].flags = %d\n", i, desc[i].flags); + DPRINT(" avail->idx = %d\n", avail_index); + DPRINT(" used->idx = %d\n", used->idx); + + if (!(desc[i].flags & VRING_DESC_F_WRITE)) { + /* FIXME: we should find writable descriptor. */ + fprintf(stderr, "Error: descriptor is not writable. Exiting.\n"); + exit(1); + } + + void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr); + uint32_t chunk_len = desc[i].len; + + if (len <= chunk_len) { + memcpy(chunk_start, buf, len); + } else { + fprintf(stderr, + "Received too long packet from the backend. Dropping...\n"); + return; + } + + /* Add descriptor to the used ring. */ + used->ring[u_index].id = d_index; + used->ring[u_index].len = len; + + vq->last_avail_index++; + vq->last_used_index++; + + atomic_mb_set(&used->idx, vq->last_used_index); + + /* Kick the guest if necessary. */ + vubr_virtqueue_kick(vq); +} + +static int +vubr_process_desc(VubrDev *dev, VubrVirtq *vq) +{ + struct vring_desc *desc = vq->desc; + struct vring_avail *avail = vq->avail; + struct vring_used *used = vq->used; + + unsigned int size = vq->size; + + uint16_t a_index = vq->last_avail_index % size; + uint16_t u_index = vq->last_used_index % size; + uint16_t d_index = avail->ring[a_index]; + + uint32_t i, len = 0; + size_t buf_size = 4096; + uint8_t buf[4096]; + + DPRINT("Chunks: "); + i = d_index; + do { + void *chunk_start = (void *)gpa_to_va(dev, desc[i].addr); + uint32_t chunk_len = desc[i].len; + + if (len + chunk_len < buf_size) { + memcpy(buf + len, chunk_start, chunk_len); + DPRINT("%d ", chunk_len); + } else { + fprintf(stderr, "Error: too long packet. Dropping...\n"); + break; + } + + len += chunk_len; + + if (!(desc[i].flags & VRING_DESC_F_NEXT)) { + break; + } + + i = desc[i].next; + } while (1); + DPRINT("\n"); + + if (!len) { + return -1; + } + + /* Add descriptor to the used ring. */ + used->ring[u_index].id = d_index; + used->ring[u_index].len = len; + + vubr_consume_raw_packet(dev, buf, len); + + return 0; +} + +static void +vubr_process_avail(VubrDev *dev, VubrVirtq *vq) +{ + struct vring_avail *avail = vq->avail; + struct vring_used *used = vq->used; + + while (vq->last_avail_index != atomic_mb_read(&avail->idx)) { + vubr_process_desc(dev, vq); + vq->last_avail_index++; + vq->last_used_index++; + } + + atomic_mb_set(&used->idx, vq->last_used_index); +} + +static void +vubr_backend_recv_cb(int sock, void *ctx) +{ + VubrDev *dev = (VubrDev *) ctx; + VubrVirtq *rx_vq = &dev->vq[0]; + uint8_t buf[4096]; + struct virtio_net_hdr_v1 *hdr = (struct virtio_net_hdr_v1 *)buf; + int hdrlen = sizeof(struct virtio_net_hdr_v1); + int buflen = sizeof(buf); + int len; + + DPRINT("\n\n *** IN UDP RECEIVE CALLBACK ***\n\n"); + + uint16_t avail_index = atomic_mb_read(&rx_vq->avail->idx); + + /* If there is no available descriptors, just do nothing. + * The buffer will be handled by next arrived UDP packet, + * or next kick on receive virtq. */ + if (rx_vq->last_avail_index == avail_index) { + DPRINT("Got UDP packet, but no available descriptors on RX virtq.\n"); + return; + } + + len = vubr_backend_udp_recvbuf(dev, buf + hdrlen, buflen - hdrlen); + + *hdr = (struct virtio_net_hdr_v1) { }; + hdr->num_buffers = 1; + vubr_post_buffer(dev, rx_vq, buf, len + hdrlen); +} + +static void +vubr_kick_cb(int sock, void *ctx) +{ + VubrDev *dev = (VubrDev *) ctx; + eventfd_t kick_data; + ssize_t rc; + + rc = eventfd_read(sock, &kick_data); + if (rc == -1) { + vubr_die("eventfd_read()"); + } else { + DPRINT("Got kick_data: %016"PRIx64"\n", kick_data); + vubr_process_avail(dev, &dev->vq[1]); + } +} + +static int +vubr_none_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_get_features_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + vmsg->payload.u64 = + ((1ULL << VIRTIO_NET_F_MRG_RXBUF) | + (1ULL << VIRTIO_NET_F_CTRL_VQ) | + (1ULL << VIRTIO_NET_F_CTRL_RX) | + (1ULL << VHOST_F_LOG_ALL)); + vmsg->size = sizeof(vmsg->payload.u64); + + DPRINT("Sending back to guest u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + /* reply */ + return 1; +} + +static int +vubr_set_features_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + return 0; +} + +static int +vubr_set_owner_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + return 0; +} + +static int +vubr_reset_device_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_set_mem_table_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + int i; + VhostUserMemory *memory = &vmsg->payload.memory; + dev->nregions = memory->nregions; + + DPRINT("Nregions: %d\n", memory->nregions); + for (i = 0; i < dev->nregions; i++) { + void *mmap_addr; + VhostUserMemoryRegion *msg_region = &memory->regions[i]; + VubrDevRegion *dev_region = &dev->regions[i]; + + DPRINT("Region %d\n", i); + DPRINT(" guest_phys_addr: 0x%016"PRIx64"\n", + msg_region->guest_phys_addr); + DPRINT(" memory_size: 0x%016"PRIx64"\n", + msg_region->memory_size); + DPRINT(" userspace_addr 0x%016"PRIx64"\n", + msg_region->userspace_addr); + DPRINT(" mmap_offset 0x%016"PRIx64"\n", + msg_region->mmap_offset); + + dev_region->gpa = msg_region->guest_phys_addr; + dev_region->size = msg_region->memory_size; + dev_region->qva = msg_region->userspace_addr; + dev_region->mmap_offset = msg_region->mmap_offset; + + /* We don't use offset argument of mmap() since the + * mapped address has to be page aligned, and we use huge + * pages. */ + mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, + PROT_READ | PROT_WRITE, MAP_SHARED, + vmsg->fds[i], 0); + + if (mmap_addr == MAP_FAILED) { + vubr_die("mmap"); + } + + dev_region->mmap_addr = (uint64_t) mmap_addr; + DPRINT(" mmap_addr: 0x%016"PRIx64"\n", dev_region->mmap_addr); + } + + return 0; +} + +static int +vubr_set_log_base_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_set_log_fd_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_set_vring_num_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; + + DPRINT("State.index: %d\n", index); + DPRINT("State.num: %d\n", num); + dev->vq[index].size = num; + return 0; +} + +static int +vubr_set_vring_addr_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + struct vhost_vring_addr *vra = &vmsg->payload.addr; + unsigned int index = vra->index; + VubrVirtq *vq = &dev->vq[index]; + + DPRINT("vhost_vring_addr:\n"); + DPRINT(" index: %d\n", vra->index); + DPRINT(" flags: %d\n", vra->flags); + DPRINT(" desc_user_addr: 0x%016llx\n", vra->desc_user_addr); + DPRINT(" used_user_addr: 0x%016llx\n", vra->used_user_addr); + DPRINT(" avail_user_addr: 0x%016llx\n", vra->avail_user_addr); + DPRINT(" log_guest_addr: 0x%016llx\n", vra->log_guest_addr); + + vq->desc = (struct vring_desc *)qva_to_va(dev, vra->desc_user_addr); + vq->used = (struct vring_used *)qva_to_va(dev, vra->used_user_addr); + vq->avail = (struct vring_avail *)qva_to_va(dev, vra->avail_user_addr); + + DPRINT("Setting virtq addresses:\n"); + DPRINT(" vring_desc at %p\n", vq->desc); + DPRINT(" vring_used at %p\n", vq->used); + DPRINT(" vring_avail at %p\n", vq->avail); + + vq->last_used_index = vq->used->idx; + return 0; +} + +static int +vubr_set_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + unsigned int index = vmsg->payload.state.index; + unsigned int num = vmsg->payload.state.num; + + DPRINT("State.index: %d\n", index); + DPRINT("State.num: %d\n", num); + dev->vq[index].last_avail_index = num; + + return 0; +} + +static int +vubr_get_vring_base_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_set_vring_kick_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + uint64_t u64_arg = vmsg->payload.u64; + int index = u64_arg & VHOST_USER_VRING_IDX_MASK; + + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + + assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0); + assert(vmsg->fd_num == 1); + + dev->vq[index].kick_fd = vmsg->fds[0]; + DPRINT("Got kick_fd: %d for vq: %d\n", vmsg->fds[0], index); + + if (index % 2 == 1) { + /* TX queue. */ + dispatcher_add(&dev->dispatcher, dev->vq[index].kick_fd, + dev, vubr_kick_cb); + + DPRINT("Waiting for kicks on fd: %d for vq: %d\n", + dev->vq[index].kick_fd, index); + } + return 0; +} + +static int +vubr_set_vring_call_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + uint64_t u64_arg = vmsg->payload.u64; + int index = u64_arg & VHOST_USER_VRING_IDX_MASK; + + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + assert((u64_arg & VHOST_USER_VRING_NOFD_MASK) == 0); + assert(vmsg->fd_num == 1); + + dev->vq[index].call_fd = vmsg->fds[0]; + DPRINT("Got call_fd: %d for vq: %d\n", vmsg->fds[0], index); + + return 0; +} + +static int +vubr_set_vring_err_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + return 0; +} + +static int +vubr_get_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + /* FIXME: unimplented */ + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + return 0; +} + +static int +vubr_set_protocol_features_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + /* FIXME: unimplented */ + DPRINT("u64: 0x%016"PRIx64"\n", vmsg->payload.u64); + return 0; +} + +static int +vubr_get_queue_num_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_set_vring_enable_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_send_rarp_exec(VubrDev *dev, VhostUserMsg *vmsg) +{ + DPRINT("Function %s() not implemented yet.\n", __func__); + return 0; +} + +static int +vubr_execute_request(VubrDev *dev, VhostUserMsg *vmsg) +{ + /* Print out generic part of the request. */ + DPRINT( + "================== Vhost user message from QEMU ==================\n"); + DPRINT("Request: %s (%d)\n", vubr_request_str[vmsg->request], + vmsg->request); + DPRINT("Flags: 0x%x\n", vmsg->flags); + DPRINT("Size: %d\n", vmsg->size); + + if (vmsg->fd_num) { + int i; + DPRINT("Fds:"); + for (i = 0; i < vmsg->fd_num; i++) { + DPRINT(" %d", vmsg->fds[i]); + } + DPRINT("\n"); + } + + switch (vmsg->request) { + case VHOST_USER_NONE: + return vubr_none_exec(dev, vmsg); + case VHOST_USER_GET_FEATURES: + return vubr_get_features_exec(dev, vmsg); + case VHOST_USER_SET_FEATURES: + return vubr_set_features_exec(dev, vmsg); + case VHOST_USER_SET_OWNER: + return vubr_set_owner_exec(dev, vmsg); + case VHOST_USER_RESET_DEVICE: + return vubr_reset_device_exec(dev, vmsg); + case VHOST_USER_SET_MEM_TABLE: + return vubr_set_mem_table_exec(dev, vmsg); + case VHOST_USER_SET_LOG_BASE: + return vubr_set_log_base_exec(dev, vmsg); + case VHOST_USER_SET_LOG_FD: + return vubr_set_log_fd_exec(dev, vmsg); + case VHOST_USER_SET_VRING_NUM: + return vubr_set_vring_num_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ADDR: + return vubr_set_vring_addr_exec(dev, vmsg); + case VHOST_USER_SET_VRING_BASE: + return vubr_set_vring_base_exec(dev, vmsg); + case VHOST_USER_GET_VRING_BASE: + return vubr_get_vring_base_exec(dev, vmsg); + case VHOST_USER_SET_VRING_KICK: + return vubr_set_vring_kick_exec(dev, vmsg); + case VHOST_USER_SET_VRING_CALL: + return vubr_set_vring_call_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ERR: + return vubr_set_vring_err_exec(dev, vmsg); + case VHOST_USER_GET_PROTOCOL_FEATURES: + return vubr_get_protocol_features_exec(dev, vmsg); + case VHOST_USER_SET_PROTOCOL_FEATURES: + return vubr_set_protocol_features_exec(dev, vmsg); + case VHOST_USER_GET_QUEUE_NUM: + return vubr_get_queue_num_exec(dev, vmsg); + case VHOST_USER_SET_VRING_ENABLE: + return vubr_set_vring_enable_exec(dev, vmsg); + case VHOST_USER_SEND_RARP: + return vubr_send_rarp_exec(dev, vmsg); + + case VHOST_USER_MAX: + assert(vmsg->request != VHOST_USER_MAX); + } + return 0; +} + +static void +vubr_receive_cb(int sock, void *ctx) +{ + VubrDev *dev = (VubrDev *) ctx; + VhostUserMsg vmsg; + int reply_requested; + + vubr_message_read(sock, &vmsg); + reply_requested = vubr_execute_request(dev, &vmsg); + if (reply_requested) { + /* Set the version in the flags when sending the reply */ + vmsg.flags &= ~VHOST_USER_VERSION_MASK; + vmsg.flags |= VHOST_USER_VERSION; + vmsg.flags |= VHOST_USER_REPLY_MASK; + vubr_message_write(sock, &vmsg); + } +} + +static void +vubr_accept_cb(int sock, void *ctx) +{ + VubrDev *dev = (VubrDev *)ctx; + int conn_fd; + struct sockaddr_un un; + socklen_t len = sizeof(un); + + conn_fd = accept(sock, (struct sockaddr *) &un, &len); + if (conn_fd == -1) { + vubr_die("accept()"); + } + DPRINT("Got connection from remote peer on sock %d\n", conn_fd); + dispatcher_add(&dev->dispatcher, conn_fd, ctx, vubr_receive_cb); +} + +static VubrDev * +vubr_new(const char *path) +{ + VubrDev *dev = (VubrDev *) calloc(1, sizeof(VubrDev)); + dev->nregions = 0; + int i; + struct sockaddr_un un; + size_t len; + + for (i = 0; i < MAX_NR_VIRTQUEUE; i++) { + dev->vq[i] = (VubrVirtq) { + .call_fd = -1, .kick_fd = -1, + .size = 0, + .last_avail_index = 0, .last_used_index = 0, + .desc = 0, .avail = 0, .used = 0, + }; + } + + /* Get a UNIX socket. */ + dev->sock = socket(AF_UNIX, SOCK_STREAM, 0); + if (dev->sock == -1) { + vubr_die("socket"); + } + + un.sun_family = AF_UNIX; + strcpy(un.sun_path, path); + len = sizeof(un.sun_family) + strlen(path); + unlink(path); + + if (bind(dev->sock, (struct sockaddr *) &un, len) == -1) { + vubr_die("bind"); + } + + if (listen(dev->sock, 1) == -1) { + vubr_die("listen"); + } + + dispatcher_init(&dev->dispatcher); + dispatcher_add(&dev->dispatcher, dev->sock, (void *)dev, + vubr_accept_cb); + + DPRINT("Waiting for connections on UNIX socket %s ...\n", path); + return dev; +} + +static void +vubr_backend_udp_setup(VubrDev *dev, + const char *local_host, + uint16_t local_port, + const char *dest_host, + uint16_t dest_port) +{ + int sock; + struct sockaddr_in si_local = { + .sin_family = AF_INET, + .sin_port = htons(local_port), + }; + + if (inet_aton(local_host, &si_local.sin_addr) == 0) { + fprintf(stderr, "inet_aton() failed.\n"); + exit(1); + } + + /* setup destination for sends */ + dev->backend_udp_dest = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_port = htons(dest_port), + }; + if (inet_aton(dest_host, &dev->backend_udp_dest.sin_addr) == 0) { + fprintf(stderr, "inet_aton() failed.\n"); + exit(1); + } + + sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (sock == -1) { + vubr_die("socket"); + } + + if (bind(sock, (struct sockaddr *)&si_local, sizeof(si_local)) == -1) { + vubr_die("bind"); + } + + dev->backend_udp_sock = sock; + dispatcher_add(&dev->dispatcher, sock, dev, vubr_backend_recv_cb); + DPRINT("Waiting for data from udp backend on %s:%d...\n", + local_host, local_port); +} + +static void +vubr_run(VubrDev *dev) +{ + while (1) { + /* timeout 200ms */ + dispatcher_wait(&dev->dispatcher, 200000); + /* Here one can try polling strategy. */ + } +} + +int +main(int argc, char *argv[]) +{ + VubrDev *dev; + + dev = vubr_new("/tmp/vubr.sock"); + if (!dev) { + return 1; + } + + vubr_backend_udp_setup(dev, + "127.0.0.1", 4444, + "127.0.0.1", 5555); + vubr_run(dev); + return 0; +} From 0d1c7d88ad909c5b2bd86211a9fe8abf5c74993b Mon Sep 17 00:00:00 2001 From: Cao jin Date: Wed, 28 Oct 2015 14:20:30 +0800 Subject: [PATCH 15/16] remove function during multi-function hot-add In case user want to cancel the hot-add operation, should roll back, device_del the added function that still don`t work. Signed-off-by: Cao jin Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pcie.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index 6e28985bd1..a72d516e4f 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -261,13 +261,31 @@ void pcie_cap_slot_hotplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, PCI_EXP_HP_EV_PDC | PCI_EXP_HP_EV_ABP); } +static void pcie_unplug_device(PCIBus *bus, PCIDevice *dev, void *opaque) +{ + object_unparent(OBJECT(dev)); +} + void pcie_cap_slot_hot_unplug_request_cb(HotplugHandler *hotplug_dev, DeviceState *dev, Error **errp) { uint8_t *exp_cap; + PCIDevice *pci_dev = PCI_DEVICE(dev); + PCIBus *bus = pci_dev->bus; pcie_cap_slot_hotplug_common(PCI_DEVICE(hotplug_dev), dev, &exp_cap, errp); + /* In case user cancel the operation of multi-function hot-add, + * remove the function that is unexposed to guest individually, + * without interaction with guest. + */ + if (pci_dev->devfn && + !bus->devices[0]) { + pcie_unplug_device(bus, pci_dev, NULL); + + return; + } + pcie_cap_slot_push_attention_button(PCI_DEVICE(hotplug_dev)); } @@ -378,11 +396,6 @@ void pcie_cap_slot_reset(PCIDevice *dev) hotplug_event_update_event_status(dev); } -static void pcie_unplug_device(PCIBus *bus, PCIDevice *dev, void *opaque) -{ - object_unparent(OBJECT(dev)); -} - void pcie_cap_slot_write_config(PCIDevice *dev, uint32_t addr, uint32_t val, int len) { From 3f1e1478db2d67098d98f2c3acf5a4946b7fb643 Mon Sep 17 00:00:00 2001 From: Cao jin Date: Wed, 28 Oct 2015 14:20:31 +0800 Subject: [PATCH 16/16] enable multi-function hot-add Enable PCIe device multi-function hot-add, just ensure function 0 is added last, then driver will get the notification to scan the slot. Signed-off-by: Cao jin Reviewed-by: Michael S. Tsirkin Signed-off-by: Michael S. Tsirkin --- hw/pci/pci.c | 40 +++++++++++++++++++++++++++++++++++++++- hw/pci/pci_host.c | 15 +++++++++++++++ hw/pci/pcie.c | 18 +++++++++--------- include/hw/pci/pci.h | 1 + 4 files changed, 64 insertions(+), 10 deletions(-) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index b0bf54061f..168b9cc56b 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -847,6 +847,9 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus, PCIConfigWriteFunc *config_write = pc->config_write; Error *local_err = NULL; AddressSpace *dma_as; + DeviceState *dev = DEVICE(pci_dev); + + pci_dev->bus = bus; if (devfn < 0) { for(devfn = bus->devfn_min ; devfn < ARRAY_SIZE(bus->devices); @@ -864,9 +867,17 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus, PCI_SLOT(devfn), PCI_FUNC(devfn), name, bus->devices[devfn]->name); return NULL; + } else if (dev->hotplugged && + pci_get_function_0(pci_dev)) { + error_setg(errp, "PCI: slot %d function 0 already ocuppied by %s," + " new func %s cannot be exposed to guest.", + PCI_SLOT(devfn), + bus->devices[PCI_DEVFN(PCI_SLOT(devfn), 0)]->name, + name); + + return NULL; } - pci_dev->bus = bus; pci_dev->devfn = devfn; dma_as = pci_device_iommu_address_space(pci_dev); @@ -2454,6 +2465,33 @@ void pci_bus_get_w64_range(PCIBus *bus, Range *range) pci_for_each_device_under_bus(bus, pci_dev_get_w64, range); } +static bool pcie_has_upstream_port(PCIDevice *dev) +{ + PCIDevice *parent_dev = pci_bridge_get_device(dev->bus); + + /* Device associated with an upstream port. + * As there are several types of these, it's easier to check the + * parent device: upstream ports are always connected to + * root or downstream ports. + */ + return parent_dev && + pci_is_express(parent_dev) && + parent_dev->exp.exp_cap && + (pcie_cap_get_type(parent_dev) == PCI_EXP_TYPE_ROOT_PORT || + pcie_cap_get_type(parent_dev) == PCI_EXP_TYPE_DOWNSTREAM); +} + +PCIDevice *pci_get_function_0(PCIDevice *pci_dev) +{ + if(pcie_has_upstream_port(pci_dev)) { + /* With an upstream PCIe port, we only support 1 device at slot 0 */ + return pci_dev->bus->devices[0]; + } else { + /* Other bus types might support multiple devices at slots 0-31 */ + return pci_dev->bus->devices[PCI_DEVFN(PCI_SLOT(pci_dev->devfn), 0)]; + } +} + static const TypeInfo pci_device_type_info = { .name = TYPE_PCI_DEVICE, .parent = TYPE_DEVICE, diff --git a/hw/pci/pci_host.c b/hw/pci/pci_host.c index 3e26f9256c..49f59a5dbc 100644 --- a/hw/pci/pci_host.c +++ b/hw/pci/pci_host.c @@ -20,6 +20,7 @@ #include "hw/pci/pci.h" #include "hw/pci/pci_host.h" +#include "hw/pci/pci_bus.h" #include "trace.h" /* debug PCI */ @@ -52,6 +53,13 @@ void pci_host_config_write_common(PCIDevice *pci_dev, uint32_t addr, uint32_t limit, uint32_t val, uint32_t len) { assert(len <= 4); + /* non-zero functions are only exposed when function 0 is present, + * allowing direct removal of unexposed functions. + */ + if (pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) { + return; + } + trace_pci_cfg_write(pci_dev->name, PCI_SLOT(pci_dev->devfn), PCI_FUNC(pci_dev->devfn), addr, val); pci_dev->config_write(pci_dev, addr, val, MIN(len, limit - addr)); @@ -63,6 +71,13 @@ uint32_t pci_host_config_read_common(PCIDevice *pci_dev, uint32_t addr, uint32_t ret; assert(len <= 4); + /* non-zero functions are only exposed when function 0 is present, + * allowing direct removal of unexposed functions. + */ + if (pci_dev->qdev.hotplugged && !pci_get_function_0(pci_dev)) { + return ~0x0; + } + ret = pci_dev->config_read(pci_dev, addr, MIN(len, limit - addr)); trace_pci_cfg_read(pci_dev->name, PCI_SLOT(pci_dev->devfn), PCI_FUNC(pci_dev->devfn), addr, ret); diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c index a72d516e4f..32c65c27a4 100644 --- a/hw/pci/pcie.c +++ b/hw/pci/pcie.c @@ -249,16 +249,16 @@ void pcie_cap_slot_hotplug_cb(HotplugHandler *hotplug_dev, DeviceState *dev, return; } - /* TODO: multifunction hot-plug. - * Right now, only a device of function = 0 is allowed to be - * hot plugged/unplugged. + /* To enable multifunction hot-plug, we just ensure the function + * 0 added last. When function 0 is added, we set the sltsta and + * inform OS via event notification. */ - assert(PCI_FUNC(pci_dev->devfn) == 0); - - pci_word_test_and_set_mask(exp_cap + PCI_EXP_SLTSTA, - PCI_EXP_SLTSTA_PDS); - pcie_cap_slot_event(PCI_DEVICE(hotplug_dev), - PCI_EXP_HP_EV_PDC | PCI_EXP_HP_EV_ABP); + if (pci_get_function_0(pci_dev)) { + pci_word_test_and_set_mask(exp_cap + PCI_EXP_SLTSTA, + PCI_EXP_SLTSTA_PDS); + pcie_cap_slot_event(PCI_DEVICE(hotplug_dev), + PCI_EXP_HP_EV_PDC | PCI_EXP_HP_EV_ABP); + } } static void pcie_unplug_device(PCIBus *bus, PCIDevice *dev, void *opaque) diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index f5e7fd818a..379b6e1a45 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -397,6 +397,7 @@ void pci_for_each_bus_depth_first(PCIBus *bus, void *(*begin)(PCIBus *bus, void *parent_state), void (*end)(PCIBus *bus, void *state), void *parent_state); +PCIDevice *pci_get_function_0(PCIDevice *pci_dev); /* Use this wrapper when specific scan order is not required. */ static inline