From 4d3fc4fdc6857e33346ed58ae55870f59391ee71 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 30 Jun 2016 13:00:22 -0600 Subject: [PATCH 1/5] vfio/pci: Fix VGA quirks Commit 2d82f8a3cdb2 ("vfio/pci: Convert all MemoryRegion to dynamic alloc and consistent functions") converted VFIOPCIDevice.vga to be dynamically allocted, negating the need for VFIOPCIDevice.has_vga. Unfortunately not all of the has_vga users were converted, nor was the field removed from the structure. Correct these oversights. Reported-by: Peter Maloney Tested-by: Peter Maloney Fixes: 2d82f8a3cdb2 ("vfio/pci: Convert all MemoryRegion to dynamic alloc and consistent functions") Fixes: https://bugs.launchpad.net/qemu/+bug/1591628 Cc: qemu-stable@nongnu.org Signed-off-by: Alex Williamson --- hw/vfio/pci-quirks.c | 8 ++++---- hw/vfio/pci.h | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c index 35d32b78f4..bec694c8d8 100644 --- a/hw/vfio/pci-quirks.c +++ b/hw/vfio/pci-quirks.c @@ -318,7 +318,7 @@ static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr) /* This windows doesn't seem to be used except by legacy VGA code */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || - !vdev->has_vga || nr != 4) { + !vdev->vga || nr != 4) { return; } @@ -366,7 +366,7 @@ static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr) /* Only enable on newer devices where BAR2 is 64bit */ if (!vfio_pci_is(vdev, PCI_VENDOR_ID_ATI, PCI_ANY_ID) || - !vdev->has_vga || nr != 2 || !vdev->bars[2].mem64) { + !vdev->vga || nr != 2 || !vdev->bars[2].mem64) { return; } @@ -660,7 +660,7 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr) VFIOConfigWindowQuirk *window; if (!vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID) || - !vdev->has_vga || nr != 5) { + !vdev->vga || nr != 5) { return; } @@ -776,7 +776,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr) QLIST_INSERT_HEAD(&vdev->bars[nr].quirks, quirk, next); /* The 0x1800 offset mirror only seems to get used by legacy VGA */ - if (vdev->has_vga) { + if (vdev->vga) { quirk = g_malloc0(sizeof(*quirk)); mirror = quirk->data = g_malloc0(sizeof(*mirror)); mirror->mem = quirk->mem = g_new0(MemoryRegion, 1); diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index b3eb0d838e..7d482d9d21 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -135,7 +135,6 @@ typedef struct VFIOPCIDevice { int32_t bootindex; uint32_t igd_gms; uint8_t pm_cap; - bool has_vga; bool pci_aer; bool req_enabled; bool has_flr; From 325ae8d548ebeee99cbebd38e2ff0909a9081c50 Mon Sep 17 00:00:00 2001 From: Chen Fan Date: Thu, 30 Jun 2016 13:00:23 -0600 Subject: [PATCH 2/5] vfio: add pcie extended capability support For vfio pcie device, we could expose the extended capability on PCIE bus. due to add a new pcie capability at the tail of the chain, in order to avoid config space overwritten, we introduce a copy config for parsing extended caps. and rebuild the pcie extended config space. Signed-off-by: Chen Fan Tested-by: Laszlo Ersek Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 53b87b76ea..a171056b41 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1502,6 +1502,21 @@ static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos) return next - pos; } + +static uint16_t vfio_ext_cap_max_size(const uint8_t *config, uint16_t pos) +{ + uint16_t tmp, next = PCIE_CONFIG_SPACE_SIZE; + + for (tmp = PCI_CONFIG_SPACE_SIZE; tmp; + tmp = PCI_EXT_CAP_NEXT(pci_get_long(config + tmp))) { + if (tmp > pos && tmp < next) { + next = tmp; + } + } + + return next - pos; +} + static void vfio_set_word_bits(uint8_t *buf, uint16_t val, uint16_t mask) { pci_set_word(buf, (pci_get_word(buf) & ~mask) | val); @@ -1749,16 +1764,71 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos) return 0; } +static int vfio_add_ext_cap(VFIOPCIDevice *vdev) +{ + PCIDevice *pdev = &vdev->pdev; + uint32_t header; + uint16_t cap_id, next, size; + uint8_t cap_ver; + uint8_t *config; + + /* + * pcie_add_capability always inserts the new capability at the tail + * of the chain. Therefore to end up with a chain that matches the + * physical device, we cache the config space to avoid overwriting + * the original config space when we parse the extended capabilities. + */ + config = g_memdup(pdev->config, vdev->config_size); + + for (next = PCI_CONFIG_SPACE_SIZE; next; + next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) { + header = pci_get_long(config + next); + cap_id = PCI_EXT_CAP_ID(header); + cap_ver = PCI_EXT_CAP_VER(header); + + /* + * If it becomes important to configure extended capabilities to their + * actual size, use this as the default when it's something we don't + * recognize. Since QEMU doesn't actually handle many of the config + * accesses, exact size doesn't seem worthwhile. + */ + size = vfio_ext_cap_max_size(config, next); + + pcie_add_capability(pdev, cap_id, cap_ver, next, size); + pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0)); + + /* Use emulated next pointer to allow dropping extended caps */ + pci_long_test_and_set_mask(vdev->emulated_config_bits + next, + PCI_EXT_CAP_NEXT_MASK); + } + + g_free(config); + return 0; +} + static int vfio_add_capabilities(VFIOPCIDevice *vdev) { PCIDevice *pdev = &vdev->pdev; + int ret; if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) || !pdev->config[PCI_CAPABILITY_LIST]) { return 0; /* Nothing to add */ } - return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]); + ret = vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]); + if (ret) { + return ret; + } + + /* on PCI bus, it doesn't make sense to expose extended capabilities. */ + if (!pci_is_express(pdev) || + !pci_bus_is_express(pdev->bus) || + !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) { + return 0; + } + + return vfio_add_ext_cap(vdev); } static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) From e37dac06dc4e85a2f46c24261c0dfdf2a30b50e3 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 30 Jun 2016 13:00:23 -0600 Subject: [PATCH 3/5] vfio/pci: Hide SR-IOV capability The kernel currently exposes the SR-IOV capability as read-only through vfio-pci. This is sufficient to protect the host kernel, but has the potential to confuse guests without further virtualization. In particular, OVMF tries to size the VF BARs and comes up with absurd results, ending with an assert. There's not much point in adding virtualization to a read-only capability, so we simply hide it for now. If the kernel ever enables SR-IOV virtualization, we should easily be able to test it through VF BAR sizing or explicit flags. Testing whether we should parse extended capabilities is also pulled into the function to keep these assumptions in one place. Tested-by: Laszlo Ersek Signed-off-by: Alex Williamson --- hw/vfio/pci.c | 49 +++++++++++++++++++++++++++++++++++--------- hw/vfio/trace-events | 1 + 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index a171056b41..f2c679e47c 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -1772,6 +1772,12 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev) uint8_t cap_ver; uint8_t *config; + /* Only add extended caps if we have them and the guest can see them */ + if (!pci_is_express(pdev) || !pci_bus_is_express(pdev->bus) || + !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) { + return 0; + } + /* * pcie_add_capability always inserts the new capability at the tail * of the chain. Therefore to end up with a chain that matches the @@ -1780,6 +1786,25 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev) */ config = g_memdup(pdev->config, vdev->config_size); + /* + * Extended capabilities are chained with each pointing to the next, so we + * can drop anything other than the head of the chain simply by modifying + * the previous next pointer. For the head of the chain, we can modify the + * capability ID to something that cannot match a valid capability. ID + * 0 is reserved for this since absence of capabilities is indicated by + * 0 for the ID, version, AND next pointer. However, pcie_add_capability() + * uses ID 0 as reserved for list management and will incorrectly match and + * assert if we attempt to pre-load the head of the chain with with this + * ID. Use ID 0xFFFF temporarily since it is also seems to be reserved in + * part for identifying absence of capabilities in a root complex register + * block. If the ID still exists after adding capabilities, switch back to + * zero. We'll mark this entire first dword as emulated for this purpose. + */ + pci_set_long(pdev->config + PCI_CONFIG_SPACE_SIZE, + PCI_EXT_CAP(0xFFFF, 0, 0)); + pci_set_long(pdev->wmask + PCI_CONFIG_SPACE_SIZE, 0); + pci_set_long(vdev->emulated_config_bits + PCI_CONFIG_SPACE_SIZE, ~0); + for (next = PCI_CONFIG_SPACE_SIZE; next; next = PCI_EXT_CAP_NEXT(pci_get_long(config + next))) { header = pci_get_long(config + next); @@ -1794,12 +1819,23 @@ static int vfio_add_ext_cap(VFIOPCIDevice *vdev) */ size = vfio_ext_cap_max_size(config, next); - pcie_add_capability(pdev, cap_id, cap_ver, next, size); - pci_set_long(pdev->config + next, PCI_EXT_CAP(cap_id, cap_ver, 0)); - /* Use emulated next pointer to allow dropping extended caps */ pci_long_test_and_set_mask(vdev->emulated_config_bits + next, PCI_EXT_CAP_NEXT_MASK); + + switch (cap_id) { + case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */ + trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next); + break; + default: + pcie_add_capability(pdev, cap_id, cap_ver, next, size); + } + + } + + /* Cleanup chain head ID if necessary */ + if (pci_get_word(pdev->config + PCI_CONFIG_SPACE_SIZE) == 0xFFFF) { + pci_set_word(pdev->config + PCI_CONFIG_SPACE_SIZE, 0); } g_free(config); @@ -1821,13 +1857,6 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev) return ret; } - /* on PCI bus, it doesn't make sense to expose extended capabilities. */ - if (!pci_is_express(pdev) || - !pci_bus_is_express(pdev->bus) || - !pci_get_long(pdev->config + PCI_CONFIG_SPACE_SIZE)) { - return 0; - } - return vfio_add_ext_cap(vdev); } diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events index 9da0ff928b..a768fb54ec 100644 --- a/hw/vfio/trace-events +++ b/hw/vfio/trace-events @@ -37,6 +37,7 @@ vfio_pci_hot_reset_result(const char *name, const char *result) "%s hot reset: % vfio_populate_device_config(const char *name, unsigned long size, unsigned long offset, unsigned long flags) "Device %s config:\n size: 0x%lx, offset: 0x%lx, flags: 0x%lx" vfio_populate_device_get_irq_info_failure(void) "VFIO_DEVICE_GET_IRQ_INFO failure: %m" vfio_initfn(const char *name, int group_id) " (%s) group %d" +vfio_add_ext_cap_dropped(const char *name, uint16_t cap, uint16_t offset) "%s %x@%x" vfio_pci_reset(const char *name) " (%s)" vfio_pci_reset_flr(const char *name) "%s FLR/VFIO_DEVICE_RESET" vfio_pci_reset_pm(const char *name) "%s PCI PM Reset" From d22d8956b185c002b50a4d0883aff61f857347ef Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 30 Jun 2016 13:00:23 -0600 Subject: [PATCH 4/5] memory: Add MemoryRegionIOMMUOps.notify_started/stopped callbacks The IOMMU driver may change behavior depending on whether a notifier client is present. In the case of POWER, this represents a change in the visibility of the IOTLB, for other drivers such as intel-iommu and future AMD-Vi emulation, notifier support is not yet enabled and this provides the opportunity to flag that incompatibility. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Reviewed-by: Peter Xu Tested-by: Peter Xu Acked-by: Paolo Bonzini [new log & extracted from [PATCH qemu v17 12/12] spapr_iommu, vfio, memory: Notify IOMMU about starting/stopping listening] Signed-off-by: Alex Williamson --- hw/vfio/common.c | 5 +++-- include/exec/memory.h | 8 +++++++- memory.c | 10 +++++++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/hw/vfio/common.c b/hw/vfio/common.c index 27cc1596f9..7be638e0e3 100644 --- a/hw/vfio/common.c +++ b/hw/vfio/common.c @@ -455,7 +455,8 @@ static void vfio_listener_region_del(MemoryListener *listener, QLIST_FOREACH(giommu, &container->giommu_list, giommu_next) { if (giommu->iommu == section->mr) { - memory_region_unregister_iommu_notifier(&giommu->n); + memory_region_unregister_iommu_notifier(giommu->iommu, + &giommu->n); QLIST_REMOVE(giommu, giommu_next); g_free(giommu); break; @@ -991,7 +992,7 @@ static void vfio_disconnect_container(VFIOGroup *group) QLIST_REMOVE(container, next); QLIST_FOREACH_SAFE(giommu, &container->giommu_list, giommu_next, tmp) { - memory_region_unregister_iommu_notifier(&giommu->n); + memory_region_unregister_iommu_notifier(giommu->iommu, &giommu->n); QLIST_REMOVE(giommu, giommu_next); g_free(giommu); } diff --git a/include/exec/memory.h b/include/exec/memory.h index e3829f797a..23c7399131 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -153,6 +153,10 @@ struct MemoryRegionIOMMUOps { IOMMUTLBEntry (*translate)(MemoryRegion *iommu, hwaddr addr, bool is_write); /* Returns minimum supported page size */ uint64_t (*get_min_page_size)(MemoryRegion *iommu); + /* Called when the first notifier is set */ + void (*notify_started)(MemoryRegion *iommu); + /* Called when the last notifier is removed */ + void (*notify_stopped)(MemoryRegion *iommu); }; typedef struct CoalescedMemoryRange CoalescedMemoryRange; @@ -622,9 +626,11 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write); * memory_region_unregister_iommu_notifier: unregister a notifier for * changes to IOMMU translation entries. * + * @mr: the memory region which was observed and for which notity_stopped() + * needs to be called * @n: the notifier to be removed. */ -void memory_region_unregister_iommu_notifier(Notifier *n); +void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n); /** * memory_region_name: get a memory region's name diff --git a/memory.c b/memory.c index 8549c791d7..33799e810b 100644 --- a/memory.c +++ b/memory.c @@ -1499,6 +1499,10 @@ bool memory_region_is_logging(MemoryRegion *mr, uint8_t client) void memory_region_register_iommu_notifier(MemoryRegion *mr, Notifier *n) { + if (mr->iommu_ops->notify_started && + QLIST_EMPTY(&mr->iommu_notify.notifiers)) { + mr->iommu_ops->notify_started(mr); + } notifier_list_add(&mr->iommu_notify, n); } @@ -1532,9 +1536,13 @@ void memory_region_iommu_replay(MemoryRegion *mr, Notifier *n, bool is_write) } } -void memory_region_unregister_iommu_notifier(Notifier *n) +void memory_region_unregister_iommu_notifier(MemoryRegion *mr, Notifier *n) { notifier_remove(n); + if (mr->iommu_ops->notify_stopped && + QLIST_EMPTY(&mr->iommu_notify.notifiers)) { + mr->iommu_ops->notify_stopped(mr); + } } void memory_region_notify_iommu(MemoryRegion *mr, From 3cb3b1549f5401dc3a5e1d073e34063dc274136f Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 30 Jun 2016 13:00:24 -0600 Subject: [PATCH 5/5] intel_iommu: Throw hw_error on notify_started We don't currently support the MemoryRegionIOMMUOps notifier, so throw an error should a device require it. Reviewed-by: Marcel Apfelbaum Reviewed-by: David Gibson Reviewed-by: Peter Xu Tested-by: Peter Xu Acked-by: Paolo Bonzini Signed-off-by: Alex Williamson --- hw/i386/intel_iommu.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 347718f938..5eba704477 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -24,6 +24,7 @@ #include "exec/address-spaces.h" #include "intel_iommu_internal.h" #include "hw/pci/pci.h" +#include "hw/pci/pci_bus.h" /*#define DEBUG_INTEL_IOMMU*/ #ifdef DEBUG_INTEL_IOMMU @@ -1871,6 +1872,16 @@ static IOMMUTLBEntry vtd_iommu_translate(MemoryRegion *iommu, hwaddr addr, return ret; } +static void vtd_iommu_notify_started(MemoryRegion *iommu) +{ + VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); + + hw_error("Device at bus %s addr %02x.%d requires iommu notifier which " + "is currently not supported by intel-iommu emulation", + vtd_as->bus->qbus.name, PCI_SLOT(vtd_as->devfn), + PCI_FUNC(vtd_as->devfn)); +} + static const VMStateDescription vtd_vmstate = { .name = "iommu-intel", .unmigratable = 1, @@ -1938,6 +1949,7 @@ static void vtd_init(IntelIOMMUState *s) memset(s->womask, 0, DMAR_REG_SIZE); s->iommu_ops.translate = vtd_iommu_translate; + s->iommu_ops.notify_started = vtd_iommu_notify_started; s->root = 0; s->root_extended = false; s->dmar_enabled = false;