NUMA queue, 2015-07-03

-----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQIcBAABCAAGBQJVlvV2AAoJECgHk2+YTcWmrH4QAJvNqH6o3OQ1uywlr9t0v1nV
 GJcUc2bS4hU+TpKTDDucXvhOuA24d//32S/HmM2oL59PkQyg4zKzX2buHQHH4B11
 lPXmNVI9ci3JoRBzCLet9jzbam/ySFsTn8Q4W77IsWl/vK8QOqWTnsArMjrAAXXv
 sJKnKjZtvj6HDtRBZLF5pUU4kw1u8rsseWeAKizl4gSDhDFlWkDDn9uWklYBes1H
 S+MMiUgxUZdSxU6tRp5M5Hhef0Jh9xhkSYgnh0hRsCVPJn5lx4nvEFHvplM8J+bA
 wvZfDUHZEeMMHx6e1SM4/xCMPE9pE+7Kg9OoyieaLXIhhtleBC186hCMX5/dvrpm
 BEmAYHOGZ2UcX2dW+0nmogrtkQMW/FYlQxUC9sfD4trGJYtimkt4drRLrMzJ6tUz
 3dJXlBZFgfr1EmAXIbMemIZyPaMbvz1LjoAtw6ErsUfes18GHydo1eDj7yZHc/+V
 DNY0QdC3D2fUDwZPTfQBknqAHRBn6pT69G8DEJx+1OCLSXyDUkqHFYMZbs6xmxNX
 LMKq8cw8cZ9fnULD4geyGxQwIsxb8xQc4uG1hNGMRT+6d+5u0NqCiyeGE8KGh/io
 cY3hCKWhq9HSMQHjvecbbF5oKXJaB8W6eF0ouCtCz74DBgnNvDk8Ra9asTW+LjST
 BIhBge7sFBCFcMkO2IXM
 =GBW2
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/ehabkost/tags/numa-pull-request' into staging

NUMA queue, 2015-07-03

# gpg: Signature made Fri Jul  3 21:49:58 2015 BST using RSA key ID 984DC5A6
# gpg: Good signature from "Eduardo Habkost <ehabkost@redhat.com>"
# gpg: WARNING: This key is not certified with sufficiently trusted signatures!
# gpg:          It is not certain that the signature belongs to the owner.
# Primary key fingerprint: 5A32 2FD5 ABC4 D3DB ACCF  D1AA 2807 936F 984D C5A6

* remotes/ehabkost/tags/numa-pull-request:
  numa: API to lookup NUMA node by address
  numa: Store boot memory address range in node_info
  numa,pc-dimm: Store pc-dimm memory information in numa_info
  pc: Abort if HotplugHandlerClass::plug() fails
  pc,pc-dimm: Factor out reusable parts in pc_dimm_plug to a separate routine
  pc,pc-dimm: Extract hotplug related fields in PCMachineState to a structure

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2015-07-05 19:33:51 +01:00
commit 63a9294ddc
7 changed files with 219 additions and 78 deletions

View file

@ -1509,7 +1509,7 @@ build_srat(GArray *table_data, GArray *linker, PcGuestInfo *guest_info)
*/
if (hotplugabble_address_space_size) {
numamem = acpi_data_push(table_data, sizeof *numamem);
acpi_build_srat_memory(numamem, pcms->hotplug_memory_base,
acpi_build_srat_memory(numamem, pcms->hotplug_memory.base,
hotplugabble_address_space_size, 0,
MEM_AFFINITY_HOTPLUGGABLE |
MEM_AFFINITY_ENABLED);

View file

@ -64,7 +64,6 @@
#include "hw/pci/pci_host.h"
#include "acpi-build.h"
#include "hw/mem/pc-dimm.h"
#include "trace.h"
#include "qapi/visitor.h"
#include "qapi-visit.h"
@ -1297,7 +1296,7 @@ FWCfgState *pc_memory_init(MachineState *machine,
exit(EXIT_FAILURE);
}
pcms->hotplug_memory_base =
pcms->hotplug_memory.base =
ROUND_UP(0x100000000ULL + above_4g_mem_size, 1ULL << 30);
if (pcms->enforce_aligned_dimm) {
@ -1305,17 +1304,17 @@ FWCfgState *pc_memory_init(MachineState *machine,
hotplug_mem_size += (1ULL << 30) * machine->ram_slots;
}
if ((pcms->hotplug_memory_base + hotplug_mem_size) <
if ((pcms->hotplug_memory.base + hotplug_mem_size) <
hotplug_mem_size) {
error_report("unsupported amount of maximum memory: " RAM_ADDR_FMT,
machine->maxram_size);
exit(EXIT_FAILURE);
}
memory_region_init(&pcms->hotplug_memory, OBJECT(pcms),
memory_region_init(&pcms->hotplug_memory.mr, OBJECT(pcms),
"hotplug-memory", hotplug_mem_size);
memory_region_add_subregion(system_memory, pcms->hotplug_memory_base,
&pcms->hotplug_memory);
memory_region_add_subregion(system_memory, pcms->hotplug_memory.base,
&pcms->hotplug_memory.mr);
}
/* Initialize PC system firmware */
@ -1333,9 +1332,9 @@ FWCfgState *pc_memory_init(MachineState *machine,
fw_cfg = bochs_bios_init();
rom_set_fw(fw_cfg);
if (guest_info->has_reserved_memory && pcms->hotplug_memory_base) {
if (guest_info->has_reserved_memory && pcms->hotplug_memory.base) {
uint64_t *val = g_malloc(sizeof(*val));
*val = cpu_to_le64(ROUND_UP(pcms->hotplug_memory_base, 0x1ULL << 30));
*val = cpu_to_le64(ROUND_UP(pcms->hotplug_memory.base, 0x1ULL << 30));
fw_cfg_add_file(fw_cfg, "etc/reserved-memory-end", val, sizeof(*val));
}
@ -1554,88 +1553,31 @@ void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
static void pc_dimm_plug(HotplugHandler *hotplug_dev,
DeviceState *dev, Error **errp)
{
int slot;
HotplugHandlerClass *hhc;
Error *local_err = NULL;
PCMachineState *pcms = PC_MACHINE(hotplug_dev);
MachineState *machine = MACHINE(hotplug_dev);
PCDIMMDevice *dimm = PC_DIMM(dev);
PCDIMMDeviceClass *ddc = PC_DIMM_GET_CLASS(dimm);
MemoryRegion *mr = ddc->get_memory_region(dimm);
uint64_t existing_dimms_capacity = 0;
uint64_t align = TARGET_PAGE_SIZE;
uint64_t addr;
addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, &local_err);
if (local_err) {
goto out;
}
if (memory_region_get_alignment(mr) && pcms->enforce_aligned_dimm) {
align = memory_region_get_alignment(mr);
}
addr = pc_dimm_get_free_addr(pcms->hotplug_memory_base,
memory_region_size(&pcms->hotplug_memory),
!addr ? NULL : &addr, align,
memory_region_size(mr), &local_err);
if (local_err) {
goto out;
}
existing_dimms_capacity = pc_existing_dimms_capacity(&local_err);
if (local_err) {
goto out;
}
if (existing_dimms_capacity + memory_region_size(mr) >
machine->maxram_size - machine->ram_size) {
error_setg(&local_err, "not enough space, currently 0x%" PRIx64
" in use of total hot pluggable 0x" RAM_ADDR_FMT,
existing_dimms_capacity,
machine->maxram_size - machine->ram_size);
goto out;
}
object_property_set_int(OBJECT(dev), addr, PC_DIMM_ADDR_PROP, &local_err);
if (local_err) {
goto out;
}
trace_mhp_pc_dimm_assigned_address(addr);
slot = object_property_get_int(OBJECT(dev), PC_DIMM_SLOT_PROP, &local_err);
if (local_err) {
goto out;
}
slot = pc_dimm_get_free_slot(slot == PC_DIMM_UNASSIGNED_SLOT ? NULL : &slot,
machine->ram_slots, &local_err);
if (local_err) {
goto out;
}
object_property_set_int(OBJECT(dev), slot, PC_DIMM_SLOT_PROP, &local_err);
if (local_err) {
goto out;
}
trace_mhp_pc_dimm_assigned_slot(slot);
if (!pcms->acpi_dev) {
error_setg(&local_err,
"memory hotplug is not enabled: missing acpi device");
goto out;
}
if (kvm_enabled() && !kvm_has_free_slot(machine)) {
error_setg(&local_err, "hypervisor has no free memory slots left");
pc_dimm_memory_plug(dev, &pcms->hotplug_memory, mr, align, &local_err);
if (local_err) {
goto out;
}
memory_region_add_subregion(&pcms->hotplug_memory,
addr - pcms->hotplug_memory_base, mr);
vmstate_register_ram(mr, dev);
hhc = HOTPLUG_HANDLER_GET_CLASS(pcms->acpi_dev);
hhc->plug(HOTPLUG_HANDLER(pcms->acpi_dev), dev, &local_err);
hhc->plug(HOTPLUG_HANDLER(pcms->acpi_dev), dev, &error_abort);
out:
error_propagate(errp, local_err);
}
@ -1677,9 +1619,7 @@ static void pc_dimm_unplug(HotplugHandler *hotplug_dev,
goto out;
}
memory_region_del_subregion(&pcms->hotplug_memory, mr);
vmstate_unregister_ram(mr, dev);
pc_dimm_memory_unplug(dev, &pcms->hotplug_memory, mr);
object_unparent(OBJECT(dev));
out:
@ -1766,7 +1706,7 @@ pc_machine_get_hotplug_memory_region_size(Object *obj, Visitor *v, void *opaque,
const char *name, Error **errp)
{
PCMachineState *pcms = PC_MACHINE(obj);
int64_t value = memory_region_size(&pcms->hotplug_memory);
int64_t value = memory_region_size(&pcms->hotplug_memory.mr);
visit_type_int(v, &value, name, errp);
}

View file

@ -23,12 +23,96 @@
#include "qapi/visitor.h"
#include "qemu/range.h"
#include "sysemu/numa.h"
#include "sysemu/kvm.h"
#include "trace.h"
typedef struct pc_dimms_capacity {
uint64_t size;
Error **errp;
} pc_dimms_capacity;
void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
MemoryRegion *mr, uint64_t align, Error **errp)
{
int slot;
MachineState *machine = MACHINE(qdev_get_machine());
PCDIMMDevice *dimm = PC_DIMM(dev);
Error *local_err = NULL;
uint64_t existing_dimms_capacity = 0;
uint64_t addr;
addr = object_property_get_int(OBJECT(dimm), PC_DIMM_ADDR_PROP, &local_err);
if (local_err) {
goto out;
}
addr = pc_dimm_get_free_addr(hpms->base,
memory_region_size(&hpms->mr),
!addr ? NULL : &addr, align,
memory_region_size(mr), &local_err);
if (local_err) {
goto out;
}
existing_dimms_capacity = pc_existing_dimms_capacity(&local_err);
if (local_err) {
goto out;
}
if (existing_dimms_capacity + memory_region_size(mr) >
machine->maxram_size - machine->ram_size) {
error_setg(&local_err, "not enough space, currently 0x%" PRIx64
" in use of total hot pluggable 0x" RAM_ADDR_FMT,
existing_dimms_capacity,
machine->maxram_size - machine->ram_size);
goto out;
}
object_property_set_int(OBJECT(dev), addr, PC_DIMM_ADDR_PROP, &local_err);
if (local_err) {
goto out;
}
trace_mhp_pc_dimm_assigned_address(addr);
slot = object_property_get_int(OBJECT(dev), PC_DIMM_SLOT_PROP, &local_err);
if (local_err) {
goto out;
}
slot = pc_dimm_get_free_slot(slot == PC_DIMM_UNASSIGNED_SLOT ? NULL : &slot,
machine->ram_slots, &local_err);
if (local_err) {
goto out;
}
object_property_set_int(OBJECT(dev), slot, PC_DIMM_SLOT_PROP, &local_err);
if (local_err) {
goto out;
}
trace_mhp_pc_dimm_assigned_slot(slot);
if (kvm_enabled() && !kvm_has_free_slot(machine)) {
error_setg(&local_err, "hypervisor has no free memory slots left");
goto out;
}
memory_region_add_subregion(&hpms->mr, addr - hpms->base, mr);
vmstate_register_ram(mr, dev);
numa_set_mem_node_id(addr, memory_region_size(mr), dimm->node);
out:
error_propagate(errp, local_err);
}
void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms,
MemoryRegion *mr)
{
PCDIMMDevice *dimm = PC_DIMM(dev);
numa_unset_mem_node_id(dimm->addr, memory_region_size(mr), dimm->node);
memory_region_del_subregion(&hpms->mr, mr);
vmstate_unregister_ram(mr, dev);
}
static int pc_existing_dimms_capacity_internal(Object *obj, void *opaque)
{
pc_dimms_capacity *cap = opaque;

View file

@ -15,14 +15,12 @@
#include "hw/pci/pci.h"
#include "hw/boards.h"
#include "hw/compat.h"
#include "hw/mem/pc-dimm.h"
#define HPET_INTCAP "hpet-intcap"
/**
* PCMachineState:
* @hotplug_memory_base: address in guest RAM address space where hotplug memory
* address space begins.
* @hotplug_memory: hotplug memory addess space container
* @acpi_dev: link to ACPI PM device that performs ACPI hotplug handling
* @enforce_aligned_dimm: check that DIMM's address/size is aligned by
* backend's alignment value if provided
@ -32,8 +30,7 @@ struct PCMachineState {
MachineState parent_obj;
/* <public> */
ram_addr_t hotplug_memory_base;
MemoryRegion hotplug_memory;
MemoryHotplugState hotplug_memory;
HotplugHandler *acpi_dev;
ISADevice *rtc;

View file

@ -70,6 +70,17 @@ typedef struct PCDIMMDeviceClass {
MemoryRegion *(*get_memory_region)(PCDIMMDevice *dimm);
} PCDIMMDeviceClass;
/**
* MemoryHotplugState:
* @base: address in guest RAM address space where hotplug memory
* address space begins.
* @mr: hotplug memory address space container
*/
typedef struct MemoryHotplugState {
ram_addr_t base;
MemoryRegion mr;
} MemoryHotplugState;
uint64_t pc_dimm_get_free_addr(uint64_t address_space_start,
uint64_t address_space_size,
uint64_t *hint, uint64_t align, uint64_t size,
@ -79,4 +90,8 @@ int pc_dimm_get_free_slot(const int *hint, int max_slots, Error **errp);
int qmp_pc_dimm_device_list(Object *obj, void *opaque);
uint64_t pc_existing_dimms_capacity(Error **errp);
void pc_dimm_memory_plug(DeviceState *dev, MemoryHotplugState *hpms,
MemoryRegion *mr, uint64_t align, Error **errp);
void pc_dimm_memory_unplug(DeviceState *dev, MemoryHotplugState *hpms,
MemoryRegion *mr);
#endif

View file

@ -10,16 +10,27 @@
extern int nb_numa_nodes; /* Number of NUMA nodes */
struct numa_addr_range {
ram_addr_t mem_start;
ram_addr_t mem_end;
QLIST_ENTRY(numa_addr_range) entry;
};
typedef struct node_info {
uint64_t node_mem;
DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
struct HostMemoryBackend *node_memdev;
bool present;
QLIST_HEAD(, numa_addr_range) addr; /* List to store address ranges */
} NodeInfo;
extern NodeInfo numa_info[MAX_NODES];
void parse_numa_opts(MachineClass *mc);
void numa_post_machine_init(void);
void query_numa_node_mem(uint64_t node_mem[]);
extern QemuOptsList qemu_numa_opts;
void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node);
uint32_t numa_get_node(ram_addr_t addr, Error **errp);
#endif

94
numa.c
View file

@ -52,6 +52,92 @@ static int max_numa_nodeid; /* Highest specified NUMA node ID, plus one.
int nb_numa_nodes;
NodeInfo numa_info[MAX_NODES];
void numa_set_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node)
{
struct numa_addr_range *range = g_malloc0(sizeof(*range));
/*
* Memory-less nodes can come here with 0 size in which case,
* there is nothing to do.
*/
if (!size) {
return;
}
range->mem_start = addr;
range->mem_end = addr + size - 1;
QLIST_INSERT_HEAD(&numa_info[node].addr, range, entry);
}
void numa_unset_mem_node_id(ram_addr_t addr, uint64_t size, uint32_t node)
{
struct numa_addr_range *range, *next;
QLIST_FOREACH_SAFE(range, &numa_info[node].addr, entry, next) {
if (addr == range->mem_start && (addr + size - 1) == range->mem_end) {
QLIST_REMOVE(range, entry);
g_free(range);
return;
}
}
}
static void numa_set_mem_ranges(void)
{
int i;
ram_addr_t mem_start = 0;
/*
* Deduce start address of each node and use it to store
* the address range info in numa_info address range list
*/
for (i = 0; i < nb_numa_nodes; i++) {
numa_set_mem_node_id(mem_start, numa_info[i].node_mem, i);
mem_start += numa_info[i].node_mem;
}
}
/*
* Check if @addr falls under NUMA @node.
*/
static bool numa_addr_belongs_to_node(ram_addr_t addr, uint32_t node)
{
struct numa_addr_range *range;
QLIST_FOREACH(range, &numa_info[node].addr, entry) {
if (addr >= range->mem_start && addr <= range->mem_end) {
return true;
}
}
return false;
}
/*
* Given an address, return the index of the NUMA node to which the
* address belongs to.
*/
uint32_t numa_get_node(ram_addr_t addr, Error **errp)
{
uint32_t i;
/* For non NUMA configurations, check if the addr falls under node 0 */
if (!nb_numa_nodes) {
if (numa_addr_belongs_to_node(addr, 0)) {
return 0;
}
}
for (i = 0; i < nb_numa_nodes; i++) {
if (numa_addr_belongs_to_node(addr, i)) {
return i;
}
}
error_setg(errp, "Address 0x" RAM_ADDR_FMT " doesn't belong to any "
"NUMA node", addr);
return -1;
}
static void numa_node_parse(NumaNodeOptions *node, QemuOpts *opts, Error **errp)
{
uint16_t nodenr;
@ -273,6 +359,12 @@ void parse_numa_opts(MachineClass *mc)
exit(1);
}
for (i = 0; i < nb_numa_nodes; i++) {
QLIST_INIT(&numa_info[i].addr);
}
numa_set_mem_ranges();
for (i = 0; i < nb_numa_nodes; i++) {
if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) {
break;
@ -297,6 +389,8 @@ void parse_numa_opts(MachineClass *mc)
}
validate_numa_cpus();
} else {
numa_set_mem_node_id(0, ram_size, 0);
}
}