qemu-patch-raspberry4/hw/virtio/virtio-balloon.c
David Gibson ed48c59875 virtio-balloon: Safely handle BALLOON_PAGE_SIZE < host page size
The virtio-balloon always works in units of 4kiB (BALLOON_PAGE_SIZE), but
we can only actually discard memory in units of the host page size.

Now, we handle this very badly: we silently ignore balloon requests that
aren't host page aligned, and for requests that are host page aligned we
discard the entire host page.  The latter can corrupt guest memory if its
page size is smaller than the host's.

The obvious choice would be to disable the balloon if the host page size is
not 4kiB.  However, that would break the special case where host and guest
have the same page size, but that's larger than 4kiB.  That case currently
works by accident[1] - and is used in practice on many production POWER
systems where 64kiB has long been the Linux default page size on both host
and guest.

To make the balloon safe, without breaking that useful special case, we
need to accumulate 4kiB balloon requests until we have a whole contiguous
host page to discard.

We could in principle do that across all guest memory, but it would require
a large bitmap to track.  This patch represents a compromise: we track
ballooned subpages for a single contiguous host page at a time.  This means
that if the guest discards all 4kiB chunks of a host page in succession,
we will discard it.  This is the expected behaviour in the (host page) ==
(guest page) != 4kiB case we want to support.

If the guest scatters 4kiB requests across different host pages, we don't
discard anything, and issue a warning.  Not ideal, but at least we don't
corrupt guest memory as the previous version could.

Warning reporting is kind of a compromise here.  Determining whether we're
in a problematic state at realize() time is tricky, because we'd have to
look at the host pagesizes of all memory backends, but we can't really know
if some of those backends could be for special purpose memory that's not
subject to ballooning.

Reporting only when the guest tries to balloon a partial page also isn't
great because if the guest page size happens to line up it won't indicate
that we're in a non ideal situation.  It could also cause alarming repeated
warnings whenever a migration is attempted.

So, what we do is warn the first time the guest attempts balloon a partial
host page, whether or not it will end up ballooning the rest of the page
immediately afterwards.

[1] Because when the guest attempts to balloon a page, it will submit
    requests for each 4kiB subpage.  Most will be ignored, but the one
    which happens to be host page aligned will discard the whole lot.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Message-Id: <20190214043916.22128-6-david@gibson.dropbear.id.au>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
2019-02-22 10:51:31 -05:00

616 lines
18 KiB
C

/*
* Virtio Balloon Device
*
* Copyright IBM, Corp. 2008
* Copyright (C) 2011 Red Hat, Inc.
* Copyright (C) 2011 Amit Shah <amit.shah@redhat.com>
*
* Authors:
* Anthony Liguori <aliguori@us.ibm.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
*
*/
#include "qemu/osdep.h"
#include "qemu/iov.h"
#include "qemu/timer.h"
#include "qemu-common.h"
#include "hw/virtio/virtio.h"
#include "hw/mem/pc-dimm.h"
#include "sysemu/balloon.h"
#include "hw/virtio/virtio-balloon.h"
#include "exec/address-spaces.h"
#include "qapi/error.h"
#include "qapi/qapi-events-misc.h"
#include "qapi/visitor.h"
#include "trace.h"
#include "qemu/error-report.h"
#include "hw/virtio/virtio-bus.h"
#include "hw/virtio/virtio-access.h"
#define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT)
struct PartiallyBalloonedPage {
RAMBlock *rb;
ram_addr_t base;
unsigned long bitmap[];
};
static void balloon_inflate_page(VirtIOBalloon *balloon,
MemoryRegion *mr, hwaddr offset)
{
void *addr = memory_region_get_ram_ptr(mr) + offset;
RAMBlock *rb;
size_t rb_page_size;
int subpages;
ram_addr_t ram_offset, host_page_base;
/* XXX is there a better way to get to the RAMBlock than via a
* host address? */
rb = qemu_ram_block_from_host(addr, false, &ram_offset);
rb_page_size = qemu_ram_pagesize(rb);
host_page_base = ram_offset & ~(rb_page_size - 1);
if (rb_page_size == BALLOON_PAGE_SIZE) {
/* Easy case */
ram_block_discard_range(rb, ram_offset, rb_page_size);
/* We ignore errors from ram_block_discard_range(), because it
* has already reported them, and failing to discard a balloon
* page is not fatal */
return;
}
/* Hard case
*
* We've put a piece of a larger host page into the balloon - we
* need to keep track until we have a whole host page to
* discard
*/
warn_report_once(
"Balloon used with backing page size > 4kiB, this may not be reliable");
subpages = rb_page_size / BALLOON_PAGE_SIZE;
if (balloon->pbp
&& (rb != balloon->pbp->rb
|| host_page_base != balloon->pbp->base)) {
/* We've partially ballooned part of a host page, but now
* we're trying to balloon part of a different one. Too hard,
* give up on the old partial page */
free(balloon->pbp);
balloon->pbp = NULL;
}
if (!balloon->pbp) {
/* Starting on a new host page */
size_t bitlen = BITS_TO_LONGS(subpages) * sizeof(unsigned long);
balloon->pbp = g_malloc0(sizeof(PartiallyBalloonedPage) + bitlen);
balloon->pbp->rb = rb;
balloon->pbp->base = host_page_base;
}
bitmap_set(balloon->pbp->bitmap,
(ram_offset - balloon->pbp->base) / BALLOON_PAGE_SIZE,
subpages);
if (bitmap_full(balloon->pbp->bitmap, subpages)) {
/* We've accumulated a full host page, we can actually discard
* it now */
ram_block_discard_range(rb, balloon->pbp->base, rb_page_size);
/* We ignore errors from ram_block_discard_range(), because it
* has already reported them, and failing to discard a balloon
* page is not fatal */
free(balloon->pbp);
balloon->pbp = NULL;
}
}
static const char *balloon_stat_names[] = {
[VIRTIO_BALLOON_S_SWAP_IN] = "stat-swap-in",
[VIRTIO_BALLOON_S_SWAP_OUT] = "stat-swap-out",
[VIRTIO_BALLOON_S_MAJFLT] = "stat-major-faults",
[VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults",
[VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory",
[VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
[VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
[VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches",
[VIRTIO_BALLOON_S_HTLB_PGALLOC] = "stat-htlb-pgalloc",
[VIRTIO_BALLOON_S_HTLB_PGFAIL] = "stat-htlb-pgfail",
[VIRTIO_BALLOON_S_NR] = NULL
};
/*
* reset_stats - Mark all items in the stats array as unset
*
* This function needs to be called at device initialization and before
* updating to a set of newly-generated stats. This will ensure that no
* stale values stick around in case the guest reports a subset of the supported
* statistics.
*/
static inline void reset_stats(VirtIOBalloon *dev)
{
int i;
for (i = 0; i < VIRTIO_BALLOON_S_NR; dev->stats[i++] = -1);
}
static bool balloon_stats_supported(const VirtIOBalloon *s)
{
VirtIODevice *vdev = VIRTIO_DEVICE(s);
return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
}
static bool balloon_stats_enabled(const VirtIOBalloon *s)
{
return s->stats_poll_interval > 0;
}
static void balloon_stats_destroy_timer(VirtIOBalloon *s)
{
if (balloon_stats_enabled(s)) {
timer_del(s->stats_timer);
timer_free(s->stats_timer);
s->stats_timer = NULL;
s->stats_poll_interval = 0;
}
}
static void balloon_stats_change_timer(VirtIOBalloon *s, int64_t secs)
{
timer_mod(s->stats_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + secs * 1000);
}
static void balloon_stats_poll_cb(void *opaque)
{
VirtIOBalloon *s = opaque;
VirtIODevice *vdev = VIRTIO_DEVICE(s);
if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) {
/* re-schedule */
balloon_stats_change_timer(s, s->stats_poll_interval);
return;
}
virtqueue_push(s->svq, s->stats_vq_elem, s->stats_vq_offset);
virtio_notify(vdev, s->svq);
g_free(s->stats_vq_elem);
s->stats_vq_elem = NULL;
}
static void balloon_stats_get_all(Object *obj, Visitor *v, const char *name,
void *opaque, Error **errp)
{
Error *err = NULL;
VirtIOBalloon *s = opaque;
int i;
visit_start_struct(v, name, NULL, 0, &err);
if (err) {
goto out;
}
visit_type_int(v, "last-update", &s->stats_last_update, &err);
if (err) {
goto out_end;
}
visit_start_struct(v, "stats", NULL, 0, &err);
if (err) {
goto out_end;
}
for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) {
visit_type_uint64(v, balloon_stat_names[i], &s->stats[i], &err);
if (err) {
goto out_nested;
}
}
visit_check_struct(v, &err);
out_nested:
visit_end_struct(v, NULL);
if (!err) {
visit_check_struct(v, &err);
}
out_end:
visit_end_struct(v, NULL);
out:
error_propagate(errp, err);
}
static void balloon_stats_get_poll_interval(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
{
VirtIOBalloon *s = opaque;
visit_type_int(v, name, &s->stats_poll_interval, errp);
}
static void balloon_stats_set_poll_interval(Object *obj, Visitor *v,
const char *name, void *opaque,
Error **errp)
{
VirtIOBalloon *s = opaque;
Error *local_err = NULL;
int64_t value;
visit_type_int(v, name, &value, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
}
if (value < 0) {
error_setg(errp, "timer value must be greater than zero");
return;
}
if (value > UINT32_MAX) {
error_setg(errp, "timer value is too big");
return;
}
if (value == s->stats_poll_interval) {
return;
}
if (value == 0) {
/* timer=0 disables the timer */
balloon_stats_destroy_timer(s);
return;
}
if (balloon_stats_enabled(s)) {
/* timer interval change */
s->stats_poll_interval = value;
balloon_stats_change_timer(s, value);
return;
}
/* create a new timer */
g_assert(s->stats_timer == NULL);
s->stats_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, balloon_stats_poll_cb, s);
s->stats_poll_interval = value;
balloon_stats_change_timer(s, 0);
}
static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
VirtQueueElement *elem;
MemoryRegionSection section;
for (;;) {
size_t offset = 0;
uint32_t pfn;
elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
if (!elem) {
return;
}
while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) {
hwaddr pa;
int p = virtio_ldl_p(vdev, &pfn);
pa = (hwaddr) p << VIRTIO_BALLOON_PFN_SHIFT;
offset += 4;
section = memory_region_find(get_system_memory(), pa,
BALLOON_PAGE_SIZE);
if (!section.mr) {
trace_virtio_balloon_bad_addr(pa);
continue;
}
if (!memory_region_is_ram(section.mr) ||
memory_region_is_rom(section.mr) ||
memory_region_is_romd(section.mr)) {
trace_virtio_balloon_bad_addr(pa);
memory_region_unref(section.mr);
continue;
}
trace_virtio_balloon_handle_output(memory_region_name(section.mr),
pa);
if (!qemu_balloon_is_inhibited() && vq != s->dvq) {
balloon_inflate_page(s, section.mr, section.offset_within_region);
}
memory_region_unref(section.mr);
}
virtqueue_push(vq, elem, offset);
virtio_notify(vdev, vq);
g_free(elem);
}
}
static void virtio_balloon_receive_stats(VirtIODevice *vdev, VirtQueue *vq)
{
VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
VirtQueueElement *elem;
VirtIOBalloonStat stat;
size_t offset = 0;
qemu_timeval tv;
elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
if (!elem) {
goto out;
}
if (s->stats_vq_elem != NULL) {
/* This should never happen if the driver follows the spec. */
virtqueue_push(vq, s->stats_vq_elem, 0);
virtio_notify(vdev, vq);
g_free(s->stats_vq_elem);
}
s->stats_vq_elem = elem;
/* Initialize the stats to get rid of any stale values. This is only
* needed to handle the case where a guest supports fewer stats than it
* used to (ie. it has booted into an old kernel).
*/
reset_stats(s);
while (iov_to_buf(elem->out_sg, elem->out_num, offset, &stat, sizeof(stat))
== sizeof(stat)) {
uint16_t tag = virtio_tswap16(vdev, stat.tag);
uint64_t val = virtio_tswap64(vdev, stat.val);
offset += sizeof(stat);
if (tag < VIRTIO_BALLOON_S_NR)
s->stats[tag] = val;
}
s->stats_vq_offset = offset;
if (qemu_gettimeofday(&tv) < 0) {
warn_report("%s: failed to get time of day", __func__);
goto out;
}
s->stats_last_update = tv.tv_sec;
out:
if (balloon_stats_enabled(s)) {
balloon_stats_change_timer(s, s->stats_poll_interval);
}
}
static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
{
VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
struct virtio_balloon_config config = {};
config.num_pages = cpu_to_le32(dev->num_pages);
config.actual = cpu_to_le32(dev->actual);
trace_virtio_balloon_get_config(config.num_pages, config.actual);
memcpy(config_data, &config, sizeof(struct virtio_balloon_config));
}
static int build_dimm_list(Object *obj, void *opaque)
{
GSList **list = opaque;
if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
DeviceState *dev = DEVICE(obj);
if (dev->realized) { /* only realized DIMMs matter */
*list = g_slist_prepend(*list, dev);
}
}
object_child_foreach(obj, build_dimm_list, opaque);
return 0;
}
static ram_addr_t get_current_ram_size(void)
{
GSList *list = NULL, *item;
ram_addr_t size = ram_size;
build_dimm_list(qdev_get_machine(), &list);
for (item = list; item; item = g_slist_next(item)) {
Object *obj = OBJECT(item->data);
if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) {
size += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
&error_abort);
}
}
g_slist_free(list);
return size;
}
static void virtio_balloon_set_config(VirtIODevice *vdev,
const uint8_t *config_data)
{
VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
struct virtio_balloon_config config;
uint32_t oldactual = dev->actual;
ram_addr_t vm_ram_size = get_current_ram_size();
memcpy(&config, config_data, sizeof(struct virtio_balloon_config));
dev->actual = le32_to_cpu(config.actual);
if (dev->actual != oldactual) {
qapi_event_send_balloon_change(vm_ram_size -
((ram_addr_t) dev->actual << VIRTIO_BALLOON_PFN_SHIFT));
}
trace_virtio_balloon_set_config(dev->actual, oldactual);
}
static uint64_t virtio_balloon_get_features(VirtIODevice *vdev, uint64_t f,
Error **errp)
{
VirtIOBalloon *dev = VIRTIO_BALLOON(vdev);
f |= dev->host_features;
virtio_add_feature(&f, VIRTIO_BALLOON_F_STATS_VQ);
return f;
}
static void virtio_balloon_stat(void *opaque, BalloonInfo *info)
{
VirtIOBalloon *dev = opaque;
info->actual = get_current_ram_size() - ((uint64_t) dev->actual <<
VIRTIO_BALLOON_PFN_SHIFT);
}
static void virtio_balloon_to_target(void *opaque, ram_addr_t target)
{
VirtIOBalloon *dev = VIRTIO_BALLOON(opaque);
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
ram_addr_t vm_ram_size = get_current_ram_size();
if (target > vm_ram_size) {
target = vm_ram_size;
}
if (target) {
dev->num_pages = (vm_ram_size - target) >> VIRTIO_BALLOON_PFN_SHIFT;
virtio_notify_config(vdev);
}
trace_virtio_balloon_to_target(target, dev->num_pages);
}
static int virtio_balloon_post_load_device(void *opaque, int version_id)
{
VirtIOBalloon *s = VIRTIO_BALLOON(opaque);
if (balloon_stats_enabled(s)) {
balloon_stats_change_timer(s, s->stats_poll_interval);
}
return 0;
}
static const VMStateDescription vmstate_virtio_balloon_device = {
.name = "virtio-balloon-device",
.version_id = 1,
.minimum_version_id = 1,
.post_load = virtio_balloon_post_load_device,
.fields = (VMStateField[]) {
VMSTATE_UINT32(num_pages, VirtIOBalloon),
VMSTATE_UINT32(actual, VirtIOBalloon),
VMSTATE_END_OF_LIST()
},
};
static void virtio_balloon_device_realize(DeviceState *dev, Error **errp)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIOBalloon *s = VIRTIO_BALLOON(dev);
int ret;
virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON,
sizeof(struct virtio_balloon_config));
ret = qemu_add_balloon_handler(virtio_balloon_to_target,
virtio_balloon_stat, s);
if (ret < 0) {
error_setg(errp, "Only one balloon device is supported");
virtio_cleanup(vdev);
return;
}
s->ivq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
s->dvq = virtio_add_queue(vdev, 128, virtio_balloon_handle_output);
s->svq = virtio_add_queue(vdev, 128, virtio_balloon_receive_stats);
reset_stats(s);
}
static void virtio_balloon_device_unrealize(DeviceState *dev, Error **errp)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIOBalloon *s = VIRTIO_BALLOON(dev);
balloon_stats_destroy_timer(s);
qemu_remove_balloon_handler(s);
virtio_cleanup(vdev);
}
static void virtio_balloon_device_reset(VirtIODevice *vdev)
{
VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
if (s->stats_vq_elem != NULL) {
virtqueue_unpop(s->svq, s->stats_vq_elem, 0);
g_free(s->stats_vq_elem);
s->stats_vq_elem = NULL;
}
}
static void virtio_balloon_set_status(VirtIODevice *vdev, uint8_t status)
{
VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
if (!s->stats_vq_elem && vdev->vm_running &&
(status & VIRTIO_CONFIG_S_DRIVER_OK) && virtqueue_rewind(s->svq, 1)) {
/* poll stats queue for the element we have discarded when the VM
* was stopped */
virtio_balloon_receive_stats(vdev, s->svq);
}
}
static void virtio_balloon_instance_init(Object *obj)
{
VirtIOBalloon *s = VIRTIO_BALLOON(obj);
object_property_add(obj, "guest-stats", "guest statistics",
balloon_stats_get_all, NULL, NULL, s, NULL);
object_property_add(obj, "guest-stats-polling-interval", "int",
balloon_stats_get_poll_interval,
balloon_stats_set_poll_interval,
NULL, s, NULL);
}
static const VMStateDescription vmstate_virtio_balloon = {
.name = "virtio-balloon",
.minimum_version_id = 1,
.version_id = 1,
.fields = (VMStateField[]) {
VMSTATE_VIRTIO_DEVICE,
VMSTATE_END_OF_LIST()
},
};
static Property virtio_balloon_properties[] = {
DEFINE_PROP_BIT("deflate-on-oom", VirtIOBalloon, host_features,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM, false),
DEFINE_PROP_END_OF_LIST(),
};
static void virtio_balloon_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
dc->props = virtio_balloon_properties;
dc->vmsd = &vmstate_virtio_balloon;
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
vdc->realize = virtio_balloon_device_realize;
vdc->unrealize = virtio_balloon_device_unrealize;
vdc->reset = virtio_balloon_device_reset;
vdc->get_config = virtio_balloon_get_config;
vdc->set_config = virtio_balloon_set_config;
vdc->get_features = virtio_balloon_get_features;
vdc->set_status = virtio_balloon_set_status;
vdc->vmsd = &vmstate_virtio_balloon_device;
}
static const TypeInfo virtio_balloon_info = {
.name = TYPE_VIRTIO_BALLOON,
.parent = TYPE_VIRTIO_DEVICE,
.instance_size = sizeof(VirtIOBalloon),
.instance_init = virtio_balloon_instance_init,
.class_init = virtio_balloon_class_init,
};
static void virtio_register_types(void)
{
type_register_static(&virtio_balloon_info);
}
type_init(virtio_register_types)