qemu-patch-raspberry4/util/qemu-coroutine.c
Roman Pen 528f449f59 coroutine-lock: do not touch coroutine after another one has been entered
Submission of requests on linux aio is a bit tricky and can lead to
requests completions on submission path:

44713c9e85 ("linux-aio: Handle io_submit() failure gracefully")
0ed93d84ed ("linux-aio: process completions from ioq_submit()")

That means that any coroutine which has been yielded in order to wait
for completion can be resumed from submission path and be eventually
terminated (freed).

The following use-after-free crash was observed when IO throttling
was enabled:

 Program received signal SIGSEGV, Segmentation fault.
 [Switching to Thread 0x7f5813dff700 (LWP 56417)]
 virtqueue_unmap_sg (elem=0x7f5804009a30, len=1, vq=<optimized out>) at virtio.c:252
 (gdb) bt
 #0  virtqueue_unmap_sg (elem=0x7f5804009a30, len=1, vq=<optimized out>) at virtio.c:252
                              ^^^^^^^^^^^^^^
                              remember the address

 #1  virtqueue_fill (vq=0x5598b20d21b0, elem=0x7f5804009a30, len=1, idx=0) at virtio.c:282
 #2  virtqueue_push (vq=0x5598b20d21b0, elem=elem@entry=0x7f5804009a30, len=<optimized out>) at virtio.c:308
 #3  virtio_blk_req_complete (req=req@entry=0x7f5804009a30, status=status@entry=0 '\000') at virtio-blk.c:61
 #4  virtio_blk_rw_complete (opaque=<optimized out>, ret=0) at virtio-blk.c:126
 #5  blk_aio_complete (acb=0x7f58040068d0) at block-backend.c:923
 #6  coroutine_trampoline (i0=<optimized out>, i1=<optimized out>) at coroutine-ucontext.c:78

 (gdb) p * elem
 $8 = {index = 77, out_num = 2, in_num = 1,
       in_addr = 0x7f5804009ad8, out_addr = 0x7f5804009ae0,
       in_sg = 0x0, out_sg = 0x7f5804009a50}
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
       'in_sg' and 'out_sg' are invalid.
       e.g. it is impossible that 'in_sg' is zero,
       instead its value must be equal to:

       (gdb) p/x 0x7f5804009ad8 + sizeof(elem->in_addr[0]) + 2 * sizeof(elem->out_addr[0])
       $26 = 0x7f5804009af0

Seems 'elem' was corrupted.  Meanwhile another thread raised an abort:

 Thread 12 (Thread 0x7f57f2ffd700 (LWP 56426)):
 #0  raise () from /lib/x86_64-linux-gnu/libc.so.6
 #1  abort () from /lib/x86_64-linux-gnu/libc.so.6
 #2  qemu_coroutine_enter (co=0x7f5804009af0) at qemu-coroutine.c:113
 #3  qemu_co_queue_run_restart (co=0x7f5804009a30) at qemu-coroutine-lock.c:60
 #4  qemu_coroutine_enter (co=0x7f5804009a30) at qemu-coroutine.c:119
                           ^^^^^^^^^^^^^^^^^^
                           WTF?? this is equal to elem from crashed thread

 #5  qemu_co_queue_run_restart (co=0x7f57e7f16ae0) at qemu-coroutine-lock.c:60
 #6  qemu_coroutine_enter (co=0x7f57e7f16ae0) at qemu-coroutine.c:119
 #7  qemu_co_queue_run_restart (co=0x7f5807e112a0) at qemu-coroutine-lock.c:60
 #8  qemu_coroutine_enter (co=0x7f5807e112a0) at qemu-coroutine.c:119
 #9  qemu_co_queue_run_restart (co=0x7f5807f17820) at qemu-coroutine-lock.c:60
 #10 qemu_coroutine_enter (co=0x7f5807f17820) at qemu-coroutine.c:119
 #11 qemu_co_queue_run_restart (co=0x7f57e7f18e10) at qemu-coroutine-lock.c:60
 #12 qemu_coroutine_enter (co=0x7f57e7f18e10) at qemu-coroutine.c:119
 #13 qemu_co_enter_next (queue=queue@entry=0x5598b1e742d0) at qemu-coroutine-lock.c:106
 #14 timer_cb (blk=0x5598b1e74280, is_write=<optimized out>) at throttle-groups.c:419

Crash can be explained by access of 'co' object from the loop inside
qemu_co_queue_run_restart():

  while ((next = QSIMPLEQ_FIRST(&co->co_queue_wakeup))) {
      QSIMPLEQ_REMOVE_HEAD(&co->co_queue_wakeup, co_queue_next);
                           ^^^^^^^^^^^^^^^^^^^^
                           on each iteration 'co' is accessed,
                           but 'co' can be already freed

      qemu_coroutine_enter(next);
  }

When 'next' coroutine is resumed (entered) it can in its turn resume
'co', and eventually free it.  That's why we see 'co' (which was freed)
has the same address as 'elem' from the first backtrace.

The fix is obvious: use temporary queue and do not touch coroutine after
first qemu_coroutine_enter() is invoked.

The issue is quite rare and happens every ~12 hours on very high IO
and CPU load (building linux kernel with -j512 inside guest) when IO
throttling is enabled.  With the fix applied guest is running ~35 hours
and is still alive so far.

Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Message-id: 20170601160847.23720-1-roman.penyaev@profitbricks.com
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Fam Zheng <famz@redhat.com>
Cc: Stefan Hajnoczi <stefanha@redhat.com>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: qemu-devel@nongnu.org
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-06-07 14:39:00 +01:00

179 lines
4.6 KiB
C

/*
* QEMU coroutines
*
* Copyright IBM, Corp. 2011
*
* Authors:
* Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
* Kevin Wolf <kwolf@redhat.com>
*
* This work is licensed under the terms of the GNU LGPL, version 2 or later.
* See the COPYING.LIB file in the top-level directory.
*
*/
#include "qemu/osdep.h"
#include "trace.h"
#include "qemu-common.h"
#include "qemu/thread.h"
#include "qemu/atomic.h"
#include "qemu/coroutine.h"
#include "qemu/coroutine_int.h"
#include "block/aio.h"
enum {
POOL_BATCH_SIZE = 64,
};
/** Free list to speed up creation */
static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
static unsigned int release_pool_size;
static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
static __thread unsigned int alloc_pool_size;
static __thread Notifier coroutine_pool_cleanup_notifier;
static void coroutine_pool_cleanup(Notifier *n, void *value)
{
Coroutine *co;
Coroutine *tmp;
QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) {
QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
qemu_coroutine_delete(co);
}
}
Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
{
Coroutine *co = NULL;
if (CONFIG_COROUTINE_POOL) {
co = QSLIST_FIRST(&alloc_pool);
if (!co) {
if (release_pool_size > POOL_BATCH_SIZE) {
/* Slow path; a good place to register the destructor, too. */
if (!coroutine_pool_cleanup_notifier.notify) {
coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
}
/* This is not exact; there could be a little skew between
* release_pool_size and the actual size of release_pool. But
* it is just a heuristic, it does not need to be perfect.
*/
alloc_pool_size = atomic_xchg(&release_pool_size, 0);
QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
co = QSLIST_FIRST(&alloc_pool);
}
}
if (co) {
QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
alloc_pool_size--;
}
}
if (!co) {
co = qemu_coroutine_new();
}
co->entry = entry;
co->entry_arg = opaque;
QSIMPLEQ_INIT(&co->co_queue_wakeup);
return co;
}
static void coroutine_delete(Coroutine *co)
{
co->caller = NULL;
if (CONFIG_COROUTINE_POOL) {
if (release_pool_size < POOL_BATCH_SIZE * 2) {
QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
atomic_inc(&release_pool_size);
return;
}
if (alloc_pool_size < POOL_BATCH_SIZE) {
QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
alloc_pool_size++;
return;
}
}
qemu_coroutine_delete(co);
}
void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
{
Coroutine *self = qemu_coroutine_self();
CoroutineAction ret;
trace_qemu_aio_coroutine_enter(ctx, self, co, co->entry_arg);
if (co->caller) {
fprintf(stderr, "Co-routine re-entered recursively\n");
abort();
}
co->caller = self;
co->ctx = ctx;
/* Store co->ctx before anything that stores co. Matches
* barrier in aio_co_wake and qemu_co_mutex_wake.
*/
smp_wmb();
ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER);
qemu_co_queue_run_restart(co);
/* Beware, if ret == COROUTINE_YIELD and qemu_co_queue_run_restart()
* has started any other coroutine, "co" might have been reentered
* and even freed by now! So be careful and do not touch it.
*/
switch (ret) {
case COROUTINE_YIELD:
return;
case COROUTINE_TERMINATE:
assert(!co->locks_held);
trace_qemu_coroutine_terminate(co);
coroutine_delete(co);
return;
default:
abort();
}
}
void qemu_coroutine_enter(Coroutine *co)
{
qemu_aio_coroutine_enter(qemu_get_current_aio_context(), co);
}
void qemu_coroutine_enter_if_inactive(Coroutine *co)
{
if (!qemu_coroutine_entered(co)) {
qemu_coroutine_enter(co);
}
}
void coroutine_fn qemu_coroutine_yield(void)
{
Coroutine *self = qemu_coroutine_self();
Coroutine *to = self->caller;
trace_qemu_coroutine_yield(self, to);
if (!to) {
fprintf(stderr, "Co-routine is yielding to no one\n");
abort();
}
self->caller = NULL;
qemu_coroutine_switch(self, to, COROUTINE_YIELD);
}
bool qemu_coroutine_entered(Coroutine *co)
{
return co->caller;
}