From e20e718cdea2527ce560a03c5d8cd248decb537a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 14 Oct 2016 10:45:45 +0200 Subject: [PATCH 01/30] checkpatch: tweak "struct should normally be const" warning Avoid triggering on typedef struct BlockJobDriver BlockJobDriver; or struct BlockJobDriver { Cc: John Snow Signed-off-by: Paolo Bonzini --- scripts/checkpatch.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3afa19a766..1f1c9d3498 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2498,8 +2498,8 @@ sub process { VMStateDescription| VMStateInfo}x; if ($line !~ /\bconst\b/ && - $line =~ /\b($struct_ops)\b/) { - ERROR("struct $1 should normally be const\n" . + $line =~ /\b($struct_ops)\b.*=/) { + ERROR("initializer for struct $1 should normally be const\n" . $herecurr); } From 9bc9732faeff09828fe38c0ebe2401ee131a6fca Mon Sep 17 00:00:00 2001 From: Changlong Xie Date: Wed, 12 Oct 2016 18:18:28 +0800 Subject: [PATCH 02/30] nbd: Use CoQueue for free_sema instead of CoMutex NBD is using the CoMutex in a way that wasn't anticipated. For example, if there are N(N=26, MAX_NBD_REQUESTS=16) nbd write requests, so we will invoke nbd_client_co_pwritev N times. ---------------------------------------------------------------------------------------- time request Actions 1 1 in_flight=1, Coroutine=C1 2 2 in_flight=2, Coroutine=C2 ... 15 15 in_flight=15, Coroutine=C15 16 16 in_flight=16, Coroutine=C16, free_sema->holder=C16, mutex->locked=true 17 17 in_flight=16, Coroutine=C17, queue C17 into free_sema->queue 18 18 in_flight=16, Coroutine=C18, queue C18 into free_sema->queue ... 26 N in_flight=16, Coroutine=C26, queue C26 into free_sema->queue ---------------------------------------------------------------------------------------- Once nbd client recieves request No.16' reply, we will re-enter C16. It's ok, because it's equal to 'free_sema->holder'. ---------------------------------------------------------------------------------------- time request Actions 27 16 in_flight=15, Coroutine=C16, free_sema->holder=C16, mutex->locked=false ---------------------------------------------------------------------------------------- Then nbd_coroutine_end invokes qemu_co_mutex_unlock what will pop coroutines from free_sema->queue's head and enter C17. More free_sema->holder is C17 now. ---------------------------------------------------------------------------------------- time request Actions 28 17 in_flight=16, Coroutine=C17, free_sema->holder=C17, mutex->locked=true ---------------------------------------------------------------------------------------- In above scenario, we only recieves request No.16' reply. As time goes by, nbd client will almostly recieves replies from requests 1 to 15 rather than request 17 who owns C17. In this case, we will encounter assert "mutex->holder == self" failed since Kevin's commit 0e438cdc "coroutine: Let CoMutex remember who holds it". For example, if nbd client recieves request No.15' reply, qemu will stop unexpectedly: ---------------------------------------------------------------------------------------- time request Actions 29 15(most case) in_flight=15, Coroutine=C15, free_sema->holder=C17, mutex->locked=false ---------------------------------------------------------------------------------------- Per Paolo's suggestion "The simplest fix is to change it to CoQueue, which is like a condition variable", this patch replaces CoMutex with CoQueue. Cc: Wen Congyang Reported-by: zhanghailiang Suggested-by: Paolo Bonzini Signed-off-by: Changlong Xie Message-Id: <1476267508-19499-1-git-send-email-xiecl.fnst@cn.fujitsu.com> Signed-off-by: Paolo Bonzini --- block/nbd-client.c | 8 ++++---- block/nbd-client.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 2cf3237ef3..40b28ab00b 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -199,8 +199,8 @@ static void nbd_coroutine_start(NbdClientSession *s, { /* Poor man semaphore. The free_sema is locked when no other request * can be accepted, and unlocked after receiving one reply. */ - if (s->in_flight >= MAX_NBD_REQUESTS - 1) { - qemu_co_mutex_lock(&s->free_sema); + if (s->in_flight == MAX_NBD_REQUESTS) { + qemu_co_queue_wait(&s->free_sema); assert(s->in_flight < MAX_NBD_REQUESTS); } s->in_flight++; @@ -214,7 +214,7 @@ static void nbd_coroutine_end(NbdClientSession *s, int i = HANDLE_TO_INDEX(s, request->handle); s->recv_coroutine[i] = NULL; if (s->in_flight-- == MAX_NBD_REQUESTS) { - qemu_co_mutex_unlock(&s->free_sema); + qemu_co_queue_next(&s->free_sema); } } @@ -386,7 +386,7 @@ int nbd_client_init(BlockDriverState *bs, } qemu_co_mutex_init(&client->send_mutex); - qemu_co_mutex_init(&client->free_sema); + qemu_co_queue_init(&client->free_sema); client->sioc = sioc; object_ref(OBJECT(client->sioc)); diff --git a/block/nbd-client.h b/block/nbd-client.h index 044aca4530..307b8b1e99 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -24,7 +24,7 @@ typedef struct NbdClientSession { off_t size; CoMutex send_mutex; - CoMutex free_sema; + CoQueue free_sema; Coroutine *send_coroutine; int in_flight; From 397d30e9401d2da96dbdf0ce49805d6d4bb68833 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Oct 2016 18:31:02 +0200 Subject: [PATCH 03/30] qemu-error: remove dependency of stubs on monitor Leave the implementation of error_vprintf and error_vprintf_unless_qmp (the latter now trivially wrapped by error_printf_unless_qmp) to libqemustub.a and monitor.c. This has two advantages: it lets us remove the monitor_printf and monitor_vprintf stubs, and it lets tests provide a different implementation of the functions that uses g_test_message. Signed-off-by: Paolo Bonzini Message-Id: <1477326663-67817-2-git-send-email-pbonzini@redhat.com> --- include/qemu/error-report.h | 1 + monitor.c | 21 +++++++++++++++++++++ stubs/Makefile.objs | 2 +- stubs/error-printf.c | 13 +++++++++++++ stubs/mon-printf.c | 11 ----------- util/qemu-error.c | 26 +++----------------------- 6 files changed, 39 insertions(+), 35 deletions(-) create mode 100644 stubs/error-printf.c delete mode 100644 stubs/mon-printf.c diff --git a/include/qemu/error-report.h b/include/qemu/error-report.h index 499ec8b12a..3001865896 100644 --- a/include/qemu/error-report.h +++ b/include/qemu/error-report.h @@ -32,6 +32,7 @@ void loc_set_file(const char *fname, int lno); void error_vprintf(const char *fmt, va_list ap) GCC_FMT_ATTR(1, 0); void error_printf(const char *fmt, ...) GCC_FMT_ATTR(1, 2); +void error_vprintf_unless_qmp(const char *fmt, va_list ap) GCC_FMT_ATTR(1, 0); void error_printf_unless_qmp(const char *fmt, ...) GCC_FMT_ATTR(1, 2); void error_set_progname(const char *argv0); void error_vreport(const char *fmt, va_list ap) GCC_FMT_ATTR(1, 0); diff --git a/monitor.c b/monitor.c index 7b963ad1ad..0841d436b0 100644 --- a/monitor.c +++ b/monitor.c @@ -3955,6 +3955,27 @@ static void monitor_readline_flush(void *opaque) monitor_flush(opaque); } +/* + * Print to current monitor if we have one, else to stderr. + * TODO should return int, so callers can calculate width, but that + * requires surgery to monitor_vprintf(). Left for another day. + */ +void error_vprintf(const char *fmt, va_list ap) +{ + if (cur_mon && !monitor_cur_is_qmp()) { + monitor_vprintf(cur_mon, fmt, ap); + } else { + vfprintf(stderr, fmt, ap); + } +} + +void error_vprintf_unless_qmp(const char *fmt, va_list ap) +{ + if (cur_mon && !monitor_cur_is_qmp()) { + monitor_vprintf(cur_mon, fmt, ap); + } +} + static void __attribute__((constructor)) monitor_lock_init(void) { qemu_mutex_init(&monitor_lock); diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 7f236a7c1f..2b5bb74fce 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -9,6 +9,7 @@ stub-obj-y += clock-warp.o stub-obj-y += cpu-get-clock.o stub-obj-y += cpu-get-icount.o stub-obj-y += dump.o +stub-obj-y += error-printf.o stub-obj-y += fdset-add-fd.o stub-obj-y += fdset-find-fd.o stub-obj-y += fdset-get-fd.o @@ -23,7 +24,6 @@ stub-obj-y += is-daemonized.o stub-obj-y += machine-init-done.o stub-obj-y += migr-blocker.o stub-obj-y += mon-is-qmp.o -stub-obj-y += mon-printf.o stub-obj-y += monitor-init.o stub-obj-y += notify-event.o stub-obj-y += qtest.o diff --git a/stubs/error-printf.c b/stubs/error-printf.c new file mode 100644 index 0000000000..56379e648d --- /dev/null +++ b/stubs/error-printf.c @@ -0,0 +1,13 @@ +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/error-report.h" + +void error_vprintf(const char *fmt, va_list ap) +{ + vfprintf(stderr, fmt, ap); +} + +void error_vprintf_unless_qmp(const char *fmt, va_list ap) +{ + error_vprintf(fmt, ap); +} diff --git a/stubs/mon-printf.c b/stubs/mon-printf.c deleted file mode 100644 index e7c1e0cf74..0000000000 --- a/stubs/mon-printf.c +++ /dev/null @@ -1,11 +0,0 @@ -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "monitor/monitor.h" - -void monitor_printf(Monitor *mon, const char *fmt, ...) -{ -} - -void monitor_vprintf(Monitor *mon, const char *fmt, va_list ap) -{ -} diff --git a/util/qemu-error.c b/util/qemu-error.c index 1ef35664af..b331f8f4a4 100644 --- a/util/qemu-error.c +++ b/util/qemu-error.c @@ -14,24 +14,6 @@ #include "monitor/monitor.h" #include "qemu/error-report.h" -/* - * Print to current monitor if we have one, else to stderr. - * TODO should return int, so callers can calculate width, but that - * requires surgery to monitor_vprintf(). Left for another day. - */ -void error_vprintf(const char *fmt, va_list ap) -{ - if (cur_mon && !monitor_cur_is_qmp()) { - monitor_vprintf(cur_mon, fmt, ap); - } else { - vfprintf(stderr, fmt, ap); - } -} - -/* - * Print to current monitor if we have one, else to stderr. - * TODO just like error_vprintf() - */ void error_printf(const char *fmt, ...) { va_list ap; @@ -45,11 +27,9 @@ void error_printf_unless_qmp(const char *fmt, ...) { va_list ap; - if (!monitor_cur_is_qmp()) { - va_start(ap, fmt); - error_vprintf(fmt, ap); - va_end(ap); - } + va_start(ap, fmt); + error_vprintf_unless_qmp(fmt, ap); + va_end(ap); } static Location std_loc = { From 28017e010ddf6849cfa830e898da3e44e6610952 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 24 Oct 2016 18:31:03 +0200 Subject: [PATCH 04/30] tests: send error_report to test log Implement error_vprintf to send the output of error_report to the test log. This silences test-vmstate. Signed-off-by: Paolo Bonzini Message-Id: <1477326663-67817-3-git-send-email-pbonzini@redhat.com> --- include/glib-compat.h | 13 +++++++++++++ stubs/error-printf.c | 8 +++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/glib-compat.h b/include/glib-compat.h index 3f8370b3e4..acf254d2a0 100644 --- a/include/glib-compat.h +++ b/include/glib-compat.h @@ -315,4 +315,17 @@ static inline void g_source_set_name_by_id(guint tag, const char *name) } #endif +#if !GLIB_CHECK_VERSION(2, 36, 0) +/* Always fail. This will not include error_report output in the test log, + * sending it instead to stderr. + */ +#define g_test_initialized() (0) +#endif +#if !GLIB_CHECK_VERSION(2, 38, 0) +#ifdef CONFIG_HAS_GLIB_SUBPROCESS_TESTS +#error schizophrenic detection of glib subprocess testing +#endif +#define g_test_subprocess() (0) +#endif + #endif diff --git a/stubs/error-printf.c b/stubs/error-printf.c index 56379e648d..ac6b92aa69 100644 --- a/stubs/error-printf.c +++ b/stubs/error-printf.c @@ -4,7 +4,13 @@ void error_vprintf(const char *fmt, va_list ap) { - vfprintf(stderr, fmt, ap); + if (g_test_initialized() && !g_test_subprocess()) { + char *msg = g_strdup_vprintf(fmt, ap); + g_test_message("%s", msg); + g_free(msg); + } else { + vfprintf(stderr, fmt, ap); + } } void error_vprintf_unless_qmp(const char *fmt, va_list ap) From f35e44e7645edbb08e35b111c10c2fc57e2905c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= Date: Fri, 21 Oct 2016 16:34:18 +0100 Subject: [PATCH 05/30] exec.c: ensure all AddressSpaceDispatch updates under RCU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memory_dispatch field is meant to be protected by RCU so we should use the correct primitives when accessing it. This race was flagged up by the ThreadSanitizer. Signed-off-by: Alex Bennée Message-Id: <20161021153418.21571-1-alex.bennee@linaro.org> Signed-off-by: Paolo Bonzini --- exec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exec.c b/exec.c index 4d085812ca..a19ed21bff 100644 --- a/exec.c +++ b/exec.c @@ -493,7 +493,7 @@ address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr, hwaddr *xlat, hwaddr *plen) { MemoryRegionSection *section; - AddressSpaceDispatch *d = cpu->cpu_ases[asidx].memory_dispatch; + AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch); section = address_space_translate_internal(d, addr, xlat, plen, false); @@ -2376,7 +2376,7 @@ static void tcg_commit(MemoryListener *listener) * may have split the RCU critical section. */ d = atomic_rcu_read(&cpuas->as->dispatch); - cpuas->memory_dispatch = d; + atomic_rcu_set(&cpuas->memory_dispatch, d); tlb_flush(cpuas->cpu, 1); } From d6af99c9f8d577927a52d00af3fa8e6ec0b2a4e2 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Thu, 27 Oct 2016 12:22:58 +0800 Subject: [PATCH 06/30] exec.c: do not truncate non-empty memory backend file For '-object memory-backend-file,mem-path=foo,size=xyz', if the size of file 'foo' does not match the given size 'xyz', the current QEMU will truncate the file to the given size, which may corrupt the existing data in that file. To avoid such data corruption, this patch disables truncating non-empty backend files. Signed-off-by: Haozhong Zhang Message-Id: <20161027042300.5929-2-haozhong.zhang@intel.com> Signed-off-by: Paolo Bonzini --- exec.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/exec.c b/exec.c index a19ed21bff..f471e7377a 100644 --- a/exec.c +++ b/exec.c @@ -1229,6 +1229,15 @@ void qemu_mutex_unlock_ramlist(void) } #ifdef __linux__ +static int64_t get_file_size(int fd) +{ + int64_t size = lseek(fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } + return size; +} + static void *file_ram_alloc(RAMBlock *block, ram_addr_t memory, const char *path, @@ -1240,6 +1249,7 @@ static void *file_ram_alloc(RAMBlock *block, char *c; void *area = MAP_FAILED; int fd = -1; + int64_t file_size; if (kvm_enabled() && !kvm_has_sync_mmu()) { error_setg(errp, @@ -1302,6 +1312,8 @@ static void *file_ram_alloc(RAMBlock *block, } #endif + file_size = get_file_size(fd); + if (memory < block->page_size) { error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to " "or larger than page size 0x%zx", @@ -1316,8 +1328,16 @@ static void *file_ram_alloc(RAMBlock *block, * hosts, so don't bother bailing out on errors. * If anything goes wrong with it under other filesystems, * mmap will fail. + * + * Do not truncate the non-empty backend file to avoid corrupting + * the existing data in the file. Disabling shrinking is not + * enough. For example, the current vNVDIMM implementation stores + * the guest NVDIMM labels at the end of the backend file. If the + * backend file is later extended, QEMU will not be able to find + * those labels. Therefore, extending the non-empty backend file + * is disabled as well. */ - if (ftruncate(fd, memory)) { + if (!file_size && ftruncate(fd, memory)) { perror("ftruncate"); } From 1775f111eaf7f49efcec30152db44a184c1e2222 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Wed, 2 Nov 2016 09:05:51 +0800 Subject: [PATCH 07/30] exec.c: check memory backend file size with 'size' option If the memory backend file is not large enough to hold the required 'size', Qemu will report error and exit. Signed-off-by: Haozhong Zhang Message-Id: <20161027042300.5929-3-haozhong.zhang@intel.com> Reviewed-by: Eduardo Habkost Message-Id: <20161102010551.2723-1-haozhong.zhang@intel.com> Signed-off-by: Paolo Bonzini --- exec.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/exec.c b/exec.c index f471e7377a..f3c2770d54 100644 --- a/exec.c +++ b/exec.c @@ -1321,6 +1321,13 @@ static void *file_ram_alloc(RAMBlock *block, goto error; } + if (file_size > 0 && file_size < memory) { + error_setg(errp, "backing store %s size 0x%" PRIx64 + " does not match 'size' option 0x" RAM_ADDR_FMT, + path, file_size, memory); + goto error; + } + memory = ROUND_UP(memory, block->page_size); /* From b1a75b3348010820cc324943f09e090ea1fc524f Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:03 -0500 Subject: [PATCH 08/30] nbd: Add qemu-nbd -D for human-readable description The NBD protocol allows servers to advertise a human-readable description alongside an export name during NBD_OPT_LIST. Add an option to pass through the user's string to the NBD client. Doing this also makes it easier to test commit 200650d4, which is the client counterpart of receiving the description. Signed-off-by: Eric Blake Message-Id: <1476469998-28592-2-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- include/block/nbd.h | 1 + nbd/nbd-internal.h | 5 +++-- nbd/server.c | 34 ++++++++++++++++++++++++++-------- qemu-nbd.c | 12 +++++++++++- qemu-nbd.texi | 5 ++++- 5 files changed, 45 insertions(+), 12 deletions(-) diff --git a/include/block/nbd.h b/include/block/nbd.h index 80610ff31b..fd58390d5d 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -115,6 +115,7 @@ BlockBackend *nbd_export_get_blockdev(NBDExport *exp); NBDExport *nbd_export_find(const char *name); void nbd_export_set_name(NBDExport *exp, const char *name); +void nbd_export_set_description(NBDExport *exp, const char *description); void nbd_export_close_all(void); void nbd_client_new(NBDExport *exp, diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index 93a6ca8549..7e78064021 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -104,9 +104,10 @@ static inline ssize_t read_sync(QIOChannel *ioc, void *buffer, size_t size) return nbd_wr_syncv(ioc, &iov, 1, size, true); } -static inline ssize_t write_sync(QIOChannel *ioc, void *buffer, size_t size) +static inline ssize_t write_sync(QIOChannel *ioc, const void *buffer, + size_t size) { - struct iovec iov = { .iov_base = buffer, .iov_len = size }; + struct iovec iov = { .iov_base = (void *) buffer, .iov_len = size }; return nbd_wr_syncv(ioc, &iov, 1, size, false); } diff --git a/nbd/server.c b/nbd/server.c index 36bcafcd50..ac42391b45 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -61,6 +61,7 @@ struct NBDExport { BlockBackend *blk; char *name; + char *description; off_t dev_offset; off_t size; uint16_t nbdflags; @@ -129,7 +130,8 @@ static ssize_t nbd_negotiate_read(QIOChannel *ioc, void *buffer, size_t size) } -static ssize_t nbd_negotiate_write(QIOChannel *ioc, void *buffer, size_t size) +static ssize_t nbd_negotiate_write(QIOChannel *ioc, const void *buffer, + size_t size) { ssize_t ret; guint watch; @@ -225,11 +227,15 @@ static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt) static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp) { - uint64_t magic, name_len; + uint64_t magic; + size_t name_len, desc_len; uint32_t opt, type, len; + const char *name = exp->name ? exp->name : ""; + const char *desc = exp->description ? exp->description : ""; - TRACE("Advertising export name '%s'", exp->name ? exp->name : ""); - name_len = strlen(exp->name); + TRACE("Advertising export name '%s' description '%s'", name, desc); + name_len = strlen(name); + desc_len = strlen(desc); magic = cpu_to_be64(NBD_REP_MAGIC); if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) { LOG("write failed (magic)"); @@ -245,18 +251,22 @@ static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp) LOG("write failed (reply type)"); return -EINVAL; } - len = cpu_to_be32(name_len + sizeof(len)); + len = cpu_to_be32(name_len + desc_len + sizeof(len)); if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) { LOG("write failed (length)"); return -EINVAL; } len = cpu_to_be32(name_len); if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) { - LOG("write failed (length)"); + LOG("write failed (name length)"); return -EINVAL; } - if (nbd_negotiate_write(ioc, exp->name, name_len) != name_len) { - LOG("write failed (buffer)"); + if (nbd_negotiate_write(ioc, name, name_len) != name_len) { + LOG("write failed (name buffer)"); + return -EINVAL; + } + if (nbd_negotiate_write(ioc, desc, desc_len) != desc_len) { + LOG("write failed (description buffer)"); return -EINVAL; } return 0; @@ -894,6 +904,12 @@ void nbd_export_set_name(NBDExport *exp, const char *name) nbd_export_put(exp); } +void nbd_export_set_description(NBDExport *exp, const char *description) +{ + g_free(exp->description); + exp->description = g_strdup(description); +} + void nbd_export_close(NBDExport *exp) { NBDClient *client, *next; @@ -903,6 +919,7 @@ void nbd_export_close(NBDExport *exp) client_close(client); } nbd_export_set_name(exp, NULL); + nbd_export_set_description(exp, NULL); nbd_export_put(exp); } @@ -921,6 +938,7 @@ void nbd_export_put(NBDExport *exp) if (--exp->refcount == 0) { assert(exp->name == NULL); + assert(exp->description == NULL); if (exp->close) { exp->close(exp); diff --git a/qemu-nbd.c b/qemu-nbd.c index b757dc7621..c734f627b4 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -83,6 +83,7 @@ static void usage(const char *name) " -t, --persistent don't exit on the last connection\n" " -v, --verbose display extra debugging information\n" " -x, --export-name=NAME expose export by name\n" +" -D, --description=TEXT with -x, also export a human-readable description\n" "\n" "Exposing part of the image:\n" " -o, --offset=OFFSET offset into the image\n" @@ -477,7 +478,7 @@ int main(int argc, char **argv) off_t fd_size; QemuOpts *sn_opts = NULL; const char *sn_id_or_name = NULL; - const char *sopt = "hVb:o:p:rsnP:c:dvk:e:f:tl:x:T:"; + const char *sopt = "hVb:o:p:rsnP:c:dvk:e:f:tl:x:T:D:"; struct option lopt[] = { { "help", no_argument, NULL, 'h' }, { "version", no_argument, NULL, 'V' }, @@ -503,6 +504,7 @@ int main(int argc, char **argv) { "verbose", no_argument, NULL, 'v' }, { "object", required_argument, NULL, QEMU_NBD_OPT_OBJECT }, { "export-name", required_argument, NULL, 'x' }, + { "description", required_argument, NULL, 'D' }, { "tls-creds", required_argument, NULL, QEMU_NBD_OPT_TLSCREDS }, { "image-opts", no_argument, NULL, QEMU_NBD_OPT_IMAGE_OPTS }, { "trace", required_argument, NULL, 'T' }, @@ -524,6 +526,7 @@ int main(int argc, char **argv) BlockdevDetectZeroesOptions detect_zeroes = BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF; QDict *options = NULL; const char *export_name = NULL; + const char *export_description = NULL; const char *tlscredsid = NULL; bool imageOpts = false; bool writethrough = true; @@ -689,6 +692,9 @@ int main(int argc, char **argv) case 'x': export_name = optarg; break; + case 'D': + export_description = optarg; + break; case 'v': verbose = 1; break; @@ -937,7 +943,11 @@ int main(int argc, char **argv) } if (export_name) { nbd_export_set_name(exp, export_name); + nbd_export_set_description(exp, export_description); newproto = true; + } else if (export_description) { + error_report("Export description requires an export name"); + exit(EXIT_FAILURE); } server_ioc = qio_channel_socket_new(); diff --git a/qemu-nbd.texi b/qemu-nbd.texi index b7a9c6d02f..9a84e81eed 100644 --- a/qemu-nbd.texi +++ b/qemu-nbd.texi @@ -79,9 +79,12 @@ Disconnect the device @var{dev} Allow up to @var{num} clients to share the device (default @samp{1}) @item -t, --persistent Don't exit on the last connection -@item -x NAME, --export-name=NAME +@item -x, --export-name=@var{name} Set the NBD volume export name. This switches the server to use the new style NBD protocol negotiation +@item -D, --description=@var{description} +Set the NBD volume export description, as a human-readable +string. Requires the use of @option{-x} @item --tls-creds=ID Enable mandatory TLS encryption for the server by setting the ID of the TLS credentials object previously created with the --object From b626b51a6721e53817155af720243f59072e424f Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:04 -0500 Subject: [PATCH 09/30] nbd: Treat flags vs. command type as separate fields Current upstream NBD documents that requests have a 16-bit flags, followed by a 16-bit type integer; although older versions mentioned only a 32-bit field with masking to find flags. Since the protocol is in network order (big-endian over the wire), the ABI is unchanged; but dealing with the flags as a separate field rather than masking will make it easier to add support for upcoming NBD extensions that increase the number of both flags and commands. Improve some comments in nbd.h based on the current upstream NBD protocol (https://github.com/yoe/nbd/blob/master/doc/proto.md), and touch some nearby code to keep checkpatch.pl happy. Signed-off-by: Eric Blake Message-Id: <1476469998-28592-3-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- block/nbd-client.c | 9 +++------ include/block/nbd.h | 18 ++++++++++++------ nbd/client.c | 9 ++++++--- nbd/nbd-internal.h | 4 ++-- nbd/server.c | 39 +++++++++++++++++++-------------------- 5 files changed, 42 insertions(+), 37 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 40b28ab00b..8ca503015c 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -1,6 +1,7 @@ /* * QEMU Block driver for NBD * + * Copyright (C) 2016 Red Hat, Inc. * Copyright (C) 2008 Bull S.A.S. * Author: Laurent Vivier * @@ -258,7 +259,7 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, if (flags & BDRV_REQ_FUA) { assert(client->nbdflags & NBD_FLAG_SEND_FUA); - request.type |= NBD_CMD_FLAG_FUA; + request.flags |= NBD_CMD_FLAG_FUA; } assert(bytes <= NBD_MAX_BUFFER_SIZE); @@ -343,11 +344,7 @@ void nbd_client_attach_aio_context(BlockDriverState *bs, void nbd_client_close(BlockDriverState *bs) { NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { - .type = NBD_CMD_DISC, - .from = 0, - .len = 0 - }; + struct nbd_request request = { .type = NBD_CMD_DISC }; if (client->ioc == NULL) { return; diff --git a/include/block/nbd.h b/include/block/nbd.h index fd58390d5d..5fe2670fdb 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -1,4 +1,5 @@ /* + * Copyright (C) 2016 Red Hat, Inc. * Copyright (C) 2005 Anthony Liguori * * Network Block Device @@ -32,7 +33,8 @@ struct nbd_request { uint64_t handle; uint64_t from; uint32_t len; - uint32_t type; + uint16_t flags; + uint16_t type; }; struct nbd_reply { @@ -40,6 +42,8 @@ struct nbd_reply { uint32_t error; }; +/* Transmission (export) flags: sent from server to client during handshake, + but describe what will happen during transmission */ #define NBD_FLAG_HAS_FLAGS (1 << 0) /* Flags are there */ #define NBD_FLAG_READ_ONLY (1 << 1) /* Device is read-only */ #define NBD_FLAG_SEND_FLUSH (1 << 2) /* Send FLUSH */ @@ -47,10 +51,12 @@ struct nbd_reply { #define NBD_FLAG_ROTATIONAL (1 << 4) /* Use elevator algorithm - rotational media */ #define NBD_FLAG_SEND_TRIM (1 << 5) /* Send TRIM (discard) */ -/* New-style global flags. */ +/* New-style handshake (global) flags, sent from server to client, and + control what will happen during handshake phase. */ #define NBD_FLAG_FIXED_NEWSTYLE (1 << 0) /* Fixed newstyle protocol. */ -/* New-style client flags. */ +/* New-style client flags, sent from client to server to control what happens + during handshake phase. */ #define NBD_FLAG_C_FIXED_NEWSTYLE (1 << 0) /* Fixed newstyle protocol. */ /* Reply types. */ @@ -61,10 +67,10 @@ struct nbd_reply { #define NBD_REP_ERR_INVALID ((UINT32_C(1) << 31) | 3) /* Invalid length. */ #define NBD_REP_ERR_TLS_REQD ((UINT32_C(1) << 31) | 5) /* TLS required */ +/* Request flags, sent from client to server during transmission phase */ +#define NBD_CMD_FLAG_FUA (1 << 0) -#define NBD_CMD_MASK_COMMAND 0x0000ffff -#define NBD_CMD_FLAG_FUA (1 << 16) - +/* Supported request types */ enum { NBD_CMD_READ = 0, NBD_CMD_WRITE = 1, diff --git a/nbd/client.c b/nbd/client.c index f6db8369b3..4620e8dcba 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -1,4 +1,5 @@ /* + * Copyright (C) 2016 Red Hat, Inc. * Copyright (C) 2005 Anthony Liguori * * Network Block Device Client Side @@ -714,11 +715,13 @@ ssize_t nbd_send_request(QIOChannel *ioc, struct nbd_request *request) TRACE("Sending request to server: " "{ .from = %" PRIu64", .len = %" PRIu32 ", .handle = %" PRIu64 - ", .type=%" PRIu32 " }", - request->from, request->len, request->handle, request->type); + ", .flags = %" PRIx16 ", .type = %" PRIu16 " }", + request->from, request->len, request->handle, + request->flags, request->type); stl_be_p(buf, NBD_REQUEST_MAGIC); - stl_be_p(buf + 4, request->type); + stw_be_p(buf + 4, request->flags); + stw_be_p(buf + 6, request->type); stq_be_p(buf + 8, request->handle); stq_be_p(buf + 16, request->from); stl_be_p(buf + 24, request->len); diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index 7e78064021..99e51571a2 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -53,10 +53,10 @@ /* This is all part of the "official" NBD API. * * The most up-to-date documentation is available at: - * https://github.com/yoe/nbd/blob/master/doc/proto.txt + * https://github.com/yoe/nbd/blob/master/doc/proto.md */ -#define NBD_REQUEST_SIZE (4 + 4 + 8 + 8 + 4) +#define NBD_REQUEST_SIZE (4 + 2 + 2 + 8 + 8 + 4) #define NBD_REPLY_SIZE (4 + 4 + 8) #define NBD_REQUEST_MAGIC 0x25609513 #define NBD_REPLY_MAGIC 0x67446698 diff --git a/nbd/server.c b/nbd/server.c index ac42391b45..38a0fef4d9 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -1,4 +1,5 @@ /* + * Copyright (C) 2016 Red Hat, Inc. * Copyright (C) 2005 Anthony Liguori * * Network Block Device Server Side @@ -652,21 +653,23 @@ static ssize_t nbd_receive_request(QIOChannel *ioc, struct nbd_request *request) /* Request [ 0 .. 3] magic (NBD_REQUEST_MAGIC) - [ 4 .. 7] type (0 == READ, 1 == WRITE) + [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, ...) + [ 6 .. 7] type (NBD_CMD_READ, ...) [ 8 .. 15] handle [16 .. 23] from [24 .. 27] len */ magic = ldl_be_p(buf); - request->type = ldl_be_p(buf + 4); + request->flags = lduw_be_p(buf + 4); + request->type = lduw_be_p(buf + 6); request->handle = ldq_be_p(buf + 8); request->from = ldq_be_p(buf + 16); request->len = ldl_be_p(buf + 24); - TRACE("Got request: { magic = 0x%" PRIx32 ", .type = %" PRIx32 - ", from = %" PRIu64 " , len = %" PRIu32 " }", - magic, request->type, request->from, request->len); + TRACE("Got request: { magic = 0x%" PRIx32 ", .flags = %" PRIx16 + ", .type = %" PRIx16 ", from = %" PRIu64 ", len = %" PRIu32 " }", + magic, request->flags, request->type, request->from, request->len); if (magic != NBD_REQUEST_MAGIC) { LOG("invalid magic (got 0x%" PRIx32 ")", magic); @@ -1013,7 +1016,6 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request) { NBDClient *client = req->client; - uint32_t command; ssize_t rc; g_assert(qemu_in_coroutine()); @@ -1030,13 +1032,12 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, TRACE("Decoding type"); - command = request->type & NBD_CMD_MASK_COMMAND; - if (command != NBD_CMD_WRITE) { + if (request->type != NBD_CMD_WRITE) { /* No payload, we are ready to read the next request. */ req->complete = true; } - if (command == NBD_CMD_DISC) { + if (request->type == NBD_CMD_DISC) { /* Special case: we're going to disconnect without a reply, * whether or not flags, from, or len are bogus */ TRACE("Request type is DISCONNECT"); @@ -1053,7 +1054,7 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, goto out; } - if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) { + if (request->type == NBD_CMD_READ || request->type == NBD_CMD_WRITE) { if (request->len > NBD_MAX_BUFFER_SIZE) { LOG("len (%" PRIu32" ) is larger than max len (%u)", request->len, NBD_MAX_BUFFER_SIZE); @@ -1067,7 +1068,7 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, goto out; } } - if (command == NBD_CMD_WRITE) { + if (request->type == NBD_CMD_WRITE) { TRACE("Reading %" PRIu32 " byte(s)", request->len); if (read_sync(client->ioc, req->data, request->len) != request->len) { @@ -1083,12 +1084,11 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, LOG("operation past EOF; From: %" PRIu64 ", Len: %" PRIu32 ", Size: %" PRIu64, request->from, request->len, (uint64_t)client->exp->size); - rc = command == NBD_CMD_WRITE ? -ENOSPC : -EINVAL; + rc = request->type == NBD_CMD_WRITE ? -ENOSPC : -EINVAL; goto out; } - if (request->type & ~NBD_CMD_MASK_COMMAND & ~NBD_CMD_FLAG_FUA) { - LOG("unsupported flags (got 0x%x)", - request->type & ~NBD_CMD_MASK_COMMAND); + if (request->flags & ~NBD_CMD_FLAG_FUA) { + LOG("unsupported flags (got 0x%x)", request->flags); rc = -EINVAL; goto out; } @@ -1110,7 +1110,6 @@ static void nbd_trip(void *opaque) struct nbd_request request; struct nbd_reply reply; ssize_t ret; - uint32_t command; int flags; TRACE("Reading request."); @@ -1134,7 +1133,6 @@ static void nbd_trip(void *opaque) reply.error = -ret; goto error_reply; } - command = request.type & NBD_CMD_MASK_COMMAND; if (client->closing) { /* @@ -1144,11 +1142,12 @@ static void nbd_trip(void *opaque) goto done; } - switch (command) { + switch (request.type) { case NBD_CMD_READ: TRACE("Request type is READ"); - if (request.type & NBD_CMD_FLAG_FUA) { + /* XXX: NBD Protocol only documents use of FUA with WRITE */ + if (request.flags & NBD_CMD_FLAG_FUA) { ret = blk_co_flush(exp->blk); if (ret < 0) { LOG("flush failed"); @@ -1181,7 +1180,7 @@ static void nbd_trip(void *opaque) TRACE("Writing to device"); flags = 0; - if (request.type & NBD_CMD_FLAG_FUA) { + if (request.flags & NBD_CMD_FLAG_FUA) { flags |= BDRV_REQ_FUA; } ret = blk_pwrite(exp->blk, request.from + exp->dev_offset, From 315f78abfce1f12bca259e8d8a41a42efbf970d1 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:05 -0500 Subject: [PATCH 10/30] nbd: Rename NBDRequest to NBDRequestData We have both 'struct NBDRequest' and 'struct nbd_request'; making it confusing to see which does what. Furthermore, we want to rename nbd_request to align with our normal CamelCase naming conventions. So, rename the struct which is used to associate the data received during request callbacks, while leaving the shorter name for the description of the request sent over the wire in the NBD protocol. Signed-off-by: Eric Blake Message-Id: <1476469998-28592-4-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- nbd/server.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index 38a0fef4d9..09d949f0c1 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -47,10 +47,10 @@ static int system_errno_to_nbd_errno(int err) /* Definitions for opaque data types */ -typedef struct NBDRequest NBDRequest; +typedef struct NBDRequestData NBDRequestData; -struct NBDRequest { - QSIMPLEQ_ENTRY(NBDRequest) entry; +struct NBDRequestData { + QSIMPLEQ_ENTRY(NBDRequestData) entry; NBDClient *client; uint8_t *data; bool complete; @@ -760,21 +760,21 @@ static void client_close(NBDClient *client) } } -static NBDRequest *nbd_request_get(NBDClient *client) +static NBDRequestData *nbd_request_get(NBDClient *client) { - NBDRequest *req; + NBDRequestData *req; assert(client->nb_requests <= MAX_NBD_REQUESTS - 1); client->nb_requests++; nbd_update_can_read(client); - req = g_new0(NBDRequest, 1); + req = g_new0(NBDRequestData, 1); nbd_client_get(client); req->client = client; return req; } -static void nbd_request_put(NBDRequest *req) +static void nbd_request_put(NBDRequestData *req) { NBDClient *client = req->client; @@ -976,7 +976,7 @@ void nbd_export_close_all(void) } } -static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply, +static ssize_t nbd_co_send_reply(NBDRequestData *req, struct nbd_reply *reply, int len) { NBDClient *client = req->client; @@ -1012,7 +1012,7 @@ static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply, * and any other negative value to report an error to the client * (although the caller may still need to disconnect after reporting * the error). */ -static ssize_t nbd_co_receive_request(NBDRequest *req, +static ssize_t nbd_co_receive_request(NBDRequestData *req, struct nbd_request *request) { NBDClient *client = req->client; @@ -1106,7 +1106,7 @@ static void nbd_trip(void *opaque) { NBDClient *client = opaque; NBDExport *exp = client->exp; - NBDRequest *req; + NBDRequestData *req; struct nbd_request request; struct nbd_reply reply; ssize_t ret; From 10676b81a9a69e9660fc203e3fe7d61a80ef1089 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:06 -0500 Subject: [PATCH 11/30] nbd: Rename NbdClientSession to NBDClientSession It's better to use consistent capitalization of the namespace used for NBD functions; we have more instances of NBD* than Nbd*. Signed-off-by: Eric Blake Message-Id: <1476469998-28592-5-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- block/nbd-client.c | 26 +++++++++++++------------- block/nbd-client.h | 6 +++--- block/nbd.c | 4 ++-- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index 8ca503015c..e6fe603ef3 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -33,7 +33,7 @@ #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) #define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs)) -static void nbd_recv_coroutines_enter_all(NbdClientSession *s) +static void nbd_recv_coroutines_enter_all(NBDClientSession *s) { int i; @@ -46,7 +46,7 @@ static void nbd_recv_coroutines_enter_all(NbdClientSession *s) static void nbd_teardown_connection(BlockDriverState *bs) { - NbdClientSession *client = nbd_get_client_session(bs); + NBDClientSession *client = nbd_get_client_session(bs); if (!client->ioc) { /* Already closed */ return; @@ -68,7 +68,7 @@ static void nbd_teardown_connection(BlockDriverState *bs) static void nbd_reply_ready(void *opaque) { BlockDriverState *bs = opaque; - NbdClientSession *s = nbd_get_client_session(bs); + NBDClientSession *s = nbd_get_client_session(bs); uint64_t i; int ret; @@ -119,7 +119,7 @@ static int nbd_co_send_request(BlockDriverState *bs, struct nbd_request *request, QEMUIOVector *qiov) { - NbdClientSession *s = nbd_get_client_session(bs); + NBDClientSession *s = nbd_get_client_session(bs); AioContext *aio_context; int rc, ret, i; @@ -167,7 +167,7 @@ static int nbd_co_send_request(BlockDriverState *bs, return rc; } -static void nbd_co_receive_reply(NbdClientSession *s, +static void nbd_co_receive_reply(NBDClientSession *s, struct nbd_request *request, struct nbd_reply *reply, QEMUIOVector *qiov) @@ -195,7 +195,7 @@ static void nbd_co_receive_reply(NbdClientSession *s, } } -static void nbd_coroutine_start(NbdClientSession *s, +static void nbd_coroutine_start(NBDClientSession *s, struct nbd_request *request) { /* Poor man semaphore. The free_sema is locked when no other request @@ -209,7 +209,7 @@ static void nbd_coroutine_start(NbdClientSession *s, /* s->recv_coroutine[i] is set as soon as we get the send_lock. */ } -static void nbd_coroutine_end(NbdClientSession *s, +static void nbd_coroutine_end(NBDClientSession *s, struct nbd_request *request) { int i = HANDLE_TO_INDEX(s, request->handle); @@ -222,7 +222,7 @@ static void nbd_coroutine_end(NbdClientSession *s, int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { - NbdClientSession *client = nbd_get_client_session(bs); + NBDClientSession *client = nbd_get_client_session(bs); struct nbd_request request = { .type = NBD_CMD_READ, .from = offset, @@ -248,7 +248,7 @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { - NbdClientSession *client = nbd_get_client_session(bs); + NBDClientSession *client = nbd_get_client_session(bs); struct nbd_request request = { .type = NBD_CMD_WRITE, .from = offset, @@ -277,7 +277,7 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, int nbd_client_co_flush(BlockDriverState *bs) { - NbdClientSession *client = nbd_get_client_session(bs); + NBDClientSession *client = nbd_get_client_session(bs); struct nbd_request request = { .type = NBD_CMD_FLUSH }; struct nbd_reply reply; ssize_t ret; @@ -302,7 +302,7 @@ int nbd_client_co_flush(BlockDriverState *bs) int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) { - NbdClientSession *client = nbd_get_client_session(bs); + NBDClientSession *client = nbd_get_client_session(bs); struct nbd_request request = { .type = NBD_CMD_TRIM, .from = offset, @@ -343,7 +343,7 @@ void nbd_client_attach_aio_context(BlockDriverState *bs, void nbd_client_close(BlockDriverState *bs) { - NbdClientSession *client = nbd_get_client_session(bs); + NBDClientSession *client = nbd_get_client_session(bs); struct nbd_request request = { .type = NBD_CMD_DISC }; if (client->ioc == NULL) { @@ -362,7 +362,7 @@ int nbd_client_init(BlockDriverState *bs, const char *hostname, Error **errp) { - NbdClientSession *client = nbd_get_client_session(bs); + NBDClientSession *client = nbd_get_client_session(bs); int ret; /* NBD handshake */ diff --git a/block/nbd-client.h b/block/nbd-client.h index 307b8b1e99..d86dd22e35 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -17,7 +17,7 @@ #define MAX_NBD_REQUESTS 16 -typedef struct NbdClientSession { +typedef struct NBDClientSession { QIOChannelSocket *sioc; /* The master data channel */ QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ uint16_t nbdflags; @@ -32,9 +32,9 @@ typedef struct NbdClientSession { struct nbd_reply reply; bool is_unix; -} NbdClientSession; +} NBDClientSession; -NbdClientSession *nbd_get_client_session(BlockDriverState *bs); +NBDClientSession *nbd_get_client_session(BlockDriverState *bs); int nbd_client_init(BlockDriverState *bs, QIOChannelSocket *sock, diff --git a/block/nbd.c b/block/nbd.c index 6e837f80c9..b281484648 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -44,7 +44,7 @@ #define EN_OPTSTR ":exportname=" typedef struct BDRVNBDState { - NbdClientSession client; + NBDClientSession client; /* For nbd_refresh_filename() */ SocketAddress *saddr; @@ -294,7 +294,7 @@ done: return saddr; } -NbdClientSession *nbd_get_client_session(BlockDriverState *bs) +NBDClientSession *nbd_get_client_session(BlockDriverState *bs) { BDRVNBDState *s = bs->opaque; return &s->client; From ed2dd91267a813e4ac5619a2fb53f55c005fcfa6 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:07 -0500 Subject: [PATCH 12/30] nbd: Rename struct nbd_request and nbd_reply Our coding convention prefers CamelCase names, and we already have other existing structs with NBDFoo naming. Let's be consistent, before later patches add even more structs. Signed-off-by: Eric Blake Message-Id: <1476469998-28592-6-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- block/nbd-client.c | 28 ++++++++++++++-------------- block/nbd-client.h | 2 +- include/block/nbd.h | 10 ++++++---- nbd/client.c | 4 ++-- nbd/server.c | 12 ++++++------ 5 files changed, 29 insertions(+), 27 deletions(-) diff --git a/block/nbd-client.c b/block/nbd-client.c index e6fe603ef3..3e5f9f5b77 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -116,7 +116,7 @@ static void nbd_restart_write(void *opaque) } static int nbd_co_send_request(BlockDriverState *bs, - struct nbd_request *request, + NBDRequest *request, QEMUIOVector *qiov) { NBDClientSession *s = nbd_get_client_session(bs); @@ -168,8 +168,8 @@ static int nbd_co_send_request(BlockDriverState *bs, } static void nbd_co_receive_reply(NBDClientSession *s, - struct nbd_request *request, - struct nbd_reply *reply, + NBDRequest *request, + NBDReply *reply, QEMUIOVector *qiov) { int ret; @@ -196,7 +196,7 @@ static void nbd_co_receive_reply(NBDClientSession *s, } static void nbd_coroutine_start(NBDClientSession *s, - struct nbd_request *request) + NBDRequest *request) { /* Poor man semaphore. The free_sema is locked when no other request * can be accepted, and unlocked after receiving one reply. */ @@ -210,7 +210,7 @@ static void nbd_coroutine_start(NBDClientSession *s, } static void nbd_coroutine_end(NBDClientSession *s, - struct nbd_request *request) + NBDRequest *request) { int i = HANDLE_TO_INDEX(s, request->handle); s->recv_coroutine[i] = NULL; @@ -223,12 +223,12 @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { NBDClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { + NBDRequest request = { .type = NBD_CMD_READ, .from = offset, .len = bytes, }; - struct nbd_reply reply; + NBDReply reply; ssize_t ret; assert(bytes <= NBD_MAX_BUFFER_SIZE); @@ -249,12 +249,12 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags) { NBDClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { + NBDRequest request = { .type = NBD_CMD_WRITE, .from = offset, .len = bytes, }; - struct nbd_reply reply; + NBDReply reply; ssize_t ret; if (flags & BDRV_REQ_FUA) { @@ -278,8 +278,8 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, int nbd_client_co_flush(BlockDriverState *bs) { NBDClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_FLUSH }; - struct nbd_reply reply; + NBDRequest request = { .type = NBD_CMD_FLUSH }; + NBDReply reply; ssize_t ret; if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) { @@ -303,12 +303,12 @@ int nbd_client_co_flush(BlockDriverState *bs) int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count) { NBDClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { + NBDRequest request = { .type = NBD_CMD_TRIM, .from = offset, .len = count, }; - struct nbd_reply reply; + NBDReply reply; ssize_t ret; if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) { @@ -344,7 +344,7 @@ void nbd_client_attach_aio_context(BlockDriverState *bs, void nbd_client_close(BlockDriverState *bs) { NBDClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_DISC }; + NBDRequest request = { .type = NBD_CMD_DISC }; if (client->ioc == NULL) { return; diff --git a/block/nbd-client.h b/block/nbd-client.h index d86dd22e35..51be419405 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -29,7 +29,7 @@ typedef struct NBDClientSession { int in_flight; Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; - struct nbd_reply reply; + NBDReply reply; bool is_unix; } NBDClientSession; diff --git a/include/block/nbd.h b/include/block/nbd.h index 5fe2670fdb..a33581b6a9 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -29,18 +29,20 @@ /* Note: these are _NOT_ the same as the network representation of an NBD * request and reply! */ -struct nbd_request { +struct NBDRequest { uint64_t handle; uint64_t from; uint32_t len; uint16_t flags; uint16_t type; }; +typedef struct NBDRequest NBDRequest; -struct nbd_reply { +struct NBDReply { uint64_t handle; uint32_t error; }; +typedef struct NBDReply NBDReply; /* Transmission (export) flags: sent from server to client during handshake, but describe what will happen during transmission */ @@ -101,8 +103,8 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, QIOChannel **outioc, off_t *size, Error **errp); int nbd_init(int fd, QIOChannelSocket *sioc, uint16_t flags, off_t size); -ssize_t nbd_send_request(QIOChannel *ioc, struct nbd_request *request); -ssize_t nbd_receive_reply(QIOChannel *ioc, struct nbd_reply *reply); +ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request); +ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply); int nbd_client(int fd); int nbd_disconnect(int fd); diff --git a/nbd/client.c b/nbd/client.c index 4620e8dcba..fe24d31ec6 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -708,7 +708,7 @@ int nbd_disconnect(int fd) } #endif -ssize_t nbd_send_request(QIOChannel *ioc, struct nbd_request *request) +ssize_t nbd_send_request(QIOChannel *ioc, NBDRequest *request) { uint8_t buf[NBD_REQUEST_SIZE]; ssize_t ret; @@ -738,7 +738,7 @@ ssize_t nbd_send_request(QIOChannel *ioc, struct nbd_request *request) return 0; } -ssize_t nbd_receive_reply(QIOChannel *ioc, struct nbd_reply *reply) +ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) { uint8_t buf[NBD_REPLY_SIZE]; uint32_t magic; diff --git a/nbd/server.c b/nbd/server.c index 09d949f0c1..badc2d1fe6 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -635,7 +635,7 @@ fail: return rc; } -static ssize_t nbd_receive_request(QIOChannel *ioc, struct nbd_request *request) +static ssize_t nbd_receive_request(QIOChannel *ioc, NBDRequest *request) { uint8_t buf[NBD_REQUEST_SIZE]; uint32_t magic; @@ -678,7 +678,7 @@ static ssize_t nbd_receive_request(QIOChannel *ioc, struct nbd_request *request) return 0; } -static ssize_t nbd_send_reply(QIOChannel *ioc, struct nbd_reply *reply) +static ssize_t nbd_send_reply(QIOChannel *ioc, NBDReply *reply) { uint8_t buf[NBD_REPLY_SIZE]; ssize_t ret; @@ -976,7 +976,7 @@ void nbd_export_close_all(void) } } -static ssize_t nbd_co_send_reply(NBDRequestData *req, struct nbd_reply *reply, +static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply, int len) { NBDClient *client = req->client; @@ -1013,7 +1013,7 @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, struct nbd_reply *reply, * (although the caller may still need to disconnect after reporting * the error). */ static ssize_t nbd_co_receive_request(NBDRequestData *req, - struct nbd_request *request) + NBDRequest *request) { NBDClient *client = req->client; ssize_t rc; @@ -1107,8 +1107,8 @@ static void nbd_trip(void *opaque) NBDClient *client = opaque; NBDExport *exp = client->exp; NBDRequestData *req; - struct nbd_request request; - struct nbd_reply reply; + NBDRequest request; + NBDReply reply; ssize_t ret; int flags; From 526e5c6559928ed98332dd27f026e8cbc95f466a Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:08 -0500 Subject: [PATCH 13/30] nbd: Share common reply-sending code in server Rather than open-coding NBD_REP_SERVER, reuse the code we already have by adding a length parameter. Additionally, the refactoring will make adding NBD_OPT_GO in a later patch easier. Signed-off-by: Eric Blake Message-Id: <1476469998-28592-7-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- nbd/server.c | 52 +++++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index badc2d1fe6..0f0c68c7b3 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -196,12 +196,15 @@ static ssize_t nbd_negotiate_drop_sync(QIOChannel *ioc, size_t size) */ -static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt) +/* Send a reply header, including length, but no payload. + * Return -errno on error, 0 on success. */ +static int nbd_negotiate_send_rep_len(QIOChannel *ioc, uint32_t type, + uint32_t opt, uint32_t len) { uint64_t magic; - uint32_t len; - TRACE("Reply opt=%" PRIx32 " type=%" PRIx32, type, opt); + TRACE("Reply opt=%" PRIx32 " type=%" PRIx32 " len=%" PRIu32, + type, opt, len); magic = cpu_to_be64(NBD_REP_MAGIC); if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) { @@ -218,7 +221,7 @@ static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt) LOG("write failed (rep type)"); return -EINVAL; } - len = cpu_to_be32(0); + len = cpu_to_be32(len); if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) { LOG("write failed (rep data length)"); return -EINVAL; @@ -226,37 +229,32 @@ static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt) return 0; } +/* Send a reply header with default 0 length. + * Return -errno on error, 0 on success. */ +static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt) +{ + return nbd_negotiate_send_rep_len(ioc, type, opt, 0); +} + +/* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload. + * Return -errno on error, 0 on success. */ static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp) { - uint64_t magic; size_t name_len, desc_len; - uint32_t opt, type, len; + uint32_t len; const char *name = exp->name ? exp->name : ""; const char *desc = exp->description ? exp->description : ""; + int rc; TRACE("Advertising export name '%s' description '%s'", name, desc); name_len = strlen(name); desc_len = strlen(desc); - magic = cpu_to_be64(NBD_REP_MAGIC); - if (nbd_negotiate_write(ioc, &magic, sizeof(magic)) != sizeof(magic)) { - LOG("write failed (magic)"); - return -EINVAL; - } - opt = cpu_to_be32(NBD_OPT_LIST); - if (nbd_negotiate_write(ioc, &opt, sizeof(opt)) != sizeof(opt)) { - LOG("write failed (opt)"); - return -EINVAL; - } - type = cpu_to_be32(NBD_REP_SERVER); - if (nbd_negotiate_write(ioc, &type, sizeof(type)) != sizeof(type)) { - LOG("write failed (reply type)"); - return -EINVAL; - } - len = cpu_to_be32(name_len + desc_len + sizeof(len)); - if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) { - LOG("write failed (length)"); - return -EINVAL; + len = name_len + desc_len + sizeof(len); + rc = nbd_negotiate_send_rep_len(ioc, NBD_REP_SERVER, NBD_OPT_LIST, len); + if (rc < 0) { + return rc; } + len = cpu_to_be32(name_len); if (nbd_negotiate_write(ioc, &len, sizeof(len)) != sizeof(len)) { LOG("write failed (name length)"); @@ -273,6 +271,8 @@ static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp) return 0; } +/* Process the NBD_OPT_LIST command, with a potential series of replies. + * Return -errno on error, 0 on success. */ static int nbd_negotiate_handle_list(NBDClient *client, uint32_t length) { NBDExport *exp; @@ -382,6 +382,8 @@ static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client, } +/* Process all NBD_OPT_* client option commands. + * Return -errno on error, 0 on success. */ static int nbd_negotiate_options(NBDClient *client) { uint32_t flags; From 3668328303429f3bc93ab3365c66331600b06a2d Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:09 -0500 Subject: [PATCH 14/30] nbd: Send message along with server NBD_REP_ERR errors The NBD Protocol allows us to send human-readable messages along with any NBD_REP_ERR error during option negotiation; make use of this fact for clients that know what to do with our message. Signed-off-by: Eric Blake Message-Id: <1476469998-28592-8-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- nbd/server.c | 78 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index 0f0c68c7b3..fa01e4902f 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -236,6 +236,38 @@ static int nbd_negotiate_send_rep(QIOChannel *ioc, uint32_t type, uint32_t opt) return nbd_negotiate_send_rep_len(ioc, type, opt, 0); } +/* Send an error reply. + * Return -errno on error, 0 on success. */ +static int GCC_FMT_ATTR(4, 5) +nbd_negotiate_send_rep_err(QIOChannel *ioc, uint32_t type, + uint32_t opt, const char *fmt, ...) +{ + va_list va; + char *msg; + int ret; + size_t len; + + va_start(va, fmt); + msg = g_strdup_vprintf(fmt, va); + va_end(va); + len = strlen(msg); + assert(len < 4096); + TRACE("sending error message \"%s\"", msg); + ret = nbd_negotiate_send_rep_len(ioc, type, opt, len); + if (ret < 0) { + goto out; + } + if (nbd_negotiate_write(ioc, msg, len) != len) { + LOG("write failed (error message)"); + ret = -EIO; + } else { + ret = 0; + } +out: + g_free(msg); + return ret; +} + /* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload. * Return -errno on error, 0 on success. */ static int nbd_negotiate_send_rep_list(QIOChannel *ioc, NBDExport *exp) @@ -281,8 +313,9 @@ static int nbd_negotiate_handle_list(NBDClient *client, uint32_t length) if (nbd_negotiate_drop_sync(client->ioc, length) != length) { return -EIO; } - return nbd_negotiate_send_rep(client->ioc, - NBD_REP_ERR_INVALID, NBD_OPT_LIST); + return nbd_negotiate_send_rep_err(client->ioc, + NBD_REP_ERR_INVALID, NBD_OPT_LIST, + "OPT_LIST should not have length"); } /* For each export, send a NBD_REP_SERVER reply. */ @@ -329,7 +362,8 @@ fail: return rc; } - +/* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the + * new channel for all further (now-encrypted) communication. */ static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client, uint32_t length) { @@ -343,7 +377,8 @@ static QIOChannel *nbd_negotiate_handle_starttls(NBDClient *client, if (nbd_negotiate_drop_sync(ioc, length) != length) { return NULL; } - nbd_negotiate_send_rep(ioc, NBD_REP_ERR_INVALID, NBD_OPT_STARTTLS); + nbd_negotiate_send_rep_err(ioc, NBD_REP_ERR_INVALID, NBD_OPT_STARTTLS, + "OPT_STARTTLS should not have length"); return NULL; } @@ -474,13 +509,15 @@ static int nbd_negotiate_options(NBDClient *client) return -EINVAL; default: - TRACE("Option 0x%" PRIx32 " not permitted before TLS", - clientflags); if (nbd_negotiate_drop_sync(client->ioc, length) != length) { return -EIO; } - ret = nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_TLS_REQD, - clientflags); + ret = nbd_negotiate_send_rep_err(client->ioc, + NBD_REP_ERR_TLS_REQD, + clientflags, + "Option 0x%" PRIx32 + "not permitted before TLS", + clientflags); if (ret < 0) { return ret; } @@ -506,27 +543,30 @@ static int nbd_negotiate_options(NBDClient *client) return -EIO; } if (client->tlscreds) { - TRACE("TLS already enabled"); - ret = nbd_negotiate_send_rep(client->ioc, - NBD_REP_ERR_INVALID, - clientflags); + ret = nbd_negotiate_send_rep_err(client->ioc, + NBD_REP_ERR_INVALID, + clientflags, + "TLS already enabled"); } else { - TRACE("TLS not configured"); - ret = nbd_negotiate_send_rep(client->ioc, - NBD_REP_ERR_POLICY, - clientflags); + ret = nbd_negotiate_send_rep_err(client->ioc, + NBD_REP_ERR_POLICY, + clientflags, + "TLS not configured"); } if (ret < 0) { return ret; } break; default: - TRACE("Unsupported option 0x%" PRIx32, clientflags); if (nbd_negotiate_drop_sync(client->ioc, length) != length) { return -EIO; } - ret = nbd_negotiate_send_rep(client->ioc, NBD_REP_ERR_UNSUP, - clientflags); + ret = nbd_negotiate_send_rep_err(client->ioc, + NBD_REP_ERR_UNSUP, + clientflags, + "Unsupported option 0x%" + PRIx32, + clientflags); if (ret < 0) { return ret; } From c8a3a1b6c4034cf53cdbd953331c72f9573edb77 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:10 -0500 Subject: [PATCH 15/30] nbd: Share common option-sending code in client Rather than open-coding each option request, it's easier to have common helper functions do the work. That in turn requires having convenient packed types for handling option requests and replies. Signed-off-by: Eric Blake Message-Id: <1476469998-28592-9-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- include/block/nbd.h | 25 ++++- nbd/client.c | 255 +++++++++++++++++++------------------------- nbd/nbd-internal.h | 2 +- 3 files changed, 131 insertions(+), 151 deletions(-) diff --git a/include/block/nbd.h b/include/block/nbd.h index a33581b6a9..b69bf1dc68 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -26,15 +26,34 @@ #include "io/channel-socket.h" #include "crypto/tlscreds.h" -/* Note: these are _NOT_ the same as the network representation of an NBD +/* Handshake phase structs - this struct is passed on the wire */ + +struct nbd_option { + uint64_t magic; /* NBD_OPTS_MAGIC */ + uint32_t option; /* NBD_OPT_* */ + uint32_t length; +} QEMU_PACKED; +typedef struct nbd_option nbd_option; + +struct nbd_opt_reply { + uint64_t magic; /* NBD_REP_MAGIC */ + uint32_t option; /* NBD_OPT_* */ + uint32_t type; /* NBD_REP_* */ + uint32_t length; +} QEMU_PACKED; +typedef struct nbd_opt_reply nbd_opt_reply; + +/* Transmission phase structs + * + * Note: these are _NOT_ the same as the network representation of an NBD * request and reply! */ struct NBDRequest { uint64_t handle; uint64_t from; uint32_t len; - uint16_t flags; - uint16_t type; + uint16_t flags; /* NBD_CMD_FLAG_* */ + uint16_t type; /* NBD_CMD_* */ }; typedef struct NBDRequest NBDRequest; diff --git a/nbd/client.c b/nbd/client.c index fe24d31ec6..e78000a28a 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -75,64 +75,128 @@ static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports); */ +/* Send an option request. + * + * The request is for option @opt, with @data containing @len bytes of + * additional payload for the request (@len may be -1 to treat @data as + * a C string; and @data may be NULL if @len is 0). + * Return 0 if successful, -1 with errp set if it is impossible to + * continue. */ +static int nbd_send_option_request(QIOChannel *ioc, uint32_t opt, + uint32_t len, const char *data, + Error **errp) +{ + nbd_option req; + QEMU_BUILD_BUG_ON(sizeof(req) != 16); -/* If type represents success, return 1 without further action. - * If type represents an error reply, consume the rest of the packet on ioc. - * Then return 0 for unsupported (so the client can fall back to - * other approaches), or -1 with errp set for other errors. + if (len == -1) { + req.length = len = strlen(data); + } + TRACE("Sending option request %" PRIu32", len %" PRIu32, opt, len); + + stq_be_p(&req.magic, NBD_OPTS_MAGIC); + stl_be_p(&req.option, opt); + stl_be_p(&req.length, len); + + if (write_sync(ioc, &req, sizeof(req)) != sizeof(req)) { + error_setg(errp, "Failed to send option request header"); + return -1; + } + + if (len && write_sync(ioc, (char *) data, len) != len) { + error_setg(errp, "Failed to send option request data"); + return -1; + } + + return 0; +} + +/* Receive the header of an option reply, which should match the given + * opt. Read through the length field, but NOT the length bytes of + * payload. Return 0 if successful, -1 with errp set if it is + * impossible to continue. */ +static int nbd_receive_option_reply(QIOChannel *ioc, uint32_t opt, + nbd_opt_reply *reply, Error **errp) +{ + QEMU_BUILD_BUG_ON(sizeof(*reply) != 20); + if (read_sync(ioc, reply, sizeof(*reply)) != sizeof(*reply)) { + error_setg(errp, "failed to read option reply"); + return -1; + } + be64_to_cpus(&reply->magic); + be32_to_cpus(&reply->option); + be32_to_cpus(&reply->type); + be32_to_cpus(&reply->length); + + TRACE("Received option reply %" PRIx32", type %" PRIx32", len %" PRIu32, + reply->option, reply->type, reply->length); + + if (reply->magic != NBD_REP_MAGIC) { + error_setg(errp, "Unexpected option reply magic"); + return -1; + } + if (reply->option != opt) { + error_setg(errp, "Unexpected option type %x expected %x", + reply->option, opt); + return -1; + } + return 0; +} + +/* If reply represents success, return 1 without further action. + * If reply represents an error, consume the optional payload of + * the packet on ioc. Then return 0 for unsupported (so the client + * can fall back to other approaches), or -1 with errp set for other + * errors. */ -static int nbd_handle_reply_err(QIOChannel *ioc, uint32_t opt, uint32_t type, +static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply, Error **errp) { - uint32_t len; char *msg = NULL; int result = -1; - if (!(type & (1 << 31))) { + if (!(reply->type & (1 << 31))) { return 1; } - if (read_sync(ioc, &len, sizeof(len)) != sizeof(len)) { - error_setg(errp, "failed to read option length"); - return -1; - } - len = be32_to_cpu(len); - if (len) { - if (len > NBD_MAX_BUFFER_SIZE) { + if (reply->length) { + if (reply->length > NBD_MAX_BUFFER_SIZE) { error_setg(errp, "server's error message is too long"); goto cleanup; } - msg = g_malloc(len + 1); - if (read_sync(ioc, msg, len) != len) { + msg = g_malloc(reply->length + 1); + if (read_sync(ioc, msg, reply->length) != reply->length) { error_setg(errp, "failed to read option error message"); goto cleanup; } - msg[len] = '\0'; + msg[reply->length] = '\0'; } - switch (type) { + switch (reply->type) { case NBD_REP_ERR_UNSUP: TRACE("server doesn't understand request %" PRIx32 - ", attempting fallback", opt); + ", attempting fallback", reply->option); result = 0; goto cleanup; case NBD_REP_ERR_POLICY: - error_setg(errp, "Denied by server for option %" PRIx32, opt); + error_setg(errp, "Denied by server for option %" PRIx32, + reply->option); break; case NBD_REP_ERR_INVALID: - error_setg(errp, "Invalid data length for option %" PRIx32, opt); + error_setg(errp, "Invalid data length for option %" PRIx32, + reply->option); break; case NBD_REP_ERR_TLS_REQD: error_setg(errp, "TLS negotiation required before option %" PRIx32, - opt); + reply->option); break; default: error_setg(errp, "Unknown error code when asking for option %" PRIx32, - opt); + reply->option); break; } @@ -147,58 +211,29 @@ static int nbd_handle_reply_err(QIOChannel *ioc, uint32_t opt, uint32_t type, static int nbd_receive_list(QIOChannel *ioc, char **name, Error **errp) { - uint64_t magic; - uint32_t opt; - uint32_t type; + nbd_opt_reply reply; uint32_t len; uint32_t namelen; int error; *name = NULL; - if (read_sync(ioc, &magic, sizeof(magic)) != sizeof(magic)) { - error_setg(errp, "failed to read list option magic"); + if (nbd_receive_option_reply(ioc, NBD_OPT_LIST, &reply, errp) < 0) { return -1; } - magic = be64_to_cpu(magic); - if (magic != NBD_REP_MAGIC) { - error_setg(errp, "Unexpected option list magic"); - return -1; - } - if (read_sync(ioc, &opt, sizeof(opt)) != sizeof(opt)) { - error_setg(errp, "failed to read list option"); - return -1; - } - opt = be32_to_cpu(opt); - if (opt != NBD_OPT_LIST) { - error_setg(errp, "Unexpected option type %" PRIx32 " expected %x", - opt, NBD_OPT_LIST); - return -1; - } - - if (read_sync(ioc, &type, sizeof(type)) != sizeof(type)) { - error_setg(errp, "failed to read list option type"); - return -1; - } - type = be32_to_cpu(type); - error = nbd_handle_reply_err(ioc, opt, type, errp); + error = nbd_handle_reply_err(ioc, &reply, errp); if (error <= 0) { return error; } + len = reply.length; - if (read_sync(ioc, &len, sizeof(len)) != sizeof(len)) { - error_setg(errp, "failed to read option length"); - return -1; - } - len = be32_to_cpu(len); - - if (type == NBD_REP_ACK) { + if (reply.type == NBD_REP_ACK) { if (len != 0) { error_setg(errp, "length too long for option end"); return -1; } - } else if (type == NBD_REP_SERVER) { + } else if (reply.type == NBD_REP_SERVER) { if (len < sizeof(namelen) || len > NBD_MAX_BUFFER_SIZE) { - error_setg(errp, "incorrect option length"); + error_setg(errp, "incorrect option length %" PRIu32, len); return -1; } if (read_sync(ioc, &namelen, sizeof(namelen)) != sizeof(namelen)) { @@ -240,7 +275,7 @@ static int nbd_receive_list(QIOChannel *ioc, char **name, Error **errp) } } else { error_setg(errp, "Unexpected reply type %" PRIx32 " expected %x", - type, NBD_REP_SERVER); + reply.type, NBD_REP_SERVER); return -1; } return 1; @@ -251,24 +286,10 @@ static int nbd_receive_query_exports(QIOChannel *ioc, const char *wantname, Error **errp) { - uint64_t magic = cpu_to_be64(NBD_OPTS_MAGIC); - uint32_t opt = cpu_to_be32(NBD_OPT_LIST); - uint32_t length = 0; bool foundExport = false; TRACE("Querying export list"); - if (write_sync(ioc, &magic, sizeof(magic)) != sizeof(magic)) { - error_setg(errp, "Failed to send list option magic"); - return -1; - } - - if (write_sync(ioc, &opt, sizeof(opt)) != sizeof(opt)) { - error_setg(errp, "Failed to send list option number"); - return -1; - } - - if (write_sync(ioc, &length, sizeof(length)) != sizeof(length)) { - error_setg(errp, "Failed to send list option length"); + if (nbd_send_option_request(ioc, NBD_OPT_LIST, 0, NULL, errp) < 0) { return -1; } @@ -314,72 +335,29 @@ static QIOChannel *nbd_receive_starttls(QIOChannel *ioc, QCryptoTLSCreds *tlscreds, const char *hostname, Error **errp) { - uint64_t magic = cpu_to_be64(NBD_OPTS_MAGIC); - uint32_t opt = cpu_to_be32(NBD_OPT_STARTTLS); - uint32_t length = 0; - uint32_t type; + nbd_opt_reply reply; QIOChannelTLS *tioc; struct NBDTLSHandshakeData data = { 0 }; TRACE("Requesting TLS from server"); - if (write_sync(ioc, &magic, sizeof(magic)) != sizeof(magic)) { - error_setg(errp, "Failed to send option magic"); - return NULL; - } - - if (write_sync(ioc, &opt, sizeof(opt)) != sizeof(opt)) { - error_setg(errp, "Failed to send option number"); - return NULL; - } - - if (write_sync(ioc, &length, sizeof(length)) != sizeof(length)) { - error_setg(errp, "Failed to send option length"); - return NULL; - } - - TRACE("Getting TLS reply from server1"); - if (read_sync(ioc, &magic, sizeof(magic)) != sizeof(magic)) { - error_setg(errp, "failed to read option magic"); - return NULL; - } - magic = be64_to_cpu(magic); - if (magic != NBD_REP_MAGIC) { - error_setg(errp, "Unexpected option magic"); - return NULL; - } - TRACE("Getting TLS reply from server2"); - if (read_sync(ioc, &opt, sizeof(opt)) != sizeof(opt)) { - error_setg(errp, "failed to read option"); - return NULL; - } - opt = be32_to_cpu(opt); - if (opt != NBD_OPT_STARTTLS) { - error_setg(errp, "Unexpected option type %" PRIx32 " expected %x", - opt, NBD_OPT_STARTTLS); + if (nbd_send_option_request(ioc, NBD_OPT_STARTTLS, 0, NULL, errp) < 0) { return NULL; } TRACE("Getting TLS reply from server"); - if (read_sync(ioc, &type, sizeof(type)) != sizeof(type)) { - error_setg(errp, "failed to read option type"); + if (nbd_receive_option_reply(ioc, NBD_OPT_STARTTLS, &reply, errp) < 0) { return NULL; } - type = be32_to_cpu(type); - if (type != NBD_REP_ACK) { + + if (reply.type != NBD_REP_ACK) { error_setg(errp, "Server rejected request to start TLS %" PRIx32, - type); + reply.type); return NULL; } - TRACE("Getting TLS reply from server"); - if (read_sync(ioc, &length, sizeof(length)) != sizeof(length)) { - error_setg(errp, "failed to read option length"); - return NULL; - } - length = be32_to_cpu(length); - if (length != 0) { + if (reply.length != 0) { error_setg(errp, "Start TLS response was not zero %" PRIu32, - length); + reply.length); return NULL; } @@ -467,8 +445,6 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, if (magic == NBD_OPTS_MAGIC) { uint32_t clientflags = 0; - uint32_t opt; - uint32_t namesize; uint16_t globalflags; bool fixedNewStyle = false; @@ -518,28 +494,13 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, goto fail; } } - /* write the export name */ - magic = cpu_to_be64(magic); - if (write_sync(ioc, &magic, sizeof(magic)) != sizeof(magic)) { - error_setg(errp, "Failed to send export name magic"); - goto fail; - } - opt = cpu_to_be32(NBD_OPT_EXPORT_NAME); - if (write_sync(ioc, &opt, sizeof(opt)) != sizeof(opt)) { - error_setg(errp, "Failed to send export name option number"); - goto fail; - } - namesize = cpu_to_be32(strlen(name)); - if (write_sync(ioc, &namesize, sizeof(namesize)) != - sizeof(namesize)) { - error_setg(errp, "Failed to send export name length"); - goto fail; - } - if (write_sync(ioc, (char *)name, strlen(name)) != strlen(name)) { - error_setg(errp, "Failed to send export name"); + /* write the export name request */ + if (nbd_send_option_request(ioc, NBD_OPT_EXPORT_NAME, -1, name, + errp) < 0) { goto fail; } + /* Read the response */ if (read_sync(ioc, &s, sizeof(s)) != sizeof(s)) { error_setg(errp, "Failed to read export length"); goto fail; diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index 99e51571a2..dd57e18833 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -62,7 +62,7 @@ #define NBD_REPLY_MAGIC 0x67446698 #define NBD_OPTS_MAGIC 0x49484156454F5054LL #define NBD_CLIENT_MAGIC 0x0000420281861253LL -#define NBD_REP_MAGIC 0x3e889045565a9LL +#define NBD_REP_MAGIC 0x0003e889045565a9LL #define NBD_SET_SOCK _IO(0xab, 0) #define NBD_SET_BLKSIZE _IO(0xab, 1) From 2cdbf41362cbc9b977dd458b6508352e45f2cb42 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:11 -0500 Subject: [PATCH 16/30] nbd: Let server know when client gives up negotiation The NBD spec says that a client should send NBD_OPT_ABORT rather than just dropping the connection, if the client doesn't like something the server sent during option negotiation. This is a best-effort attempt only, and can only be done in places where we know the server is still in sync with what we've sent, whether or not we've read everything the server has sent. Technically, the server then has to reply with NBD_REP_ACK, but it's not worth complicating the client to wait around for that reply. Signed-off-by: Eric Blake Message-Id: <1476469998-28592-10-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- nbd/client.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/nbd/client.c b/nbd/client.c index e78000a28a..651e08cd3a 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -111,6 +111,19 @@ static int nbd_send_option_request(QIOChannel *ioc, uint32_t opt, return 0; } +/* Send NBD_OPT_ABORT as a courtesy to let the server know that we are + * not going to attempt further negotiation. */ +static void nbd_send_opt_abort(QIOChannel *ioc) +{ + /* Technically, a compliant server is supposed to reply to us; but + * older servers disconnected instead. At any rate, we're allowed + * to disconnect without waiting for the server reply, so we don't + * even care if the request makes it to the server, let alone + * waiting around for whether the server replies. */ + nbd_send_option_request(ioc, NBD_OPT_ABORT, 0, NULL, NULL); +} + + /* Receive the header of an option reply, which should match the given * opt. Read through the length field, but NOT the length bytes of * payload. Return 0 if successful, -1 with errp set if it is @@ -121,6 +134,7 @@ static int nbd_receive_option_reply(QIOChannel *ioc, uint32_t opt, QEMU_BUILD_BUG_ON(sizeof(*reply) != 20); if (read_sync(ioc, reply, sizeof(*reply)) != sizeof(*reply)) { error_setg(errp, "failed to read option reply"); + nbd_send_opt_abort(ioc); return -1; } be64_to_cpus(&reply->magic); @@ -133,11 +147,13 @@ static int nbd_receive_option_reply(QIOChannel *ioc, uint32_t opt, if (reply->magic != NBD_REP_MAGIC) { error_setg(errp, "Unexpected option reply magic"); + nbd_send_opt_abort(ioc); return -1; } if (reply->option != opt) { error_setg(errp, "Unexpected option type %x expected %x", reply->option, opt); + nbd_send_opt_abort(ioc); return -1; } return 0; @@ -206,6 +222,9 @@ static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply, cleanup: g_free(msg); + if (result < 0) { + nbd_send_opt_abort(ioc); + } return result; } @@ -229,25 +248,30 @@ static int nbd_receive_list(QIOChannel *ioc, char **name, Error **errp) if (reply.type == NBD_REP_ACK) { if (len != 0) { error_setg(errp, "length too long for option end"); + nbd_send_opt_abort(ioc); return -1; } } else if (reply.type == NBD_REP_SERVER) { if (len < sizeof(namelen) || len > NBD_MAX_BUFFER_SIZE) { error_setg(errp, "incorrect option length %" PRIu32, len); + nbd_send_opt_abort(ioc); return -1; } if (read_sync(ioc, &namelen, sizeof(namelen)) != sizeof(namelen)) { error_setg(errp, "failed to read option name length"); + nbd_send_opt_abort(ioc); return -1; } namelen = be32_to_cpu(namelen); len -= sizeof(namelen); if (len < namelen) { error_setg(errp, "incorrect option name length"); + nbd_send_opt_abort(ioc); return -1; } if (namelen > NBD_MAX_NAME_SIZE) { error_setg(errp, "export name length too long %" PRIu32, namelen); + nbd_send_opt_abort(ioc); return -1; } @@ -256,6 +280,7 @@ static int nbd_receive_list(QIOChannel *ioc, char **name, Error **errp) error_setg(errp, "failed to read export name"); g_free(*name); *name = NULL; + nbd_send_opt_abort(ioc); return -1; } (*name)[namelen] = '\0'; @@ -267,6 +292,7 @@ static int nbd_receive_list(QIOChannel *ioc, char **name, Error **errp) g_free(*name); g_free(buf); *name = NULL; + nbd_send_opt_abort(ioc); return -1; } buf[len] = '\0'; @@ -276,6 +302,7 @@ static int nbd_receive_list(QIOChannel *ioc, char **name, Error **errp) } else { error_setg(errp, "Unexpected reply type %" PRIx32 " expected %x", reply.type, NBD_REP_SERVER); + nbd_send_opt_abort(ioc); return -1; } return 1; @@ -325,6 +352,7 @@ static int nbd_receive_query_exports(QIOChannel *ioc, if (!foundExport) { error_setg(errp, "No export with name '%s' available", wantname); + nbd_send_opt_abort(ioc); return -1; } @@ -352,12 +380,14 @@ static QIOChannel *nbd_receive_starttls(QIOChannel *ioc, if (reply.type != NBD_REP_ACK) { error_setg(errp, "Server rejected request to start TLS %" PRIx32, reply.type); + nbd_send_opt_abort(ioc); return NULL; } if (reply.length != 0) { error_setg(errp, "Start TLS response was not zero %" PRIu32, reply.length); + nbd_send_opt_abort(ioc); return NULL; } From 7d3123e1775f16bf23fa8d5bac17a3d8a9e72583 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:12 -0500 Subject: [PATCH 17/30] nbd: Let client skip portions of server reply The server has a nice helper function nbd_negotiate_drop_sync() which lets it easily ignore fluff from the client (such as the payload to an unknown option request). We can't quite make it common, since it depends on nbd_negotiate_read() which handles coroutine magic, but we can copy the idea into the client where we have places where we want to ignore data (such as the description tacked on the end of NBD_REP_SERVER). Signed-off-by: Eric Blake Message-Id: <1476469998-28592-11-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- nbd/client.c | 47 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/nbd/client.c b/nbd/client.c index 651e08cd3a..5d94e34760 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -75,6 +75,32 @@ static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports); */ +/* Discard length bytes from channel. Return -errno on failure, or + * the amount of bytes consumed. */ +static ssize_t drop_sync(QIOChannel *ioc, size_t size) +{ + ssize_t ret, dropped = size; + char small[1024]; + char *buffer; + + buffer = sizeof(small) < size ? small : g_malloc(MIN(65536, size)); + while (size > 0) { + ret = read_sync(ioc, buffer, MIN(65536, size)); + if (ret < 0) { + goto cleanup; + } + assert(ret <= size); + size -= ret; + } + ret = dropped; + + cleanup: + if (buffer != small) { + g_free(buffer); + } + return ret; +} + /* Send an option request. * * The request is for option @opt, with @data containing @len bytes of @@ -285,19 +311,12 @@ static int nbd_receive_list(QIOChannel *ioc, char **name, Error **errp) } (*name)[namelen] = '\0'; len -= namelen; - if (len) { - char *buf = g_malloc(len + 1); - if (read_sync(ioc, buf, len) != len) { - error_setg(errp, "failed to read export description"); - g_free(*name); - g_free(buf); - *name = NULL; - nbd_send_opt_abort(ioc); - return -1; - } - buf[len] = '\0'; - TRACE("Ignoring export description: %s", buf); - g_free(buf); + if (drop_sync(ioc, len) != len) { + error_setg(errp, "failed to read export description"); + g_free(*name); + *name = NULL; + nbd_send_opt_abort(ioc); + return -1; } } else { error_setg(errp, "Unexpected reply type %" PRIx32 " expected %x", @@ -577,7 +596,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } TRACE("Size is %" PRIu64 ", export flags %" PRIx16, *size, *flags); - if (read_sync(ioc, &buf, 124) != 124) { + if (drop_sync(ioc, 124) != 124) { error_setg(errp, "Failed to read reserved block"); goto fail; } From 75368aab9ba27f762acf5a51062c951a1fb1e006 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:13 -0500 Subject: [PATCH 18/30] nbd: Less allocation during NBD_OPT_LIST Since we know that the maximum name we are willing to accept is small enough to stack-allocate, rework the iteration over NBD_OPT_LIST responses to reuse a stack buffer rather than allocating every time. Furthermore, we don't even have to allocate if we know the server's length doesn't match what we are searching for. Signed-off-by: Eric Blake Message-Id: <1476469998-28592-12-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- nbd/client.c | 145 +++++++++++++++++++++++++-------------------------- 1 file changed, 70 insertions(+), 75 deletions(-) diff --git a/nbd/client.c b/nbd/client.c index 5d94e34760..0afb8be15d 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -254,19 +254,28 @@ static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply, return result; } -static int nbd_receive_list(QIOChannel *ioc, char **name, Error **errp) +/* Process another portion of the NBD_OPT_LIST reply. Set *@match if + * the current reply matches @want or if the server does not support + * NBD_OPT_LIST, otherwise leave @match alone. Return 0 if iteration + * is complete, positive if more replies are expected, or negative + * with @errp set if an unrecoverable error occurred. */ +static int nbd_receive_list(QIOChannel *ioc, const char *want, bool *match, + Error **errp) { nbd_opt_reply reply; uint32_t len; uint32_t namelen; + char name[NBD_MAX_NAME_SIZE + 1]; int error; - *name = NULL; if (nbd_receive_option_reply(ioc, NBD_OPT_LIST, &reply, errp) < 0) { return -1; } error = nbd_handle_reply_err(ioc, &reply, errp); if (error <= 0) { + /* The server did not support NBD_OPT_LIST, so set *match on + * the assumption that any name will be accepted. */ + *match = true; return error; } len = reply.length; @@ -277,105 +286,91 @@ static int nbd_receive_list(QIOChannel *ioc, char **name, Error **errp) nbd_send_opt_abort(ioc); return -1; } - } else if (reply.type == NBD_REP_SERVER) { - if (len < sizeof(namelen) || len > NBD_MAX_BUFFER_SIZE) { - error_setg(errp, "incorrect option length %" PRIu32, len); - nbd_send_opt_abort(ioc); - return -1; - } - if (read_sync(ioc, &namelen, sizeof(namelen)) != sizeof(namelen)) { - error_setg(errp, "failed to read option name length"); - nbd_send_opt_abort(ioc); - return -1; - } - namelen = be32_to_cpu(namelen); - len -= sizeof(namelen); - if (len < namelen) { - error_setg(errp, "incorrect option name length"); - nbd_send_opt_abort(ioc); - return -1; - } - if (namelen > NBD_MAX_NAME_SIZE) { - error_setg(errp, "export name length too long %" PRIu32, namelen); - nbd_send_opt_abort(ioc); - return -1; - } - - *name = g_new0(char, namelen + 1); - if (read_sync(ioc, *name, namelen) != namelen) { - error_setg(errp, "failed to read export name"); - g_free(*name); - *name = NULL; - nbd_send_opt_abort(ioc); - return -1; - } - (*name)[namelen] = '\0'; - len -= namelen; - if (drop_sync(ioc, len) != len) { - error_setg(errp, "failed to read export description"); - g_free(*name); - *name = NULL; - nbd_send_opt_abort(ioc); - return -1; - } - } else { + return 0; + } else if (reply.type != NBD_REP_SERVER) { error_setg(errp, "Unexpected reply type %" PRIx32 " expected %x", reply.type, NBD_REP_SERVER); nbd_send_opt_abort(ioc); return -1; } + + if (len < sizeof(namelen) || len > NBD_MAX_BUFFER_SIZE) { + error_setg(errp, "incorrect option length %" PRIu32, len); + nbd_send_opt_abort(ioc); + return -1; + } + if (read_sync(ioc, &namelen, sizeof(namelen)) != sizeof(namelen)) { + error_setg(errp, "failed to read option name length"); + nbd_send_opt_abort(ioc); + return -1; + } + namelen = be32_to_cpu(namelen); + len -= sizeof(namelen); + if (len < namelen) { + error_setg(errp, "incorrect option name length"); + nbd_send_opt_abort(ioc); + return -1; + } + if (namelen != strlen(want)) { + if (drop_sync(ioc, len) != len) { + error_setg(errp, "failed to skip export name with wrong length"); + nbd_send_opt_abort(ioc); + return -1; + } + return 1; + } + + assert(namelen < sizeof(name)); + if (read_sync(ioc, name, namelen) != namelen) { + error_setg(errp, "failed to read export name"); + nbd_send_opt_abort(ioc); + return -1; + } + name[namelen] = '\0'; + len -= namelen; + if (drop_sync(ioc, len) != len) { + error_setg(errp, "failed to read export description"); + nbd_send_opt_abort(ioc); + return -1; + } + if (!strcmp(name, want)) { + *match = true; + } return 1; } +/* Return -1 on failure, 0 if wantname is an available export. */ static int nbd_receive_query_exports(QIOChannel *ioc, const char *wantname, Error **errp) { bool foundExport = false; - TRACE("Querying export list"); + TRACE("Querying export list for '%s'", wantname); if (nbd_send_option_request(ioc, NBD_OPT_LIST, 0, NULL, errp) < 0) { return -1; } TRACE("Reading available export names"); while (1) { - char *name = NULL; - int ret = nbd_receive_list(ioc, &name, errp); + int ret = nbd_receive_list(ioc, wantname, &foundExport, errp); if (ret < 0) { - g_free(name); - name = NULL; + /* Server gave unexpected reply */ return -1; + } else if (ret == 0) { + /* Done iterating. */ + if (!foundExport) { + error_setg(errp, "No export with name '%s' available", + wantname); + nbd_send_opt_abort(ioc); + return -1; + } + TRACE("Found desired export name '%s'", wantname); + return 0; } - if (ret == 0) { - /* Server doesn't support export listing, so - * we will just assume an export with our - * wanted name exists */ - foundExport = true; - break; - } - if (name == NULL) { - TRACE("End of export name list"); - break; - } - if (g_str_equal(name, wantname)) { - foundExport = true; - TRACE("Found desired export name '%s'", name); - } else { - TRACE("Ignored export name '%s'", name); - } - g_free(name); } - - if (!foundExport) { - error_setg(errp, "No export with name '%s' available", wantname); - nbd_send_opt_abort(ioc); - return -1; - } - - return 0; } static QIOChannel *nbd_receive_starttls(QIOChannel *ioc, From c203c59ad9dc677b27e00ba76d221fd7e93c1aa6 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:14 -0500 Subject: [PATCH 19/30] nbd: Support shorter handshake The NBD Protocol allows the server and client to mutually agree on a shorter handshake (omit the 124 bytes of reserved 0), via the server advertising NBD_FLAG_NO_ZEROES and the client acknowledging with NBD_FLAG_C_NO_ZEROES (only possible in newstyle, whether or not it is fixed newstyle). It doesn't shave much off the wire, but we might as well implement it. Signed-off-by: Eric Blake Reviewed-by: Alex Bligh Message-Id: <1476469998-28592-13-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- include/block/nbd.h | 6 ++++-- nbd/client.c | 8 +++++++- nbd/server.c | 15 +++++++++++---- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/include/block/nbd.h b/include/block/nbd.h index b69bf1dc68..d326308656 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -74,11 +74,13 @@ typedef struct NBDReply NBDReply; /* New-style handshake (global) flags, sent from server to client, and control what will happen during handshake phase. */ -#define NBD_FLAG_FIXED_NEWSTYLE (1 << 0) /* Fixed newstyle protocol. */ +#define NBD_FLAG_FIXED_NEWSTYLE (1 << 0) /* Fixed newstyle protocol. */ +#define NBD_FLAG_NO_ZEROES (1 << 1) /* End handshake without zeroes. */ /* New-style client flags, sent from client to server to control what happens during handshake phase. */ -#define NBD_FLAG_C_FIXED_NEWSTYLE (1 << 0) /* Fixed newstyle protocol. */ +#define NBD_FLAG_C_FIXED_NEWSTYLE (1 << 0) /* Fixed newstyle protocol. */ +#define NBD_FLAG_C_NO_ZEROES (1 << 1) /* End handshake without zeroes. */ /* Reply types. */ #define NBD_REP_ACK (1) /* Data sending finished. */ diff --git a/nbd/client.c b/nbd/client.c index 0afb8be15d..b29963b942 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -440,6 +440,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, char buf[256]; uint64_t magic, s; int rc; + bool zeroes = true; TRACE("Receiving negotiation tlscreds=%p hostname=%s.", tlscreds, hostname ? hostname : ""); @@ -504,6 +505,11 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, TRACE("Server supports fixed new style"); clientflags |= NBD_FLAG_C_FIXED_NEWSTYLE; } + if (globalflags & NBD_FLAG_NO_ZEROES) { + zeroes = false; + TRACE("Server supports no zeroes"); + clientflags |= NBD_FLAG_C_NO_ZEROES; + } /* client requested flags */ clientflags = cpu_to_be32(clientflags); if (write_sync(ioc, &clientflags, sizeof(clientflags)) != @@ -591,7 +597,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, const char *name, uint16_t *flags, } TRACE("Size is %" PRIu64 ", export flags %" PRIx16, *size, *flags); - if (drop_sync(ioc, 124) != 124) { + if (zeroes && drop_sync(ioc, 124) != 124) { error_setg(errp, "Failed to read reserved block"); goto fail; } diff --git a/nbd/server.c b/nbd/server.c index fa01e4902f..afc1ec47b3 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -81,6 +81,7 @@ struct NBDClient { int refcount; void (*close)(NBDClient *client); + bool no_zeroes; NBDExport *exp; QCryptoTLSCreds *tlscreds; char *tlsaclname; @@ -450,6 +451,11 @@ static int nbd_negotiate_options(NBDClient *client) fixedNewstyle = true; flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE; } + if (flags & NBD_FLAG_C_NO_ZEROES) { + TRACE("Client supports no zeroes at handshake end"); + client->no_zeroes = true; + flags &= ~NBD_FLAG_C_NO_ZEROES; + } if (flags != 0) { TRACE("Unknown client flags 0x%" PRIx32 " received", flags); return -EIO; @@ -602,6 +608,7 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) const uint16_t myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM | NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA); bool oldStyle; + size_t len; /* Old style negotiation header without options [ 0 .. 7] passwd ("NBDMAGIC") @@ -618,7 +625,7 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) ....options sent.... [18 .. 25] size [26 .. 27] export flags - [28 .. 151] reserved (0) + [28 .. 151] reserved (0, omit if no_zeroes) */ qio_channel_set_blocking(client->ioc, false, NULL); @@ -637,7 +644,7 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) stw_be_p(buf + 26, client->exp->nbdflags | myflags); } else { stq_be_p(buf + 8, NBD_OPTS_MAGIC); - stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE); + stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES); } if (oldStyle) { @@ -664,8 +671,8 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) client->exp->size, client->exp->nbdflags | myflags); stq_be_p(buf + 18, client->exp->size); stw_be_p(buf + 26, client->exp->nbdflags | myflags); - if (nbd_negotiate_write(client->ioc, buf + 18, sizeof(buf) - 18) != - sizeof(buf) - 18) { + len = client->no_zeroes ? 10 : sizeof(buf) - 18; + if (nbd_negotiate_write(client->ioc, buf + 18, len) != len) { LOG("write failed"); goto fail; } From 8b34a9dbc3f2c0afe3450cb20b94cc30f450e77b Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:15 -0500 Subject: [PATCH 20/30] nbd: Refactor conversion to errno to silence checkpatch Checkpatch complains that 'return EINVAL' is usually wrong (since we tend to favor 'return -EINVAL'). But it is a false positive for nbd_errno_to_system_errno(). Since NBD may add future defined wire values, refactor the code to keep checkpatch happy. Signed-off-by: Eric Blake Message-Id: <1476469998-28592-14-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- nbd/client.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/nbd/client.c b/nbd/client.c index b29963b942..7bdce531cc 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -23,23 +23,31 @@ static int nbd_errno_to_system_errno(int err) { + int ret; switch (err) { case NBD_SUCCESS: - return 0; + ret = 0; + break; case NBD_EPERM: - return EPERM; + ret = EPERM; + break; case NBD_EIO: - return EIO; + ret = EIO; + break; case NBD_ENOMEM: - return ENOMEM; + ret = ENOMEM; + break; case NBD_ENOSPC: - return ENOSPC; + ret = ENOSPC; + break; default: TRACE("Squashing unexpected error %d to EINVAL", err); /* fallthrough */ case NBD_EINVAL: - return EINVAL; + ret = EINVAL; + break; } + return ret; } /* Definitions for opaque data types */ From b6f5d3b573fe43da1f9fa07b7454e4492f409411 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:16 -0500 Subject: [PATCH 21/30] nbd: Improve server handling of shutdown requests NBD commit 6d34500b clarified how clients and servers are supposed to behave before closing a connection. It added NBD_REP_ERR_SHUTDOWN (for the server to announce it is about to go away during option haggling, so the client should quit sending NBD_OPT_* other than NBD_OPT_ABORT) and ESHUTDOWN (for the server to announce it is about to go away during transmission, so the client should quit sending NBD_CMD_* other than NBD_CMD_DISC). It also clarified that NBD_OPT_ABORT gets a reply, while NBD_CMD_DISC does not. This patch merely adds the missing reply to NBD_OPT_ABORT and teaches the client to recognize server errors. Actually teaching the server to send NBD_REP_ERR_SHUTDOWN or ESHUTDOWN would require knowing that the server has been requested to shut down soon (maybe we could do that by installing a SIGINT handler in qemu-nbd, which transitions from RUNNING to a new state that waits for the client to react, rather than just out-right quitting - but that's a bigger task for another day). Signed-off-by: Eric Blake Message-Id: <1476469998-28592-15-git-send-email-eblake@redhat.com> [Move dummy ESHUTDOWN to include/qemu/osdep.h. - Paolo] Signed-off-by: Paolo Bonzini --- include/block/nbd.h | 13 +++++++++---- include/qemu/osdep.h | 3 +++ nbd/client.c | 18 ++++++++++++++++++ nbd/nbd-internal.h | 1 + nbd/server.c | 10 ++++++++++ 5 files changed, 41 insertions(+), 4 deletions(-) diff --git a/include/block/nbd.h b/include/block/nbd.h index d326308656..eea7ef0536 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -83,12 +83,17 @@ typedef struct NBDReply NBDReply; #define NBD_FLAG_C_NO_ZEROES (1 << 1) /* End handshake without zeroes. */ /* Reply types. */ +#define NBD_REP_ERR(value) ((UINT32_C(1) << 31) | (value)) + #define NBD_REP_ACK (1) /* Data sending finished. */ #define NBD_REP_SERVER (2) /* Export description. */ -#define NBD_REP_ERR_UNSUP ((UINT32_C(1) << 31) | 1) /* Unknown option. */ -#define NBD_REP_ERR_POLICY ((UINT32_C(1) << 31) | 2) /* Server denied */ -#define NBD_REP_ERR_INVALID ((UINT32_C(1) << 31) | 3) /* Invalid length. */ -#define NBD_REP_ERR_TLS_REQD ((UINT32_C(1) << 31) | 5) /* TLS required */ + +#define NBD_REP_ERR_UNSUP NBD_REP_ERR(1) /* Unknown option */ +#define NBD_REP_ERR_POLICY NBD_REP_ERR(2) /* Server denied */ +#define NBD_REP_ERR_INVALID NBD_REP_ERR(3) /* Invalid length */ +#define NBD_REP_ERR_PLATFORM NBD_REP_ERR(4) /* Not compiled in */ +#define NBD_REP_ERR_TLS_REQD NBD_REP_ERR(5) /* TLS required */ +#define NBD_REP_ERR_SHUTDOWN NBD_REP_ERR(7) /* Server shutting down */ /* Request flags, sent from client to server during transmission phase */ #define NBD_CMD_FLAG_FUA (1 << 0) diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h index 0e3c330e6b..689f253ea7 100644 --- a/include/qemu/osdep.h +++ b/include/qemu/osdep.h @@ -128,6 +128,9 @@ extern int daemon(int, int); #if !defined(EMEDIUMTYPE) #define EMEDIUMTYPE 4098 #endif +#if !defined(ESHUTDOWN) +#define ESHUTDOWN 4099 +#endif #ifndef TIME_MAX #define TIME_MAX LONG_MAX #endif diff --git a/nbd/client.c b/nbd/client.c index 7bdce531cc..7db4301d29 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -40,6 +40,9 @@ static int nbd_errno_to_system_errno(int err) case NBD_ENOSPC: ret = ENOSPC; break; + case NBD_ESHUTDOWN: + ret = ESHUTDOWN; + break; default: TRACE("Squashing unexpected error %d to EINVAL", err); /* fallthrough */ @@ -239,11 +242,21 @@ static int nbd_handle_reply_err(QIOChannel *ioc, nbd_opt_reply *reply, reply->option); break; + case NBD_REP_ERR_PLATFORM: + error_setg(errp, "Server lacks support for option %" PRIx32, + reply->option); + break; + case NBD_REP_ERR_TLS_REQD: error_setg(errp, "TLS negotiation required before option %" PRIx32, reply->option); break; + case NBD_REP_ERR_SHUTDOWN: + error_setg(errp, "Server shutting down before option %" PRIx32, + reply->option); + break; + default: error_setg(errp, "Unknown error code when asking for option %" PRIx32, reply->option); @@ -785,6 +798,11 @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply) reply->error = nbd_errno_to_system_errno(reply->error); + if (reply->error == ESHUTDOWN) { + /* This works even on mingw which lacks a native ESHUTDOWN */ + LOG("server shutting down"); + return -EINVAL; + } TRACE("Got reply: { magic = 0x%" PRIx32 ", .error = % " PRId32 ", handle = %" PRIu64" }", magic, reply->error, reply->handle); diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h index dd57e18833..eee20abc25 100644 --- a/nbd/nbd-internal.h +++ b/nbd/nbd-internal.h @@ -92,6 +92,7 @@ #define NBD_ENOMEM 12 #define NBD_EINVAL 22 #define NBD_ENOSPC 28 +#define NBD_ESHUTDOWN 108 static inline ssize_t read_sync(QIOChannel *ioc, void *buffer, size_t size) { diff --git a/nbd/server.c b/nbd/server.c index afc1ec47b3..0b50caad9e 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -39,6 +39,8 @@ static int system_errno_to_nbd_errno(int err) case EFBIG: case ENOSPC: return NBD_ENOSPC; + case ESHUTDOWN: + return NBD_ESHUTDOWN; case EINVAL: default: return NBD_EINVAL; @@ -527,6 +529,10 @@ static int nbd_negotiate_options(NBDClient *client) if (ret < 0) { return ret; } + /* Let the client keep trying, unless they asked to quit */ + if (clientflags == NBD_OPT_ABORT) { + return -EINVAL; + } break; } } else if (fixedNewstyle) { @@ -539,6 +545,10 @@ static int nbd_negotiate_options(NBDClient *client) break; case NBD_OPT_ABORT: + /* NBD spec says we must try to reply before + * disconnecting, but that we must also tolerate + * guests that don't wait for our reply. */ + nbd_negotiate_send_rep(client->ioc, NBD_REP_ACK, clientflags); return -EINVAL; case NBD_OPT_EXPORT_NAME: From 1f4d6d18edfeaea64ae74bf5254b8d0e923dc73f Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:17 -0500 Subject: [PATCH 22/30] nbd: Implement NBD_CMD_WRITE_ZEROES on server Upstream NBD protocol recently added the ability to efficiently write zeroes without having to send the zeroes over the wire, along with a flag to control whether the client wants to allow a hole. Note that when it comes to requiring full allocation, vs. permitting optimizations, the NBD spec intentionally picked a different sense for the flag; the rules in qemu are: MAY_UNMAP == 0: must write zeroes MAY_UNMAP == 1: may use holes if reads will see zeroes while in NBD, the rules are: FLAG_NO_HOLE == 1: must write zeroes FLAG_NO_HOLE == 0: may use holes if reads will see zeroes In all cases, the 'may use holes' scenario is optional (the server need not use a hole, and must not use a hole if subsequent reads would not see zeroes). Signed-off-by: Eric Blake Message-Id: <1476469998-28592-16-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- include/block/nbd.h | 8 ++++++-- nbd/server.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/include/block/nbd.h b/include/block/nbd.h index eea7ef0536..3e373f0498 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -71,6 +71,7 @@ typedef struct NBDReply NBDReply; #define NBD_FLAG_SEND_FUA (1 << 3) /* Send FUA (Force Unit Access) */ #define NBD_FLAG_ROTATIONAL (1 << 4) /* Use elevator algorithm - rotational media */ #define NBD_FLAG_SEND_TRIM (1 << 5) /* Send TRIM (discard) */ +#define NBD_FLAG_SEND_WRITE_ZEROES (1 << 6) /* Send WRITE_ZEROES */ /* New-style handshake (global) flags, sent from server to client, and control what will happen during handshake phase. */ @@ -96,7 +97,8 @@ typedef struct NBDReply NBDReply; #define NBD_REP_ERR_SHUTDOWN NBD_REP_ERR(7) /* Server shutting down */ /* Request flags, sent from client to server during transmission phase */ -#define NBD_CMD_FLAG_FUA (1 << 0) +#define NBD_CMD_FLAG_FUA (1 << 0) /* 'force unit access' during write */ +#define NBD_CMD_FLAG_NO_HOLE (1 << 1) /* don't punch hole on zero run */ /* Supported request types */ enum { @@ -104,7 +106,9 @@ enum { NBD_CMD_WRITE = 1, NBD_CMD_DISC = 2, NBD_CMD_FLUSH = 3, - NBD_CMD_TRIM = 4 + NBD_CMD_TRIM = 4, + /* 5 reserved for failed experiment NBD_CMD_CACHE */ + NBD_CMD_WRITE_ZEROES = 6, }; #define NBD_DEFAULT_PORT 10809 diff --git a/nbd/server.c b/nbd/server.c index 0b50caad9e..5b76261666 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -616,7 +616,8 @@ static coroutine_fn int nbd_negotiate(NBDClientNewData *data) char buf[8 + 8 + 8 + 128]; int rc; const uint16_t myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM | - NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA); + NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA | + NBD_FLAG_SEND_WRITE_ZEROES); bool oldStyle; size_t len; @@ -1146,11 +1147,17 @@ static ssize_t nbd_co_receive_request(NBDRequestData *req, rc = request->type == NBD_CMD_WRITE ? -ENOSPC : -EINVAL; goto out; } - if (request->flags & ~NBD_CMD_FLAG_FUA) { + if (request->flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE)) { LOG("unsupported flags (got 0x%x)", request->flags); rc = -EINVAL; goto out; } + if (request->type != NBD_CMD_WRITE_ZEROES && + (request->flags & NBD_CMD_FLAG_NO_HOLE)) { + LOG("unexpected flags (got 0x%x)", request->flags); + rc = -EINVAL; + goto out; + } rc = 0; @@ -1255,6 +1262,37 @@ static void nbd_trip(void *opaque) } break; + case NBD_CMD_WRITE_ZEROES: + TRACE("Request type is WRITE_ZEROES"); + + if (exp->nbdflags & NBD_FLAG_READ_ONLY) { + TRACE("Server is read-only, return error"); + reply.error = EROFS; + goto error_reply; + } + + TRACE("Writing to device"); + + flags = 0; + if (request.flags & NBD_CMD_FLAG_FUA) { + flags |= BDRV_REQ_FUA; + } + if (!(request.flags & NBD_CMD_FLAG_NO_HOLE)) { + flags |= BDRV_REQ_MAY_UNMAP; + } + ret = blk_pwrite_zeroes(exp->blk, request.from + exp->dev_offset, + request.len, flags); + if (ret < 0) { + LOG("writing to file failed"); + reply.error = -ret; + goto error_reply; + } + + if (nbd_co_send_reply(req, &reply, 0) < 0) { + goto out; + } + break; + case NBD_CMD_DISC: /* unreachable, thanks to special case in nbd_co_receive_request() */ abort(); From fa778fffdfafce811bba3dd97de41fb524b861f7 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Fri, 14 Oct 2016 13:33:18 -0500 Subject: [PATCH 23/30] nbd: Implement NBD_CMD_WRITE_ZEROES on client Upstream NBD protocol recently added the ability to efficiently write zeroes without having to send the zeroes over the wire, along with a flag to control whether the client wants a hole. The generic block code takes care of falling back to the obvious write of lots of zeroes if we return -ENOTSUP because the server does not have WRITE_ZEROES. Ideally, since NBD_CMD_WRITE_ZEROES does not involve any data over the wire, we want to support transactions that are much larger than the normal 32M limit imposed on NBD_CMD_WRITE. But the server may still have a limit smaller than UINT_MAX, so until experimental NBD protocol additions for advertising various command sizes is finalized (see [1], [2]), for now we just stick to the same limits as normal writes. [1] https://github.com/yoe/nbd/blob/extension-info/doc/proto.md [2] https://sourceforge.net/p/nbd/mailman/message/35081223/ Signed-off-by: Eric Blake Message-Id: <1476469998-28592-17-git-send-email-eblake@redhat.com> Signed-off-by: Paolo Bonzini --- block/nbd-client.c | 35 +++++++++++++++++++++++++++++++++++ block/nbd-client.h | 2 ++ block/nbd.c | 4 ++++ 3 files changed, 41 insertions(+) diff --git a/block/nbd-client.c b/block/nbd-client.c index 3e5f9f5b77..2a302de674 100644 --- a/block/nbd-client.c +++ b/block/nbd-client.c @@ -275,6 +275,41 @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, return -reply.error; } +int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, + int count, BdrvRequestFlags flags) +{ + ssize_t ret; + NBDClientSession *client = nbd_get_client_session(bs); + NBDRequest request = { + .type = NBD_CMD_WRITE_ZEROES, + .from = offset, + .len = count, + }; + NBDReply reply; + + if (!(client->nbdflags & NBD_FLAG_SEND_WRITE_ZEROES)) { + return -ENOTSUP; + } + + if (flags & BDRV_REQ_FUA) { + assert(client->nbdflags & NBD_FLAG_SEND_FUA); + request.flags |= NBD_CMD_FLAG_FUA; + } + if (!(flags & BDRV_REQ_MAY_UNMAP)) { + request.flags |= NBD_CMD_FLAG_NO_HOLE; + } + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(bs, &request, NULL); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, NULL); + } + nbd_coroutine_end(client, &request); + return -reply.error; +} + int nbd_client_co_flush(BlockDriverState *bs) { NBDClientSession *client = nbd_get_client_session(bs); diff --git a/block/nbd-client.h b/block/nbd-client.h index 51be419405..f8d6006849 100644 --- a/block/nbd-client.h +++ b/block/nbd-client.h @@ -48,6 +48,8 @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count); int nbd_client_co_flush(BlockDriverState *bs); int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags); +int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, + int count, BdrvRequestFlags flags); int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags); diff --git a/block/nbd.c b/block/nbd.c index b281484648..9cff8396f9 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -466,6 +466,7 @@ static int nbd_co_flush(BlockDriverState *bs) static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) { bs->bl.max_pdiscard = NBD_MAX_BUFFER_SIZE; + bs->bl.max_pwrite_zeroes = NBD_MAX_BUFFER_SIZE; bs->bl.max_transfer = NBD_MAX_BUFFER_SIZE; } @@ -558,6 +559,7 @@ static BlockDriver bdrv_nbd = { .bdrv_file_open = nbd_open, .bdrv_co_preadv = nbd_client_co_preadv, .bdrv_co_pwritev = nbd_client_co_pwritev, + .bdrv_co_pwrite_zeroes = nbd_client_co_pwrite_zeroes, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_pdiscard = nbd_client_co_pdiscard, @@ -576,6 +578,7 @@ static BlockDriver bdrv_nbd_tcp = { .bdrv_file_open = nbd_open, .bdrv_co_preadv = nbd_client_co_preadv, .bdrv_co_pwritev = nbd_client_co_pwritev, + .bdrv_co_pwrite_zeroes = nbd_client_co_pwrite_zeroes, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_pdiscard = nbd_client_co_pdiscard, @@ -594,6 +597,7 @@ static BlockDriver bdrv_nbd_unix = { .bdrv_file_open = nbd_open, .bdrv_co_preadv = nbd_client_co_preadv, .bdrv_co_pwritev = nbd_client_co_pwritev, + .bdrv_co_pwrite_zeroes = nbd_client_co_pwrite_zeroes, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_pdiscard = nbd_client_co_pdiscard, From fffbd9cf1befa256d75bc45541adca405bb65dd5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Oct 2016 15:38:19 +0200 Subject: [PATCH 24/30] qemu-char: do not forward events through the mux until QEMU has started Otherwise, the CHR_EVENT_OPENED event is sent twice: first when the backend (for example "stdio") is opened, and second after processing the command line. The incorrect sending of the event prints the monitor banner when QEMU is started with "-serial mon:stdio". This includes the "(qemu)" prompt; thus the monitor seems to be dead, whereas actually the active front-end is the serial port. Reported-by: Dr. David Alan Gilbert Tested-by: Dr. David Alan Gilbert Signed-off-by: Paolo Bonzini --- qemu-char.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/qemu-char.c b/qemu-char.c index 1e5a0e8cb9..2c9940cea4 100644 --- a/qemu-char.c +++ b/qemu-char.c @@ -735,19 +735,23 @@ static void mux_chr_read(void *opaque, const uint8_t *buf, int size) } } +static bool muxes_realized; + static void mux_chr_event(void *opaque, int event) { CharDriverState *chr = opaque; MuxDriver *d = chr->opaque; int i; + if (!muxes_realized) { + return; + } + /* Send the event to all registered listeners */ for (i = 0; i < d->mux_cnt; i++) mux_chr_send_event(d, i, event); } -static bool muxes_realized; - /** * Called after processing of default and command-line-specified * chardevs to deliver CHR_EVENT_OPENED events to any FEs attached From d14fabd9c2debef0d51ec90af1326e83111a77de Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Oct 2016 22:04:58 +0200 Subject: [PATCH 25/30] slirp: fix CharDriver breakage SLIRP expects a CharBackend as the third argument to slirp_add_exec, but net/slirp.c was passing a CharDriverState. Fix this to restore guestfwd functionality. Reported-by: Dr. David Alan Gilbert Signed-off-by: Paolo Bonzini --- net/slirp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/slirp.c b/net/slirp.c index 64dd3255ae..bcd1c5f57d 100644 --- a/net/slirp.c +++ b/net/slirp.c @@ -763,8 +763,7 @@ static int slirp_guestfwd(SlirpState *s, const char *config_str, return -1; } - if (slirp_add_exec(s->slirp, 3, qemu_chr_fe_get_driver(&fwd->hd), - &server, port) < 0) { + if (slirp_add_exec(s->slirp, 3, &fwd->hd, &server, port) < 0) { error_report("conflicting/invalid host:port in guest forwarding " "rule '%s'", config_str); g_free(fwd); From 95ea69fb46266aaa46d0c8b7f0ba8c4903dbe4e3 Mon Sep 17 00:00:00 2001 From: Luwei Kang Date: Mon, 31 Oct 2016 16:27:26 +0800 Subject: [PATCH 26/30] x86: add AVX512_4VNNIW and AVX512_4FMAPS features The spec can be found in Intel Software Developer Manual or in Instruction Set Extensions Programming Reference. Signed-off-by: Piotr Luc Signed-off-by: Luwei Kang Message-Id: <1477902446-5932-1-git-send-email-he.chen@linux.intel.com> Signed-off-by: Paolo Bonzini --- target-i386/cpu.c | 19 ++++++++++++++++++- target-i386/cpu.h | 4 ++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 0f8a8fbd3b..14c5186fe7 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -239,6 +239,7 @@ static void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM, CPUID_7_0_EBX_RDSEED */ #define TCG_7_0_ECX_FEATURES (CPUID_7_0_ECX_PKU | CPUID_7_0_ECX_OSPKE) +#define TCG_7_0_EDX_FEATURES 0 #define TCG_APM_FEATURES 0 #define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT #define TCG_XSAVE_FEATURES (CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XGETBV1) @@ -444,6 +445,22 @@ static FeatureWordInfo feature_word_info[FEATURE_WORDS] = { .cpuid_reg = R_ECX, .tcg_features = TCG_7_0_ECX_FEATURES, }, + [FEAT_7_0_EDX] = { + .feat_names = { + NULL, NULL, "avx512-4vnniw", "avx512-4fmaps", + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, + }, + .cpuid_eax = 7, + .cpuid_needs_ecx = true, .cpuid_ecx = 0, + .cpuid_reg = R_EDX, + .tcg_features = TCG_7_0_EDX_FEATURES, + }, [FEAT_8000_0007_EDX] = { .feat_names = { NULL, NULL, NULL, NULL, @@ -2560,7 +2577,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, if ((*ecx & CPUID_7_0_ECX_PKU) && env->cr[4] & CR4_PKE_MASK) { *ecx |= CPUID_7_0_ECX_OSPKE; } - *edx = 0; /* Reserved */ + *edx = env->features[FEAT_7_0_EDX]; /* Feature flags */ } else { *eax = 0; *ebx = 0; diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 6303d6593d..c605724022 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -443,6 +443,7 @@ typedef enum FeatureWord { FEAT_1_ECX, /* CPUID[1].ECX */ FEAT_7_0_EBX, /* CPUID[EAX=7,ECX=0].EBX */ FEAT_7_0_ECX, /* CPUID[EAX=7,ECX=0].ECX */ + FEAT_7_0_EDX, /* CPUID[EAX=7,ECX=0].EDX */ FEAT_8000_0001_EDX, /* CPUID[8000_0001].EDX */ FEAT_8000_0001_ECX, /* CPUID[8000_0001].ECX */ FEAT_8000_0007_EDX, /* CPUID[8000_0007].EDX */ @@ -629,6 +630,9 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS]; #define CPUID_7_0_ECX_OSPKE (1U << 4) #define CPUID_7_0_ECX_RDPID (1U << 22) +#define CPUID_7_0_EDX_AVX512_4VNNIW (1U << 2) /* AVX512 Neural Network Instructions */ +#define CPUID_7_0_EDX_AVX512_4FMAPS (1U << 3) /* AVX512 Multiply Accumulation Single Precision */ + #define CPUID_XSAVE_XSAVEOPT (1U << 0) #define CPUID_XSAVE_XSAVEC (1U << 1) #define CPUID_XSAVE_XGETBV1 (1U << 2) From 000980cb8307afe0578368cee8f31018905bb036 Mon Sep 17 00:00:00 2001 From: Jeff Cody Date: Tue, 1 Nov 2016 10:38:35 -0400 Subject: [PATCH 27/30] checkpatch: allow spaces before parenthesis for 'coroutine_fn' Signed-off-by: Jeff Cody Message-Id: <83b0fae0728906e18849c971d22d077d7fc0f179.1478010883.git.jcody@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Paolo Bonzini --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 1f1c9d3498..f084542934 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1754,7 +1754,7 @@ sub process { # Ignore those directives where spaces _are_ permitted. if ($name =~ /^(?: if|for|while|switch|return|case| - volatile|__volatile__| + volatile|__volatile__|coroutine_fn| __attribute__|format|__extension__| asm|__asm__)$/x) { From 864111f422babcf8ce837fb47f7f9e1948446f22 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 18 Oct 2016 09:29:54 +0200 Subject: [PATCH 28/30] vl: exit qemu on guest panic if -no-shutdown is not set For automated testing purposes it can be helpful to exit qemu (poweroff) when the guest panics. Make this the default unless -no-shutdown is specified. For internal-errors like errors from KVM_RUN the behaviour is not changed, in other words QEMU does not exit to allow debugging in the QEMU monitor. Signed-off-by: Christian Borntraeger Message-Id: <1476775794-108012-1-git-send-email-borntraeger@de.ibm.com> Reviewed-by: Eric Blake Signed-off-by: Paolo Bonzini --- qapi-schema.json | 4 ++-- vl.c | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/qapi-schema.json b/qapi-schema.json index 5dc96af469..b0b4bf64cc 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4621,10 +4621,10 @@ # # @pause: system pauses # -# Since: 2.1 +# Since: 2.1 (poweroff since 2.8) ## { 'enum': 'GuestPanicAction', - 'data': [ 'pause' ] } + 'data': [ 'pause', 'poweroff' ] } ## # @rtc-reset-reinjection diff --git a/vl.c b/vl.c index 368510fd8c..319f6413f2 100644 --- a/vl.c +++ b/vl.c @@ -1792,6 +1792,11 @@ void qemu_system_guest_panicked(void) } qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_PAUSE, &error_abort); vm_stop(RUN_STATE_GUEST_PANICKED); + if (!no_shutdown) { + qapi_event_send_guest_panicked(GUEST_PANIC_ACTION_POWEROFF, + &error_abort); + qemu_system_shutdown_request(); + } } void qemu_system_reset_request(void) From 85cdeb36a4d4f46f04ea3f4a9e4f1e86dd8e5c9a Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Tue, 18 Oct 2016 01:04:18 -0400 Subject: [PATCH 29/30] docs/rcu.txt: Fix minor typo s/presented/prevented/ Signed-off-by: Pranith Kumar Message-Id: <20161018050418.4912-1-bobby.prani@gmail.com> Signed-off-by: Paolo Bonzini --- docs/rcu.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/rcu.txt b/docs/rcu.txt index a70b72c545..c84e7f42b2 100644 --- a/docs/rcu.txt +++ b/docs/rcu.txt @@ -145,7 +145,7 @@ The core RCU API is small: and then read from there. RCU read-side critical sections must use atomic_rcu_read() to - read data, unless concurrent writes are presented by another + read data, unless concurrent writes are prevented by another synchronization mechanism. Furthermore, RCU read-side critical sections should traverse the From 7d175d29c9430fcba7a98f2c71925137b7870da4 Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Mon, 17 Oct 2016 20:09:39 +0200 Subject: [PATCH 30/30] main-loop: Suppress I/O thread warning under qtest We do not want to display the "I/O thread spun" warning for test cases that run under qtest. The first attempt for this (commit 01c22f2cdd4fcf02276ea10f48253850a5fd7259) tested whether qtest_enabled() was true. Commit 21a24302e85024dd7b2a151158adbc1f5dc5c4dd correctly recognized that just testing qtest_enabled() is not sufficient since there are some tests that do not use the qtest accelerator but just the qtest character device, and thus replaced qtest_enabled() by qtest_driver(). However, there are also some tests that only use the qtest accelerator and not the qtest chardev; perhaps most notably the bash iotests. Therefore, we have to check both qtest_enabled() and qtest_driver(). Signed-off-by: Max Reitz Message-Id: <20161017180939.27912-1-mreitz@redhat.com> Signed-off-by: Paolo Bonzini --- main-loop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main-loop.c b/main-loop.c index 66c4eb69a3..ad10bca211 100644 --- a/main-loop.c +++ b/main-loop.c @@ -234,7 +234,7 @@ static int os_host_main_loop_wait(int64_t timeout) if (!timeout && (spin_counter > MAX_MAIN_LOOP_SPIN)) { static bool notified; - if (!notified && !qtest_driver()) { + if (!notified && !qtest_enabled() && !qtest_driver()) { fprintf(stderr, "main-loop: WARNING: I/O thread spun for %d iterations\n", MAX_MAIN_LOOP_SPIN);