From aec21d31756cbda4ad1d0fc5c60797d2c1eecf33 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 25 Jul 2019 12:49:37 +0300 Subject: [PATCH 1/9] qapi: Add InetSocketAddress member keep-alive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's needed to provide keepalive for nbd client to track server availability. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20190725094937.32454-1-vsementsov@virtuozzo.com> Reviewed-by: Markus Armbruster Acked-by: Daniel P. Berrangé [eblake: Fix error message typo] Signed-off-by: Eric Blake --- qapi/sockets.json | 6 +++++- util/qemu-sockets.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/qapi/sockets.json b/qapi/sockets.json index fc81d8d5e8..32375f3a36 100644 --- a/qapi/sockets.json +++ b/qapi/sockets.json @@ -53,6 +53,9 @@ # # @ipv6: whether to accept IPv6 addresses, default try both IPv4 and IPv6 # +# @keep-alive: enable keep-alive when connecting to this socket. Not supported +# for passive sockets. (Since 4.2) +# # Since: 1.3 ## { 'struct': 'InetSocketAddress', @@ -61,7 +64,8 @@ '*numeric': 'bool', '*to': 'uint16', '*ipv4': 'bool', - '*ipv6': 'bool' } } + '*ipv6': 'bool', + '*keep-alive': 'bool' } } ## # @UnixSocketAddress: diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c index a5092dbd12..e3a1666578 100644 --- a/util/qemu-sockets.c +++ b/util/qemu-sockets.c @@ -219,6 +219,12 @@ static int inet_listen_saddr(InetSocketAddress *saddr, bool socket_created = false; Error *err = NULL; + if (saddr->keep_alive) { + error_setg(errp, "keep-alive option is not supported for passive " + "sockets"); + return -1; + } + memset(&ai,0, sizeof(ai)); ai.ai_flags = AI_PASSIVE; if (saddr->has_numeric && saddr->numeric) { @@ -458,6 +464,19 @@ int inet_connect_saddr(InetSocketAddress *saddr, Error **errp) } freeaddrinfo(res); + + if (saddr->keep_alive) { + int val = 1; + int ret = qemu_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + &val, sizeof(val)); + + if (ret < 0) { + error_setg_errno(errp, errno, "Unable to set KEEPALIVE"); + close(sock); + return -1; + } + } + return sock; } @@ -653,6 +672,15 @@ int inet_parse(InetSocketAddress *addr, const char *str, Error **errp) } addr->has_ipv6 = true; } + begin = strstr(optstr, ",keep-alive"); + if (begin) { + if (inet_parse_flag("keep-alive", begin + strlen(",keep-alive"), + &addr->keep_alive, errp) < 0) + { + return -1; + } + addr->has_keep_alive = true; + } return 0; } From 3299e5ecf7e1c9443189e50c949d97e536022c59 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 25 Jul 2019 13:05:48 +0300 Subject: [PATCH 2/9] block: implement BDRV_REQ_PREFETCH Do effective copy-on-read request when we don't need data actually. It will be used for block-stream and NBD_CMD_CACHE. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20190725100550.33801-2-vsementsov@virtuozzo.com> Reviewed-by: Stefan Hajnoczi [eblake: comment grammar fix] Signed-off-by: Eric Blake --- block/io.c | 18 ++++++++++++------ include/block/block.h | 8 +++++++- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/block/io.c b/block/io.c index 06305c6ea6..9d99858b55 100644 --- a/block/io.c +++ b/block/io.c @@ -1167,7 +1167,8 @@ bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset, } static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov) + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + int flags) { BlockDriverState *bs = child->bs; @@ -1278,9 +1279,11 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child, goto err; } - qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes, - pnum - skip_bytes); - } else { + if (!(flags & BDRV_REQ_PREFETCH)) { + qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes, + pnum - skip_bytes); + } + } else if (!(flags & BDRV_REQ_PREFETCH)) { /* Read directly into the destination */ qemu_iovec_init(&local_qiov, qiov->niov); qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes); @@ -1331,7 +1334,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, * potential fallback support, if we ever implement any read flags * to pass through to drivers. For now, there aren't any * passthrough flags. */ - assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ))); + assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ | + BDRV_REQ_PREFETCH))); /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { @@ -1359,7 +1363,9 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child, } if (!ret || pnum != bytes) { - ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov); + ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov, flags); + goto out; + } else if (flags & BDRV_REQ_PREFETCH) { goto out; } } diff --git a/include/block/block.h b/include/block/block.h index 50a07c1c33..a9df34ff94 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -88,8 +88,14 @@ typedef enum { * fallback. */ BDRV_REQ_NO_FALLBACK = 0x100, + /* + * BDRV_REQ_PREFETCH may be used only together with BDRV_REQ_COPY_ON_READ + * on read request and means that caller doesn't really need data to be + * written to qiov parameter which may be NULL. + */ + BDRV_REQ_PREFETCH = 0x200, /* Mask of valid flags */ - BDRV_REQ_MASK = 0x1ff, + BDRV_REQ_MASK = 0x3ff, } BdrvRequestFlags; typedef struct BlockSizes { From 99136607b1edb0129d1ebb6ecac21738ad3d9c36 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 25 Jul 2019 13:05:49 +0300 Subject: [PATCH 3/9] block/stream: use BDRV_REQ_PREFETCH This helps to avoid extra io, allocations and memory copying. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20190725100550.33801-3-vsementsov@virtuozzo.com> Reviewed-by: Stefan Hajnoczi [eblake: fix comment grammar] Signed-off-by: Eric Blake --- block/stream.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/block/stream.c b/block/stream.c index 6ac1e7bec4..0d3a6ac7c3 100644 --- a/block/stream.c +++ b/block/stream.c @@ -22,11 +22,11 @@ enum { /* - * Size of data buffer for populating the image file. This should be large - * enough to process multiple clusters in a single call, so that populating - * contiguous regions of the image is efficient. + * Maximum chunk size to feed to copy-on-read. This should be + * large enough to process multiple clusters in a single call, so + * that populating contiguous regions of the image is efficient. */ - STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */ + STREAM_CHUNK = 512 * 1024, /* in bytes */ }; typedef struct StreamBlockJob { @@ -39,13 +39,12 @@ typedef struct StreamBlockJob { } StreamBlockJob; static int coroutine_fn stream_populate(BlockBackend *blk, - int64_t offset, uint64_t bytes, - void *buf) + int64_t offset, uint64_t bytes) { assert(bytes < SIZE_MAX); - /* Copy-on-read the unallocated clusters */ - return blk_co_pread(blk, offset, bytes, buf, BDRV_REQ_COPY_ON_READ); + return blk_co_preadv(blk, offset, bytes, NULL, + BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH); } static void stream_abort(Job *job) @@ -117,7 +116,6 @@ static int coroutine_fn stream_run(Job *job, Error **errp) int error = 0; int ret = 0; int64_t n = 0; /* bytes */ - void *buf; if (bs == s->bottom) { /* Nothing to stream */ @@ -130,8 +128,6 @@ static int coroutine_fn stream_run(Job *job, Error **errp) } job_progress_set_remaining(&s->common.job, len); - buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE); - /* Turn on copy-on-read for the whole block device so that guest read * requests help us make progress. Only do this when copying the entire * backing chain since the copy-on-read operation does not take base into @@ -154,7 +150,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) copy = false; - ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &n); + ret = bdrv_is_allocated(bs, offset, STREAM_CHUNK, &n); if (ret == 1) { /* Allocated in the top, no need to copy. */ } else if (ret >= 0) { @@ -171,7 +167,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp) } trace_stream_one_iteration(s, offset, n, ret); if (copy) { - ret = stream_populate(blk, offset, n, buf); + ret = stream_populate(blk, offset, n); } if (ret < 0) { BlockErrorAction action = @@ -202,8 +198,6 @@ static int coroutine_fn stream_run(Job *job, Error **errp) bdrv_disable_copy_on_read(bs); } - qemu_vfree(buf); - /* Do not remove the backing file if an error was there but ignored. */ return error; } From 7fa5c5657f11096101f5be18fe1f7395776be5a6 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Thu, 25 Jul 2019 13:05:50 +0300 Subject: [PATCH 4/9] nbd: improve CMD_CACHE: use BDRV_REQ_PREFETCH This helps to avoid extra io, allocations and memory copying. We assume here that CMD_CACHE is always used with copy-on-read, as otherwise it's a noop. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20190725100550.33801-4-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake Reviewed-by: Stefan Hajnoczi Signed-off-by: Eric Blake --- nbd/server.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index 10faedcfc5..a2cf085f76 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -2104,12 +2104,15 @@ static int nbd_co_receive_request(NBDRequestData *req, NBDRequest *request, return -EINVAL; } - req->data = blk_try_blockalign(client->exp->blk, request->len); - if (req->data == NULL) { - error_setg(errp, "No memory"); - return -ENOMEM; + if (request->type != NBD_CMD_CACHE) { + req->data = blk_try_blockalign(client->exp->blk, request->len); + if (req->data == NULL) { + error_setg(errp, "No memory"); + return -ENOMEM; + } } } + if (request->type == NBD_CMD_WRITE) { if (nbd_read(client->ioc, req->data, request->len, "CMD_WRITE data", errp) < 0) @@ -2194,7 +2197,7 @@ static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request, int ret; NBDExport *exp = client->exp; - assert(request->type == NBD_CMD_READ || request->type == NBD_CMD_CACHE); + assert(request->type == NBD_CMD_READ); /* XXX: NBD Protocol only documents use of FUA with WRITE */ if (request->flags & NBD_CMD_FLAG_FUA) { @@ -2206,7 +2209,7 @@ static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request, } if (client->structured_reply && !(request->flags & NBD_CMD_FLAG_DF) && - request->len && request->type != NBD_CMD_CACHE) + request->len) { return nbd_co_send_sparse_read(client, request->handle, request->from, data, request->len, errp); @@ -2214,7 +2217,7 @@ static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request, ret = blk_pread(exp->blk, request->from + exp->dev_offset, data, request->len); - if (ret < 0 || request->type == NBD_CMD_CACHE) { + if (ret < 0) { return nbd_send_generic_reply(client, request->handle, ret, "reading from file failed", errp); } @@ -2233,6 +2236,28 @@ static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request, } } +/* + * nbd_do_cmd_cache + * + * Handle NBD_CMD_CACHE request. + * Return -errno if sending fails. Other errors are reported directly to the + * client as an error reply. + */ +static coroutine_fn int nbd_do_cmd_cache(NBDClient *client, NBDRequest *request, + Error **errp) +{ + int ret; + NBDExport *exp = client->exp; + + assert(request->type == NBD_CMD_CACHE); + + ret = blk_co_preadv(exp->blk, request->from + exp->dev_offset, request->len, + NULL, BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH); + + return nbd_send_generic_reply(client, request->handle, ret, + "caching data failed", errp); +} + /* Handle NBD request. * Return -errno if sending fails. Other errors are reported directly to the * client as an error reply. */ @@ -2246,8 +2271,10 @@ static coroutine_fn int nbd_handle_request(NBDClient *client, char *msg; switch (request->type) { - case NBD_CMD_READ: case NBD_CMD_CACHE: + return nbd_do_cmd_cache(client, request, errp); + + case NBD_CMD_READ: return nbd_do_cmd_read(client, request, data, errp); case NBD_CMD_WRITE: From 962b7b3d4c3aed241ec28ea712ec9079ac98d3ad Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 18 Jun 2019 14:43:20 +0300 Subject: [PATCH 5/9] block/nbd: split connection_co start out of nbd_client_connect nbd_client_connect is going to be used from connection_co, so, let's refactor nbd_client_connect in advance, leaving io channel configuration all in nbd_client_connect. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Message-Id: <20190618114328.55249-2-vsementsov@virtuozzo.com> Signed-off-by: Eric Blake --- block/nbd.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index 57c1a20581..c16d02528b 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -1231,14 +1231,8 @@ static int nbd_client_connect(BlockDriverState *bs, object_ref(OBJECT(s->ioc)); } - /* - * Now that we're connected, set the socket to be non-blocking and - * kick the reply mechanism. - */ qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL); - s->connection_co = qemu_coroutine_create(nbd_connection_entry, s); - bdrv_inc_in_flight(bs); - nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs)); + qio_channel_attach_aio_context(s->ioc, bdrv_get_aio_context(bs)); trace_nbd_client_connect_success(export); @@ -1269,14 +1263,24 @@ static int nbd_client_init(BlockDriverState *bs, const char *x_dirty_bitmap, Error **errp) { + int ret; BDRVNBDState *s = (BDRVNBDState *)bs->opaque; s->bs = bs; qemu_co_mutex_init(&s->send_mutex); qemu_co_queue_init(&s->free_sema); - return nbd_client_connect(bs, saddr, export, tlscreds, hostname, - x_dirty_bitmap, errp); + ret = nbd_client_connect(bs, saddr, export, tlscreds, hostname, + x_dirty_bitmap, errp); + if (ret < 0) { + return ret; + } + + s->connection_co = qemu_coroutine_create(nbd_connection_entry, s); + bdrv_inc_in_flight(bs); + aio_co_schedule(bdrv_get_aio_context(bs), s->connection_co); + + return 0; } static int nbd_parse_uri(const char *filename, QDict *options) From a8e2bb6a76c7c661c117327f70f06eb628554230 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 18 Jun 2019 14:43:21 +0300 Subject: [PATCH 6/9] block/nbd: use non-blocking io channel for nbd negotiation No reason to use blocking channel for negotiation and we'll benefit in further reconnect feature, as qio_channel reads and writes will do qemu_coroutine_yield while waiting for io completion. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Message-Id: <20190618114328.55249-3-vsementsov@virtuozzo.com> Signed-off-by: Eric Blake --- block/nbd.c | 16 +++++++--------- include/block/nbd.h | 3 ++- nbd/client.c | 16 +++++++++++----- qemu-nbd.c | 2 +- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index c16d02528b..3a243d9de9 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -1175,6 +1175,7 @@ static int nbd_client_connect(BlockDriverState *bs, Error **errp) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; + AioContext *aio_context = bdrv_get_aio_context(bs); int ret; /* @@ -1189,15 +1190,16 @@ static int nbd_client_connect(BlockDriverState *bs, /* NBD handshake */ trace_nbd_client_connect(export); - qio_channel_set_blocking(QIO_CHANNEL(sioc), true, NULL); + qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL); + qio_channel_attach_aio_context(QIO_CHANNEL(sioc), aio_context); s->info.request_sizes = true; s->info.structured_reply = true; s->info.base_allocation = true; s->info.x_dirty_bitmap = g_strdup(x_dirty_bitmap); s->info.name = g_strdup(export ?: ""); - ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), tlscreds, hostname, - &s->ioc, &s->info, errp); + ret = nbd_receive_negotiate(aio_context, QIO_CHANNEL(sioc), tlscreds, + hostname, &s->ioc, &s->info, errp); g_free(s->info.x_dirty_bitmap); g_free(s->info.name); if (ret < 0) { @@ -1231,18 +1233,14 @@ static int nbd_client_connect(BlockDriverState *bs, object_ref(OBJECT(s->ioc)); } - qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL); - qio_channel_attach_aio_context(s->ioc, bdrv_get_aio_context(bs)); - trace_nbd_client_connect_success(export); return 0; fail: /* - * We have connected, but must fail for other reasons. The - * connection is still blocking; send NBD_CMD_DISC as a courtesy - * to the server. + * We have connected, but must fail for other reasons. + * Send NBD_CMD_DISC as a courtesy to the server. */ { NBDRequest request = { .type = NBD_CMD_DISC }; diff --git a/include/block/nbd.h b/include/block/nbd.h index bb9f5bc021..7b36d672f0 100644 --- a/include/block/nbd.h +++ b/include/block/nbd.h @@ -304,7 +304,8 @@ struct NBDExportInfo { }; typedef struct NBDExportInfo NBDExportInfo; -int nbd_receive_negotiate(QIOChannel *ioc, QCryptoTLSCreds *tlscreds, +int nbd_receive_negotiate(AioContext *aio_context, QIOChannel *ioc, + QCryptoTLSCreds *tlscreds, const char *hostname, QIOChannel **outioc, NBDExportInfo *info, Error **errp); void nbd_free_export_list(NBDExportInfo *info, int count); diff --git a/nbd/client.c b/nbd/client.c index 4de30630c7..8f524c3e35 100644 --- a/nbd/client.c +++ b/nbd/client.c @@ -867,7 +867,8 @@ static int nbd_list_meta_contexts(QIOChannel *ioc, * 2: server is newstyle, but lacks structured replies * 3: server is newstyle and set up for structured replies */ -static int nbd_start_negotiate(QIOChannel *ioc, QCryptoTLSCreds *tlscreds, +static int nbd_start_negotiate(AioContext *aio_context, QIOChannel *ioc, + QCryptoTLSCreds *tlscreds, const char *hostname, QIOChannel **outioc, bool structured_reply, bool *zeroes, Error **errp) @@ -934,6 +935,10 @@ static int nbd_start_negotiate(QIOChannel *ioc, QCryptoTLSCreds *tlscreds, return -EINVAL; } ioc = *outioc; + if (aio_context) { + qio_channel_set_blocking(ioc, false, NULL); + qio_channel_attach_aio_context(ioc, aio_context); + } } else { error_setg(errp, "Server does not support STARTTLS"); return -EINVAL; @@ -998,7 +1003,8 @@ static int nbd_negotiate_finish_oldstyle(QIOChannel *ioc, NBDExportInfo *info, * Returns: negative errno: failure talking to server * 0: server is connected */ -int nbd_receive_negotiate(QIOChannel *ioc, QCryptoTLSCreds *tlscreds, +int nbd_receive_negotiate(AioContext *aio_context, QIOChannel *ioc, + QCryptoTLSCreds *tlscreds, const char *hostname, QIOChannel **outioc, NBDExportInfo *info, Error **errp) { @@ -1009,7 +1015,7 @@ int nbd_receive_negotiate(QIOChannel *ioc, QCryptoTLSCreds *tlscreds, assert(info->name); trace_nbd_receive_negotiate_name(info->name); - result = nbd_start_negotiate(ioc, tlscreds, hostname, outioc, + result = nbd_start_negotiate(aio_context, ioc, tlscreds, hostname, outioc, info->structured_reply, &zeroes, errp); info->structured_reply = false; @@ -1129,8 +1135,8 @@ int nbd_receive_export_list(QIOChannel *ioc, QCryptoTLSCreds *tlscreds, QIOChannel *sioc = NULL; *info = NULL; - result = nbd_start_negotiate(ioc, tlscreds, hostname, &sioc, true, NULL, - errp); + result = nbd_start_negotiate(NULL, ioc, tlscreds, hostname, &sioc, true, + NULL, errp); if (tlscreds && sioc) { ioc = sioc; } diff --git a/qemu-nbd.c b/qemu-nbd.c index a8cb39e510..049645491d 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -362,7 +362,7 @@ static void *nbd_client_thread(void *arg) goto out; } - ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), + ret = nbd_receive_negotiate(NULL, QIO_CHANNEL(sioc), NULL, NULL, NULL, &info, &local_error); if (ret < 0) { if (local_error) { From a34b1e5e06de777c0faaa78694c4e0775b2bef0c Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 18 Jun 2019 14:43:22 +0300 Subject: [PATCH 7/9] block/nbd: move from quit to state To implement reconnect we need several states for the client: CONNECTED, QUIT and two different CONNECTING states. CONNECTING states will be added in the following patches. This patch implements CONNECTED and QUIT. QUIT means, that we should close the connection and fail all current and further requests (like old quit = true). CONNECTED means that connection is ok, we can send requests (like old quit = false). For receiving loop we use a comparison of the current state with QUIT, because reconnect will be in the same loop, so it should be looping until the end. Opposite, for requests we use a comparison of the current state with CONNECTED, as we don't want to send requests in future CONNECTING states. Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Message-Id: <20190618114328.55249-4-vsementsov@virtuozzo.com> Signed-off-by: Eric Blake --- block/nbd.c | 58 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index 3a243d9de9..d03b00fc30 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -53,6 +53,11 @@ typedef struct { bool receiving; /* waiting for connection_co? */ } NBDClientRequest; +typedef enum NBDClientState { + NBD_CLIENT_CONNECTED, + NBD_CLIENT_QUIT +} NBDClientState; + typedef struct BDRVNBDState { QIOChannelSocket *sioc; /* The master data channel */ QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ @@ -62,17 +67,23 @@ typedef struct BDRVNBDState { CoQueue free_sema; Coroutine *connection_co; int in_flight; + NBDClientState state; NBDClientRequest requests[MAX_NBD_REQUESTS]; NBDReply reply; BlockDriverState *bs; - bool quit; /* For nbd_refresh_filename() */ SocketAddress *saddr; char *export, *tlscredsid; } BDRVNBDState; +/* @ret will be used for reconnect in future */ +static void nbd_channel_error(BDRVNBDState *s, int ret) +{ + s->state = NBD_CLIENT_QUIT; +} + static void nbd_recv_coroutines_wake_all(BDRVNBDState *s) { int i; @@ -151,7 +162,7 @@ static coroutine_fn void nbd_connection_entry(void *opaque) int ret = 0; Error *local_err = NULL; - while (!s->quit) { + while (s->state != NBD_CLIENT_QUIT) { /* * The NBD client can only really be considered idle when it has * yielded from qio_channel_readv_all_eof(), waiting for data. This is @@ -169,6 +180,7 @@ static coroutine_fn void nbd_connection_entry(void *opaque) error_free(local_err); } if (ret <= 0) { + nbd_channel_error(s, ret ? ret : -EIO); break; } @@ -183,6 +195,7 @@ static coroutine_fn void nbd_connection_entry(void *opaque) !s->requests[i].receiving || (nbd_reply_is_structured(&s->reply) && !s->info.structured_reply)) { + nbd_channel_error(s, -EINVAL); break; } @@ -202,7 +215,6 @@ static coroutine_fn void nbd_connection_entry(void *opaque) qemu_coroutine_yield(); } - s->quit = true; nbd_recv_coroutines_wake_all(s); bdrv_dec_in_flight(s->bs); @@ -215,12 +227,18 @@ static int nbd_co_send_request(BlockDriverState *bs, QEMUIOVector *qiov) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; - int rc, i; + int rc, i = -1; qemu_co_mutex_lock(&s->send_mutex); while (s->in_flight == MAX_NBD_REQUESTS) { qemu_co_queue_wait(&s->free_sema, &s->send_mutex); } + + if (s->state != NBD_CLIENT_CONNECTED) { + rc = -EIO; + goto err; + } + s->in_flight++; for (i = 0; i < MAX_NBD_REQUESTS; i++) { @@ -238,16 +256,12 @@ static int nbd_co_send_request(BlockDriverState *bs, request->handle = INDEX_TO_HANDLE(s, i); - if (s->quit) { - rc = -EIO; - goto err; - } assert(s->ioc); if (qiov) { qio_channel_set_cork(s->ioc, true); rc = nbd_send_request(s->ioc, request); - if (rc >= 0 && !s->quit) { + if (rc >= 0 && s->state == NBD_CLIENT_CONNECTED) { if (qio_channel_writev_all(s->ioc, qiov->iov, qiov->niov, NULL) < 0) { rc = -EIO; @@ -262,9 +276,11 @@ static int nbd_co_send_request(BlockDriverState *bs, err: if (rc < 0) { - s->quit = true; - s->requests[i].coroutine = NULL; - s->in_flight--; + nbd_channel_error(s, rc); + if (i != -1) { + s->requests[i].coroutine = NULL; + s->in_flight--; + } qemu_co_queue_next(&s->free_sema); } qemu_co_mutex_unlock(&s->send_mutex); @@ -556,7 +572,7 @@ static coroutine_fn int nbd_co_do_receive_one_chunk( s->requests[i].receiving = true; qemu_coroutine_yield(); s->requests[i].receiving = false; - if (s->quit) { + if (s->state != NBD_CLIENT_CONNECTED) { error_setg(errp, "Connection closed"); return -EIO; } @@ -641,7 +657,7 @@ static coroutine_fn int nbd_co_receive_one_chunk( if (ret < 0) { memset(reply, 0, sizeof(*reply)); - s->quit = true; + nbd_channel_error(s, ret); } else { /* For assert at loop start in nbd_connection_entry */ *reply = s->reply; @@ -709,7 +725,7 @@ static bool nbd_reply_chunk_iter_receive(BDRVNBDState *s, NBDReply local_reply; NBDStructuredReplyChunk *chunk; Error *local_err = NULL; - if (s->quit) { + if (s->state != NBD_CLIENT_CONNECTED) { error_setg(&local_err, "Connection closed"); nbd_iter_channel_error(iter, -EIO, &local_err); goto break_loop; @@ -734,7 +750,7 @@ static bool nbd_reply_chunk_iter_receive(BDRVNBDState *s, } /* Do not execute the body of NBD_FOREACH_REPLY_CHUNK for simple reply. */ - if (nbd_reply_is_simple(reply) || s->quit) { + if (nbd_reply_is_simple(reply) || s->state != NBD_CLIENT_CONNECTED) { goto break_loop; } @@ -808,14 +824,14 @@ static int nbd_co_receive_cmdread_reply(BDRVNBDState *s, uint64_t handle, ret = nbd_parse_offset_hole_payload(s, &reply.structured, payload, offset, qiov, &local_err); if (ret < 0) { - s->quit = true; + nbd_channel_error(s, ret); nbd_iter_channel_error(&iter, ret, &local_err); } break; default: if (!nbd_reply_type_is_error(chunk->type)) { /* not allowed reply type */ - s->quit = true; + nbd_channel_error(s, -EINVAL); error_setg(&local_err, "Unexpected reply type: %d (%s) for CMD_READ", chunk->type, nbd_reply_type_lookup(chunk->type)); @@ -853,7 +869,7 @@ static int nbd_co_receive_blockstatus_reply(BDRVNBDState *s, switch (chunk->type) { case NBD_REPLY_TYPE_BLOCK_STATUS: if (received) { - s->quit = true; + nbd_channel_error(s, -EINVAL); error_setg(&local_err, "Several BLOCK_STATUS chunks in reply"); nbd_iter_channel_error(&iter, -EINVAL, &local_err); } @@ -863,13 +879,13 @@ static int nbd_co_receive_blockstatus_reply(BDRVNBDState *s, payload, length, extent, &local_err); if (ret < 0) { - s->quit = true; + nbd_channel_error(s, ret); nbd_iter_channel_error(&iter, ret, &local_err); } break; default: if (!nbd_reply_type_is_error(chunk->type)) { - s->quit = true; + nbd_channel_error(s, -EINVAL); error_setg(&local_err, "Unexpected reply type: %d (%s) " "for CMD_BLOCK_STATUS", From b172ae2e0ed38a1d058babe6a788f97c402e0e51 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 18 Jun 2019 14:43:23 +0300 Subject: [PATCH 8/9] block/nbd: add cmdline and qapi parameter reconnect-delay Reconnect will be implemented in the following commit, so for now, in semantics below, disconnect itself is a "serious error". Signed-off-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Eric Blake Message-Id: <20190618114328.55249-5-vsementsov@virtuozzo.com> [eblake: slipped from 4.1 to 4.2] Signed-off-by: Eric Blake --- block/nbd.c | 16 +++++++++++++++- qapi/block-core.json | 11 ++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index d03b00fc30..de2a26097b 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -1275,6 +1275,7 @@ static int nbd_client_init(BlockDriverState *bs, QCryptoTLSCreds *tlscreds, const char *hostname, const char *x_dirty_bitmap, + uint32_t reconnect_delay, Error **errp) { int ret; @@ -1600,6 +1601,17 @@ static QemuOptsList nbd_runtime_opts = { .help = "experimental: expose named dirty bitmap in place of " "block status", }, + { + .name = "reconnect-delay", + .type = QEMU_OPT_NUMBER, + .help = "On an unexpected disconnect, the nbd client tries to " + "connect again until succeeding or encountering a serious " + "error. During the first @reconnect-delay seconds, all " + "requests are paused and will be rerun on a successful " + "reconnect. After that time, any delayed requests and all " + "future requests before a successful reconnect will " + "immediately fail. Default 0", + }, { /* end of list */ } }, }; @@ -1651,7 +1663,9 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, /* NBD handshake */ ret = nbd_client_init(bs, s->saddr, s->export, tlscreds, hostname, - qemu_opt_get(opts, "x-dirty-bitmap"), errp); + qemu_opt_get(opts, "x-dirty-bitmap"), + qemu_opt_get_number(opts, "reconnect-delay", 0), + errp); error: if (tlscreds) { diff --git a/qapi/block-core.json b/qapi/block-core.json index 0d43d4f37c..f1e7701fbe 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -3860,13 +3860,22 @@ # traditional "base:allocation" block status (see # NBD_OPT_LIST_META_CONTEXT in the NBD protocol) (since 3.0) # +# @reconnect-delay: On an unexpected disconnect, the nbd client tries to +# connect again until succeeding or encountering a serious +# error. During the first @reconnect-delay seconds, all +# requests are paused and will be rerun on a successful +# reconnect. After that time, any delayed requests and all +# future requests before a successful reconnect will +# immediately fail. Default 0 (Since 4.2) +# # Since: 2.9 ## { 'struct': 'BlockdevOptionsNbd', 'data': { 'server': 'SocketAddress', '*export': 'str', '*tls-creds': 'str', - '*x-dirty-bitmap': 'str' } } + '*x-dirty-bitmap': 'str', + '*reconnect-delay': 'uint32' } } ## # @BlockdevOptionsRaw: From 8f071c9db506e03abcb1b76ec6d3d2f9488cc3b3 Mon Sep 17 00:00:00 2001 From: Vladimir Sementsov-Ogievskiy Date: Tue, 18 Jun 2019 14:43:24 +0300 Subject: [PATCH 9/9] block/nbd: refactor nbd connection parameters We'll need some connection parameters to be available all the time to implement nbd reconnect. So, let's refactor them: define additional parameters in BDRVNBDState, drop them from function parameters, drop nbd_client_init and separate options parsing instead from nbd_open. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20190618114328.55249-6-vsementsov@virtuozzo.com> Reviewed-by: Eric Blake [eblake: Drop useless 'if' before object_unref] Signed-off-by: Eric Blake --- block/nbd.c | 121 ++++++++++++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 61 deletions(-) diff --git a/block/nbd.c b/block/nbd.c index de2a26097b..00b8d86783 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -73,9 +73,13 @@ typedef struct BDRVNBDState { NBDReply reply; BlockDriverState *bs; - /* For nbd_refresh_filename() */ + /* Connection parameters */ + uint32_t reconnect_delay; SocketAddress *saddr; char *export, *tlscredsid; + QCryptoTLSCreds *tlscreds; + const char *hostname; + char *x_dirty_bitmap; } BDRVNBDState; /* @ret will be used for reconnect in future */ @@ -1182,13 +1186,7 @@ static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr, return sioc; } -static int nbd_client_connect(BlockDriverState *bs, - SocketAddress *saddr, - const char *export, - QCryptoTLSCreds *tlscreds, - const char *hostname, - const char *x_dirty_bitmap, - Error **errp) +static int nbd_client_connect(BlockDriverState *bs, Error **errp) { BDRVNBDState *s = (BDRVNBDState *)bs->opaque; AioContext *aio_context = bdrv_get_aio_context(bs); @@ -1198,33 +1196,33 @@ static int nbd_client_connect(BlockDriverState *bs, * establish TCP connection, return error if it fails * TODO: Configurable retry-until-timeout behaviour. */ - QIOChannelSocket *sioc = nbd_establish_connection(saddr, errp); + QIOChannelSocket *sioc = nbd_establish_connection(s->saddr, errp); if (!sioc) { return -ECONNREFUSED; } /* NBD handshake */ - trace_nbd_client_connect(export); + trace_nbd_client_connect(s->export); qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL); qio_channel_attach_aio_context(QIO_CHANNEL(sioc), aio_context); s->info.request_sizes = true; s->info.structured_reply = true; s->info.base_allocation = true; - s->info.x_dirty_bitmap = g_strdup(x_dirty_bitmap); - s->info.name = g_strdup(export ?: ""); - ret = nbd_receive_negotiate(aio_context, QIO_CHANNEL(sioc), tlscreds, - hostname, &s->ioc, &s->info, errp); + s->info.x_dirty_bitmap = g_strdup(s->x_dirty_bitmap); + s->info.name = g_strdup(s->export ?: ""); + ret = nbd_receive_negotiate(aio_context, QIO_CHANNEL(sioc), s->tlscreds, + s->hostname, &s->ioc, &s->info, errp); g_free(s->info.x_dirty_bitmap); g_free(s->info.name); if (ret < 0) { object_unref(OBJECT(sioc)); return ret; } - if (x_dirty_bitmap && !s->info.base_allocation) { + if (s->x_dirty_bitmap && !s->info.base_allocation) { error_setg(errp, "requested x-dirty-bitmap %s not found", - x_dirty_bitmap); + s->x_dirty_bitmap); ret = -EINVAL; goto fail; } @@ -1249,7 +1247,7 @@ static int nbd_client_connect(BlockDriverState *bs, object_ref(OBJECT(s->ioc)); } - trace_nbd_client_connect_success(export); + trace_nbd_client_connect_success(s->export); return 0; @@ -1269,34 +1267,9 @@ static int nbd_client_connect(BlockDriverState *bs, } } -static int nbd_client_init(BlockDriverState *bs, - SocketAddress *saddr, - const char *export, - QCryptoTLSCreds *tlscreds, - const char *hostname, - const char *x_dirty_bitmap, - uint32_t reconnect_delay, - Error **errp) -{ - int ret; - BDRVNBDState *s = (BDRVNBDState *)bs->opaque; - - s->bs = bs; - qemu_co_mutex_init(&s->send_mutex); - qemu_co_queue_init(&s->free_sema); - - ret = nbd_client_connect(bs, saddr, export, tlscreds, hostname, - x_dirty_bitmap, errp); - if (ret < 0) { - return ret; - } - - s->connection_co = qemu_coroutine_create(nbd_connection_entry, s); - bdrv_inc_in_flight(bs); - aio_co_schedule(bdrv_get_aio_context(bs), s->connection_co); - - return 0; -} +/* + * Parse nbd_open options + */ static int nbd_parse_uri(const char *filename, QDict *options) { @@ -1616,14 +1589,12 @@ static QemuOptsList nbd_runtime_opts = { }, }; -static int nbd_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) +static int nbd_process_options(BlockDriverState *bs, QDict *options, + Error **errp) { BDRVNBDState *s = bs->opaque; - QemuOpts *opts = NULL; + QemuOpts *opts; Error *local_err = NULL; - QCryptoTLSCreds *tlscreds = NULL; - const char *hostname = NULL; int ret = -EINVAL; opts = qemu_opts_create(&nbd_runtime_opts, NULL, 0, &error_abort); @@ -1648,8 +1619,8 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, s->tlscredsid = g_strdup(qemu_opt_get(opts, "tls-creds")); if (s->tlscredsid) { - tlscreds = nbd_get_tls_creds(s->tlscredsid, errp); - if (!tlscreds) { + s->tlscreds = nbd_get_tls_creds(s->tlscredsid, errp); + if (!s->tlscreds) { goto error; } @@ -1658,20 +1629,17 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, error_setg(errp, "TLS only supported over IP sockets"); goto error; } - hostname = s->saddr->u.inet.host; + s->hostname = s->saddr->u.inet.host; } - /* NBD handshake */ - ret = nbd_client_init(bs, s->saddr, s->export, tlscreds, hostname, - qemu_opt_get(opts, "x-dirty-bitmap"), - qemu_opt_get_number(opts, "reconnect-delay", 0), - errp); + s->x_dirty_bitmap = g_strdup(qemu_opt_get(opts, "x-dirty-bitmap")); + s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 0); + + ret = 0; error: - if (tlscreds) { - object_unref(OBJECT(tlscreds)); - } if (ret < 0) { + object_unref(OBJECT(s->tlscreds)); qapi_free_SocketAddress(s->saddr); g_free(s->export); g_free(s->tlscredsid); @@ -1680,6 +1648,35 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags, return ret; } +static int nbd_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + int ret; + BDRVNBDState *s = (BDRVNBDState *)bs->opaque; + + ret = nbd_process_options(bs, options, errp); + if (ret < 0) { + return ret; + } + + s->bs = bs; + qemu_co_mutex_init(&s->send_mutex); + qemu_co_queue_init(&s->free_sema); + + ret = nbd_client_connect(bs, errp); + if (ret < 0) { + return ret; + } + /* successfully connected */ + s->state = NBD_CLIENT_CONNECTED; + + s->connection_co = qemu_coroutine_create(nbd_connection_entry, s); + bdrv_inc_in_flight(bs); + aio_co_schedule(bdrv_get_aio_context(bs), s->connection_co); + + return 0; +} + static int nbd_co_flush(BlockDriverState *bs) { return nbd_client_co_flush(bs); @@ -1725,9 +1722,11 @@ static void nbd_close(BlockDriverState *bs) nbd_client_close(bs); + object_unref(OBJECT(s->tlscreds)); qapi_free_SocketAddress(s->saddr); g_free(s->export); g_free(s->tlscredsid); + g_free(s->x_dirty_bitmap); } static int64_t nbd_getlength(BlockDriverState *bs)