From 987a9b4800003567b1a47a379255e886a77d57ea Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Aug 2012 16:45:55 +0200 Subject: [PATCH 01/13] net: notify iothread after flushing queue virtio-net has code to flush the queue and notify the iothread whenever new receive buffers are added by the guest. That is fine, and indeed we need to do the same in all other drivers. However, notifying the iothread should be work for the network subsystem. And since we are at it we can add a little smartness: if some of the queued packets already could not be delivered, there is no need to notify the iothread. Reported-by: Luigi Rizzo Cc: Stefan Hajnoczi Cc: Jan Kiszka Signed-off-by: Paolo Bonzini Reviewed-by: Amos Kong Signed-off-by: Stefan Hajnoczi --- hw/virtio-net.c | 4 ---- net.c | 7 ++++++- net/queue.c | 5 +++-- net/queue.h | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index b1998b27d3..6490743290 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -447,10 +447,6 @@ static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq) VirtIONet *n = to_virtio_net(vdev); qemu_flush_queued_packets(&n->nic->nc); - - /* We now have RX buffers, signal to the IO thread to break out of the - * select to re-poll the tap file descriptor */ - qemu_notify_event(); } static int virtio_net_can_receive(NetClientState *nc) diff --git a/net.c b/net.c index e5d25d4b6d..d9ba1e5b74 100644 --- a/net.c +++ b/net.c @@ -357,7 +357,12 @@ void qemu_flush_queued_packets(NetClientState *nc) { nc->receive_disabled = 0; - qemu_net_queue_flush(nc->send_queue); + if (qemu_net_queue_flush(nc->send_queue)) { + /* We emptied the queue successfully, signal to the IO thread to repoll + * the file descriptor (for tap, for example). + */ + qemu_notify_event(); + } } static ssize_t qemu_send_packet_async_with_flags(NetClientState *sender, diff --git a/net/queue.c b/net/queue.c index e8030aafe4..6e64091463 100644 --- a/net/queue.c +++ b/net/queue.c @@ -228,7 +228,7 @@ void qemu_net_queue_purge(NetQueue *queue, NetClientState *from) } } -void qemu_net_queue_flush(NetQueue *queue) +bool qemu_net_queue_flush(NetQueue *queue) { while (!QTAILQ_EMPTY(&queue->packets)) { NetPacket *packet; @@ -244,7 +244,7 @@ void qemu_net_queue_flush(NetQueue *queue) packet->size); if (ret == 0) { QTAILQ_INSERT_HEAD(&queue->packets, packet, entry); - break; + return false; } if (packet->sent_cb) { @@ -253,4 +253,5 @@ void qemu_net_queue_flush(NetQueue *queue) g_free(packet); } + return true; } diff --git a/net/queue.h b/net/queue.h index 9d44a9b3b8..fc02b33915 100644 --- a/net/queue.h +++ b/net/queue.h @@ -53,6 +53,6 @@ ssize_t qemu_net_queue_send_iov(NetQueue *queue, NetPacketSent *sent_cb); void qemu_net_queue_purge(NetQueue *queue, NetClientState *from); -void qemu_net_queue_flush(NetQueue *queue); +bool qemu_net_queue_flush(NetQueue *queue); #endif /* QEMU_NET_QUEUE_H */ From e8b4c680b41bd960ecccd9ff076b7b058e0afcd4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Aug 2012 16:45:56 +0200 Subject: [PATCH 02/13] e1000: flush queue whenever can_receive can go from false to true When the guests replenish the receive ring buffer, the network device should flush its queue of pending packets. This is done with qemu_flush_queued_packets. e1000's can_receive can go from false to true when RCTL or RDT are modified. Reported-by: Luigi Rizzo Cc: Stefan Hajnoczi Cc: Jan Kiszka Signed-off-by: Paolo Bonzini Reviewed-by: Amos Kong Signed-off-by: Stefan Hajnoczi --- hw/e1000.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/e1000.c b/hw/e1000.c index ae8a6c5523..ec3a7c4ecc 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -295,6 +295,7 @@ set_rx_control(E1000State *s, int index, uint32_t val) s->rxbuf_min_shift = ((val / E1000_RCTL_RDMTS_QUAT) & 3) + 1; DBGOUT(RX, "RCTL: %d, mac_reg[RCTL] = 0x%x\n", s->mac_reg[RDT], s->mac_reg[RCTL]); + qemu_flush_queued_packets(&s->nic->nc); } static void @@ -926,6 +927,9 @@ set_rdt(E1000State *s, int index, uint32_t val) { s->check_rxov = 0; s->mac_reg[index] = val & 0xffff; + if (e1000_has_rxbufs(s, 1)) { + qemu_flush_queued_packets(&s->nic->nc); + } } static void From a98b140223d3a627eab7ee3ddec645bab630d756 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 Aug 2012 16:45:57 +0200 Subject: [PATCH 03/13] xen: flush queue when getting an event xen does not have a register that, when written, will cause can_receive to go from false to true. However, flushing the queue can be attempted whenever the front-end raises its side of the Xen event channel. There is a single event channel for tx and rx. Cc: Stefano Stabellini Cc: Stefan Hajnoczi Signed-off-by: Paolo Bonzini Reviewed-by: Amos Kong Signed-off-by: Stefan Hajnoczi --- hw/xen_nic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/xen_nic.c b/hw/xen_nic.c index 8b79bfb73e..cf7d5591b3 100644 --- a/hw/xen_nic.c +++ b/hw/xen_nic.c @@ -415,6 +415,7 @@ static void net_event(struct XenDevice *xendev) { struct XenNetDev *netdev = container_of(xendev, struct XenNetDev, xendev); net_tx_packets(netdev); + qemu_flush_queued_packets(&netdev->nic->nc); } static int net_free(struct XenDevice *xendev) From 1069985fb132cd4324fc02d371f1e61492a1823f Mon Sep 17 00:00:00 2001 From: Bo Yang Date: Wed, 29 Aug 2012 19:26:11 +0800 Subject: [PATCH 04/13] eepro100: Fix network hang when rx buffers run out This is reported by QA. When installing os with pxe, after the initial kernel and initrd are loaded, the procedure tries to copy files from install server to local harddisk, the network becomes stall because of running out of receive descriptor. [Whitespace fixes and removed qemu_notify_event() because Paolo's earlier net patches have moved it into qemu_flush_queued_packets(). Additional info: I can reproduce the network hang with a tap device doing a iPXE HTTP boot as follows: $ qemu -enable-kvm -m 1024 \ -netdev tap,id=netdev0,script=no,downscript=no \ -device i82559er,netdev=netdev0,romfile=80861209.rom \ -drive if=virtio,cache=none,file=test.img iPXE> ifopen net0 iPXE> config # set static network configuration iPXE> kernel http://mirror.bytemark.co.uk/fedora/linux/releases/17/Fedora/x86_64/os/images/pxeboot/vmlinuz I needed a vanilla iPXE ROM to get to the iPXE prompt. I think the boot prompt has been disabled in the ROMs that ship with QEMU to reduce boot time. During the vmlinuz HTTP download there is a network hang. hw/eepro100.c has reached the end of the rx descriptor list. When the iPXE driver replenishes the rx descriptor list we don't kick the QEMU net subsystem and event loop, thereby leaving the tap netdev without its file descriptor in select(2). Stefan Hajnoczi ] Signed-off-by: Bo Yang Signed-off-by: Stefan Hajnoczi --- hw/eepro100.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hw/eepro100.c b/hw/eepro100.c index 50d117e35e..5b231163d8 100644 --- a/hw/eepro100.c +++ b/hw/eepro100.c @@ -1036,6 +1036,7 @@ static void eepro100_ru_command(EEPRO100State * s, uint8_t val) } set_ru_state(s, ru_ready); s->ru_offset = e100_read_reg4(s, SCBPointer); + qemu_flush_queued_packets(&s->nic->nc); TRACE(OTHER, logout("val=0x%02x (rx start)\n", val)); break; case RX_RESUME: @@ -1770,7 +1771,8 @@ static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size) if (rfd_command & COMMAND_EL) { /* EL bit is set, so this was the last frame. */ logout("receive: Running out of frames\n"); - set_ru_state(s, ru_suspended); + set_ru_state(s, ru_no_resources); + eepro100_rnr_interrupt(s); } if (rfd_command & COMMAND_S) { /* S bit is set. */ From c67f5dc10573687497f0f5c3aec19b15c35c63d7 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 17 Aug 2012 21:16:42 +0100 Subject: [PATCH 05/13] net: add receive_disabled logic to iov delivery path This patch adds the missing NetClient->receive_disabled logic in the sendv delivery code path. It seems that commit 893379efd0e1b84ceb0c42a713293f3dbd27b1bd ("net: disable receiving if client returns zero") only added the logic to qemu_deliver_packet() and not qemu_deliver_packet_iov(). The receive_disabled flag should be automatically set when .receive(), .receive_raw(), or .receive_iov() return 0. No further packets will be delivered to the NetClient until the receive_disabled flag is cleared again by calling qemu_flush_queued_packets(). Typically the NetClient will wait until its file descriptor becomes writable and then invoke qemu_flush_queued_packets() to resume transmission. Signed-off-by: Stefan Hajnoczi --- net.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/net.c b/net.c index d9ba1e5b74..a187a7b3db 100644 --- a/net.c +++ b/net.c @@ -423,16 +423,27 @@ ssize_t qemu_deliver_packet_iov(NetClientState *sender, void *opaque) { NetClientState *nc = opaque; + int ret; if (nc->link_down) { return iov_size(iov, iovcnt); } - if (nc->info->receive_iov) { - return nc->info->receive_iov(nc, iov, iovcnt); - } else { - return nc_sendv_compat(nc, iov, iovcnt); + if (nc->receive_disabled) { + return 0; } + + if (nc->info->receive_iov) { + ret = nc->info->receive_iov(nc, iov, iovcnt); + } else { + ret = nc_sendv_compat(nc, iov, iovcnt); + } + + if (ret == 0) { + nc->receive_disabled = 1; + } + + return ret; } ssize_t qemu_sendv_packet_async(NetClientState *sender, From 06b5f36d052b540a59b52150582d65674199b2ce Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 20 Aug 2012 13:35:23 +0100 Subject: [PATCH 06/13] net: do not report queued packets as sent Net send functions have a return value where 0 means the packet has not been sent and will be queued. A non-zero value means the packet was sent or an error caused the packet to be dropped. This patch fixes two instances where packets are queued but we return their size. This causes callers to believe the packets were sent. When the caller uses the async send interface this creates a real problem because the callback will be invoked for a packet that the caller believed to be already sent. This bug can cause double-frees in the caller. Signed-off-by: Stefan Hajnoczi --- net/queue.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/net/queue.c b/net/queue.c index 6e64091463..254f28013a 100644 --- a/net/queue.c +++ b/net/queue.c @@ -83,12 +83,12 @@ void qemu_del_net_queue(NetQueue *queue) g_free(queue); } -static ssize_t qemu_net_queue_append(NetQueue *queue, - NetClientState *sender, - unsigned flags, - const uint8_t *buf, - size_t size, - NetPacketSent *sent_cb) +static void qemu_net_queue_append(NetQueue *queue, + NetClientState *sender, + unsigned flags, + const uint8_t *buf, + size_t size, + NetPacketSent *sent_cb) { NetPacket *packet; @@ -100,16 +100,14 @@ static ssize_t qemu_net_queue_append(NetQueue *queue, memcpy(packet->data, buf, size); QTAILQ_INSERT_TAIL(&queue->packets, packet, entry); - - return size; } -static ssize_t qemu_net_queue_append_iov(NetQueue *queue, - NetClientState *sender, - unsigned flags, - const struct iovec *iov, - int iovcnt, - NetPacketSent *sent_cb) +static void qemu_net_queue_append_iov(NetQueue *queue, + NetClientState *sender, + unsigned flags, + const struct iovec *iov, + int iovcnt, + NetPacketSent *sent_cb) { NetPacket *packet; size_t max_len = 0; @@ -133,8 +131,6 @@ static ssize_t qemu_net_queue_append_iov(NetQueue *queue, } QTAILQ_INSERT_TAIL(&queue->packets, packet, entry); - - return packet->size; } static ssize_t qemu_net_queue_deliver(NetQueue *queue, @@ -177,7 +173,8 @@ ssize_t qemu_net_queue_send(NetQueue *queue, ssize_t ret; if (queue->delivering || !qemu_can_send_packet(sender)) { - return qemu_net_queue_append(queue, sender, flags, data, size, sent_cb); + qemu_net_queue_append(queue, sender, flags, data, size, sent_cb); + return 0; } ret = qemu_net_queue_deliver(queue, sender, flags, data, size); @@ -201,8 +198,8 @@ ssize_t qemu_net_queue_send_iov(NetQueue *queue, ssize_t ret; if (queue->delivering || !qemu_can_send_packet(sender)) { - return qemu_net_queue_append_iov(queue, sender, flags, - iov, iovcnt, sent_cb); + qemu_net_queue_append_iov(queue, sender, flags, iov, iovcnt, sent_cb); + return 0; } ret = qemu_net_queue_deliver_iov(queue, sender, flags, iov, iovcnt); From 08d12022c7f1aba6acccc75150659c6e4c9dff23 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 14 Aug 2012 14:14:27 +0100 Subject: [PATCH 07/13] net: add -netdev options to man page Document the -netdev syntax which supercedes the older -net syntax. This patch is a first step to making -netdev prominent in the QEMU manual. Reported-by: Anatoly Techtonik Signed-off-by: Stefan Hajnoczi --- qemu-options.hx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/qemu-options.hx b/qemu-options.hx index 804a2d1739..0977b3f783 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -1357,6 +1357,7 @@ Valid values for @var{type} are Not all devices are supported on all targets. Use -net nic,model=? for a list of available devices for your target. +@item -netdev user,id=@var{id}[,@var{option}][,@var{option}][,...] @item -net user[,@var{option}][,@var{option}][,...] Use the user mode network stack which requires no administrator privilege to run. Valid options are: @@ -1365,6 +1366,7 @@ privilege to run. Valid options are: @item vlan=@var{n} Connect user mode stack to VLAN @var{n} (@var{n} = 0 is the default). +@item id=@var{id} @item name=@var{name} Assign symbolic name for use in monitor commands. @@ -1490,6 +1492,7 @@ processed and applied to -net user. Mixing them with the new configuration syntax gives undefined results. Their use for new applications is discouraged as they will be removed from future versions. +@item -netdev tap,id=@var{id}[,fd=@var{h}][,ifname=@var{name}][,script=@var{file}][,downscript=@var{dfile}][,helper=@var{helper}] @item -net tap[,vlan=@var{n}][,name=@var{name}][,fd=@var{h}][,ifname=@var{name}][,script=@var{file}][,downscript=@var{dfile}][,helper=@var{helper}] Connect the host TAP network interface @var{name} to VLAN @var{n}. @@ -1529,6 +1532,7 @@ qemu-system-i386 linux.img \ -net nic -net tap,"helper=/usr/local/libexec/qemu-bridge-helper" @end example +@item -netdev bridge,id=@var{id}[,br=@var{bridge}][,helper=@var{helper}] @item -net bridge[,vlan=@var{n}][,name=@var{name}][,br=@var{bridge}][,helper=@var{helper}] Connect a host TAP network interface to a host bridge device. @@ -1551,6 +1555,7 @@ qemu-system-i386 linux.img -net bridge -net nic,model=virtio qemu-system-i386 linux.img -net bridge,br=qemubr0 -net nic,model=virtio @end example +@item -netdev socket,id=@var{id}[,fd=@var{h}][,listen=[@var{host}]:@var{port}][,connect=@var{host}:@var{port}] @item -net socket[,vlan=@var{n}][,name=@var{name}][,fd=@var{h}] [,listen=[@var{host}]:@var{port}][,connect=@var{host}:@var{port}] Connect the VLAN @var{n} to a remote VLAN in another QEMU virtual @@ -1573,6 +1578,7 @@ qemu-system-i386 linux.img \ -net socket,connect=127.0.0.1:1234 @end example +@item -netdev socket,id=@var{id}[,fd=@var{h}][,mcast=@var{maddr}:@var{port}[,localaddr=@var{addr}]] @item -net socket[,vlan=@var{n}][,name=@var{name}][,fd=@var{h}][,mcast=@var{maddr}:@var{port}[,localaddr=@var{addr}]] Create a VLAN @var{n} shared with another QEMU virtual @@ -1624,6 +1630,7 @@ qemu-system-i386 linux.img \ -net socket,mcast=239.192.168.1:1102,localaddr=1.2.3.4 @end example +@item -netdev vde,id=@var{id}[,sock=@var{socketpath}][,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}] @item -net vde[,vlan=@var{n}][,name=@var{name}][,sock=@var{socketpath}] [,port=@var{n}][,group=@var{groupname}][,mode=@var{octalmode}] Connect VLAN @var{n} to PORT @var{n} of a vde switch running on host and listening for incoming connections on @var{socketpath}. Use GROUP @var{groupname} From f237ddbb89142c6948a2257c459e49dee7500a7c Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 24 Aug 2012 13:32:16 +0100 Subject: [PATCH 08/13] net: clean up usbnet_receive() The USB network interface has two code paths depending on whether or not RNDIS mode is enabled. Refactor usbnet_receive() so that there is a common path throughout the function instead of duplicating everything across if (is_rndis(s)) ... else ... code paths. Clean up coding style and 80 character line wrap along the way. Signed-off-by: Stefan Hajnoczi --- hw/usb/dev-network.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/hw/usb/dev-network.c b/hw/usb/dev-network.c index c84892c98d..0b5cb71f98 100644 --- a/hw/usb/dev-network.c +++ b/hw/usb/dev-network.c @@ -1250,20 +1250,27 @@ static int usb_net_handle_data(USBDevice *dev, USBPacket *p) static ssize_t usbnet_receive(NetClientState *nc, const uint8_t *buf, size_t size) { USBNetState *s = DO_UPCAST(NICState, nc, nc)->opaque; - struct rndis_packet_msg_type *msg; + uint8_t *in_buf = s->in_buf; + size_t total_size = size; if (is_rndis(s)) { - msg = (struct rndis_packet_msg_type *) s->in_buf; if (s->rndis_state != RNDIS_DATA_INITIALIZED) { return -1; } - if (size + sizeof(struct rndis_packet_msg_type) > sizeof(s->in_buf)) - return -1; + total_size += sizeof(struct rndis_packet_msg_type); + } + if (total_size > sizeof(s->in_buf)) { + return -1; + } + if (is_rndis(s)) { + struct rndis_packet_msg_type *msg; + + msg = (struct rndis_packet_msg_type *)in_buf; memset(msg, 0, sizeof(struct rndis_packet_msg_type)); msg->MessageType = cpu_to_le32(RNDIS_PACKET_MSG); - msg->MessageLength = cpu_to_le32(size + sizeof(struct rndis_packet_msg_type)); - msg->DataOffset = cpu_to_le32(sizeof(struct rndis_packet_msg_type) - 8); + msg->MessageLength = cpu_to_le32(size + sizeof(*msg)); + msg->DataOffset = cpu_to_le32(sizeof(*msg) - 8); msg->DataLength = cpu_to_le32(size); /* msg->OOBDataOffset; * msg->OOBDataLength; @@ -1273,14 +1280,11 @@ static ssize_t usbnet_receive(NetClientState *nc, const uint8_t *buf, size_t siz * msg->VcHandle; * msg->Reserved; */ - memcpy(msg + 1, buf, size); - s->in_len = size + sizeof(struct rndis_packet_msg_type); - } else { - if (size > sizeof(s->in_buf)) - return -1; - memcpy(s->in_buf, buf, size); - s->in_len = size; + in_buf += sizeof(*msg); } + + memcpy(in_buf, buf, size); + s->in_len = total_size; s->in_ptr = 0; return size; } From 190563f9a90c9df8ad32fc7f3e4b166deda949a6 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 24 Aug 2012 13:37:29 +0100 Subject: [PATCH 09/13] net: fix usbnet_receive() packet drops The USB network interface has a single buffer which the guest reads from. This patch prevents multiple calls to usbnet_receive() from clobbering the input buffer. Instead we queue packets until buffer space becomes available again. This is inspired by virtio-net and e1000 rxbuf handling. Signed-off-by: Stefan Hajnoczi --- hw/usb/dev-network.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/hw/usb/dev-network.c b/hw/usb/dev-network.c index 0b5cb71f98..e4a43599b5 100644 --- a/hw/usb/dev-network.c +++ b/hw/usb/dev-network.c @@ -1001,6 +1001,13 @@ static int rndis_keepalive_response(USBNetState *s, return 0; } +/* Prepare to receive the next packet */ +static void usb_net_reset_in_buf(USBNetState *s) +{ + s->in_ptr = s->in_len = 0; + qemu_flush_queued_packets(&s->nic->nc); +} + static int rndis_parse(USBNetState *s, uint8_t *data, int length) { uint32_t msg_type; @@ -1025,7 +1032,8 @@ static int rndis_parse(USBNetState *s, uint8_t *data, int length) case RNDIS_RESET_MSG: rndis_clear_responsequeue(s); - s->out_ptr = s->in_ptr = s->in_len = 0; + s->out_ptr = 0; + usb_net_reset_in_buf(s); return rndis_reset_response(s, (rndis_reset_msg_type *) data); case RNDIS_KEEPALIVE_MSG: @@ -1135,7 +1143,7 @@ static int usb_net_handle_datain(USBNetState *s, USBPacket *p) int ret = USB_RET_NAK; if (s->in_ptr > s->in_len) { - s->in_ptr = s->in_len = 0; + usb_net_reset_in_buf(s); ret = USB_RET_NAK; return ret; } @@ -1152,7 +1160,7 @@ static int usb_net_handle_datain(USBNetState *s, USBPacket *p) if (s->in_ptr >= s->in_len && (is_rndis(s) || (s->in_len & (64 - 1)) || !ret)) { /* no short packet necessary */ - s->in_ptr = s->in_len = 0; + usb_net_reset_in_buf(s); } #ifdef TRAFFIC_DEBUG @@ -1263,6 +1271,11 @@ static ssize_t usbnet_receive(NetClientState *nc, const uint8_t *buf, size_t siz return -1; } + /* Only accept packet if input buffer is empty */ + if (s->in_len > 0) { + return 0; + } + if (is_rndis(s)) { struct rndis_packet_msg_type *msg; From 61518a74ca98870e8ff132f91dd5dda252e31f58 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Fri, 24 Aug 2012 13:50:30 +0100 Subject: [PATCH 10/13] net: broadcast hub packets if at least one port can receive In commit 60c07d933c66c4b30a83b7ccbc8a0cb3df1b2d0e ("net: fix qemu_can_send_packet logic") the "VLAN" broadcast behavior was changed to queue packets if any net client cannot receive. It turns out that this was not actually the right fix and just hides the real bug that hw/usb/dev-network.c:usbnet_receive() clobbers its receive buffer when called multiple times in a row. The commit also introduced a new bug that "VLAN" packets would not be sent if one of multiple net clients was down. The hw/usb/dev-network.c bug has since been fixed, so this patch reverts broadcast behavior to send packets as long as one net client can receive. Packets simply get queued for the net clients that are temporarily unable to receive. Reported-by: Roy.Li Signed-off-by: Stefan Hajnoczi --- net/hub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/hub.c b/net/hub.c index ac157e32ee..650a8b4a40 100644 --- a/net/hub.c +++ b/net/hub.c @@ -97,12 +97,12 @@ static int net_hub_port_can_receive(NetClientState *nc) continue; } - if (!qemu_can_send_packet(&port->nc)) { - return 0; + if (qemu_can_send_packet(&port->nc)) { + return 1; } } - return 1; + return 0; } static ssize_t net_hub_port_receive(NetClientState *nc, From 863f678fba4191f3b695620f41056cb7c124425d Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 20 Aug 2012 10:21:54 +0100 Subject: [PATCH 11/13] net: asynchronous send/receive infrastructure for net/socket.c The net/socket.c net client is not truly asynchronous. This patch borrows the qemu_set_fd_handler2() code from net/tap.c as the basis for proper asynchronous send/receive. Only read packets from the socket when the peer is able to receive. This avoids needless queuing. Later patches implement asynchronous send. Signed-off-by: Stefan Hajnoczi --- net/socket.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/net/socket.c b/net/socket.c index 7c602e4c3a..7bff5367ac 100644 --- a/net/socket.c +++ b/net/socket.c @@ -42,9 +42,51 @@ typedef struct NetSocketState { unsigned int packet_len; uint8_t buf[4096]; struct sockaddr_in dgram_dst; /* contains inet host and port destination iff connectionless (SOCK_DGRAM) */ + IOHandler *send_fn; /* differs between SOCK_STREAM/SOCK_DGRAM */ + bool read_poll; /* waiting to receive data? */ + bool write_poll; /* waiting to transmit data? */ } NetSocketState; static void net_socket_accept(void *opaque); +static void net_socket_writable(void *opaque); + +/* Only read packets from socket when peer can receive them */ +static int net_socket_can_send(void *opaque) +{ + NetSocketState *s = opaque; + + return qemu_can_send_packet(&s->nc); +} + +static void net_socket_update_fd_handler(NetSocketState *s) +{ + qemu_set_fd_handler2(s->fd, + s->read_poll ? net_socket_can_send : NULL, + s->read_poll ? s->send_fn : NULL, + s->write_poll ? net_socket_writable : NULL, + s); +} + +static void net_socket_read_poll(NetSocketState *s, bool enable) +{ + s->read_poll = enable; + net_socket_update_fd_handler(s); +} + +static void net_socket_write_poll(NetSocketState *s, bool enable) +{ + s->write_poll = enable; + net_socket_update_fd_handler(s); +} + +static void net_socket_writable(void *opaque) +{ + NetSocketState *s = opaque; + + net_socket_write_poll(s, false); + + qemu_flush_queued_packets(&s->nc); +} /* XXX: we consider we can send the whole packet without blocking */ static ssize_t net_socket_receive(NetClientState *nc, const uint8_t *buf, size_t size) @@ -81,7 +123,8 @@ static void net_socket_send(void *opaque) } else if (size == 0) { /* end of connection */ eoc: - qemu_set_fd_handler(s->fd, NULL, NULL, NULL); + net_socket_read_poll(s, false); + net_socket_write_poll(s, false); if (s->listen_fd != -1) { qemu_set_fd_handler(s->listen_fd, net_socket_accept, NULL, s); } @@ -152,7 +195,8 @@ static void net_socket_send_dgram(void *opaque) return; if (size == 0) { /* end of connection */ - qemu_set_fd_handler(s->fd, NULL, NULL, NULL); + net_socket_read_poll(s, false); + net_socket_write_poll(s, false); return; } qemu_send_packet(&s->nc, s->buf, size); @@ -243,7 +287,8 @@ static void net_socket_cleanup(NetClientState *nc) { NetSocketState *s = DO_UPCAST(NetSocketState, nc, nc); if (s->fd != -1) { - qemu_set_fd_handler(s->fd, NULL, NULL, NULL); + net_socket_read_poll(s, false); + net_socket_write_poll(s, false); close(s->fd); s->fd = -1; } @@ -314,8 +359,8 @@ static NetSocketState *net_socket_fd_init_dgram(NetClientState *peer, s->fd = fd; s->listen_fd = -1; - - qemu_set_fd_handler(s->fd, net_socket_send_dgram, NULL, s); + s->send_fn = net_socket_send_dgram; + net_socket_read_poll(s, true); /* mcast: save bound address as dst */ if (is_connected) { @@ -332,7 +377,8 @@ err: static void net_socket_connect(void *opaque) { NetSocketState *s = opaque; - qemu_set_fd_handler(s->fd, net_socket_send, NULL, s); + s->send_fn = net_socket_send; + net_socket_read_poll(s, true); } static NetClientInfo net_socket_info = { From 213fd5087e2e4e2da10ad266df0ba950cf7618bf Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 20 Aug 2012 10:28:53 +0100 Subject: [PATCH 12/13] net: EAGAIN handling for net/socket.c UDP Implement asynchronous send for UDP (or other SOCK_DGRAM) sockets. If send fails with EAGAIN we wait for the socket to become writable again. Signed-off-by: Stefan Hajnoczi --- net/socket.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/socket.c b/net/socket.c index 7bff5367ac..aabf0a48c5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -102,9 +102,19 @@ static ssize_t net_socket_receive(NetClientState *nc, const uint8_t *buf, size_t static ssize_t net_socket_receive_dgram(NetClientState *nc, const uint8_t *buf, size_t size) { NetSocketState *s = DO_UPCAST(NetSocketState, nc, nc); + ssize_t ret; - return sendto(s->fd, (const void *)buf, size, 0, - (struct sockaddr *)&s->dgram_dst, sizeof(s->dgram_dst)); + do { + ret = sendto(s->fd, buf, size, 0, + (struct sockaddr *)&s->dgram_dst, + sizeof(s->dgram_dst)); + } while (ret == -1 && errno == EINTR); + + if (ret == -1 && errno == EAGAIN) { + net_socket_write_poll(s, true); + return 0; + } + return ret; } static void net_socket_send(void *opaque) From 45a7f54a8bb3928ffa58d522e0d61acaee8277bb Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 20 Aug 2012 10:14:35 +0100 Subject: [PATCH 13/13] net: EAGAIN handling for net/socket.c TCP Replace spinning send_all() with a proper non-blocking send. When the socket write buffer limit is reached, we should stop trying to send and wait for the socket to become writable again. Non-blocking TCP sockets can return in two different ways when the write buffer limit is reached: 1. ret = -1 and errno = EAGAIN/EWOULDBLOCK. No data has been written. 2. ret < total_size. Short write, only part of the message was transmitted. Handle both cases and keep track of how many bytes have been written in s->send_index. (This includes the 'length' header before the actual payload buffer.) Signed-off-by: Stefan Hajnoczi --- net/socket.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/net/socket.c b/net/socket.c index aabf0a48c5..5e0c92e062 100644 --- a/net/socket.c +++ b/net/socket.c @@ -32,6 +32,7 @@ #include "qemu-error.h" #include "qemu-option.h" #include "qemu_socket.h" +#include "iov.h" typedef struct NetSocketState { NetClientState nc; @@ -40,6 +41,7 @@ typedef struct NetSocketState { int state; /* 0 = getting length, 1 = getting data */ unsigned int index; unsigned int packet_len; + unsigned int send_index; /* number of bytes sent (only SOCK_STREAM) */ uint8_t buf[4096]; struct sockaddr_in dgram_dst; /* contains inet host and port destination iff connectionless (SOCK_DGRAM) */ IOHandler *send_fn; /* differs between SOCK_STREAM/SOCK_DGRAM */ @@ -88,15 +90,39 @@ static void net_socket_writable(void *opaque) qemu_flush_queued_packets(&s->nc); } -/* XXX: we consider we can send the whole packet without blocking */ static ssize_t net_socket_receive(NetClientState *nc, const uint8_t *buf, size_t size) { NetSocketState *s = DO_UPCAST(NetSocketState, nc, nc); - uint32_t len; - len = htonl(size); + uint32_t len = htonl(size); + struct iovec iov[] = { + { + .iov_base = &len, + .iov_len = sizeof(len), + }, { + .iov_base = (void *)buf, + .iov_len = size, + }, + }; + size_t remaining; + ssize_t ret; - send_all(s->fd, (const uint8_t *)&len, sizeof(len)); - return send_all(s->fd, buf, size); + remaining = iov_size(iov, 2) - s->send_index; + ret = iov_send(s->fd, iov, 2, s->send_index, remaining); + + if (ret == -1 && errno == EAGAIN) { + ret = 0; /* handled further down */ + } + if (ret == -1) { + s->send_index = 0; + return -errno; + } + if (ret < (ssize_t)remaining) { + s->send_index += ret; + net_socket_write_poll(s, true); + return 0; + } + s->send_index = 0; + return size; } static ssize_t net_socket_receive_dgram(NetClientState *nc, const uint8_t *buf, size_t size)