diff options
Diffstat (limited to 'qemu/hw/virtio')
-rw-r--r-- | qemu/hw/virtio/Makefile.objs | 1 | ||||
-rw-r--r-- | qemu/hw/virtio/dataplane/Makefile.objs | 1 | ||||
-rw-r--r-- | qemu/hw/virtio/dataplane/vring.c | 453 | ||||
-rw-r--r-- | qemu/hw/virtio/vhost-backend.c | 147 | ||||
-rw-r--r-- | qemu/hw/virtio/vhost-user.c | 624 | ||||
-rw-r--r-- | qemu/hw/virtio/vhost.c | 213 | ||||
-rw-r--r-- | qemu/hw/virtio/virtio-balloon.c | 124 | ||||
-rw-r--r-- | qemu/hw/virtio/virtio-bus.c | 4 | ||||
-rw-r--r-- | qemu/hw/virtio/virtio-mmio.c | 1 | ||||
-rw-r--r-- | qemu/hw/virtio/virtio-pci.c | 398 | ||||
-rw-r--r-- | qemu/hw/virtio/virtio-pci.h | 49 | ||||
-rw-r--r-- | qemu/hw/virtio/virtio-rng.c | 19 | ||||
-rw-r--r-- | qemu/hw/virtio/virtio.c | 562 |
13 files changed, 1692 insertions, 904 deletions
diff --git a/qemu/hw/virtio/Makefile.objs b/qemu/hw/virtio/Makefile.objs index 19b224a44..3e2b175da 100644 --- a/qemu/hw/virtio/Makefile.objs +++ b/qemu/hw/virtio/Makefile.objs @@ -2,7 +2,6 @@ common-obj-y += virtio-rng.o common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o common-obj-y += virtio-bus.o common-obj-y += virtio-mmio.o -obj-$(CONFIG_VIRTIO) += dataplane/ obj-y += virtio.o virtio-balloon.o obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o diff --git a/qemu/hw/virtio/dataplane/Makefile.objs b/qemu/hw/virtio/dataplane/Makefile.objs deleted file mode 100644 index 753a9cab4..000000000 --- a/qemu/hw/virtio/dataplane/Makefile.objs +++ /dev/null @@ -1 +0,0 @@ -obj-y += vring.o diff --git a/qemu/hw/virtio/dataplane/vring.c b/qemu/hw/virtio/dataplane/vring.c deleted file mode 100644 index 07fd69c69..000000000 --- a/qemu/hw/virtio/dataplane/vring.c +++ /dev/null @@ -1,453 +0,0 @@ -/* Copyright 2012 Red Hat, Inc. - * Copyright IBM, Corp. 2012 - * - * Based on Linux 2.6.39 vhost code: - * Copyright (C) 2009 Red Hat, Inc. - * Copyright (C) 2006 Rusty Russell IBM Corporation - * - * Author: Michael S. Tsirkin <mst@redhat.com> - * Stefan Hajnoczi <stefanha@redhat.com> - * - * Inspiration, some code, and most witty comments come from - * Documentation/virtual/lguest/lguest.c, by Rusty Russell - * - * This work is licensed under the terms of the GNU GPL, version 2. - */ - -#include "trace.h" -#include "hw/hw.h" -#include "exec/memory.h" -#include "exec/address-spaces.h" -#include "hw/virtio/virtio-access.h" -#include "hw/virtio/dataplane/vring.h" -#include "hw/virtio/dataplane/vring-accessors.h" -#include "qemu/error-report.h" - -/* vring_map can be coupled with vring_unmap or (if you still have the - * value returned in *mr) memory_region_unref. - */ -static void *vring_map(MemoryRegion **mr, hwaddr phys, hwaddr len, - bool is_write) -{ - MemoryRegionSection section = memory_region_find(get_system_memory(), phys, len); - - if (!section.mr || int128_get64(section.size) < len) { - goto out; - } - if (is_write && section.readonly) { - goto out; - } - if (!memory_region_is_ram(section.mr)) { - goto out; - } - - /* Ignore regions with dirty logging, we cannot mark them dirty */ - if (memory_region_get_dirty_log_mask(section.mr)) { - goto out; - } - - *mr = section.mr; - return memory_region_get_ram_ptr(section.mr) + section.offset_within_region; - -out: - memory_region_unref(section.mr); - *mr = NULL; - return NULL; -} - -static void vring_unmap(void *buffer, bool is_write) -{ - ram_addr_t addr; - MemoryRegion *mr; - - mr = qemu_ram_addr_from_host(buffer, &addr); - memory_region_unref(mr); -} - -/* Map the guest's vring to host memory */ -bool vring_setup(Vring *vring, VirtIODevice *vdev, int n) -{ - hwaddr vring_addr = virtio_queue_get_ring_addr(vdev, n); - hwaddr vring_size = virtio_queue_get_ring_size(vdev, n); - void *vring_ptr; - - vring->broken = false; - - vring_ptr = vring_map(&vring->mr, vring_addr, vring_size, true); - if (!vring_ptr) { - error_report("Failed to map vring " - "addr %#" HWADDR_PRIx " size %" HWADDR_PRIu, - vring_addr, vring_size); - vring->broken = true; - return false; - } - - vring_init(&vring->vr, virtio_queue_get_num(vdev, n), vring_ptr, 4096); - - vring->last_avail_idx = virtio_queue_get_last_avail_idx(vdev, n); - vring->last_used_idx = vring_get_used_idx(vdev, vring); - vring->signalled_used = 0; - vring->signalled_used_valid = false; - - trace_vring_setup(virtio_queue_get_ring_addr(vdev, n), - vring->vr.desc, vring->vr.avail, vring->vr.used); - return true; -} - -void vring_teardown(Vring *vring, VirtIODevice *vdev, int n) -{ - virtio_queue_set_last_avail_idx(vdev, n, vring->last_avail_idx); - virtio_queue_invalidate_signalled_used(vdev, n); - - memory_region_unref(vring->mr); -} - -/* Disable guest->host notifies */ -void vring_disable_notification(VirtIODevice *vdev, Vring *vring) -{ - if (!virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { - vring_set_used_flags(vdev, vring, VRING_USED_F_NO_NOTIFY); - } -} - -/* Enable guest->host notifies - * - * Return true if the vring is empty, false if there are more requests. - */ -bool vring_enable_notification(VirtIODevice *vdev, Vring *vring) -{ - if (virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { - vring_avail_event(&vring->vr) = vring->vr.avail->idx; - } else { - vring_clear_used_flags(vdev, vring, VRING_USED_F_NO_NOTIFY); - } - smp_mb(); /* ensure update is seen before reading avail_idx */ - return !vring_more_avail(vdev, vring); -} - -/* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */ -bool vring_should_notify(VirtIODevice *vdev, Vring *vring) -{ - uint16_t old, new; - bool v; - /* Flush out used index updates. This is paired - * with the barrier that the Guest executes when enabling - * interrupts. */ - smp_mb(); - - if (virtio_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) && - unlikely(!vring_more_avail(vdev, vring))) { - return true; - } - - if (!virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { - return !(vring_get_avail_flags(vdev, vring) & - VRING_AVAIL_F_NO_INTERRUPT); - } - old = vring->signalled_used; - v = vring->signalled_used_valid; - new = vring->signalled_used = vring->last_used_idx; - vring->signalled_used_valid = true; - - if (unlikely(!v)) { - return true; - } - - return vring_need_event(virtio_tswap16(vdev, vring_used_event(&vring->vr)), - new, old); -} - - -static int get_desc(Vring *vring, VirtQueueElement *elem, - struct vring_desc *desc) -{ - unsigned *num; - struct iovec *iov; - hwaddr *addr; - MemoryRegion *mr; - - if (desc->flags & VRING_DESC_F_WRITE) { - num = &elem->in_num; - iov = &elem->in_sg[*num]; - addr = &elem->in_addr[*num]; - } else { - num = &elem->out_num; - iov = &elem->out_sg[*num]; - addr = &elem->out_addr[*num]; - - /* If it's an output descriptor, they're all supposed - * to come before any input descriptors. */ - if (unlikely(elem->in_num)) { - error_report("Descriptor has out after in"); - return -EFAULT; - } - } - - /* Stop for now if there are not enough iovecs available. */ - if (*num >= VIRTQUEUE_MAX_SIZE) { - error_report("Invalid SG num: %u", *num); - return -EFAULT; - } - - /* TODO handle non-contiguous memory across region boundaries */ - iov->iov_base = vring_map(&mr, desc->addr, desc->len, - desc->flags & VRING_DESC_F_WRITE); - if (!iov->iov_base) { - error_report("Failed to map descriptor addr %#" PRIx64 " len %u", - (uint64_t)desc->addr, desc->len); - return -EFAULT; - } - - /* The MemoryRegion is looked up again and unref'ed later, leave the - * ref in place. */ - iov->iov_len = desc->len; - *addr = desc->addr; - *num += 1; - return 0; -} - -static void copy_in_vring_desc(VirtIODevice *vdev, - const struct vring_desc *guest, - struct vring_desc *host) -{ - host->addr = virtio_ldq_p(vdev, &guest->addr); - host->len = virtio_ldl_p(vdev, &guest->len); - host->flags = virtio_lduw_p(vdev, &guest->flags); - host->next = virtio_lduw_p(vdev, &guest->next); -} - -/* This is stolen from linux/drivers/vhost/vhost.c. */ -static int get_indirect(VirtIODevice *vdev, Vring *vring, - VirtQueueElement *elem, struct vring_desc *indirect) -{ - struct vring_desc desc; - unsigned int i = 0, count, found = 0; - int ret; - - /* Sanity check */ - if (unlikely(indirect->len % sizeof(desc))) { - error_report("Invalid length in indirect descriptor: " - "len %#x not multiple of %#zx", - indirect->len, sizeof(desc)); - vring->broken = true; - return -EFAULT; - } - - count = indirect->len / sizeof(desc); - /* Buffers are chained via a 16 bit next field, so - * we can have at most 2^16 of these. */ - if (unlikely(count > USHRT_MAX + 1)) { - error_report("Indirect buffer length too big: %d", indirect->len); - vring->broken = true; - return -EFAULT; - } - - do { - struct vring_desc *desc_ptr; - MemoryRegion *mr; - - /* Translate indirect descriptor */ - desc_ptr = vring_map(&mr, - indirect->addr + found * sizeof(desc), - sizeof(desc), false); - if (!desc_ptr) { - error_report("Failed to map indirect descriptor " - "addr %#" PRIx64 " len %zu", - (uint64_t)indirect->addr + found * sizeof(desc), - sizeof(desc)); - vring->broken = true; - return -EFAULT; - } - copy_in_vring_desc(vdev, desc_ptr, &desc); - memory_region_unref(mr); - - /* Ensure descriptor has been loaded before accessing fields */ - barrier(); /* read_barrier_depends(); */ - - if (unlikely(++found > count)) { - error_report("Loop detected: last one at %u " - "indirect size %u", i, count); - vring->broken = true; - return -EFAULT; - } - - if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { - error_report("Nested indirect descriptor"); - vring->broken = true; - return -EFAULT; - } - - ret = get_desc(vring, elem, &desc); - if (ret < 0) { - vring->broken |= (ret == -EFAULT); - return ret; - } - i = desc.next; - } while (desc.flags & VRING_DESC_F_NEXT); - return 0; -} - -static void vring_unmap_element(VirtQueueElement *elem) -{ - int i; - - /* This assumes that the iovecs, if changed, are never moved past - * the end of the valid area. This is true if iovec manipulations - * are done with iov_discard_front and iov_discard_back. - */ - for (i = 0; i < elem->out_num; i++) { - vring_unmap(elem->out_sg[i].iov_base, false); - } - - for (i = 0; i < elem->in_num; i++) { - vring_unmap(elem->in_sg[i].iov_base, true); - } -} - -/* This looks in the virtqueue and for the first available buffer, and converts - * it to an iovec for convenient access. Since descriptors consist of some - * number of output then some number of input descriptors, it's actually two - * iovecs, but we pack them into one and note how many of each there were. - * - * This function returns the descriptor number found, or vq->num (which is - * never a valid descriptor number) if none was found. A negative code is - * returned on error. - * - * Stolen from linux/drivers/vhost/vhost.c. - */ -int vring_pop(VirtIODevice *vdev, Vring *vring, - VirtQueueElement *elem) -{ - struct vring_desc desc; - unsigned int i, head, found = 0, num = vring->vr.num; - uint16_t avail_idx, last_avail_idx; - int ret; - - /* Initialize elem so it can be safely unmapped */ - elem->in_num = elem->out_num = 0; - - /* If there was a fatal error then refuse operation */ - if (vring->broken) { - ret = -EFAULT; - goto out; - } - - /* Check it isn't doing very strange things with descriptor numbers. */ - last_avail_idx = vring->last_avail_idx; - avail_idx = vring_get_avail_idx(vdev, vring); - barrier(); /* load indices now and not again later */ - - if (unlikely((uint16_t)(avail_idx - last_avail_idx) > num)) { - error_report("Guest moved used index from %u to %u", - last_avail_idx, avail_idx); - ret = -EFAULT; - goto out; - } - - /* If there's nothing new since last we looked. */ - if (avail_idx == last_avail_idx) { - ret = -EAGAIN; - goto out; - } - - /* Only get avail ring entries after they have been exposed by guest. */ - smp_rmb(); - - /* Grab the next descriptor number they're advertising, and increment - * the index we've seen. */ - head = vring_get_avail_ring(vdev, vring, last_avail_idx % num); - - elem->index = head; - - /* If their number is silly, that's an error. */ - if (unlikely(head >= num)) { - error_report("Guest says index %u > %u is available", head, num); - ret = -EFAULT; - goto out; - } - - i = head; - do { - if (unlikely(i >= num)) { - error_report("Desc index is %u > %u, head = %u", i, num, head); - ret = -EFAULT; - goto out; - } - if (unlikely(++found > num)) { - error_report("Loop detected: last one at %u vq size %u head %u", - i, num, head); - ret = -EFAULT; - goto out; - } - copy_in_vring_desc(vdev, &vring->vr.desc[i], &desc); - - /* Ensure descriptor is loaded before accessing fields */ - barrier(); - - if (desc.flags & VRING_DESC_F_INDIRECT) { - ret = get_indirect(vdev, vring, elem, &desc); - if (ret < 0) { - goto out; - } - continue; - } - - ret = get_desc(vring, elem, &desc); - if (ret < 0) { - goto out; - } - - i = desc.next; - } while (desc.flags & VRING_DESC_F_NEXT); - - /* On success, increment avail index. */ - vring->last_avail_idx++; - if (virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { - vring_avail_event(&vring->vr) = - virtio_tswap16(vdev, vring->last_avail_idx); - } - - return head; - -out: - assert(ret < 0); - if (ret == -EFAULT) { - vring->broken = true; - } - vring_unmap_element(elem); - return ret; -} - -/* After we've used one of their buffers, we tell them about it. - * - * Stolen from linux/drivers/vhost/vhost.c. - */ -void vring_push(VirtIODevice *vdev, Vring *vring, VirtQueueElement *elem, - int len) -{ - unsigned int head = elem->index; - uint16_t new; - - vring_unmap_element(elem); - - /* Don't touch vring if a fatal error occurred */ - if (vring->broken) { - return; - } - - /* The virtqueue contains a ring of used buffers. Get a pointer to the - * next entry in that used ring. */ - vring_set_used_ring_id(vdev, vring, vring->last_used_idx % vring->vr.num, - head); - vring_set_used_ring_len(vdev, vring, vring->last_used_idx % vring->vr.num, - len); - - /* Make sure buffer is written before we update index. */ - smp_wmb(); - - new = ++vring->last_used_idx; - vring_set_used_idx(vdev, vring, new); - if (unlikely((int16_t)(new - vring->signalled_used) < (uint16_t)1)) { - vring->signalled_used_valid = false; - } -} diff --git a/qemu/hw/virtio/vhost-backend.c b/qemu/hw/virtio/vhost-backend.c index 4d68a2765..b35890289 100644 --- a/qemu/hw/virtio/vhost-backend.c +++ b/qemu/hw/virtio/vhost-backend.c @@ -8,9 +8,11 @@ * */ +#include "qemu/osdep.h" #include "hw/virtio/vhost.h" #include "hw/virtio/vhost-backend.h" #include "qemu/error-report.h" +#include "linux/vhost.h" #include <sys/ioctl.h> @@ -42,11 +44,152 @@ static int vhost_kernel_cleanup(struct vhost_dev *dev) return close(fd); } +static int vhost_kernel_memslots_limit(struct vhost_dev *dev) +{ + int limit = 64; + char *s; + + if (g_file_get_contents("/sys/module/vhost/parameters/max_mem_regions", + &s, NULL, NULL)) { + uint64_t val = g_ascii_strtoull(s, NULL, 10); + if (!((val == G_MAXUINT64 || !val) && errno)) { + return val; + } + error_report("ignoring invalid max_mem_regions value in vhost module:" + " %s", s); + } + return limit; +} + +static int vhost_kernel_net_set_backend(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_kernel_call(dev, VHOST_NET_SET_BACKEND, file); +} + +static int vhost_kernel_scsi_set_endpoint(struct vhost_dev *dev, + struct vhost_scsi_target *target) +{ + return vhost_kernel_call(dev, VHOST_SCSI_SET_ENDPOINT, target); +} + +static int vhost_kernel_scsi_clear_endpoint(struct vhost_dev *dev, + struct vhost_scsi_target *target) +{ + return vhost_kernel_call(dev, VHOST_SCSI_CLEAR_ENDPOINT, target); +} + +static int vhost_kernel_scsi_get_abi_version(struct vhost_dev *dev, int *version) +{ + return vhost_kernel_call(dev, VHOST_SCSI_GET_ABI_VERSION, version); +} + +static int vhost_kernel_set_log_base(struct vhost_dev *dev, uint64_t base, + struct vhost_log *log) +{ + return vhost_kernel_call(dev, VHOST_SET_LOG_BASE, &base); +} + +static int vhost_kernel_set_mem_table(struct vhost_dev *dev, + struct vhost_memory *mem) +{ + return vhost_kernel_call(dev, VHOST_SET_MEM_TABLE, mem); +} + +static int vhost_kernel_set_vring_addr(struct vhost_dev *dev, + struct vhost_vring_addr *addr) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_ADDR, addr); +} + +static int vhost_kernel_set_vring_endian(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_ENDIAN, ring); +} + +static int vhost_kernel_set_vring_num(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_NUM, ring); +} + +static int vhost_kernel_set_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_BASE, ring); +} + +static int vhost_kernel_get_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_kernel_call(dev, VHOST_GET_VRING_BASE, ring); +} + +static int vhost_kernel_set_vring_kick(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_KICK, file); +} + +static int vhost_kernel_set_vring_call(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_kernel_call(dev, VHOST_SET_VRING_CALL, file); +} + +static int vhost_kernel_set_features(struct vhost_dev *dev, + uint64_t features) +{ + return vhost_kernel_call(dev, VHOST_SET_FEATURES, &features); +} + +static int vhost_kernel_get_features(struct vhost_dev *dev, + uint64_t *features) +{ + return vhost_kernel_call(dev, VHOST_GET_FEATURES, features); +} + +static int vhost_kernel_set_owner(struct vhost_dev *dev) +{ + return vhost_kernel_call(dev, VHOST_SET_OWNER, NULL); +} + +static int vhost_kernel_reset_device(struct vhost_dev *dev) +{ + return vhost_kernel_call(dev, VHOST_RESET_OWNER, NULL); +} + +static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx) +{ + assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); + + return idx - dev->vq_index; +} + static const VhostOps kernel_ops = { .backend_type = VHOST_BACKEND_TYPE_KERNEL, - .vhost_call = vhost_kernel_call, .vhost_backend_init = vhost_kernel_init, - .vhost_backend_cleanup = vhost_kernel_cleanup + .vhost_backend_cleanup = vhost_kernel_cleanup, + .vhost_backend_memslots_limit = vhost_kernel_memslots_limit, + .vhost_net_set_backend = vhost_kernel_net_set_backend, + .vhost_scsi_set_endpoint = vhost_kernel_scsi_set_endpoint, + .vhost_scsi_clear_endpoint = vhost_kernel_scsi_clear_endpoint, + .vhost_scsi_get_abi_version = vhost_kernel_scsi_get_abi_version, + .vhost_set_log_base = vhost_kernel_set_log_base, + .vhost_set_mem_table = vhost_kernel_set_mem_table, + .vhost_set_vring_addr = vhost_kernel_set_vring_addr, + .vhost_set_vring_endian = vhost_kernel_set_vring_endian, + .vhost_set_vring_num = vhost_kernel_set_vring_num, + .vhost_set_vring_base = vhost_kernel_set_vring_base, + .vhost_get_vring_base = vhost_kernel_get_vring_base, + .vhost_set_vring_kick = vhost_kernel_set_vring_kick, + .vhost_set_vring_call = vhost_kernel_set_vring_call, + .vhost_set_features = vhost_kernel_set_features, + .vhost_get_features = vhost_kernel_get_features, + .vhost_set_owner = vhost_kernel_set_owner, + .vhost_reset_device = vhost_kernel_reset_device, + .vhost_get_vq_index = vhost_kernel_get_vq_index, }; int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type) diff --git a/qemu/hw/virtio/vhost-user.c b/qemu/hw/virtio/vhost-user.c index e7ab8293d..5914e8510 100644 --- a/qemu/hw/virtio/vhost-user.c +++ b/qemu/hw/virtio/vhost-user.c @@ -8,22 +8,35 @@ * */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "hw/virtio/vhost.h" #include "hw/virtio/vhost-backend.h" +#include "hw/virtio/virtio-net.h" #include "sysemu/char.h" #include "sysemu/kvm.h" #include "qemu/error-report.h" #include "qemu/sockets.h" #include "exec/ram_addr.h" +#include "migration/migration.h" -#include <fcntl.h> -#include <unistd.h> #include <sys/ioctl.h> #include <sys/socket.h> #include <sys/un.h> #include <linux/vhost.h> #define VHOST_MEMORY_MAX_NREGIONS 8 +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +enum VhostUserProtocolFeature { + VHOST_USER_PROTOCOL_F_MQ = 0, + VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1, + VHOST_USER_PROTOCOL_F_RARP = 2, + + VHOST_USER_PROTOCOL_F_MAX +}; + +#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1) typedef enum VhostUserRequest { VHOST_USER_NONE = 0, @@ -41,6 +54,11 @@ typedef enum VhostUserRequest { VHOST_USER_SET_VRING_KICK = 12, VHOST_USER_SET_VRING_CALL = 13, VHOST_USER_SET_VRING_ERR = 14, + VHOST_USER_GET_PROTOCOL_FEATURES = 15, + VHOST_USER_SET_PROTOCOL_FEATURES = 16, + VHOST_USER_GET_QUEUE_NUM = 17, + VHOST_USER_SET_VRING_ENABLE = 18, + VHOST_USER_SEND_RARP = 19, VHOST_USER_MAX } VhostUserRequest; @@ -57,6 +75,11 @@ typedef struct VhostUserMemory { VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS]; } VhostUserMemory; +typedef struct VhostUserLog { + uint64_t mmap_size; + uint64_t mmap_offset; +} VhostUserLog; + typedef struct VhostUserMsg { VhostUserRequest request; @@ -71,7 +94,8 @@ typedef struct VhostUserMsg { struct vhost_vring_state state; struct vhost_vring_addr addr; VhostUserMemory memory; - }; + VhostUserLog log; + } payload; } QEMU_PACKED VhostUserMsg; static VhostUserMsg m __attribute__ ((unused)); @@ -89,37 +113,6 @@ static bool ioeventfd_enabled(void) return kvm_enabled() && kvm_eventfds_enabled(); } -static unsigned long int ioctl_to_vhost_user_request[VHOST_USER_MAX] = { - -1, /* VHOST_USER_NONE */ - VHOST_GET_FEATURES, /* VHOST_USER_GET_FEATURES */ - VHOST_SET_FEATURES, /* VHOST_USER_SET_FEATURES */ - VHOST_SET_OWNER, /* VHOST_USER_SET_OWNER */ - VHOST_RESET_OWNER, /* VHOST_USER_RESET_OWNER */ - VHOST_SET_MEM_TABLE, /* VHOST_USER_SET_MEM_TABLE */ - VHOST_SET_LOG_BASE, /* VHOST_USER_SET_LOG_BASE */ - VHOST_SET_LOG_FD, /* VHOST_USER_SET_LOG_FD */ - VHOST_SET_VRING_NUM, /* VHOST_USER_SET_VRING_NUM */ - VHOST_SET_VRING_ADDR, /* VHOST_USER_SET_VRING_ADDR */ - VHOST_SET_VRING_BASE, /* VHOST_USER_SET_VRING_BASE */ - VHOST_GET_VRING_BASE, /* VHOST_USER_GET_VRING_BASE */ - VHOST_SET_VRING_KICK, /* VHOST_USER_SET_VRING_KICK */ - VHOST_SET_VRING_CALL, /* VHOST_USER_SET_VRING_CALL */ - VHOST_SET_VRING_ERR /* VHOST_USER_SET_VRING_ERR */ -}; - -static VhostUserRequest vhost_user_request_translate(unsigned long int request) -{ - VhostUserRequest idx; - - for (idx = 0; idx < VHOST_USER_MAX; idx++) { - if (ioctl_to_vhost_user_request[idx] == request) { - break; - } - } - - return (idx == VHOST_USER_MAX) ? VHOST_USER_NONE : idx; -} - static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) { CharDriverState *chr = dev->opaque; @@ -128,8 +121,8 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg) r = qemu_chr_fe_read_all(chr, p, size); if (r != size) { - error_report("Failed to read msg header. Read %d instead of %d.", r, - size); + error_report("Failed to read msg header. Read %d instead of %d." + " Original request %d.", r, size, msg->request); goto fail; } @@ -166,12 +159,35 @@ fail: return -1; } +static bool vhost_user_one_time_request(VhostUserRequest request) +{ + switch (request) { + case VHOST_USER_SET_OWNER: + case VHOST_USER_RESET_OWNER: + case VHOST_USER_SET_MEM_TABLE: + case VHOST_USER_GET_QUEUE_NUM: + return true; + default: + return false; + } +} + +/* most non-init callers ignore the error */ static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg, int *fds, int fd_num) { CharDriverState *chr = dev->opaque; int size = VHOST_USER_HDR_SIZE + msg->size; + /* + * For non-vring specific requests, like VHOST_USER_SET_MEM_TABLE, + * we just need send it once in the first time. For later such + * request, we just ignore it. + */ + if (vhost_user_one_time_request(msg->request) && dev->vq_index != 0) { + return 0; + } + if (fd_num) { qemu_chr_fe_set_msgfds(chr, fds, fd_num); } @@ -180,157 +196,364 @@ static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg, 0 : -1; } -static int vhost_user_call(struct vhost_dev *dev, unsigned long int request, - void *arg) +static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base, + struct vhost_log *log) { - VhostUserMsg msg; - VhostUserRequest msg_request; - struct vhost_vring_file *file = 0; - int need_reply = 0; int fds[VHOST_MEMORY_MAX_NREGIONS]; - int i, fd; size_t fd_num = 0; + bool shmfd = virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_LOG_SHMFD); + VhostUserMsg msg = { + .request = VHOST_USER_SET_LOG_BASE, + .flags = VHOST_USER_VERSION, + .payload.log.mmap_size = log->size * sizeof(*(log->log)), + .payload.log.mmap_offset = 0, + .size = sizeof(msg.payload.log), + }; - assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + if (shmfd && log->fd != -1) { + fds[fd_num++] = log->fd; + } - msg_request = vhost_user_request_translate(request); - msg.request = msg_request; - msg.flags = VHOST_USER_VERSION; - msg.size = 0; + vhost_user_write(dev, &msg, fds, fd_num); - switch (request) { - case VHOST_GET_FEATURES: - need_reply = 1; - break; - - case VHOST_SET_FEATURES: - case VHOST_SET_LOG_BASE: - msg.u64 = *((__u64 *) arg); - msg.size = sizeof(m.u64); - break; - - case VHOST_SET_OWNER: - case VHOST_RESET_OWNER: - break; - - case VHOST_SET_MEM_TABLE: - for (i = 0; i < dev->mem->nregions; ++i) { - struct vhost_memory_region *reg = dev->mem->regions + i; - ram_addr_t ram_addr; - - assert((uintptr_t)reg->userspace_addr == reg->userspace_addr); - qemu_ram_addr_from_host((void *)(uintptr_t)reg->userspace_addr, &ram_addr); - fd = qemu_get_ram_fd(ram_addr); - if (fd > 0) { - msg.memory.regions[fd_num].userspace_addr = reg->userspace_addr; - msg.memory.regions[fd_num].memory_size = reg->memory_size; - msg.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr; - msg.memory.regions[fd_num].mmap_offset = reg->userspace_addr - - (uintptr_t) qemu_get_ram_block_host_ptr(ram_addr); - assert(fd_num < VHOST_MEMORY_MAX_NREGIONS); - fds[fd_num++] = fd; - } + if (shmfd) { + msg.size = 0; + if (vhost_user_read(dev, &msg) < 0) { + return 0; } - msg.memory.nregions = fd_num; - - if (!fd_num) { - error_report("Failed initializing vhost-user memory map, " - "consider using -object memory-backend-file share=on"); + if (msg.request != VHOST_USER_SET_LOG_BASE) { + error_report("Received unexpected msg type. " + "Expected %d received %d", + VHOST_USER_SET_LOG_BASE, msg.request); return -1; } + } - msg.size = sizeof(m.memory.nregions); - msg.size += sizeof(m.memory.padding); - msg.size += fd_num * sizeof(VhostUserMemoryRegion); - - break; - - case VHOST_SET_LOG_FD: - fds[fd_num++] = *((int *) arg); - break; - - case VHOST_SET_VRING_NUM: - case VHOST_SET_VRING_BASE: - memcpy(&msg.state, arg, sizeof(struct vhost_vring_state)); - msg.size = sizeof(m.state); - break; - - case VHOST_GET_VRING_BASE: - memcpy(&msg.state, arg, sizeof(struct vhost_vring_state)); - msg.size = sizeof(m.state); - need_reply = 1; - break; - - case VHOST_SET_VRING_ADDR: - memcpy(&msg.addr, arg, sizeof(struct vhost_vring_addr)); - msg.size = sizeof(m.addr); - break; - - case VHOST_SET_VRING_KICK: - case VHOST_SET_VRING_CALL: - case VHOST_SET_VRING_ERR: - file = arg; - msg.u64 = file->index & VHOST_USER_VRING_IDX_MASK; - msg.size = sizeof(m.u64); - if (ioeventfd_enabled() && file->fd > 0) { - fds[fd_num++] = file->fd; - } else { - msg.u64 |= VHOST_USER_VRING_NOFD_MASK; + return 0; +} + +static int vhost_user_set_mem_table(struct vhost_dev *dev, + struct vhost_memory *mem) +{ + int fds[VHOST_MEMORY_MAX_NREGIONS]; + int i, fd; + size_t fd_num = 0; + VhostUserMsg msg = { + .request = VHOST_USER_SET_MEM_TABLE, + .flags = VHOST_USER_VERSION, + }; + + for (i = 0; i < dev->mem->nregions; ++i) { + struct vhost_memory_region *reg = dev->mem->regions + i; + ram_addr_t ram_addr; + + assert((uintptr_t)reg->userspace_addr == reg->userspace_addr); + qemu_ram_addr_from_host((void *)(uintptr_t)reg->userspace_addr, + &ram_addr); + fd = qemu_get_ram_fd(ram_addr); + if (fd > 0) { + msg.payload.memory.regions[fd_num].userspace_addr = reg->userspace_addr; + msg.payload.memory.regions[fd_num].memory_size = reg->memory_size; + msg.payload.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr; + msg.payload.memory.regions[fd_num].mmap_offset = reg->userspace_addr - + (uintptr_t) qemu_get_ram_block_host_ptr(ram_addr); + assert(fd_num < VHOST_MEMORY_MAX_NREGIONS); + fds[fd_num++] = fd; } - break; - default: - error_report("vhost-user trying to send unhandled ioctl"); + } + + msg.payload.memory.nregions = fd_num; + + if (!fd_num) { + error_report("Failed initializing vhost-user memory map, " + "consider using -object memory-backend-file share=on"); return -1; - break; } - if (vhost_user_write(dev, &msg, fds, fd_num) < 0) { + msg.size = sizeof(msg.payload.memory.nregions); + msg.size += sizeof(msg.payload.memory.padding); + msg.size += fd_num * sizeof(VhostUserMemoryRegion); + + vhost_user_write(dev, &msg, fds, fd_num); + + return 0; +} + +static int vhost_user_set_vring_addr(struct vhost_dev *dev, + struct vhost_vring_addr *addr) +{ + VhostUserMsg msg = { + .request = VHOST_USER_SET_VRING_ADDR, + .flags = VHOST_USER_VERSION, + .payload.addr = *addr, + .size = sizeof(msg.payload.addr), + }; + + vhost_user_write(dev, &msg, NULL, 0); + + return 0; +} + +static int vhost_user_set_vring_endian(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + error_report("vhost-user trying to send unhandled ioctl"); + return -1; +} + +static int vhost_set_vring(struct vhost_dev *dev, + unsigned long int request, + struct vhost_vring_state *ring) +{ + VhostUserMsg msg = { + .request = request, + .flags = VHOST_USER_VERSION, + .payload.state = *ring, + .size = sizeof(msg.payload.state), + }; + + vhost_user_write(dev, &msg, NULL, 0); + + return 0; +} + +static int vhost_user_set_vring_num(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring); +} + +static int vhost_user_set_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring); +} + +static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable) +{ + int i; + + if (!virtio_has_feature(dev->features, VHOST_USER_F_PROTOCOL_FEATURES)) { + return -1; + } + + for (i = 0; i < dev->nvqs; ++i) { + struct vhost_vring_state state = { + .index = dev->vq_index + i, + .num = enable, + }; + + vhost_set_vring(dev, VHOST_USER_SET_VRING_ENABLE, &state); + } + + return 0; +} + +static int vhost_user_get_vring_base(struct vhost_dev *dev, + struct vhost_vring_state *ring) +{ + VhostUserMsg msg = { + .request = VHOST_USER_GET_VRING_BASE, + .flags = VHOST_USER_VERSION, + .payload.state = *ring, + .size = sizeof(msg.payload.state), + }; + + vhost_user_write(dev, &msg, NULL, 0); + + if (vhost_user_read(dev, &msg) < 0) { return 0; } - if (need_reply) { - if (vhost_user_read(dev, &msg) < 0) { - return 0; - } + if (msg.request != VHOST_USER_GET_VRING_BASE) { + error_report("Received unexpected msg type. Expected %d received %d", + VHOST_USER_GET_VRING_BASE, msg.request); + return -1; + } - if (msg_request != msg.request) { - error_report("Received unexpected msg type." - " Expected %d received %d", msg_request, msg.request); - return -1; - } + if (msg.size != sizeof(msg.payload.state)) { + error_report("Received bad msg size."); + return -1; + } - switch (msg_request) { - case VHOST_USER_GET_FEATURES: - if (msg.size != sizeof(m.u64)) { - error_report("Received bad msg size."); - return -1; - } - *((__u64 *) arg) = msg.u64; - break; - case VHOST_USER_GET_VRING_BASE: - if (msg.size != sizeof(m.state)) { - error_report("Received bad msg size."); - return -1; - } - memcpy(arg, &msg.state, sizeof(struct vhost_vring_state)); - break; - default: - error_report("Received unexpected msg type."); - return -1; - break; - } + *ring = msg.payload.state; + + return 0; +} + +static int vhost_set_vring_file(struct vhost_dev *dev, + VhostUserRequest request, + struct vhost_vring_file *file) +{ + int fds[VHOST_MEMORY_MAX_NREGIONS]; + size_t fd_num = 0; + VhostUserMsg msg = { + .request = request, + .flags = VHOST_USER_VERSION, + .payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK, + .size = sizeof(msg.payload.u64), + }; + + if (ioeventfd_enabled() && file->fd > 0) { + fds[fd_num++] = file->fd; + } else { + msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK; } + vhost_user_write(dev, &msg, fds, fd_num); + + return 0; +} + +static int vhost_user_set_vring_kick(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_KICK, file); +} + +static int vhost_user_set_vring_call(struct vhost_dev *dev, + struct vhost_vring_file *file) +{ + return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_CALL, file); +} + +static int vhost_user_set_u64(struct vhost_dev *dev, int request, uint64_t u64) +{ + VhostUserMsg msg = { + .request = request, + .flags = VHOST_USER_VERSION, + .payload.u64 = u64, + .size = sizeof(msg.payload.u64), + }; + + vhost_user_write(dev, &msg, NULL, 0); + + return 0; +} + +static int vhost_user_set_features(struct vhost_dev *dev, + uint64_t features) +{ + return vhost_user_set_u64(dev, VHOST_USER_SET_FEATURES, features); +} + +static int vhost_user_set_protocol_features(struct vhost_dev *dev, + uint64_t features) +{ + return vhost_user_set_u64(dev, VHOST_USER_SET_PROTOCOL_FEATURES, features); +} + +static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64) +{ + VhostUserMsg msg = { + .request = request, + .flags = VHOST_USER_VERSION, + }; + + if (vhost_user_one_time_request(request) && dev->vq_index != 0) { + return 0; + } + + vhost_user_write(dev, &msg, NULL, 0); + + if (vhost_user_read(dev, &msg) < 0) { + return 0; + } + + if (msg.request != request) { + error_report("Received unexpected msg type. Expected %d received %d", + request, msg.request); + return -1; + } + + if (msg.size != sizeof(msg.payload.u64)) { + error_report("Received bad msg size."); + return -1; + } + + *u64 = msg.payload.u64; + + return 0; +} + +static int vhost_user_get_features(struct vhost_dev *dev, uint64_t *features) +{ + return vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features); +} + +static int vhost_user_set_owner(struct vhost_dev *dev) +{ + VhostUserMsg msg = { + .request = VHOST_USER_SET_OWNER, + .flags = VHOST_USER_VERSION, + }; + + vhost_user_write(dev, &msg, NULL, 0); + + return 0; +} + +static int vhost_user_reset_device(struct vhost_dev *dev) +{ + VhostUserMsg msg = { + .request = VHOST_USER_RESET_OWNER, + .flags = VHOST_USER_VERSION, + }; + + vhost_user_write(dev, &msg, NULL, 0); + return 0; } static int vhost_user_init(struct vhost_dev *dev, void *opaque) { + uint64_t features; + int err; + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); dev->opaque = opaque; + err = vhost_user_get_features(dev, &features); + if (err < 0) { + return err; + } + + if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) { + dev->backend_features |= 1ULL << VHOST_USER_F_PROTOCOL_FEATURES; + + err = vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES, + &features); + if (err < 0) { + return err; + } + + dev->protocol_features = features & VHOST_USER_PROTOCOL_FEATURE_MASK; + err = vhost_user_set_protocol_features(dev, dev->protocol_features); + if (err < 0) { + return err; + } + + /* query the max queues we support if backend supports Multiple Queue */ + if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) { + err = vhost_user_get_u64(dev, VHOST_USER_GET_QUEUE_NUM, + &dev->max_queues); + if (err < 0) { + return err; + } + } + } + + if (dev->migration_blocker == NULL && + !virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_LOG_SHMFD)) { + error_setg(&dev->migration_blocker, + "Migration disabled: vhost-user backend lacks " + "VHOST_USER_PROTOCOL_F_LOG_SHMFD feature."); + } + return 0; } @@ -343,9 +566,92 @@ static int vhost_user_cleanup(struct vhost_dev *dev) return 0; } +static int vhost_user_get_vq_index(struct vhost_dev *dev, int idx) +{ + assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); + + return idx; +} + +static int vhost_user_memslots_limit(struct vhost_dev *dev) +{ + return VHOST_MEMORY_MAX_NREGIONS; +} + +static bool vhost_user_requires_shm_log(struct vhost_dev *dev) +{ + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + return virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_LOG_SHMFD); +} + +static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr) +{ + VhostUserMsg msg = { 0 }; + int err; + + assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER); + + /* If guest supports GUEST_ANNOUNCE do nothing */ + if (virtio_has_feature(dev->acked_features, VIRTIO_NET_F_GUEST_ANNOUNCE)) { + return 0; + } + + /* if backend supports VHOST_USER_PROTOCOL_F_RARP ask it to send the RARP */ + if (virtio_has_feature(dev->protocol_features, + VHOST_USER_PROTOCOL_F_RARP)) { + msg.request = VHOST_USER_SEND_RARP; + msg.flags = VHOST_USER_VERSION; + memcpy((char *)&msg.payload.u64, mac_addr, 6); + msg.size = sizeof(msg.payload.u64); + + err = vhost_user_write(dev, &msg, NULL, 0); + return err; + } + return -1; +} + +static bool vhost_user_can_merge(struct vhost_dev *dev, + uint64_t start1, uint64_t size1, + uint64_t start2, uint64_t size2) +{ + ram_addr_t ram_addr; + int mfd, rfd; + MemoryRegion *mr; + + mr = qemu_ram_addr_from_host((void *)(uintptr_t)start1, &ram_addr); + assert(mr); + mfd = qemu_get_ram_fd(ram_addr); + + mr = qemu_ram_addr_from_host((void *)(uintptr_t)start2, &ram_addr); + assert(mr); + rfd = qemu_get_ram_fd(ram_addr); + + return mfd == rfd; +} + const VhostOps user_ops = { .backend_type = VHOST_BACKEND_TYPE_USER, - .vhost_call = vhost_user_call, .vhost_backend_init = vhost_user_init, - .vhost_backend_cleanup = vhost_user_cleanup - }; + .vhost_backend_cleanup = vhost_user_cleanup, + .vhost_backend_memslots_limit = vhost_user_memslots_limit, + .vhost_set_log_base = vhost_user_set_log_base, + .vhost_set_mem_table = vhost_user_set_mem_table, + .vhost_set_vring_addr = vhost_user_set_vring_addr, + .vhost_set_vring_endian = vhost_user_set_vring_endian, + .vhost_set_vring_num = vhost_user_set_vring_num, + .vhost_set_vring_base = vhost_user_set_vring_base, + .vhost_get_vring_base = vhost_user_get_vring_base, + .vhost_set_vring_kick = vhost_user_set_vring_kick, + .vhost_set_vring_call = vhost_user_set_vring_call, + .vhost_set_features = vhost_user_set_features, + .vhost_get_features = vhost_user_get_features, + .vhost_set_owner = vhost_user_set_owner, + .vhost_reset_device = vhost_user_reset_device, + .vhost_get_vq_index = vhost_user_get_vq_index, + .vhost_set_vring_enable = vhost_user_set_vring_enable, + .vhost_requires_shm_log = vhost_user_requires_shm_log, + .vhost_migration_done = vhost_user_migration_done, + .vhost_backend_can_merge = vhost_user_can_merge, +}; diff --git a/qemu/hw/virtio/vhost.c b/qemu/hw/virtio/vhost.c index 2712c6fc0..440071815 100644 --- a/qemu/hw/virtio/vhost.c +++ b/qemu/hw/virtio/vhost.c @@ -13,11 +13,14 @@ * GNU GPL, version 2 or (at your option) any later version. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "hw/virtio/vhost.h" #include "hw/hw.h" #include "qemu/atomic.h" #include "qemu/range.h" #include "qemu/error-report.h" +#include "qemu/memfd.h" #include <linux/vhost.h> #include "exec/address-spaces.h" #include "hw/virtio/virtio-bus.h" @@ -25,6 +28,23 @@ #include "migration/migration.h" static struct vhost_log *vhost_log; +static struct vhost_log *vhost_log_shm; + +static unsigned int used_memslots; +static QLIST_HEAD(, vhost_dev) vhost_devices = + QLIST_HEAD_INITIALIZER(vhost_devices); + +bool vhost_has_free_slot(void) +{ + unsigned int slots_limit = ~0U; + struct vhost_dev *hdev; + + QLIST_FOREACH(hdev, &vhost_devices, entry) { + unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev); + slots_limit = MIN(slots_limit, r); + } + return slots_limit > used_memslots; +} static void vhost_dev_sync_region(struct vhost_dev *dev, MemoryRegionSection *section, @@ -241,6 +261,13 @@ static void vhost_dev_assign_memory(struct vhost_dev *dev, continue; } + if (dev->vhost_ops->vhost_backend_can_merge && + !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size, + reg->userspace_addr, + reg->memory_size)) { + continue; + } + if (merged) { --to; assert(to >= 0); @@ -286,25 +313,46 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev) } return log_size; } -static struct vhost_log *vhost_log_alloc(uint64_t size) + +static struct vhost_log *vhost_log_alloc(uint64_t size, bool share) { - struct vhost_log *log = g_malloc0(sizeof *log + size * sizeof(*(log->log))); + struct vhost_log *log; + uint64_t logsize = size * sizeof(*(log->log)); + int fd = -1; + + log = g_new0(struct vhost_log, 1); + if (share) { + log->log = qemu_memfd_alloc("vhost-log", logsize, + F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL, + &fd); + memset(log->log, 0, logsize); + } else { + log->log = g_malloc0(logsize); + } log->size = size; log->refcnt = 1; + log->fd = fd; return log; } -static struct vhost_log *vhost_log_get(uint64_t size) +static struct vhost_log *vhost_log_get(uint64_t size, bool share) { - if (!vhost_log || vhost_log->size != size) { - vhost_log = vhost_log_alloc(size); + struct vhost_log *log = share ? vhost_log_shm : vhost_log; + + if (!log || log->size != size) { + log = vhost_log_alloc(size, share); + if (share) { + vhost_log_shm = log; + } else { + vhost_log = log; + } } else { - ++vhost_log->refcnt; + ++log->refcnt; } - return vhost_log; + return log; } static void vhost_log_put(struct vhost_dev *dev, bool sync) @@ -321,20 +369,35 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync) if (dev->log_size && sync) { vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1); } + if (vhost_log == log) { + g_free(log->log); vhost_log = NULL; + } else if (vhost_log_shm == log) { + qemu_memfd_free(log->log, log->size * sizeof(*(log->log)), + log->fd); + vhost_log_shm = NULL; } + g_free(log); } } -static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size) +static bool vhost_dev_log_is_shared(struct vhost_dev *dev) { - struct vhost_log *log = vhost_log_get(size); + return dev->vhost_ops->vhost_requires_shm_log && + dev->vhost_ops->vhost_requires_shm_log(dev); +} + +static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size) +{ + struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev)); uint64_t log_base = (uintptr_t)log->log; int r; - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_LOG_BASE, &log_base); + /* inform backend of log switching, this must be done before + releasing the current log, to ensure no logging is lost */ + r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log); assert(r >= 0); vhost_log_put(dev, true); dev->log = log; @@ -457,6 +520,7 @@ static void vhost_set_memory(MemoryListener *listener, dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr); dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1); dev->memory_changed = true; + used_memslots = dev->mem->nregions; } static bool vhost_section(MemoryRegionSection *section) @@ -500,7 +564,7 @@ static void vhost_commit(MemoryListener *listener) } if (!dev->log_enabled) { - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem); + r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); assert(r >= 0); dev->memory_changed = false; return; @@ -513,7 +577,7 @@ static void vhost_commit(MemoryListener *listener) if (dev->log_size < log_size) { vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER); } - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem); + r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem); assert(r >= 0); /* To log less, can only decrease log size after table update. */ if (dev->log_size > log_size + VHOST_LOG_BUFFER) { @@ -581,7 +645,7 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev, .log_guest_addr = vq->used_phys, .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0, }; - int r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ADDR, &addr); + int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr); if (r < 0) { return -errno; } @@ -595,19 +659,20 @@ static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log) if (enable_log) { features |= 0x1ULL << VHOST_F_LOG_ALL; } - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_FEATURES, &features); + r = dev->vhost_ops->vhost_set_features(dev, features); return r < 0 ? -errno : 0; } static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) { - int r, t, i; + int r, t, i, idx; r = vhost_dev_set_features(dev, enable_log); if (r < 0) { goto err_features; } for (i = 0; i < dev->nvqs; ++i) { - r = vhost_virtqueue_set_addr(dev, dev->vqs + i, i, + idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); + r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, enable_log); if (r < 0) { goto err_vq; @@ -616,7 +681,8 @@ static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log) return 0; err_vq: for (; i >= 0; --i) { - t = vhost_virtqueue_set_addr(dev, dev->vqs + i, i, + idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i); + t = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx, dev->log_enabled); assert(t >= 0); } @@ -691,6 +757,27 @@ static void vhost_log_stop(MemoryListener *listener, /* FIXME: implement */ } +/* The vhost driver natively knows how to handle the vrings of non + * cross-endian legacy devices and modern devices. Only legacy devices + * exposed to a bi-endian guest may require the vhost driver to use a + * specific endianness. + */ +static inline bool vhost_needs_vring_endian(VirtIODevice *vdev) +{ + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { + return false; + } +#ifdef TARGET_IS_BIENDIAN +#ifdef HOST_WORDS_BIGENDIAN + return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE; +#else + return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG; +#endif +#else + return false; +#endif +} + static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, bool is_big_endian, int vhost_vq_index) @@ -700,7 +787,7 @@ static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev, .num = is_big_endian }; - if (!dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ENDIAN, &s)) { + if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) { return 0; } @@ -719,7 +806,7 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, { hwaddr s, l, a; int r; - int vhost_vq_index = idx - dev->vq_index; + int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); struct vhost_vring_file file = { .index = vhost_vq_index }; @@ -728,22 +815,20 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, }; struct VirtQueue *vvq = virtio_get_queue(vdev, idx); - assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); vq->num = state.num = virtio_queue_get_num(vdev, idx); - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_NUM, &state); + r = dev->vhost_ops->vhost_set_vring_num(dev, &state); if (r) { return -errno; } state.num = virtio_queue_get_last_avail_idx(vdev, idx); - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_BASE, &state); + r = dev->vhost_ops->vhost_set_vring_base(dev, &state); if (r) { return -errno; } - if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1) && - virtio_legacy_is_cross_endian(vdev)) { + if (vhost_needs_vring_endian(vdev)) { r = vhost_virtqueue_set_vring_endian_legacy(dev, virtio_is_big_endian(vdev), vhost_vq_index); @@ -789,7 +874,7 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, } file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq)); - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_KICK, &file); + r = dev->vhost_ops->vhost_set_vring_kick(dev, &file); if (r) { r = -errno; goto fail_kick; @@ -798,6 +883,14 @@ static int vhost_virtqueue_start(struct vhost_dev *dev, /* Clear and discard previous events if any. */ event_notifier_test_and_clear(&vq->masked_notifier); + /* Init vring in unmasked state, unless guest_notifier_mask + * will do it later. + */ + if (!vdev->use_guest_notifier_mask) { + /* TODO: check and handle errors. */ + vhost_virtqueue_mask(dev, vdev, idx, false); + } + return 0; fail_kick: @@ -822,13 +915,13 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev, struct vhost_virtqueue *vq, unsigned idx) { - int vhost_vq_index = idx - dev->vq_index; + int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx); struct vhost_vring_state state = { .index = vhost_vq_index, }; int r; - assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs); - r = dev->vhost_ops->vhost_call(dev, VHOST_GET_VRING_BASE, &state); + + r = dev->vhost_ops->vhost_get_vring_base(dev, &state); if (r < 0) { fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r); fflush(stderr); @@ -839,8 +932,7 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev, /* In the cross-endian case, we need to reset the vring endianness to * native as legacy devices expect so by default. */ - if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1) && - virtio_legacy_is_cross_endian(vdev)) { + if (vhost_needs_vring_endian(vdev)) { r = vhost_virtqueue_set_vring_endian_legacy(dev, !virtio_is_big_endian(vdev), vhost_vq_index); @@ -875,8 +967,9 @@ static void vhost_eventfd_del(MemoryListener *listener, static int vhost_virtqueue_init(struct vhost_dev *dev, struct vhost_virtqueue *vq, int n) { + int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n); struct vhost_vring_file file = { - .index = n, + .index = vhost_vq_index, }; int r = event_notifier_init(&vq->masked_notifier, 0); if (r < 0) { @@ -884,7 +977,7 @@ static int vhost_virtqueue_init(struct vhost_dev *dev, } file.fd = event_notifier_get_fd(&vq->masked_notifier); - r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_CALL, &file); + r = dev->vhost_ops->vhost_set_vring_call(dev, &file); if (r) { r = -errno; goto fail_call; @@ -906,6 +999,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, uint64_t features; int i, r; + hdev->migration_blocker = NULL; + if (vhost_set_backend_type(hdev, backend_type) < 0) { close((uintptr_t)opaque); return -1; @@ -916,18 +1011,26 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, return -errno; } - r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_OWNER, NULL); + if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) { + fprintf(stderr, "vhost backend memory slots limit is less" + " than current number of present memory slots\n"); + close((uintptr_t)opaque); + return -1; + } + QLIST_INSERT_HEAD(&vhost_devices, hdev, entry); + + r = hdev->vhost_ops->vhost_set_owner(hdev); if (r < 0) { goto fail; } - r = hdev->vhost_ops->vhost_call(hdev, VHOST_GET_FEATURES, &features); + r = hdev->vhost_ops->vhost_get_features(hdev, &features); if (r < 0) { goto fail; } for (i = 0; i < hdev->nvqs; ++i) { - r = vhost_virtqueue_init(hdev, hdev->vqs + i, i); + r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i); if (r < 0) { goto fail_vq; } @@ -949,12 +1052,21 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque, .eventfd_del = vhost_eventfd_del, .priority = 10 }; - hdev->migration_blocker = NULL; - if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { - error_setg(&hdev->migration_blocker, - "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); + + if (hdev->migration_blocker == NULL) { + if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) { + error_setg(&hdev->migration_blocker, + "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature."); + } else if (!qemu_memfd_check()) { + error_setg(&hdev->migration_blocker, + "Migration disabled: failed to allocate shared memory"); + } + } + + if (hdev->migration_blocker != NULL) { migrate_add_blocker(hdev->migration_blocker); } + hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions)); hdev->n_mem_sections = 0; hdev->mem_sections = NULL; @@ -972,6 +1084,7 @@ fail_vq: fail: r = -errno; hdev->vhost_ops->vhost_backend_cleanup(hdev); + QLIST_REMOVE(hdev, entry); return r; } @@ -989,6 +1102,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev) g_free(hdev->mem); g_free(hdev->mem_sections); hdev->vhost_ops->vhost_backend_cleanup(hdev); + QLIST_REMOVE(hdev, entry); } /* Stop processing guest IO notifications in qemu. @@ -1066,18 +1180,17 @@ void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n, { struct VirtQueue *vvq = virtio_get_queue(vdev, n); int r, index = n - hdev->vq_index; + struct vhost_vring_file file; - assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs); - - struct vhost_vring_file file = { - .index = index - }; if (mask) { + assert(vdev->use_guest_notifier_mask); file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier); } else { file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq)); } - r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_VRING_CALL, &file); + + file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n); + r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file); assert(r >= 0); } @@ -1119,7 +1232,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) if (r < 0) { goto fail_features; } - r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_MEM_TABLE, hdev->mem); + r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem); if (r < 0) { r = -errno; goto fail_mem; @@ -1138,10 +1251,12 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev) uint64_t log_base; hdev->log_size = vhost_get_log_size(hdev); - hdev->log = vhost_log_get(hdev->log_size); + hdev->log = vhost_log_get(hdev->log_size, + vhost_dev_log_is_shared(hdev)); log_base = (uintptr_t)hdev->log->log; - r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_LOG_BASE, - hdev->log_size ? &log_base : NULL); + r = hdev->vhost_ops->vhost_set_log_base(hdev, + hdev->log_size ? log_base : 0, + hdev->log); if (r < 0) { r = -errno; goto fail_log; diff --git a/qemu/hw/virtio/virtio-balloon.c b/qemu/hw/virtio/virtio-balloon.c index 3577b7af9..9dbe68179 100644 --- a/qemu/hw/virtio/virtio-balloon.c +++ b/qemu/hw/virtio/virtio-balloon.c @@ -13,12 +13,12 @@ * */ +#include "qemu/osdep.h" #include "qemu/iov.h" #include "qemu/timer.h" #include "qemu-common.h" #include "hw/virtio/virtio.h" #include "hw/i386/pc.h" -#include "cpu.h" #include "sysemu/balloon.h" #include "hw/virtio/virtio-balloon.h" #include "sysemu/kvm.h" @@ -34,12 +34,16 @@ #include "hw/virtio/virtio-bus.h" #include "hw/virtio/virtio-access.h" +#define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) + static void balloon_page(void *addr, int deflate) { #if defined(__linux__) - if (!kvm_enabled() || kvm_has_sync_mmu()) - qemu_madvise(addr, TARGET_PAGE_SIZE, + if (!qemu_balloon_is_inhibited() && (!kvm_enabled() || + kvm_has_sync_mmu())) { + qemu_madvise(addr, BALLOON_PAGE_SIZE, deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED); + } #endif } @@ -50,6 +54,7 @@ static const char *balloon_stat_names[] = { [VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults", [VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory", [VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory", + [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory", [VIRTIO_BALLOON_S_NR] = NULL }; @@ -70,7 +75,7 @@ static inline void reset_stats(VirtIOBalloon *dev) static bool balloon_stats_supported(const VirtIOBalloon *s) { VirtIODevice *vdev = VIRTIO_DEVICE(s); - return virtio_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ); + return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ); } static bool balloon_stats_enabled(const VirtIOBalloon *s) @@ -98,39 +103,43 @@ static void balloon_stats_poll_cb(void *opaque) VirtIOBalloon *s = opaque; VirtIODevice *vdev = VIRTIO_DEVICE(s); - if (!balloon_stats_supported(s)) { + if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) { /* re-schedule */ balloon_stats_change_timer(s, s->stats_poll_interval); return; } - virtqueue_push(s->svq, &s->stats_vq_elem, s->stats_vq_offset); + virtqueue_push(s->svq, s->stats_vq_elem, s->stats_vq_offset); virtio_notify(vdev, s->svq); + g_free(s->stats_vq_elem); + s->stats_vq_elem = NULL; } -static void balloon_stats_get_all(Object *obj, struct Visitor *v, - void *opaque, const char *name, Error **errp) +static void balloon_stats_get_all(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) { Error *err = NULL; VirtIOBalloon *s = opaque; int i; - visit_start_struct(v, NULL, "guest-stats", name, 0, &err); + visit_start_struct(v, name, NULL, 0, &err); if (err) { goto out; } - visit_type_int(v, &s->stats_last_update, "last-update", &err); + visit_type_int(v, "last-update", &s->stats_last_update, &err); if (err) { goto out_end; } - visit_start_struct(v, NULL, NULL, "stats", 0, &err); + visit_start_struct(v, "stats", NULL, 0, &err); if (err) { goto out_end; } - for (i = 0; !err && i < VIRTIO_BALLOON_S_NR; i++) { - visit_type_int64(v, (int64_t *) &s->stats[i], balloon_stat_names[i], - &err); + for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) { + visit_type_uint64(v, balloon_stat_names[i], &s->stats[i], &err); + if (err) { + break; + } } error_propagate(errp, err); err = NULL; @@ -144,23 +153,23 @@ out: error_propagate(errp, err); } -static void balloon_stats_get_poll_interval(Object *obj, struct Visitor *v, - void *opaque, const char *name, +static void balloon_stats_get_poll_interval(Object *obj, Visitor *v, + const char *name, void *opaque, Error **errp) { VirtIOBalloon *s = opaque; - visit_type_int(v, &s->stats_poll_interval, name, errp); + visit_type_int(v, name, &s->stats_poll_interval, errp); } -static void balloon_stats_set_poll_interval(Object *obj, struct Visitor *v, - void *opaque, const char *name, +static void balloon_stats_set_poll_interval(Object *obj, Visitor *v, + const char *name, void *opaque, Error **errp) { VirtIOBalloon *s = opaque; Error *local_err = NULL; int64_t value; - visit_type_int(v, &value, name, &local_err); + visit_type_int(v, name, &value, &local_err); if (local_err) { error_propagate(errp, local_err); return; @@ -203,14 +212,18 @@ static void balloon_stats_set_poll_interval(Object *obj, struct Visitor *v, static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtIOBalloon *s = VIRTIO_BALLOON(vdev); - VirtQueueElement elem; + VirtQueueElement *elem; MemoryRegionSection section; - while (virtqueue_pop(vq, &elem)) { + for (;;) { size_t offset = 0; uint32_t pfn; + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { + return; + } - while (iov_to_buf(elem.out_sg, elem.out_num, offset, &pfn, 4) == 4) { + while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) { ram_addr_t pa; ram_addr_t addr; int p = virtio_ldl_p(vdev, &pfn); @@ -233,23 +246,34 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq) memory_region_unref(section.mr); } - virtqueue_push(vq, &elem, offset); + virtqueue_push(vq, elem, offset); virtio_notify(vdev, vq); + g_free(elem); } } static void virtio_balloon_receive_stats(VirtIODevice *vdev, VirtQueue *vq) { VirtIOBalloon *s = VIRTIO_BALLOON(vdev); - VirtQueueElement *elem = &s->stats_vq_elem; + VirtQueueElement *elem; VirtIOBalloonStat stat; size_t offset = 0; qemu_timeval tv; - if (!virtqueue_pop(vq, elem)) { + elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); + if (!elem) { goto out; } + if (s->stats_vq_elem != NULL) { + /* This should never happen if the driver follows the spec. */ + virtqueue_push(vq, s->stats_vq_elem, 0); + virtio_notify(vdev, vq); + g_free(s->stats_vq_elem); + } + + s->stats_vq_elem = elem; + /* Initialize the stats to get rid of any stale values. This is only * needed to handle the case where a guest supports fewer stats than it * used to (ie. it has booted into an old kernel). @@ -292,6 +316,39 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data) memcpy(config_data, &config, sizeof(struct virtio_balloon_config)); } +static int build_dimm_list(Object *obj, void *opaque) +{ + GSList **list = opaque; + + if (object_dynamic_cast(obj, TYPE_PC_DIMM)) { + DeviceState *dev = DEVICE(obj); + if (dev->realized) { /* only realized DIMMs matter */ + *list = g_slist_prepend(*list, dev); + } + } + + object_child_foreach(obj, build_dimm_list, opaque); + return 0; +} + +static ram_addr_t get_current_ram_size(void) +{ + GSList *list = NULL, *item; + ram_addr_t size = ram_size; + + build_dimm_list(qdev_get_machine(), &list); + for (item = list; item; item = g_slist_next(item)) { + Object *obj = OBJECT(item->data); + if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) { + size += object_property_get_int(obj, PC_DIMM_SIZE_PROP, + &error_abort); + } + } + g_slist_free(list); + + return size; +} + static void virtio_balloon_set_config(VirtIODevice *vdev, const uint8_t *config_data) { @@ -370,6 +427,10 @@ static int virtio_balloon_load_device(VirtIODevice *vdev, QEMUFile *f, s->num_pages = qemu_get_be32(f); s->actual = qemu_get_be32(f); + + if (balloon_stats_enabled(s)) { + balloon_stats_change_timer(s, s->stats_poll_interval); + } return 0; } @@ -412,6 +473,16 @@ static void virtio_balloon_device_unrealize(DeviceState *dev, Error **errp) virtio_cleanup(vdev); } +static void virtio_balloon_device_reset(VirtIODevice *vdev) +{ + VirtIOBalloon *s = VIRTIO_BALLOON(vdev); + + if (s->stats_vq_elem != NULL) { + g_free(s->stats_vq_elem); + s->stats_vq_elem = NULL; + } +} + static void virtio_balloon_instance_init(Object *obj) { VirtIOBalloon *s = VIRTIO_BALLOON(obj); @@ -440,6 +511,7 @@ static void virtio_balloon_class_init(ObjectClass *klass, void *data) set_bit(DEVICE_CATEGORY_MISC, dc->categories); vdc->realize = virtio_balloon_device_realize; vdc->unrealize = virtio_balloon_device_unrealize; + vdc->reset = virtio_balloon_device_reset; vdc->get_config = virtio_balloon_get_config; vdc->set_config = virtio_balloon_set_config; vdc->get_features = virtio_balloon_get_features; diff --git a/qemu/hw/virtio/virtio-bus.c b/qemu/hw/virtio/virtio-bus.c index febda76b9..574f0e23f 100644 --- a/qemu/hw/virtio/virtio-bus.c +++ b/qemu/hw/virtio/virtio-bus.c @@ -22,6 +22,7 @@ * */ +#include "qemu/osdep.h" #include "hw/hw.h" #include "qemu/error-report.h" #include "hw/qdev.h" @@ -56,6 +57,9 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp) assert(vdc->get_features != NULL); vdev->host_features = vdc->get_features(vdev, vdev->host_features, errp); + if (klass->post_plugged != NULL) { + klass->post_plugged(qbus->parent, errp); + } } /* Reset the virtio_bus */ diff --git a/qemu/hw/virtio/virtio-mmio.c b/qemu/hw/virtio/virtio-mmio.c index 18660b07b..d4cd91f8c 100644 --- a/qemu/hw/virtio/virtio-mmio.c +++ b/qemu/hw/virtio/virtio-mmio.c @@ -19,6 +19,7 @@ * with this program; if not, see <http://www.gnu.org/licenses/>. */ +#include "qemu/osdep.h" #include "hw/sysbus.h" #include "hw/virtio/virtio.h" #include "qemu/host-utils.h" diff --git a/qemu/hw/virtio/virtio-pci.c b/qemu/hw/virtio/virtio-pci.c index c024161f5..bfedbbf17 100644 --- a/qemu/hw/virtio/virtio-pci.c +++ b/qemu/hw/virtio/virtio-pci.c @@ -15,7 +15,7 @@ * GNU GPL, version 2 or (at your option) any later version. */ -#include <inttypes.h> +#include "qemu/osdep.h" #include "standard-headers/linux/virtio_pci.h" #include "hw/virtio/virtio.h" @@ -26,6 +26,7 @@ #include "hw/virtio/virtio-balloon.h" #include "hw/virtio/virtio-input.h" #include "hw/pci/pci.h" +#include "qapi/error.h" #include "qemu/error-report.h" #include "hw/pci/msi.h" #include "hw/pci/msix.h" @@ -47,6 +48,7 @@ static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size, VirtIOPCIProxy *dev); +static void virtio_pci_reset(DeviceState *qdev); /* virtio device */ /* DeviceState to VirtIOPCIProxy. For use off data-path. TODO: use QOM. */ @@ -86,6 +88,129 @@ static void virtio_pci_save_config(DeviceState *d, QEMUFile *f) qemu_put_be16(f, vdev->config_vector); } +static void virtio_pci_load_modern_queue_state(VirtIOPCIQueue *vq, + QEMUFile *f) +{ + vq->num = qemu_get_be16(f); + vq->enabled = qemu_get_be16(f); + vq->desc[0] = qemu_get_be32(f); + vq->desc[1] = qemu_get_be32(f); + vq->avail[0] = qemu_get_be32(f); + vq->avail[1] = qemu_get_be32(f); + vq->used[0] = qemu_get_be32(f); + vq->used[1] = qemu_get_be32(f); +} + +static bool virtio_pci_has_extra_state(DeviceState *d) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + + return proxy->flags & VIRTIO_PCI_FLAG_MIGRATE_EXTRA; +} + +static int get_virtio_pci_modern_state(QEMUFile *f, void *pv, size_t size) +{ + VirtIOPCIProxy *proxy = pv; + int i; + + proxy->dfselect = qemu_get_be32(f); + proxy->gfselect = qemu_get_be32(f); + proxy->guest_features[0] = qemu_get_be32(f); + proxy->guest_features[1] = qemu_get_be32(f); + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + virtio_pci_load_modern_queue_state(&proxy->vqs[i], f); + } + + return 0; +} + +static void virtio_pci_save_modern_queue_state(VirtIOPCIQueue *vq, + QEMUFile *f) +{ + qemu_put_be16(f, vq->num); + qemu_put_be16(f, vq->enabled); + qemu_put_be32(f, vq->desc[0]); + qemu_put_be32(f, vq->desc[1]); + qemu_put_be32(f, vq->avail[0]); + qemu_put_be32(f, vq->avail[1]); + qemu_put_be32(f, vq->used[0]); + qemu_put_be32(f, vq->used[1]); +} + +static void put_virtio_pci_modern_state(QEMUFile *f, void *pv, size_t size) +{ + VirtIOPCIProxy *proxy = pv; + int i; + + qemu_put_be32(f, proxy->dfselect); + qemu_put_be32(f, proxy->gfselect); + qemu_put_be32(f, proxy->guest_features[0]); + qemu_put_be32(f, proxy->guest_features[1]); + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + virtio_pci_save_modern_queue_state(&proxy->vqs[i], f); + } +} + +static const VMStateInfo vmstate_info_virtio_pci_modern_state = { + .name = "virtqueue_state", + .get = get_virtio_pci_modern_state, + .put = put_virtio_pci_modern_state, +}; + +static bool virtio_pci_modern_state_needed(void *opaque) +{ + VirtIOPCIProxy *proxy = opaque; + + return !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN); +} + +static const VMStateDescription vmstate_virtio_pci_modern_state = { + .name = "virtio_pci/modern_state", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_pci_modern_state_needed, + .fields = (VMStateField[]) { + { + .name = "modern_state", + .version_id = 0, + .field_exists = NULL, + .size = 0, + .info = &vmstate_info_virtio_pci_modern_state, + .flags = VMS_SINGLE, + .offset = 0, + }, + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_pci = { + .name = "virtio_pci", + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription*[]) { + &vmstate_virtio_pci_modern_state, + NULL + } +}; + +static void virtio_pci_save_extra_state(DeviceState *d, QEMUFile *f) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + + vmstate_save_state(f, &vmstate_virtio_pci, proxy, NULL); +} + +static int virtio_pci_load_extra_state(DeviceState *d, QEMUFile *f) +{ + VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); + + return vmstate_load_state(f, &vmstate_virtio_pci, proxy, 1); +} + static void virtio_pci_save_queue(DeviceState *d, int n, QEMUFile *f) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d); @@ -133,6 +258,7 @@ static int virtio_pci_load_queue(DeviceState *d, int n, QEMUFile *f) if (vector != VIRTIO_NO_VECTOR) { return msix_vector_use(&proxy->pci_dev, vector); } + return 0; } @@ -146,7 +272,10 @@ static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy, EventNotifier *notifier = virtio_queue_get_host_notifier(vq); bool legacy = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_LEGACY); bool modern = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN); + bool fast_mmio = kvm_ioeventfd_any_length_enabled(); + bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY; MemoryRegion *modern_mr = &proxy->notify.mr; + MemoryRegion *modern_notify_mr = &proxy->notify_pio.mr; MemoryRegion *legacy_mr = &proxy->bar; hwaddr modern_addr = QEMU_VIRTIO_PCI_QUEUE_MEM_MULT * virtio_get_queue_index(vq); @@ -162,8 +291,17 @@ static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy, } virtio_queue_set_host_notifier_fd_handler(vq, true, set_handler); if (modern) { - memory_region_add_eventfd(modern_mr, modern_addr, 2, - true, n, notifier); + if (fast_mmio) { + memory_region_add_eventfd(modern_mr, modern_addr, 0, + false, n, notifier); + } else { + memory_region_add_eventfd(modern_mr, modern_addr, 2, + false, n, notifier); + } + if (modern_pio) { + memory_region_add_eventfd(modern_notify_mr, 0, 2, + true, n, notifier); + } } if (legacy) { memory_region_add_eventfd(legacy_mr, legacy_addr, 2, @@ -171,8 +309,17 @@ static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy, } } else { if (modern) { - memory_region_del_eventfd(modern_mr, modern_addr, 2, - true, n, notifier); + if (fast_mmio) { + memory_region_del_eventfd(modern_mr, modern_addr, 0, + false, n, notifier); + } else { + memory_region_del_eventfd(modern_mr, modern_addr, 2, + false, n, notifier); + } + if (modern_pio) { + memory_region_del_eventfd(modern_notify_mr, 0, 2, + true, n, notifier); + } } if (legacy) { memory_region_del_eventfd(legacy_mr, legacy_addr, 2, @@ -259,9 +406,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) case VIRTIO_PCI_QUEUE_PFN: pa = (hwaddr)val << VIRTIO_PCI_QUEUE_ADDR_SHIFT; if (pa == 0) { - virtio_pci_stop_ioeventfd(proxy); - virtio_reset(vdev); - msix_unuse_all_vectors(&proxy->pci_dev); + virtio_pci_reset(DEVICE(proxy)); } else virtio_queue_set_addr(vdev, vdev->queue_sel, pa); @@ -287,8 +432,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) } if (vdev->status == 0) { - virtio_reset(vdev); - msix_unuse_all_vectors(&proxy->pci_dev); + virtio_pci_reset(DEVICE(proxy)); } /* Linux before 2.6.34 drives the device without enabling @@ -590,7 +734,7 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy, int ret; if (irqfd->users == 0) { - ret = kvm_irqchip_add_msi_route(kvm_state, msg); + ret = kvm_irqchip_add_msi_route(kvm_state, msg, &proxy->pci_dev); if (ret < 0) { return ret; } @@ -661,7 +805,7 @@ static int kvm_virtio_pci_vector_use(VirtIOPCIProxy *proxy, int nvqs) /* If guest supports masking, set up irqfd now. * Otherwise, delay until unmasked in the frontend. */ - if (k->guest_notifier_mask) { + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector); if (ret < 0) { kvm_virtio_pci_vq_vector_release(proxy, vector); @@ -677,7 +821,7 @@ undo: if (vector >= msix_nr_vectors_allocated(dev)) { continue; } - if (k->guest_notifier_mask) { + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { kvm_virtio_pci_irqfd_release(proxy, queue_no, vector); } kvm_virtio_pci_vq_vector_release(proxy, vector); @@ -704,7 +848,7 @@ static void kvm_virtio_pci_vector_release(VirtIOPCIProxy *proxy, int nvqs) /* If guest supports masking, clean up irqfd now. * Otherwise, it was cleaned when masked in the frontend. */ - if (k->guest_notifier_mask) { + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { kvm_virtio_pci_irqfd_release(proxy, queue_no, vector); } kvm_virtio_pci_vq_vector_release(proxy, vector); @@ -726,7 +870,8 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, if (proxy->vector_irqfd) { irqfd = &proxy->vector_irqfd[vector]; if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) { - ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg); + ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg, + &proxy->pci_dev); if (ret < 0) { return ret; } @@ -736,7 +881,7 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy, /* If guest supports masking, irqfd is already setup, unmask it. * Otherwise, set it up now. */ - if (k->guest_notifier_mask) { + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { k->guest_notifier_mask(vdev, queue_no, false); /* Test after unmasking to avoid losing events. */ if (k->guest_notifier_pending && @@ -759,7 +904,7 @@ static void virtio_pci_vq_vector_mask(VirtIOPCIProxy *proxy, /* If guest supports masking, keep irqfd but mask it. * Otherwise, clean it up now. */ - if (k->guest_notifier_mask) { + if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) { k->guest_notifier_mask(vdev, queue_no, true); } else { kvm_virtio_pci_irqfd_release(proxy, queue_no, vector); @@ -876,7 +1021,9 @@ static int virtio_pci_set_guest_notifier(DeviceState *d, int n, bool assign, event_notifier_cleanup(notifier); } - if (!msix_enabled(&proxy->pci_dev) && vdc->guest_notifier_mask) { + if (!msix_enabled(&proxy->pci_dev) && + vdev->use_guest_notifier_mask && + vdc->guest_notifier_mask) { vdc->guest_notifier_mask(vdev, n, !assign); } @@ -1205,8 +1352,7 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, } if (vdev->status == 0) { - virtio_reset(vdev); - msix_unuse_all_vectors(&proxy->pci_dev); + virtio_pci_reset(DEVICE(proxy)); } break; @@ -1238,6 +1384,7 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr, proxy->vqs[vdev->queue_sel].avail[0], ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 | proxy->vqs[vdev->queue_sel].used[0]); + proxy->vqs[vdev->queue_sel].enabled = 1; break; case VIRTIO_PCI_COMMON_Q_DESCLO: proxy->vqs[vdev->queue_sel].desc[0] = val; @@ -1280,6 +1427,17 @@ static void virtio_pci_notify_write(void *opaque, hwaddr addr, } } +static void virtio_pci_notify_write_pio(void *opaque, hwaddr addr, + uint64_t val, unsigned size) +{ + VirtIODevice *vdev = opaque; + unsigned queue = val; + + if (queue < VIRTIO_QUEUE_MAX) { + virtio_queue_notify(vdev, queue); + } +} + static uint64_t virtio_pci_isr_read(void *opaque, hwaddr addr, unsigned size) { @@ -1373,6 +1531,16 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy) }, .endianness = DEVICE_LITTLE_ENDIAN, }; + static const MemoryRegionOps notify_pio_ops = { + .read = virtio_pci_notify_read, + .write = virtio_pci_notify_write_pio, + .impl = { + .min_access_size = 1, + .max_access_size = 4, + }, + .endianness = DEVICE_LITTLE_ENDIAN, + }; + memory_region_init_io(&proxy->common.mr, OBJECT(proxy), &common_ops, @@ -1397,30 +1565,60 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy) virtio_bus_get_device(&proxy->bus), "virtio-pci-notify", proxy->notify.size); + + memory_region_init_io(&proxy->notify_pio.mr, OBJECT(proxy), + ¬ify_pio_ops, + virtio_bus_get_device(&proxy->bus), + "virtio-pci-notify-pio", + proxy->notify.size); } static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy, VirtIOPCIRegion *region, - struct virtio_pci_cap *cap) + struct virtio_pci_cap *cap, + MemoryRegion *mr, + uint8_t bar) { - memory_region_add_subregion(&proxy->modern_bar, - region->offset, - ®ion->mr); + memory_region_add_subregion(mr, region->offset, ®ion->mr); cap->cfg_type = region->type; - cap->bar = proxy->modern_mem_bar; + cap->bar = bar; cap->offset = cpu_to_le32(region->offset); cap->length = cpu_to_le32(region->size); virtio_pci_add_mem_cap(proxy, cap); + +} + +static void virtio_pci_modern_mem_region_map(VirtIOPCIProxy *proxy, + VirtIOPCIRegion *region, + struct virtio_pci_cap *cap) +{ + virtio_pci_modern_region_map(proxy, region, cap, + &proxy->modern_bar, proxy->modern_mem_bar); } -static void virtio_pci_modern_region_unmap(VirtIOPCIProxy *proxy, - VirtIOPCIRegion *region) +static void virtio_pci_modern_io_region_map(VirtIOPCIProxy *proxy, + VirtIOPCIRegion *region, + struct virtio_pci_cap *cap) +{ + virtio_pci_modern_region_map(proxy, region, cap, + &proxy->io_bar, proxy->modern_io_bar); +} + +static void virtio_pci_modern_mem_region_unmap(VirtIOPCIProxy *proxy, + VirtIOPCIRegion *region) { memory_region_del_subregion(&proxy->modern_bar, ®ion->mr); } +static void virtio_pci_modern_io_region_unmap(VirtIOPCIProxy *proxy, + VirtIOPCIRegion *region) +{ + memory_region_del_subregion(&proxy->io_bar, + ®ion->mr); +} + /* This is called by virtio-bus just after the device is plugged. */ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) { @@ -1428,6 +1626,7 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) VirtioBusState *bus = &proxy->bus; bool legacy = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_LEGACY); bool modern = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN); + bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY; uint8_t *config; uint32_t size; VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); @@ -1466,16 +1665,31 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) .cap.cap_len = sizeof cfg, .cap.cfg_type = VIRTIO_PCI_CAP_PCI_CFG, }; - struct virtio_pci_cfg_cap *cfg_mask; + struct virtio_pci_notify_cap notify_pio = { + .cap.cap_len = sizeof notify, + .notify_off_multiplier = cpu_to_le32(0x0), + }; - /* TODO: add io access for speed */ + struct virtio_pci_cfg_cap *cfg_mask; virtio_add_feature(&vdev->host_features, VIRTIO_F_VERSION_1); virtio_pci_modern_regions_init(proxy); - virtio_pci_modern_region_map(proxy, &proxy->common, &cap); - virtio_pci_modern_region_map(proxy, &proxy->isr, &cap); - virtio_pci_modern_region_map(proxy, &proxy->device, &cap); - virtio_pci_modern_region_map(proxy, &proxy->notify, ¬ify.cap); + + virtio_pci_modern_mem_region_map(proxy, &proxy->common, &cap); + virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap); + virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap); + virtio_pci_modern_mem_region_map(proxy, &proxy->notify, ¬ify.cap); + + if (modern_pio) { + memory_region_init(&proxy->io_bar, OBJECT(proxy), + "virtio-pci-io", 0x4); + + pci_register_bar(&proxy->pci_dev, proxy->modern_io_bar, + PCI_BASE_ADDRESS_SPACE_IO, &proxy->io_bar); + + virtio_pci_modern_io_region_map(proxy, &proxy->notify_pio, + ¬ify_pio.cap); + } pci_register_bar(&proxy->pci_dev, proxy->modern_mem_bar, PCI_BASE_ADDRESS_SPACE_MEMORY | @@ -1491,12 +1705,17 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) pci_set_long(cfg_mask->pci_cfg_data, ~0x0); } - if (proxy->nvectors && - msix_init_exclusive_bar(&proxy->pci_dev, proxy->nvectors, - proxy->msix_bar)) { - error_report("unable to init msix vectors to %" PRIu32, - proxy->nvectors); - proxy->nvectors = 0; + if (proxy->nvectors) { + int err = msix_init_exclusive_bar(&proxy->pci_dev, proxy->nvectors, + proxy->msix_bar); + if (err) { + /* Notice when a system that supports MSIx can't initialize it. */ + if (err != -ENOTSUP) { + error_report("unable to init msix vectors to %" PRIu32, + proxy->nvectors); + } + proxy->nvectors = 0; + } } proxy->pci_dev.config_write = virtio_write_config; @@ -1505,9 +1724,7 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp) if (legacy) { size = VIRTIO_PCI_REGION_SIZE(&proxy->pci_dev) + virtio_bus_get_vdev_config_len(bus); - if (size & (size - 1)) { - size = 1 << qemu_fls(size); - } + size = pow2ceil(size); memory_region_init_io(&proxy->bar, OBJECT(proxy), &virtio_pci_config_ops, @@ -1528,14 +1745,18 @@ static void virtio_pci_device_unplugged(DeviceState *d) { VirtIOPCIProxy *proxy = VIRTIO_PCI(d); bool modern = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN); + bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY; virtio_pci_stop_ioeventfd(proxy); if (modern) { - virtio_pci_modern_region_unmap(proxy, &proxy->common); - virtio_pci_modern_region_unmap(proxy, &proxy->isr); - virtio_pci_modern_region_unmap(proxy, &proxy->device); - virtio_pci_modern_region_unmap(proxy, &proxy->notify); + virtio_pci_modern_mem_region_unmap(proxy, &proxy->common); + virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr); + virtio_pci_modern_mem_region_unmap(proxy, &proxy->device); + virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify); + if (modern_pio) { + virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio); + } } } @@ -1555,6 +1776,7 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) */ proxy->legacy_io_bar = 0; proxy->msix_bar = 1; + proxy->modern_io_bar = 2; proxy->modern_mem_bar = 4; proxy->common.offset = 0x0; @@ -1574,6 +1796,10 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) QEMU_VIRTIO_PCI_QUEUE_MEM_MULT * VIRTIO_QUEUE_MAX; proxy->notify.type = VIRTIO_PCI_CAP_NOTIFY_CFG; + proxy->notify_pio.offset = 0x0; + proxy->notify_pio.size = 0x4; + proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG; + /* subclasses can enforce modern, so do this unconditionally */ memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci", 2 * QEMU_VIRTIO_PCI_QUEUE_MEM_MULT * @@ -1588,6 +1814,29 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp) address_space_init(&proxy->modern_as, &proxy->modern_cfg, "virtio-pci-cfg-as"); + if (pci_is_express(pci_dev) && pci_bus_is_express(pci_dev->bus) && + !pci_bus_is_root(pci_dev->bus)) { + int pos; + + pos = pcie_endpoint_cap_init(pci_dev, 0); + assert(pos > 0); + + pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0, PCI_PM_SIZEOF); + assert(pos > 0); + + /* + * Indicates that this function complies with revision 1.2 of the + * PCI Power Management Interface Specification. + */ + pci_set_word(pci_dev->config + pos + PCI_PM_PMC, 0x3); + } else { + /* + * make future invocations of pci_is_express() return false + * and pci_config_size() return PCI_CONFIG_SPACE_SIZE. + */ + pci_dev->cap_present &= ~QEMU_PCI_CAP_EXPRESS; + } + virtio_pci_bus_new(&proxy->bus, sizeof(proxy->bus), proxy); if (k->realize) { k->realize(proxy, errp); @@ -1606,9 +1855,15 @@ static void virtio_pci_reset(DeviceState *qdev) { VirtIOPCIProxy *proxy = VIRTIO_PCI(qdev); VirtioBusState *bus = VIRTIO_BUS(&proxy->bus); + int i; + virtio_pci_stop_ioeventfd(proxy); virtio_bus_reset(bus); msix_unuse_all_vectors(&proxy->pci_dev); + + for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { + proxy->vqs[i].enabled = 0; + } } static Property virtio_pci_properties[] = { @@ -1618,13 +1873,34 @@ static Property virtio_pci_properties[] = { VIRTIO_PCI_FLAG_DISABLE_LEGACY_BIT, false), DEFINE_PROP_BIT("disable-modern", VirtIOPCIProxy, flags, VIRTIO_PCI_FLAG_DISABLE_MODERN_BIT, true), + DEFINE_PROP_BIT("migrate-extra", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT, true), + DEFINE_PROP_BIT("modern-pio-notify", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT, false), + DEFINE_PROP_BIT("x-disable-pcie", VirtIOPCIProxy, flags, + VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT, false), DEFINE_PROP_END_OF_LIST(), }; +static void virtio_pci_dc_realize(DeviceState *qdev, Error **errp) +{ + VirtioPCIClass *vpciklass = VIRTIO_PCI_GET_CLASS(qdev); + VirtIOPCIProxy *proxy = VIRTIO_PCI(qdev); + PCIDevice *pci_dev = &proxy->pci_dev; + + if (!(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_PCIE) && + !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN)) { + pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS; + } + + vpciklass->parent_dc_realize(qdev, errp); +} + static void virtio_pci_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); PCIDeviceClass *k = PCI_DEVICE_CLASS(klass); + VirtioPCIClass *vpciklass = VIRTIO_PCI_CLASS(klass); dc->props = virtio_pci_properties; k->realize = virtio_pci_realize; @@ -1632,6 +1908,8 @@ static void virtio_pci_class_init(ObjectClass *klass, void *data) k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; k->revision = VIRTIO_PCI_ABI_VERSION; k->class_id = PCI_CLASS_OTHERS; + vpciklass->parent_dc_realize = dc->realize; + dc->realize = virtio_pci_dc_realize; dc->reset = virtio_pci_reset; } @@ -2009,10 +2287,6 @@ static const TypeInfo virtio_net_pci_info = { /* virtio-rng-pci */ -static Property virtio_rng_pci_properties[] = { - DEFINE_PROP_END_OF_LIST(), -}; - static void virtio_rng_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp) { VirtIORngPCI *vrng = VIRTIO_RNG_PCI(vpci_dev); @@ -2039,7 +2313,6 @@ static void virtio_rng_pci_class_init(ObjectClass *klass, void *data) k->realize = virtio_rng_pci_realize; set_bit(DEVICE_CATEGORY_MISC, dc->categories); - dc->props = virtio_rng_pci_properties; pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET; pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_RNG; @@ -2136,14 +2409,6 @@ static void virtio_tablet_initfn(Object *obj) TYPE_VIRTIO_TABLET); } -static void virtio_host_initfn(Object *obj) -{ - VirtIOInputHostPCI *dev = VIRTIO_INPUT_HOST_PCI(obj); - - virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), - TYPE_VIRTIO_INPUT_HOST); -} - static const TypeInfo virtio_input_pci_info = { .name = TYPE_VIRTIO_INPUT_PCI, .parent = TYPE_VIRTIO_PCI, @@ -2182,12 +2447,22 @@ static const TypeInfo virtio_tablet_pci_info = { .instance_init = virtio_tablet_initfn, }; +#ifdef CONFIG_LINUX +static void virtio_host_initfn(Object *obj) +{ + VirtIOInputHostPCI *dev = VIRTIO_INPUT_HOST_PCI(obj); + + virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev), + TYPE_VIRTIO_INPUT_HOST); +} + static const TypeInfo virtio_host_pci_info = { .name = TYPE_VIRTIO_INPUT_HOST_PCI, .parent = TYPE_VIRTIO_INPUT_PCI, .instance_size = sizeof(VirtIOInputHostPCI), .instance_init = virtio_host_initfn, }; +#endif /* virtio-pci-bus */ @@ -2211,6 +2486,9 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data) k->load_config = virtio_pci_load_config; k->save_queue = virtio_pci_save_queue; k->load_queue = virtio_pci_load_queue; + k->save_extra_state = virtio_pci_save_extra_state; + k->load_extra_state = virtio_pci_load_extra_state; + k->has_extra_state = virtio_pci_has_extra_state; k->query_guest_notifiers = virtio_pci_query_guest_notifiers; k->set_host_notifier = virtio_pci_set_host_notifier; k->set_guest_notifiers = virtio_pci_set_guest_notifiers; @@ -2235,7 +2513,9 @@ static void virtio_pci_register_types(void) type_register_static(&virtio_keyboard_pci_info); type_register_static(&virtio_mouse_pci_info); type_register_static(&virtio_tablet_pci_info); +#ifdef CONFIG_LINUX type_register_static(&virtio_host_pci_info); +#endif type_register_static(&virtio_pci_bus_info); type_register_static(&virtio_pci_info); #ifdef CONFIG_VIRTFS diff --git a/qemu/hw/virtio/virtio-pci.h b/qemu/hw/virtio/virtio-pci.h index b6c442f52..e4548c2f9 100644 --- a/qemu/hw/virtio/virtio-pci.h +++ b/qemu/hw/virtio/virtio-pci.h @@ -23,7 +23,6 @@ #include "hw/virtio/virtio-scsi.h" #include "hw/virtio/virtio-balloon.h" #include "hw/virtio/virtio-bus.h" -#include "hw/virtio/virtio-9p.h" #include "hw/virtio/virtio-input.h" #include "hw/virtio/virtio-gpu.h" #ifdef CONFIG_VIRTFS @@ -59,21 +58,35 @@ typedef struct VirtioBusClass VirtioPCIBusClass; #define VIRTIO_PCI_BUS_CLASS(klass) \ OBJECT_CLASS_CHECK(VirtioPCIBusClass, klass, TYPE_VIRTIO_PCI_BUS) +enum { + VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION_BIT, + VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, + VIRTIO_PCI_FLAG_DISABLE_LEGACY_BIT, + VIRTIO_PCI_FLAG_DISABLE_MODERN_BIT, + VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT, + VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT, + VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT, +}; + /* Need to activate work-arounds for buggy guests at vmstate load. */ -#define VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION_BIT 0 #define VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION \ (1 << VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION_BIT) /* Performance improves when virtqueue kick processing is decoupled from the * vcpu thread using ioeventfd for some devices. */ -#define VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT 1 #define VIRTIO_PCI_FLAG_USE_IOEVENTFD (1 << VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT) /* virtio version flags */ -#define VIRTIO_PCI_FLAG_DISABLE_LEGACY_BIT 2 -#define VIRTIO_PCI_FLAG_DISABLE_MODERN_BIT 3 #define VIRTIO_PCI_FLAG_DISABLE_LEGACY (1 << VIRTIO_PCI_FLAG_DISABLE_LEGACY_BIT) #define VIRTIO_PCI_FLAG_DISABLE_MODERN (1 << VIRTIO_PCI_FLAG_DISABLE_MODERN_BIT) +#define VIRTIO_PCI_FLAG_DISABLE_PCIE (1 << VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT) + +/* migrate extra state */ +#define VIRTIO_PCI_FLAG_MIGRATE_EXTRA (1 << VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT) + +/* have pio notification for modern device ? */ +#define VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY \ + (1 << VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT) typedef struct { MSIMessage msg; @@ -94,6 +107,7 @@ typedef struct { typedef struct VirtioPCIClass { PCIDeviceClass parent_class; + DeviceRealize parent_dc_realize; void (*realize)(VirtIOPCIProxy *vpci_dev, Error **errp); } VirtioPCIClass; @@ -104,6 +118,14 @@ typedef struct VirtIOPCIRegion { uint32_t type; } VirtIOPCIRegion; +typedef struct VirtIOPCIQueue { + uint16_t num; + bool enabled; + uint32_t desc[2]; + uint32_t avail[2]; + uint32_t used[2]; +} VirtIOPCIQueue; + struct VirtIOPCIProxy { PCIDevice pci_dev; MemoryRegion bar; @@ -111,11 +133,14 @@ struct VirtIOPCIProxy { VirtIOPCIRegion isr; VirtIOPCIRegion device; VirtIOPCIRegion notify; + VirtIOPCIRegion notify_pio; MemoryRegion modern_bar; + MemoryRegion io_bar; MemoryRegion modern_cfg; AddressSpace modern_as; uint32_t legacy_io_bar; uint32_t msix_bar; + uint32_t modern_io_bar; uint32_t modern_mem_bar; int config_cap; uint32_t flags; @@ -124,13 +149,7 @@ struct VirtIOPCIProxy { uint32_t dfselect; uint32_t gfselect; uint32_t guest_features[2]; - struct { - uint16_t num; - bool enabled; - uint32_t desc[2]; - uint32_t avail[2]; - uint32_t used[2]; - } vqs[VIRTIO_QUEUE_MAX]; + VirtIOPCIQueue vqs[VIRTIO_QUEUE_MAX]; bool ioeventfd_disabled; bool ioeventfd_started; @@ -226,7 +245,7 @@ struct VirtIONetPCI { typedef struct V9fsPCIState { VirtIOPCIProxy parent_obj; - V9fsState vdev; + V9fsVirtioState vdev; } V9fsPCIState; #endif @@ -267,6 +286,8 @@ struct VirtIOInputHIDPCI { VirtIOInputHID vdev; }; +#ifdef CONFIG_LINUX + #define TYPE_VIRTIO_INPUT_HOST_PCI "virtio-input-host-pci" #define VIRTIO_INPUT_HOST_PCI(obj) \ OBJECT_CHECK(VirtIOInputHostPCI, (obj), TYPE_VIRTIO_INPUT_HOST_PCI) @@ -276,6 +297,8 @@ struct VirtIOInputHostPCI { VirtIOInputHost vdev; }; +#endif + /* * virtio-gpu-pci: This extends VirtioPCIProxy. */ diff --git a/qemu/hw/virtio/virtio-rng.c b/qemu/hw/virtio/virtio-rng.c index 97d154191..6b991a764 100644 --- a/qemu/hw/virtio/virtio-rng.c +++ b/qemu/hw/virtio/virtio-rng.c @@ -9,6 +9,8 @@ * top-level directory. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu/iov.h" #include "hw/qdev.h" #include "hw/virtio/virtio.h" @@ -43,7 +45,7 @@ static void chr_read(void *opaque, const void *buf, size_t size) { VirtIORNG *vrng = opaque; VirtIODevice *vdev = VIRTIO_DEVICE(vrng); - VirtQueueElement elem; + VirtQueueElement *elem; size_t len; int offset; @@ -55,17 +57,26 @@ static void chr_read(void *opaque, const void *buf, size_t size) offset = 0; while (offset < size) { - if (!virtqueue_pop(vrng->vq, &elem)) { + elem = virtqueue_pop(vrng->vq, sizeof(VirtQueueElement)); + if (!elem) { break; } - len = iov_from_buf(elem.in_sg, elem.in_num, + len = iov_from_buf(elem->in_sg, elem->in_num, 0, buf + offset, size - offset); offset += len; - virtqueue_push(vrng->vq, &elem, len); + virtqueue_push(vrng->vq, elem, len); trace_virtio_rng_pushed(vrng, len); + g_free(elem); } virtio_notify(vdev, vrng->vq); + + if (!virtio_queue_empty(vrng->vq)) { + /* If we didn't drain the queue, call virtio_rng_process + * to take care of asking for more data as appropriate. + */ + virtio_rng_process(vrng); + } } static void virtio_rng_process(VirtIORNG *vrng) diff --git a/qemu/hw/virtio/virtio.c b/qemu/hw/virtio/virtio.c index 788b556a7..30ede3d1c 100644 --- a/qemu/hw/virtio/virtio.c +++ b/qemu/hw/virtio/virtio.c @@ -11,8 +11,10 @@ * */ -#include <inttypes.h> - +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu-common.h" +#include "cpu.h" #include "trace.h" #include "exec/address-spaces.h" #include "qemu/error-report.h" @@ -60,6 +62,7 @@ typedef struct VRingUsed typedef struct VRing { unsigned int num; + unsigned int num_default; unsigned int align; hwaddr desc; hwaddr avail; @@ -69,7 +72,15 @@ typedef struct VRing struct VirtQueue { VRing vring; + + /* Next head to pop */ uint16_t last_avail_idx; + + /* Last avail_idx read from VQ. */ + uint16_t shadow_avail_idx; + + uint16_t used_idx; + /* Last used index value we have signalled on */ uint16_t signalled_used; @@ -85,6 +96,7 @@ struct VirtQueue uint16_t vector; void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq); + void (*handle_aio_output)(VirtIODevice *vdev, VirtQueue *vq); VirtIODevice *vdev; EventNotifier guest_notifier; EventNotifier host_notifier; @@ -106,35 +118,15 @@ void virtio_queue_update_rings(VirtIODevice *vdev, int n) vring->align); } -static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa, - int i) -{ - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr); - return virtio_ldq_phys(vdev, pa); -} - -static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa, int i) +static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc, + hwaddr desc_pa, int i) { - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len); - return virtio_ldl_phys(vdev, pa); -} - -static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr desc_pa, - int i) -{ - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags); - return virtio_lduw_phys(vdev, pa); -} - -static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa, - int i) -{ - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next); - return virtio_lduw_phys(vdev, pa); + address_space_read(&address_space_memory, desc_pa + i * sizeof(VRingDesc), + MEMTXATTRS_UNSPECIFIED, (void *)desc, sizeof(VRingDesc)); + virtio_tswap64s(vdev, &desc->addr); + virtio_tswap32s(vdev, &desc->len); + virtio_tswap16s(vdev, &desc->flags); + virtio_tswap16s(vdev, &desc->next); } static inline uint16_t vring_avail_flags(VirtQueue *vq) @@ -148,7 +140,8 @@ static inline uint16_t vring_avail_idx(VirtQueue *vq) { hwaddr pa; pa = vq->vring.avail + offsetof(VRingAvail, idx); - return virtio_lduw_phys(vq->vdev, pa); + vq->shadow_avail_idx = virtio_lduw_phys(vq->vdev, pa); + return vq->shadow_avail_idx; } static inline uint16_t vring_avail_ring(VirtQueue *vq, int i) @@ -163,18 +156,15 @@ static inline uint16_t vring_get_used_event(VirtQueue *vq) return vring_avail_ring(vq, vq->vring.num); } -static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val) -{ - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, ring[i].id); - virtio_stl_phys(vq->vdev, pa, val); -} - -static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val) +static inline void vring_used_write(VirtQueue *vq, VRingUsedElem *uelem, + int i) { hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, ring[i].len); - virtio_stl_phys(vq->vdev, pa, val); + virtio_tswap32s(vq->vdev, &uelem->id); + virtio_tswap32s(vq->vdev, &uelem->len); + pa = vq->vring.used + offsetof(VRingUsed, ring[i]); + address_space_write(&address_space_memory, pa, MEMTXATTRS_UNSPECIFIED, + (void *)uelem, sizeof(VRingUsedElem)); } static uint16_t vring_used_idx(VirtQueue *vq) @@ -189,6 +179,7 @@ static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val) hwaddr pa; pa = vq->vring.used + offsetof(VRingUsed, idx); virtio_stw_phys(vq->vdev, pa, val); + vq->used_idx = val; } static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) @@ -220,7 +211,7 @@ static inline void vring_set_avail_event(VirtQueue *vq, uint16_t val) void virtio_queue_set_notification(VirtQueue *vq, int enable) { vq->notification = enable; - if (virtio_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) { + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) { vring_set_avail_event(vq, vring_avail_idx(vq)); } else if (enable) { vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); @@ -238,19 +229,23 @@ int virtio_queue_ready(VirtQueue *vq) return vq->vring.avail != 0; } +/* Fetch avail_idx from VQ memory only when we really need to know if + * guest has added some buffers. */ int virtio_queue_empty(VirtQueue *vq) { + if (vq->shadow_avail_idx != vq->last_avail_idx) { + return 0; + } + return vring_avail_idx(vq) == vq->last_avail_idx; } -void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, - unsigned int len, unsigned int idx) +static void virtqueue_unmap_sg(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) { unsigned int offset; int i; - trace_virtqueue_fill(vq, elem, len, idx); - offset = 0; for (i = 0; i < elem->in_num; i++) { size_t size = MIN(len - offset, elem->in_sg[i].iov_len); @@ -266,12 +261,29 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, cpu_physical_memory_unmap(elem->out_sg[i].iov_base, elem->out_sg[i].iov_len, 0, elem->out_sg[i].iov_len); +} - idx = (idx + vring_used_idx(vq)) % vq->vring.num; +void virtqueue_discard(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) +{ + vq->last_avail_idx--; + virtqueue_unmap_sg(vq, elem, len); +} - /* Get a pointer to the next entry in the used ring. */ - vring_used_ring_id(vq, idx, elem->index); - vring_used_ring_len(vq, idx, len); +void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len, unsigned int idx) +{ + VRingUsedElem uelem; + + trace_virtqueue_fill(vq, elem, len, idx); + + virtqueue_unmap_sg(vq, elem, len); + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = elem->index; + uelem.len = len; + vring_used_write(vq, &uelem, idx); } void virtqueue_flush(VirtQueue *vq, unsigned int count) @@ -280,7 +292,7 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count) /* Make sure buffer is written before we update index. */ smp_wmb(); trace_virtqueue_flush(vq, count); - old = vring_used_idx(vq); + old = vq->used_idx; new = old + count; vring_used_idx_set(vq, new); vq->inuse -= count; @@ -302,7 +314,7 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx) /* Check it isn't doing very strange things with descriptor numbers. */ if (num_heads > vq->vring.num) { error_report("Guest moved used index from %u to %u", - idx, vring_avail_idx(vq)); + idx, vq->shadow_avail_idx); exit(1); } /* On success, callers read a descriptor at vq->last_avail_idx. @@ -331,18 +343,18 @@ static unsigned int virtqueue_get_head(VirtQueue *vq, unsigned int idx) return head; } -static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa, - unsigned int i, unsigned int max) +static unsigned virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc *desc, + hwaddr desc_pa, unsigned int max) { unsigned int next; /* If this descriptor says it doesn't chain, we're done. */ - if (!(vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_NEXT)) { + if (!(desc->flags & VRING_DESC_F_NEXT)) { return max; } /* Check they're not leading us off end of descriptors. */ - next = vring_desc_next(vdev, desc_pa, i); + next = desc->next; /* Make sure compiler knows to grab that: we don't want it changing! */ smp_wmb(); @@ -351,6 +363,7 @@ static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa, exit(1); } + vring_desc_read(vdev, desc, desc_pa, next); return next; } @@ -367,6 +380,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, while (virtqueue_num_heads(vq, idx)) { VirtIODevice *vdev = vq->vdev; unsigned int max, num_bufs, indirect = 0; + VRingDesc desc; hwaddr desc_pa; int i; @@ -374,9 +388,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, num_bufs = total_bufs; i = virtqueue_get_head(vq, idx++); desc_pa = vq->vring.desc; + vring_desc_read(vdev, &desc, desc_pa, i); - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) { - if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) { + if (desc.flags & VRING_DESC_F_INDIRECT) { + if (desc.len % sizeof(VRingDesc)) { error_report("Invalid size for indirect buffer table"); exit(1); } @@ -389,9 +404,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, /* loop over the indirect descriptor table */ indirect = 1; - max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc); - desc_pa = vring_desc_addr(vdev, desc_pa, i); + max = desc.len / sizeof(VRingDesc); + desc_pa = desc.addr; num_bufs = i = 0; + vring_desc_read(vdev, &desc, desc_pa, i); } do { @@ -401,15 +417,15 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, exit(1); } - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) { - in_total += vring_desc_len(vdev, desc_pa, i); + if (desc.flags & VRING_DESC_F_WRITE) { + in_total += desc.len; } else { - out_total += vring_desc_len(vdev, desc_pa, i); + out_total += desc.len; } if (in_total >= max_in_bytes && out_total >= max_out_bytes) { goto done; } - } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max); + } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) != max); if (!indirect) total_bufs = num_bufs; @@ -434,98 +450,256 @@ int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, return in_bytes <= in_total && out_bytes <= out_total; } -void virtqueue_map_sg(struct iovec *sg, hwaddr *addr, - size_t num_sg, int is_write) +static void virtqueue_map_desc(unsigned int *p_num_sg, hwaddr *addr, struct iovec *iov, + unsigned int max_num_sg, bool is_write, + hwaddr pa, size_t sz) +{ + unsigned num_sg = *p_num_sg; + assert(num_sg <= max_num_sg); + + while (sz) { + hwaddr len = sz; + + if (num_sg == max_num_sg) { + error_report("virtio: too many write descriptors in indirect table"); + exit(1); + } + + iov[num_sg].iov_base = cpu_physical_memory_map(pa, &len, is_write); + iov[num_sg].iov_len = len; + addr[num_sg] = pa; + + sz -= len; + pa += len; + num_sg++; + } + *p_num_sg = num_sg; +} + +static void virtqueue_map_iovec(struct iovec *sg, hwaddr *addr, + unsigned int *num_sg, unsigned int max_size, + int is_write) { unsigned int i; hwaddr len; - if (num_sg > VIRTQUEUE_MAX_SIZE) { - error_report("virtio: map attempt out of bounds: %zd > %d", - num_sg, VIRTQUEUE_MAX_SIZE); - exit(1); - } + /* Note: this function MUST validate input, some callers + * are passing in num_sg values received over the network. + */ + /* TODO: teach all callers that this can fail, and return failure instead + * of asserting here. + * When we do, we might be able to re-enable NDEBUG below. + */ +#ifdef NDEBUG +#error building with NDEBUG is not supported +#endif + assert(*num_sg <= max_size); - for (i = 0; i < num_sg; i++) { + for (i = 0; i < *num_sg; i++) { len = sg[i].iov_len; sg[i].iov_base = cpu_physical_memory_map(addr[i], &len, is_write); - if (sg[i].iov_base == NULL || len != sg[i].iov_len) { + if (!sg[i].iov_base) { error_report("virtio: error trying to map MMIO memory"); exit(1); } + if (len != sg[i].iov_len) { + error_report("virtio: unexpected memory split"); + exit(1); + } } } -int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) +void virtqueue_map(VirtQueueElement *elem) +{ + virtqueue_map_iovec(elem->in_sg, elem->in_addr, &elem->in_num, + VIRTQUEUE_MAX_SIZE, 1); + virtqueue_map_iovec(elem->out_sg, elem->out_addr, &elem->out_num, + VIRTQUEUE_MAX_SIZE, 0); +} + +void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num) +{ + VirtQueueElement *elem; + size_t in_addr_ofs = QEMU_ALIGN_UP(sz, __alignof__(elem->in_addr[0])); + size_t out_addr_ofs = in_addr_ofs + in_num * sizeof(elem->in_addr[0]); + size_t out_addr_end = out_addr_ofs + out_num * sizeof(elem->out_addr[0]); + size_t in_sg_ofs = QEMU_ALIGN_UP(out_addr_end, __alignof__(elem->in_sg[0])); + size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); + size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); + + assert(sz >= sizeof(VirtQueueElement)); + elem = g_malloc(out_sg_end); + elem->out_num = out_num; + elem->in_num = in_num; + elem->in_addr = (void *)elem + in_addr_ofs; + elem->out_addr = (void *)elem + out_addr_ofs; + elem->in_sg = (void *)elem + in_sg_ofs; + elem->out_sg = (void *)elem + out_sg_ofs; + return elem; +} + +void *virtqueue_pop(VirtQueue *vq, size_t sz) { unsigned int i, head, max; hwaddr desc_pa = vq->vring.desc; VirtIODevice *vdev = vq->vdev; + VirtQueueElement *elem; + unsigned out_num, in_num; + hwaddr addr[VIRTQUEUE_MAX_SIZE]; + struct iovec iov[VIRTQUEUE_MAX_SIZE]; + VRingDesc desc; - if (!virtqueue_num_heads(vq, vq->last_avail_idx)) - return 0; + if (virtio_queue_empty(vq)) { + return NULL; + } + /* Needed after virtio_queue_empty(), see comment in + * virtqueue_num_heads(). */ + smp_rmb(); /* When we start there are none of either input nor output. */ - elem->out_num = elem->in_num = 0; + out_num = in_num = 0; max = vq->vring.num; i = head = virtqueue_get_head(vq, vq->last_avail_idx++); - if (virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { + if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { vring_set_avail_event(vq, vq->last_avail_idx); } - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) { - if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) { + vring_desc_read(vdev, &desc, desc_pa, i); + if (desc.flags & VRING_DESC_F_INDIRECT) { + if (desc.len % sizeof(VRingDesc)) { error_report("Invalid size for indirect buffer table"); exit(1); } /* loop over the indirect descriptor table */ - max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc); - desc_pa = vring_desc_addr(vdev, desc_pa, i); + max = desc.len / sizeof(VRingDesc); + desc_pa = desc.addr; i = 0; + vring_desc_read(vdev, &desc, desc_pa, i); } /* Collect all the descriptors */ do { - struct iovec *sg; - - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) { - if (elem->in_num >= ARRAY_SIZE(elem->in_sg)) { - error_report("Too many write descriptors in indirect table"); - exit(1); - } - elem->in_addr[elem->in_num] = vring_desc_addr(vdev, desc_pa, i); - sg = &elem->in_sg[elem->in_num++]; + if (desc.flags & VRING_DESC_F_WRITE) { + virtqueue_map_desc(&in_num, addr + out_num, iov + out_num, + VIRTQUEUE_MAX_SIZE - out_num, true, desc.addr, desc.len); } else { - if (elem->out_num >= ARRAY_SIZE(elem->out_sg)) { - error_report("Too many read descriptors in indirect table"); + if (in_num) { + error_report("Incorrect order for descriptors"); exit(1); } - elem->out_addr[elem->out_num] = vring_desc_addr(vdev, desc_pa, i); - sg = &elem->out_sg[elem->out_num++]; + virtqueue_map_desc(&out_num, addr, iov, + VIRTQUEUE_MAX_SIZE, false, desc.addr, desc.len); } - sg->iov_len = vring_desc_len(vdev, desc_pa, i); - /* If we've got too many, that implies a descriptor loop. */ - if ((elem->in_num + elem->out_num) > max) { + if ((in_num + out_num) > max) { error_report("Looped descriptor"); exit(1); } - } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max); - - /* Now map what we have collected */ - virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1); - virtqueue_map_sg(elem->out_sg, elem->out_addr, elem->out_num, 0); + } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) != max); + /* Now copy what we have collected and mapped */ + elem = virtqueue_alloc_element(sz, out_num, in_num); elem->index = head; + for (i = 0; i < out_num; i++) { + elem->out_addr[i] = addr[i]; + elem->out_sg[i] = iov[i]; + } + for (i = 0; i < in_num; i++) { + elem->in_addr[i] = addr[out_num + i]; + elem->in_sg[i] = iov[out_num + i]; + } vq->inuse++; trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); - return elem->in_num + elem->out_num; + return elem; +} + +/* Reading and writing a structure directly to QEMUFile is *awful*, but + * it is what QEMU has always done by mistake. We can change it sooner + * or later by bumping the version number of the affected vm states. + * In the meanwhile, since the in-memory layout of VirtQueueElement + * has changed, we need to marshal to and from the layout that was + * used before the change. + */ +typedef struct VirtQueueElementOld { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + hwaddr in_addr[VIRTQUEUE_MAX_SIZE]; + hwaddr out_addr[VIRTQUEUE_MAX_SIZE]; + struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + struct iovec out_sg[VIRTQUEUE_MAX_SIZE]; +} VirtQueueElementOld; + +void *qemu_get_virtqueue_element(QEMUFile *f, size_t sz) +{ + VirtQueueElement *elem; + VirtQueueElementOld data; + int i; + + qemu_get_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld)); + + elem = virtqueue_alloc_element(sz, data.out_num, data.in_num); + elem->index = data.index; + + for (i = 0; i < elem->in_num; i++) { + elem->in_addr[i] = data.in_addr[i]; + } + + for (i = 0; i < elem->out_num; i++) { + elem->out_addr[i] = data.out_addr[i]; + } + + for (i = 0; i < elem->in_num; i++) { + /* Base is overwritten by virtqueue_map. */ + elem->in_sg[i].iov_base = 0; + elem->in_sg[i].iov_len = data.in_sg[i].iov_len; + } + + for (i = 0; i < elem->out_num; i++) { + /* Base is overwritten by virtqueue_map. */ + elem->out_sg[i].iov_base = 0; + elem->out_sg[i].iov_len = data.out_sg[i].iov_len; + } + + virtqueue_map(elem); + return elem; +} + +void qemu_put_virtqueue_element(QEMUFile *f, VirtQueueElement *elem) +{ + VirtQueueElementOld data; + int i; + + memset(&data, 0, sizeof(data)); + data.index = elem->index; + data.in_num = elem->in_num; + data.out_num = elem->out_num; + + for (i = 0; i < elem->in_num; i++) { + data.in_addr[i] = elem->in_addr[i]; + } + + for (i = 0; i < elem->out_num; i++) { + data.out_addr[i] = elem->out_addr[i]; + } + + for (i = 0; i < elem->in_num; i++) { + /* Base is overwritten by virtqueue_map when loading. Do not + * save it, as it would leak the QEMU address space layout. */ + data.in_sg[i].iov_len = elem->in_sg[i].iov_len; + } + + for (i = 0; i < elem->out_num; i++) { + /* Do not save iov_base as above. */ + data.out_sg[i].iov_len = elem->out_sg[i].iov_len; + } + qemu_put_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld)); } /* virtio device */ @@ -560,7 +734,7 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); trace_virtio_set_status(vdev, val); - if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { if (!(vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) && val & VIRTIO_CONFIG_S_FEATURES_OK) { int ret = virtio_validate_features(vdev); @@ -629,10 +803,13 @@ void virtio_reset(void *opaque) vdev->vq[i].vring.avail = 0; vdev->vq[i].vring.used = 0; vdev->vq[i].last_avail_idx = 0; + vdev->vq[i].shadow_avail_idx = 0; + vdev->vq[i].used_idx = 0; virtio_queue_set_vector(vdev, i, VIRTIO_NO_VECTOR); vdev->vq[i].signalled_used = 0; vdev->vq[i].signalled_used_valid = false; vdev->vq[i].notification = true; + vdev->vq[i].vring.num = vdev->vq[i].vring.num_default; } } @@ -898,7 +1075,7 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align) VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); /* virtio-1 compliant devices cannot change the alignment */ - if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { error_report("tried to modify queue alignment for virtio-1 device"); return; } @@ -912,7 +1089,17 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align) virtio_queue_update_rings(vdev, n); } -void virtio_queue_notify_vq(VirtQueue *vq) +static void virtio_queue_notify_aio_vq(VirtQueue *vq) +{ + if (vq->vring.desc && vq->handle_aio_output) { + VirtIODevice *vdev = vq->vdev; + + trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); + vq->handle_aio_output(vdev, vq); + } +} + +static void virtio_queue_notify_vq(VirtQueue *vq) { if (vq->vring.desc && vq->handle_output) { VirtIODevice *vdev = vq->vdev; @@ -964,8 +1151,10 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, abort(); vdev->vq[i].vring.num = queue_size; + vdev->vq[i].vring.num_default = queue_size; vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN; vdev->vq[i].handle_output = handle_output; + vdev->vq[i].handle_aio_output = NULL; return &vdev->vq[i]; } @@ -977,6 +1166,7 @@ void virtio_del_queue(VirtIODevice *vdev, int n) } vdev->vq[n].vring.num = 0; + vdev->vq[n].vring.num_default = 0; } void virtio_irq(VirtQueue *vq) @@ -986,32 +1176,32 @@ void virtio_irq(VirtQueue *vq) virtio_notify_vector(vq->vdev, vq->vector); } -static bool vring_notify(VirtIODevice *vdev, VirtQueue *vq) +bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq) { uint16_t old, new; bool v; /* We need to expose used array entries before checking used event. */ smp_mb(); /* Always notify when queue is empty (when feature acknowledge) */ - if (virtio_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) && - !vq->inuse && vring_avail_idx(vq) == vq->last_avail_idx) { + if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && virtio_queue_empty(vq)) { return true; } - if (!virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { + if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); } v = vq->signalled_used_valid; vq->signalled_used_valid = true; old = vq->signalled_used; - new = vq->signalled_used = vring_used_idx(vq); + new = vq->signalled_used = vq->used_idx; return !v || vring_need_event(vring_get_used_event(vq), new, old); } void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) { - if (!vring_notify(vdev, vq)) { + if (!virtio_should_notify(vdev, vq)) { return; } @@ -1035,7 +1225,7 @@ static bool virtio_device_endian_needed(void *opaque) VirtIODevice *vdev = opaque; assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN); - if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { + if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { return vdev->device_endian != virtio_default_endian(); } /* Devices conforming to VIRTIO 1.0 or later are always LE. */ @@ -1056,33 +1246,38 @@ static bool virtio_virtqueue_needed(void *opaque) return virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1); } -static void put_virtqueue_state(QEMUFile *f, void *pv, size_t size) +static bool virtio_ringsize_needed(void *opaque) { - VirtIODevice *vdev = pv; + VirtIODevice *vdev = opaque; int i; for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { - qemu_put_be64(f, vdev->vq[i].vring.avail); - qemu_put_be64(f, vdev->vq[i].vring.used); + if (vdev->vq[i].vring.num != vdev->vq[i].vring.num_default) { + return true; + } } + return false; } -static int get_virtqueue_state(QEMUFile *f, void *pv, size_t size) +static bool virtio_extra_state_needed(void *opaque) { - VirtIODevice *vdev = pv; - int i; + VirtIODevice *vdev = opaque; + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); - for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { - vdev->vq[i].vring.avail = qemu_get_be64(f); - vdev->vq[i].vring.used = qemu_get_be64(f); - } - return 0; + return k->has_extra_state && + k->has_extra_state(qbus->parent); } -static VMStateInfo vmstate_info_virtqueue = { +static const VMStateDescription vmstate_virtqueue = { .name = "virtqueue_state", - .get = get_virtqueue_state, - .put = put_virtqueue_state, + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT64(vring.avail, struct VirtQueue), + VMSTATE_UINT64(vring.used, struct VirtQueue), + VMSTATE_END_OF_LIST() + } }; static const VMStateDescription vmstate_virtio_virtqueues = { @@ -1091,12 +1286,74 @@ static const VMStateDescription vmstate_virtio_virtqueues = { .minimum_version_id = 1, .needed = &virtio_virtqueue_needed, .fields = (VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice, + VIRTIO_QUEUE_MAX, 0, vmstate_virtqueue, VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_ringsize = { + .name = "ringsize_state", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(vring.num_default, struct VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_ringsize = { + .name = "virtio/ringsize", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_ringsize_needed, + .fields = (VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice, + VIRTIO_QUEUE_MAX, 0, vmstate_ringsize, VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static int get_extra_state(QEMUFile *f, void *pv, size_t size) +{ + VirtIODevice *vdev = pv; + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + if (!k->load_extra_state) { + return -1; + } else { + return k->load_extra_state(qbus->parent, f); + } +} + +static void put_extra_state(QEMUFile *f, void *pv, size_t size) +{ + VirtIODevice *vdev = pv; + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + k->save_extra_state(qbus->parent, f); +} + +static const VMStateInfo vmstate_info_extra_state = { + .name = "virtqueue_extra_state", + .get = get_extra_state, + .put = put_extra_state, +}; + +static const VMStateDescription vmstate_virtio_extra_state = { + .name = "virtio/extra_state", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_extra_state_needed, + .fields = (VMStateField[]) { { - .name = "virtqueues", + .name = "extra_state", .version_id = 0, .field_exists = NULL, .size = 0, - .info = &vmstate_info_virtqueue, + .info = &vmstate_info_extra_state, .flags = VMS_SINGLE, .offset = 0, }, @@ -1138,6 +1395,8 @@ static const VMStateDescription vmstate_virtio = { &vmstate_virtio_device_endian, &vmstate_virtio_64bit_features, &vmstate_virtio_virtqueues, + &vmstate_virtio_ringsize, + &vmstate_virtio_extra_state, NULL } }; @@ -1264,7 +1523,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) num = qemu_get_be32(f); if (num > VIRTIO_QUEUE_MAX) { - error_report("Invalid number of PCI queues: 0x%x", num); + error_report("Invalid number of virtqueues: 0x%x", num); return -1; } @@ -1348,6 +1607,8 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) vdev->vq[i].last_avail_idx, nheads); return -1; } + vdev->vq[i].used_idx = vring_used_idx(&vdev->vq[i]); + vdev->vq[i].shadow_avail_idx = vring_avail_idx(&vdev->vq[i]); } } @@ -1430,6 +1691,7 @@ void virtio_init(VirtIODevice *vdev, const char *name, vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change, vdev); vdev->device_endian = virtio_default_endian(); + vdev->use_guest_notifier_mask = true; } hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n) @@ -1460,7 +1722,7 @@ hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n) hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n) { return offsetof(VRingAvail, ring) + - sizeof(uint64_t) * vdev->vq[n].vring.num; + sizeof(uint16_t) * vdev->vq[n].vring.num; } hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n) @@ -1483,6 +1745,7 @@ uint16_t virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx) { vdev->vq[n].last_avail_idx = idx; + vdev->vq[n].shadow_avail_idx = idx; } void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n) @@ -1512,10 +1775,10 @@ void virtio_queue_set_guest_notifier_fd_handler(VirtQueue *vq, bool assign, bool with_irqfd) { if (assign && !with_irqfd) { - event_notifier_set_handler(&vq->guest_notifier, + event_notifier_set_handler(&vq->guest_notifier, false, virtio_queue_guest_notifier_read); } else { - event_notifier_set_handler(&vq->guest_notifier, NULL); + event_notifier_set_handler(&vq->guest_notifier, false, NULL); } if (!assign) { /* Test and clear notifier before closing it, @@ -1529,6 +1792,31 @@ EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq) return &vq->guest_notifier; } +static void virtio_queue_host_notifier_aio_read(EventNotifier *n) +{ + VirtQueue *vq = container_of(n, VirtQueue, host_notifier); + if (event_notifier_test_and_clear(n)) { + virtio_queue_notify_aio_vq(vq); + } +} + +void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx, + void (*handle_output)(VirtIODevice *, + VirtQueue *)) +{ + if (handle_output) { + vq->handle_aio_output = handle_output; + aio_set_event_notifier(ctx, &vq->host_notifier, true, + virtio_queue_host_notifier_aio_read); + } else { + aio_set_event_notifier(ctx, &vq->host_notifier, true, NULL); + /* Test and clear notifier before after disabling event, + * in case poll callback didn't have time to run. */ + virtio_queue_host_notifier_aio_read(&vq->host_notifier); + vq->handle_aio_output = NULL; + } +} + static void virtio_queue_host_notifier_read(EventNotifier *n) { VirtQueue *vq = container_of(n, VirtQueue, host_notifier); @@ -1541,10 +1829,10 @@ void virtio_queue_set_host_notifier_fd_handler(VirtQueue *vq, bool assign, bool set_handler) { if (assign && set_handler) { - event_notifier_set_handler(&vq->host_notifier, + event_notifier_set_handler(&vq->host_notifier, true, virtio_queue_host_notifier_read); } else { - event_notifier_set_handler(&vq->host_notifier, NULL); + event_notifier_set_handler(&vq->host_notifier, true, NULL); } if (!assign) { /* Test and clear notifier before after disabling event, |