summaryrefslogtreecommitdiffstats
path: root/qemu/hw/virtio
diff options
context:
space:
mode:
Diffstat (limited to 'qemu/hw/virtio')
-rw-r--r--qemu/hw/virtio/Makefile.objs1
-rw-r--r--qemu/hw/virtio/dataplane/Makefile.objs1
-rw-r--r--qemu/hw/virtio/dataplane/vring.c453
-rw-r--r--qemu/hw/virtio/vhost-backend.c147
-rw-r--r--qemu/hw/virtio/vhost-user.c624
-rw-r--r--qemu/hw/virtio/vhost.c213
-rw-r--r--qemu/hw/virtio/virtio-balloon.c124
-rw-r--r--qemu/hw/virtio/virtio-bus.c4
-rw-r--r--qemu/hw/virtio/virtio-mmio.c1
-rw-r--r--qemu/hw/virtio/virtio-pci.c398
-rw-r--r--qemu/hw/virtio/virtio-pci.h49
-rw-r--r--qemu/hw/virtio/virtio-rng.c19
-rw-r--r--qemu/hw/virtio/virtio.c562
13 files changed, 1692 insertions, 904 deletions
diff --git a/qemu/hw/virtio/Makefile.objs b/qemu/hw/virtio/Makefile.objs
index 19b224a44..3e2b175da 100644
--- a/qemu/hw/virtio/Makefile.objs
+++ b/qemu/hw/virtio/Makefile.objs
@@ -2,7 +2,6 @@ common-obj-y += virtio-rng.o
common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
common-obj-y += virtio-bus.o
common-obj-y += virtio-mmio.o
-obj-$(CONFIG_VIRTIO) += dataplane/
obj-y += virtio.o virtio-balloon.o
obj-$(CONFIG_LINUX) += vhost.o vhost-backend.o vhost-user.o
diff --git a/qemu/hw/virtio/dataplane/Makefile.objs b/qemu/hw/virtio/dataplane/Makefile.objs
deleted file mode 100644
index 753a9cab4..000000000
--- a/qemu/hw/virtio/dataplane/Makefile.objs
+++ /dev/null
@@ -1 +0,0 @@
-obj-y += vring.o
diff --git a/qemu/hw/virtio/dataplane/vring.c b/qemu/hw/virtio/dataplane/vring.c
deleted file mode 100644
index 07fd69c69..000000000
--- a/qemu/hw/virtio/dataplane/vring.c
+++ /dev/null
@@ -1,453 +0,0 @@
-/* Copyright 2012 Red Hat, Inc.
- * Copyright IBM, Corp. 2012
- *
- * Based on Linux 2.6.39 vhost code:
- * Copyright (C) 2009 Red Hat, Inc.
- * Copyright (C) 2006 Rusty Russell IBM Corporation
- *
- * Author: Michael S. Tsirkin <mst@redhat.com>
- * Stefan Hajnoczi <stefanha@redhat.com>
- *
- * Inspiration, some code, and most witty comments come from
- * Documentation/virtual/lguest/lguest.c, by Rusty Russell
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
- */
-
-#include "trace.h"
-#include "hw/hw.h"
-#include "exec/memory.h"
-#include "exec/address-spaces.h"
-#include "hw/virtio/virtio-access.h"
-#include "hw/virtio/dataplane/vring.h"
-#include "hw/virtio/dataplane/vring-accessors.h"
-#include "qemu/error-report.h"
-
-/* vring_map can be coupled with vring_unmap or (if you still have the
- * value returned in *mr) memory_region_unref.
- */
-static void *vring_map(MemoryRegion **mr, hwaddr phys, hwaddr len,
- bool is_write)
-{
- MemoryRegionSection section = memory_region_find(get_system_memory(), phys, len);
-
- if (!section.mr || int128_get64(section.size) < len) {
- goto out;
- }
- if (is_write && section.readonly) {
- goto out;
- }
- if (!memory_region_is_ram(section.mr)) {
- goto out;
- }
-
- /* Ignore regions with dirty logging, we cannot mark them dirty */
- if (memory_region_get_dirty_log_mask(section.mr)) {
- goto out;
- }
-
- *mr = section.mr;
- return memory_region_get_ram_ptr(section.mr) + section.offset_within_region;
-
-out:
- memory_region_unref(section.mr);
- *mr = NULL;
- return NULL;
-}
-
-static void vring_unmap(void *buffer, bool is_write)
-{
- ram_addr_t addr;
- MemoryRegion *mr;
-
- mr = qemu_ram_addr_from_host(buffer, &addr);
- memory_region_unref(mr);
-}
-
-/* Map the guest's vring to host memory */
-bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
-{
- hwaddr vring_addr = virtio_queue_get_ring_addr(vdev, n);
- hwaddr vring_size = virtio_queue_get_ring_size(vdev, n);
- void *vring_ptr;
-
- vring->broken = false;
-
- vring_ptr = vring_map(&vring->mr, vring_addr, vring_size, true);
- if (!vring_ptr) {
- error_report("Failed to map vring "
- "addr %#" HWADDR_PRIx " size %" HWADDR_PRIu,
- vring_addr, vring_size);
- vring->broken = true;
- return false;
- }
-
- vring_init(&vring->vr, virtio_queue_get_num(vdev, n), vring_ptr, 4096);
-
- vring->last_avail_idx = virtio_queue_get_last_avail_idx(vdev, n);
- vring->last_used_idx = vring_get_used_idx(vdev, vring);
- vring->signalled_used = 0;
- vring->signalled_used_valid = false;
-
- trace_vring_setup(virtio_queue_get_ring_addr(vdev, n),
- vring->vr.desc, vring->vr.avail, vring->vr.used);
- return true;
-}
-
-void vring_teardown(Vring *vring, VirtIODevice *vdev, int n)
-{
- virtio_queue_set_last_avail_idx(vdev, n, vring->last_avail_idx);
- virtio_queue_invalidate_signalled_used(vdev, n);
-
- memory_region_unref(vring->mr);
-}
-
-/* Disable guest->host notifies */
-void vring_disable_notification(VirtIODevice *vdev, Vring *vring)
-{
- if (!virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
- vring_set_used_flags(vdev, vring, VRING_USED_F_NO_NOTIFY);
- }
-}
-
-/* Enable guest->host notifies
- *
- * Return true if the vring is empty, false if there are more requests.
- */
-bool vring_enable_notification(VirtIODevice *vdev, Vring *vring)
-{
- if (virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
- vring_avail_event(&vring->vr) = vring->vr.avail->idx;
- } else {
- vring_clear_used_flags(vdev, vring, VRING_USED_F_NO_NOTIFY);
- }
- smp_mb(); /* ensure update is seen before reading avail_idx */
- return !vring_more_avail(vdev, vring);
-}
-
-/* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */
-bool vring_should_notify(VirtIODevice *vdev, Vring *vring)
-{
- uint16_t old, new;
- bool v;
- /* Flush out used index updates. This is paired
- * with the barrier that the Guest executes when enabling
- * interrupts. */
- smp_mb();
-
- if (virtio_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
- unlikely(!vring_more_avail(vdev, vring))) {
- return true;
- }
-
- if (!virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
- return !(vring_get_avail_flags(vdev, vring) &
- VRING_AVAIL_F_NO_INTERRUPT);
- }
- old = vring->signalled_used;
- v = vring->signalled_used_valid;
- new = vring->signalled_used = vring->last_used_idx;
- vring->signalled_used_valid = true;
-
- if (unlikely(!v)) {
- return true;
- }
-
- return vring_need_event(virtio_tswap16(vdev, vring_used_event(&vring->vr)),
- new, old);
-}
-
-
-static int get_desc(Vring *vring, VirtQueueElement *elem,
- struct vring_desc *desc)
-{
- unsigned *num;
- struct iovec *iov;
- hwaddr *addr;
- MemoryRegion *mr;
-
- if (desc->flags & VRING_DESC_F_WRITE) {
- num = &elem->in_num;
- iov = &elem->in_sg[*num];
- addr = &elem->in_addr[*num];
- } else {
- num = &elem->out_num;
- iov = &elem->out_sg[*num];
- addr = &elem->out_addr[*num];
-
- /* If it's an output descriptor, they're all supposed
- * to come before any input descriptors. */
- if (unlikely(elem->in_num)) {
- error_report("Descriptor has out after in");
- return -EFAULT;
- }
- }
-
- /* Stop for now if there are not enough iovecs available. */
- if (*num >= VIRTQUEUE_MAX_SIZE) {
- error_report("Invalid SG num: %u", *num);
- return -EFAULT;
- }
-
- /* TODO handle non-contiguous memory across region boundaries */
- iov->iov_base = vring_map(&mr, desc->addr, desc->len,
- desc->flags & VRING_DESC_F_WRITE);
- if (!iov->iov_base) {
- error_report("Failed to map descriptor addr %#" PRIx64 " len %u",
- (uint64_t)desc->addr, desc->len);
- return -EFAULT;
- }
-
- /* The MemoryRegion is looked up again and unref'ed later, leave the
- * ref in place. */
- iov->iov_len = desc->len;
- *addr = desc->addr;
- *num += 1;
- return 0;
-}
-
-static void copy_in_vring_desc(VirtIODevice *vdev,
- const struct vring_desc *guest,
- struct vring_desc *host)
-{
- host->addr = virtio_ldq_p(vdev, &guest->addr);
- host->len = virtio_ldl_p(vdev, &guest->len);
- host->flags = virtio_lduw_p(vdev, &guest->flags);
- host->next = virtio_lduw_p(vdev, &guest->next);
-}
-
-/* This is stolen from linux/drivers/vhost/vhost.c. */
-static int get_indirect(VirtIODevice *vdev, Vring *vring,
- VirtQueueElement *elem, struct vring_desc *indirect)
-{
- struct vring_desc desc;
- unsigned int i = 0, count, found = 0;
- int ret;
-
- /* Sanity check */
- if (unlikely(indirect->len % sizeof(desc))) {
- error_report("Invalid length in indirect descriptor: "
- "len %#x not multiple of %#zx",
- indirect->len, sizeof(desc));
- vring->broken = true;
- return -EFAULT;
- }
-
- count = indirect->len / sizeof(desc);
- /* Buffers are chained via a 16 bit next field, so
- * we can have at most 2^16 of these. */
- if (unlikely(count > USHRT_MAX + 1)) {
- error_report("Indirect buffer length too big: %d", indirect->len);
- vring->broken = true;
- return -EFAULT;
- }
-
- do {
- struct vring_desc *desc_ptr;
- MemoryRegion *mr;
-
- /* Translate indirect descriptor */
- desc_ptr = vring_map(&mr,
- indirect->addr + found * sizeof(desc),
- sizeof(desc), false);
- if (!desc_ptr) {
- error_report("Failed to map indirect descriptor "
- "addr %#" PRIx64 " len %zu",
- (uint64_t)indirect->addr + found * sizeof(desc),
- sizeof(desc));
- vring->broken = true;
- return -EFAULT;
- }
- copy_in_vring_desc(vdev, desc_ptr, &desc);
- memory_region_unref(mr);
-
- /* Ensure descriptor has been loaded before accessing fields */
- barrier(); /* read_barrier_depends(); */
-
- if (unlikely(++found > count)) {
- error_report("Loop detected: last one at %u "
- "indirect size %u", i, count);
- vring->broken = true;
- return -EFAULT;
- }
-
- if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
- error_report("Nested indirect descriptor");
- vring->broken = true;
- return -EFAULT;
- }
-
- ret = get_desc(vring, elem, &desc);
- if (ret < 0) {
- vring->broken |= (ret == -EFAULT);
- return ret;
- }
- i = desc.next;
- } while (desc.flags & VRING_DESC_F_NEXT);
- return 0;
-}
-
-static void vring_unmap_element(VirtQueueElement *elem)
-{
- int i;
-
- /* This assumes that the iovecs, if changed, are never moved past
- * the end of the valid area. This is true if iovec manipulations
- * are done with iov_discard_front and iov_discard_back.
- */
- for (i = 0; i < elem->out_num; i++) {
- vring_unmap(elem->out_sg[i].iov_base, false);
- }
-
- for (i = 0; i < elem->in_num; i++) {
- vring_unmap(elem->in_sg[i].iov_base, true);
- }
-}
-
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access. Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which is
- * never a valid descriptor number) if none was found. A negative code is
- * returned on error.
- *
- * Stolen from linux/drivers/vhost/vhost.c.
- */
-int vring_pop(VirtIODevice *vdev, Vring *vring,
- VirtQueueElement *elem)
-{
- struct vring_desc desc;
- unsigned int i, head, found = 0, num = vring->vr.num;
- uint16_t avail_idx, last_avail_idx;
- int ret;
-
- /* Initialize elem so it can be safely unmapped */
- elem->in_num = elem->out_num = 0;
-
- /* If there was a fatal error then refuse operation */
- if (vring->broken) {
- ret = -EFAULT;
- goto out;
- }
-
- /* Check it isn't doing very strange things with descriptor numbers. */
- last_avail_idx = vring->last_avail_idx;
- avail_idx = vring_get_avail_idx(vdev, vring);
- barrier(); /* load indices now and not again later */
-
- if (unlikely((uint16_t)(avail_idx - last_avail_idx) > num)) {
- error_report("Guest moved used index from %u to %u",
- last_avail_idx, avail_idx);
- ret = -EFAULT;
- goto out;
- }
-
- /* If there's nothing new since last we looked. */
- if (avail_idx == last_avail_idx) {
- ret = -EAGAIN;
- goto out;
- }
-
- /* Only get avail ring entries after they have been exposed by guest. */
- smp_rmb();
-
- /* Grab the next descriptor number they're advertising, and increment
- * the index we've seen. */
- head = vring_get_avail_ring(vdev, vring, last_avail_idx % num);
-
- elem->index = head;
-
- /* If their number is silly, that's an error. */
- if (unlikely(head >= num)) {
- error_report("Guest says index %u > %u is available", head, num);
- ret = -EFAULT;
- goto out;
- }
-
- i = head;
- do {
- if (unlikely(i >= num)) {
- error_report("Desc index is %u > %u, head = %u", i, num, head);
- ret = -EFAULT;
- goto out;
- }
- if (unlikely(++found > num)) {
- error_report("Loop detected: last one at %u vq size %u head %u",
- i, num, head);
- ret = -EFAULT;
- goto out;
- }
- copy_in_vring_desc(vdev, &vring->vr.desc[i], &desc);
-
- /* Ensure descriptor is loaded before accessing fields */
- barrier();
-
- if (desc.flags & VRING_DESC_F_INDIRECT) {
- ret = get_indirect(vdev, vring, elem, &desc);
- if (ret < 0) {
- goto out;
- }
- continue;
- }
-
- ret = get_desc(vring, elem, &desc);
- if (ret < 0) {
- goto out;
- }
-
- i = desc.next;
- } while (desc.flags & VRING_DESC_F_NEXT);
-
- /* On success, increment avail index. */
- vring->last_avail_idx++;
- if (virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
- vring_avail_event(&vring->vr) =
- virtio_tswap16(vdev, vring->last_avail_idx);
- }
-
- return head;
-
-out:
- assert(ret < 0);
- if (ret == -EFAULT) {
- vring->broken = true;
- }
- vring_unmap_element(elem);
- return ret;
-}
-
-/* After we've used one of their buffers, we tell them about it.
- *
- * Stolen from linux/drivers/vhost/vhost.c.
- */
-void vring_push(VirtIODevice *vdev, Vring *vring, VirtQueueElement *elem,
- int len)
-{
- unsigned int head = elem->index;
- uint16_t new;
-
- vring_unmap_element(elem);
-
- /* Don't touch vring if a fatal error occurred */
- if (vring->broken) {
- return;
- }
-
- /* The virtqueue contains a ring of used buffers. Get a pointer to the
- * next entry in that used ring. */
- vring_set_used_ring_id(vdev, vring, vring->last_used_idx % vring->vr.num,
- head);
- vring_set_used_ring_len(vdev, vring, vring->last_used_idx % vring->vr.num,
- len);
-
- /* Make sure buffer is written before we update index. */
- smp_wmb();
-
- new = ++vring->last_used_idx;
- vring_set_used_idx(vdev, vring, new);
- if (unlikely((int16_t)(new - vring->signalled_used) < (uint16_t)1)) {
- vring->signalled_used_valid = false;
- }
-}
diff --git a/qemu/hw/virtio/vhost-backend.c b/qemu/hw/virtio/vhost-backend.c
index 4d68a2765..b35890289 100644
--- a/qemu/hw/virtio/vhost-backend.c
+++ b/qemu/hw/virtio/vhost-backend.c
@@ -8,9 +8,11 @@
*
*/
+#include "qemu/osdep.h"
#include "hw/virtio/vhost.h"
#include "hw/virtio/vhost-backend.h"
#include "qemu/error-report.h"
+#include "linux/vhost.h"
#include <sys/ioctl.h>
@@ -42,11 +44,152 @@ static int vhost_kernel_cleanup(struct vhost_dev *dev)
return close(fd);
}
+static int vhost_kernel_memslots_limit(struct vhost_dev *dev)
+{
+ int limit = 64;
+ char *s;
+
+ if (g_file_get_contents("/sys/module/vhost/parameters/max_mem_regions",
+ &s, NULL, NULL)) {
+ uint64_t val = g_ascii_strtoull(s, NULL, 10);
+ if (!((val == G_MAXUINT64 || !val) && errno)) {
+ return val;
+ }
+ error_report("ignoring invalid max_mem_regions value in vhost module:"
+ " %s", s);
+ }
+ return limit;
+}
+
+static int vhost_kernel_net_set_backend(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_kernel_call(dev, VHOST_NET_SET_BACKEND, file);
+}
+
+static int vhost_kernel_scsi_set_endpoint(struct vhost_dev *dev,
+ struct vhost_scsi_target *target)
+{
+ return vhost_kernel_call(dev, VHOST_SCSI_SET_ENDPOINT, target);
+}
+
+static int vhost_kernel_scsi_clear_endpoint(struct vhost_dev *dev,
+ struct vhost_scsi_target *target)
+{
+ return vhost_kernel_call(dev, VHOST_SCSI_CLEAR_ENDPOINT, target);
+}
+
+static int vhost_kernel_scsi_get_abi_version(struct vhost_dev *dev, int *version)
+{
+ return vhost_kernel_call(dev, VHOST_SCSI_GET_ABI_VERSION, version);
+}
+
+static int vhost_kernel_set_log_base(struct vhost_dev *dev, uint64_t base,
+ struct vhost_log *log)
+{
+ return vhost_kernel_call(dev, VHOST_SET_LOG_BASE, &base);
+}
+
+static int vhost_kernel_set_mem_table(struct vhost_dev *dev,
+ struct vhost_memory *mem)
+{
+ return vhost_kernel_call(dev, VHOST_SET_MEM_TABLE, mem);
+}
+
+static int vhost_kernel_set_vring_addr(struct vhost_dev *dev,
+ struct vhost_vring_addr *addr)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_ADDR, addr);
+}
+
+static int vhost_kernel_set_vring_endian(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_ENDIAN, ring);
+}
+
+static int vhost_kernel_set_vring_num(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_NUM, ring);
+}
+
+static int vhost_kernel_set_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_BASE, ring);
+}
+
+static int vhost_kernel_get_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_kernel_call(dev, VHOST_GET_VRING_BASE, ring);
+}
+
+static int vhost_kernel_set_vring_kick(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_KICK, file);
+}
+
+static int vhost_kernel_set_vring_call(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_kernel_call(dev, VHOST_SET_VRING_CALL, file);
+}
+
+static int vhost_kernel_set_features(struct vhost_dev *dev,
+ uint64_t features)
+{
+ return vhost_kernel_call(dev, VHOST_SET_FEATURES, &features);
+}
+
+static int vhost_kernel_get_features(struct vhost_dev *dev,
+ uint64_t *features)
+{
+ return vhost_kernel_call(dev, VHOST_GET_FEATURES, features);
+}
+
+static int vhost_kernel_set_owner(struct vhost_dev *dev)
+{
+ return vhost_kernel_call(dev, VHOST_SET_OWNER, NULL);
+}
+
+static int vhost_kernel_reset_device(struct vhost_dev *dev)
+{
+ return vhost_kernel_call(dev, VHOST_RESET_OWNER, NULL);
+}
+
+static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx)
+{
+ assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
+
+ return idx - dev->vq_index;
+}
+
static const VhostOps kernel_ops = {
.backend_type = VHOST_BACKEND_TYPE_KERNEL,
- .vhost_call = vhost_kernel_call,
.vhost_backend_init = vhost_kernel_init,
- .vhost_backend_cleanup = vhost_kernel_cleanup
+ .vhost_backend_cleanup = vhost_kernel_cleanup,
+ .vhost_backend_memslots_limit = vhost_kernel_memslots_limit,
+ .vhost_net_set_backend = vhost_kernel_net_set_backend,
+ .vhost_scsi_set_endpoint = vhost_kernel_scsi_set_endpoint,
+ .vhost_scsi_clear_endpoint = vhost_kernel_scsi_clear_endpoint,
+ .vhost_scsi_get_abi_version = vhost_kernel_scsi_get_abi_version,
+ .vhost_set_log_base = vhost_kernel_set_log_base,
+ .vhost_set_mem_table = vhost_kernel_set_mem_table,
+ .vhost_set_vring_addr = vhost_kernel_set_vring_addr,
+ .vhost_set_vring_endian = vhost_kernel_set_vring_endian,
+ .vhost_set_vring_num = vhost_kernel_set_vring_num,
+ .vhost_set_vring_base = vhost_kernel_set_vring_base,
+ .vhost_get_vring_base = vhost_kernel_get_vring_base,
+ .vhost_set_vring_kick = vhost_kernel_set_vring_kick,
+ .vhost_set_vring_call = vhost_kernel_set_vring_call,
+ .vhost_set_features = vhost_kernel_set_features,
+ .vhost_get_features = vhost_kernel_get_features,
+ .vhost_set_owner = vhost_kernel_set_owner,
+ .vhost_reset_device = vhost_kernel_reset_device,
+ .vhost_get_vq_index = vhost_kernel_get_vq_index,
};
int vhost_set_backend_type(struct vhost_dev *dev, VhostBackendType backend_type)
diff --git a/qemu/hw/virtio/vhost-user.c b/qemu/hw/virtio/vhost-user.c
index e7ab8293d..5914e8510 100644
--- a/qemu/hw/virtio/vhost-user.c
+++ b/qemu/hw/virtio/vhost-user.c
@@ -8,22 +8,35 @@
*
*/
+#include "qemu/osdep.h"
+#include "qapi/error.h"
#include "hw/virtio/vhost.h"
#include "hw/virtio/vhost-backend.h"
+#include "hw/virtio/virtio-net.h"
#include "sysemu/char.h"
#include "sysemu/kvm.h"
#include "qemu/error-report.h"
#include "qemu/sockets.h"
#include "exec/ram_addr.h"
+#include "migration/migration.h"
-#include <fcntl.h>
-#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <linux/vhost.h>
#define VHOST_MEMORY_MAX_NREGIONS 8
+#define VHOST_USER_F_PROTOCOL_FEATURES 30
+
+enum VhostUserProtocolFeature {
+ VHOST_USER_PROTOCOL_F_MQ = 0,
+ VHOST_USER_PROTOCOL_F_LOG_SHMFD = 1,
+ VHOST_USER_PROTOCOL_F_RARP = 2,
+
+ VHOST_USER_PROTOCOL_F_MAX
+};
+
+#define VHOST_USER_PROTOCOL_FEATURE_MASK ((1 << VHOST_USER_PROTOCOL_F_MAX) - 1)
typedef enum VhostUserRequest {
VHOST_USER_NONE = 0,
@@ -41,6 +54,11 @@ typedef enum VhostUserRequest {
VHOST_USER_SET_VRING_KICK = 12,
VHOST_USER_SET_VRING_CALL = 13,
VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+ VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+ VHOST_USER_GET_QUEUE_NUM = 17,
+ VHOST_USER_SET_VRING_ENABLE = 18,
+ VHOST_USER_SEND_RARP = 19,
VHOST_USER_MAX
} VhostUserRequest;
@@ -57,6 +75,11 @@ typedef struct VhostUserMemory {
VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
} VhostUserMemory;
+typedef struct VhostUserLog {
+ uint64_t mmap_size;
+ uint64_t mmap_offset;
+} VhostUserLog;
+
typedef struct VhostUserMsg {
VhostUserRequest request;
@@ -71,7 +94,8 @@ typedef struct VhostUserMsg {
struct vhost_vring_state state;
struct vhost_vring_addr addr;
VhostUserMemory memory;
- };
+ VhostUserLog log;
+ } payload;
} QEMU_PACKED VhostUserMsg;
static VhostUserMsg m __attribute__ ((unused));
@@ -89,37 +113,6 @@ static bool ioeventfd_enabled(void)
return kvm_enabled() && kvm_eventfds_enabled();
}
-static unsigned long int ioctl_to_vhost_user_request[VHOST_USER_MAX] = {
- -1, /* VHOST_USER_NONE */
- VHOST_GET_FEATURES, /* VHOST_USER_GET_FEATURES */
- VHOST_SET_FEATURES, /* VHOST_USER_SET_FEATURES */
- VHOST_SET_OWNER, /* VHOST_USER_SET_OWNER */
- VHOST_RESET_OWNER, /* VHOST_USER_RESET_OWNER */
- VHOST_SET_MEM_TABLE, /* VHOST_USER_SET_MEM_TABLE */
- VHOST_SET_LOG_BASE, /* VHOST_USER_SET_LOG_BASE */
- VHOST_SET_LOG_FD, /* VHOST_USER_SET_LOG_FD */
- VHOST_SET_VRING_NUM, /* VHOST_USER_SET_VRING_NUM */
- VHOST_SET_VRING_ADDR, /* VHOST_USER_SET_VRING_ADDR */
- VHOST_SET_VRING_BASE, /* VHOST_USER_SET_VRING_BASE */
- VHOST_GET_VRING_BASE, /* VHOST_USER_GET_VRING_BASE */
- VHOST_SET_VRING_KICK, /* VHOST_USER_SET_VRING_KICK */
- VHOST_SET_VRING_CALL, /* VHOST_USER_SET_VRING_CALL */
- VHOST_SET_VRING_ERR /* VHOST_USER_SET_VRING_ERR */
-};
-
-static VhostUserRequest vhost_user_request_translate(unsigned long int request)
-{
- VhostUserRequest idx;
-
- for (idx = 0; idx < VHOST_USER_MAX; idx++) {
- if (ioctl_to_vhost_user_request[idx] == request) {
- break;
- }
- }
-
- return (idx == VHOST_USER_MAX) ? VHOST_USER_NONE : idx;
-}
-
static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
{
CharDriverState *chr = dev->opaque;
@@ -128,8 +121,8 @@ static int vhost_user_read(struct vhost_dev *dev, VhostUserMsg *msg)
r = qemu_chr_fe_read_all(chr, p, size);
if (r != size) {
- error_report("Failed to read msg header. Read %d instead of %d.", r,
- size);
+ error_report("Failed to read msg header. Read %d instead of %d."
+ " Original request %d.", r, size, msg->request);
goto fail;
}
@@ -166,12 +159,35 @@ fail:
return -1;
}
+static bool vhost_user_one_time_request(VhostUserRequest request)
+{
+ switch (request) {
+ case VHOST_USER_SET_OWNER:
+ case VHOST_USER_RESET_OWNER:
+ case VHOST_USER_SET_MEM_TABLE:
+ case VHOST_USER_GET_QUEUE_NUM:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/* most non-init callers ignore the error */
static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg,
int *fds, int fd_num)
{
CharDriverState *chr = dev->opaque;
int size = VHOST_USER_HDR_SIZE + msg->size;
+ /*
+ * For non-vring specific requests, like VHOST_USER_SET_MEM_TABLE,
+ * we just need send it once in the first time. For later such
+ * request, we just ignore it.
+ */
+ if (vhost_user_one_time_request(msg->request) && dev->vq_index != 0) {
+ return 0;
+ }
+
if (fd_num) {
qemu_chr_fe_set_msgfds(chr, fds, fd_num);
}
@@ -180,157 +196,364 @@ static int vhost_user_write(struct vhost_dev *dev, VhostUserMsg *msg,
0 : -1;
}
-static int vhost_user_call(struct vhost_dev *dev, unsigned long int request,
- void *arg)
+static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
+ struct vhost_log *log)
{
- VhostUserMsg msg;
- VhostUserRequest msg_request;
- struct vhost_vring_file *file = 0;
- int need_reply = 0;
int fds[VHOST_MEMORY_MAX_NREGIONS];
- int i, fd;
size_t fd_num = 0;
+ bool shmfd = virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_LOG_SHMFD);
+ VhostUserMsg msg = {
+ .request = VHOST_USER_SET_LOG_BASE,
+ .flags = VHOST_USER_VERSION,
+ .payload.log.mmap_size = log->size * sizeof(*(log->log)),
+ .payload.log.mmap_offset = 0,
+ .size = sizeof(msg.payload.log),
+ };
- assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
+ if (shmfd && log->fd != -1) {
+ fds[fd_num++] = log->fd;
+ }
- msg_request = vhost_user_request_translate(request);
- msg.request = msg_request;
- msg.flags = VHOST_USER_VERSION;
- msg.size = 0;
+ vhost_user_write(dev, &msg, fds, fd_num);
- switch (request) {
- case VHOST_GET_FEATURES:
- need_reply = 1;
- break;
-
- case VHOST_SET_FEATURES:
- case VHOST_SET_LOG_BASE:
- msg.u64 = *((__u64 *) arg);
- msg.size = sizeof(m.u64);
- break;
-
- case VHOST_SET_OWNER:
- case VHOST_RESET_OWNER:
- break;
-
- case VHOST_SET_MEM_TABLE:
- for (i = 0; i < dev->mem->nregions; ++i) {
- struct vhost_memory_region *reg = dev->mem->regions + i;
- ram_addr_t ram_addr;
-
- assert((uintptr_t)reg->userspace_addr == reg->userspace_addr);
- qemu_ram_addr_from_host((void *)(uintptr_t)reg->userspace_addr, &ram_addr);
- fd = qemu_get_ram_fd(ram_addr);
- if (fd > 0) {
- msg.memory.regions[fd_num].userspace_addr = reg->userspace_addr;
- msg.memory.regions[fd_num].memory_size = reg->memory_size;
- msg.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr;
- msg.memory.regions[fd_num].mmap_offset = reg->userspace_addr -
- (uintptr_t) qemu_get_ram_block_host_ptr(ram_addr);
- assert(fd_num < VHOST_MEMORY_MAX_NREGIONS);
- fds[fd_num++] = fd;
- }
+ if (shmfd) {
+ msg.size = 0;
+ if (vhost_user_read(dev, &msg) < 0) {
+ return 0;
}
- msg.memory.nregions = fd_num;
-
- if (!fd_num) {
- error_report("Failed initializing vhost-user memory map, "
- "consider using -object memory-backend-file share=on");
+ if (msg.request != VHOST_USER_SET_LOG_BASE) {
+ error_report("Received unexpected msg type. "
+ "Expected %d received %d",
+ VHOST_USER_SET_LOG_BASE, msg.request);
return -1;
}
+ }
- msg.size = sizeof(m.memory.nregions);
- msg.size += sizeof(m.memory.padding);
- msg.size += fd_num * sizeof(VhostUserMemoryRegion);
-
- break;
-
- case VHOST_SET_LOG_FD:
- fds[fd_num++] = *((int *) arg);
- break;
-
- case VHOST_SET_VRING_NUM:
- case VHOST_SET_VRING_BASE:
- memcpy(&msg.state, arg, sizeof(struct vhost_vring_state));
- msg.size = sizeof(m.state);
- break;
-
- case VHOST_GET_VRING_BASE:
- memcpy(&msg.state, arg, sizeof(struct vhost_vring_state));
- msg.size = sizeof(m.state);
- need_reply = 1;
- break;
-
- case VHOST_SET_VRING_ADDR:
- memcpy(&msg.addr, arg, sizeof(struct vhost_vring_addr));
- msg.size = sizeof(m.addr);
- break;
-
- case VHOST_SET_VRING_KICK:
- case VHOST_SET_VRING_CALL:
- case VHOST_SET_VRING_ERR:
- file = arg;
- msg.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
- msg.size = sizeof(m.u64);
- if (ioeventfd_enabled() && file->fd > 0) {
- fds[fd_num++] = file->fd;
- } else {
- msg.u64 |= VHOST_USER_VRING_NOFD_MASK;
+ return 0;
+}
+
+static int vhost_user_set_mem_table(struct vhost_dev *dev,
+ struct vhost_memory *mem)
+{
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+ int i, fd;
+ size_t fd_num = 0;
+ VhostUserMsg msg = {
+ .request = VHOST_USER_SET_MEM_TABLE,
+ .flags = VHOST_USER_VERSION,
+ };
+
+ for (i = 0; i < dev->mem->nregions; ++i) {
+ struct vhost_memory_region *reg = dev->mem->regions + i;
+ ram_addr_t ram_addr;
+
+ assert((uintptr_t)reg->userspace_addr == reg->userspace_addr);
+ qemu_ram_addr_from_host((void *)(uintptr_t)reg->userspace_addr,
+ &ram_addr);
+ fd = qemu_get_ram_fd(ram_addr);
+ if (fd > 0) {
+ msg.payload.memory.regions[fd_num].userspace_addr = reg->userspace_addr;
+ msg.payload.memory.regions[fd_num].memory_size = reg->memory_size;
+ msg.payload.memory.regions[fd_num].guest_phys_addr = reg->guest_phys_addr;
+ msg.payload.memory.regions[fd_num].mmap_offset = reg->userspace_addr -
+ (uintptr_t) qemu_get_ram_block_host_ptr(ram_addr);
+ assert(fd_num < VHOST_MEMORY_MAX_NREGIONS);
+ fds[fd_num++] = fd;
}
- break;
- default:
- error_report("vhost-user trying to send unhandled ioctl");
+ }
+
+ msg.payload.memory.nregions = fd_num;
+
+ if (!fd_num) {
+ error_report("Failed initializing vhost-user memory map, "
+ "consider using -object memory-backend-file share=on");
return -1;
- break;
}
- if (vhost_user_write(dev, &msg, fds, fd_num) < 0) {
+ msg.size = sizeof(msg.payload.memory.nregions);
+ msg.size += sizeof(msg.payload.memory.padding);
+ msg.size += fd_num * sizeof(VhostUserMemoryRegion);
+
+ vhost_user_write(dev, &msg, fds, fd_num);
+
+ return 0;
+}
+
+static int vhost_user_set_vring_addr(struct vhost_dev *dev,
+ struct vhost_vring_addr *addr)
+{
+ VhostUserMsg msg = {
+ .request = VHOST_USER_SET_VRING_ADDR,
+ .flags = VHOST_USER_VERSION,
+ .payload.addr = *addr,
+ .size = sizeof(msg.payload.addr),
+ };
+
+ vhost_user_write(dev, &msg, NULL, 0);
+
+ return 0;
+}
+
+static int vhost_user_set_vring_endian(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ error_report("vhost-user trying to send unhandled ioctl");
+ return -1;
+}
+
+static int vhost_set_vring(struct vhost_dev *dev,
+ unsigned long int request,
+ struct vhost_vring_state *ring)
+{
+ VhostUserMsg msg = {
+ .request = request,
+ .flags = VHOST_USER_VERSION,
+ .payload.state = *ring,
+ .size = sizeof(msg.payload.state),
+ };
+
+ vhost_user_write(dev, &msg, NULL, 0);
+
+ return 0;
+}
+
+static int vhost_user_set_vring_num(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_set_vring(dev, VHOST_USER_SET_VRING_NUM, ring);
+}
+
+static int vhost_user_set_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ return vhost_set_vring(dev, VHOST_USER_SET_VRING_BASE, ring);
+}
+
+static int vhost_user_set_vring_enable(struct vhost_dev *dev, int enable)
+{
+ int i;
+
+ if (!virtio_has_feature(dev->features, VHOST_USER_F_PROTOCOL_FEATURES)) {
+ return -1;
+ }
+
+ for (i = 0; i < dev->nvqs; ++i) {
+ struct vhost_vring_state state = {
+ .index = dev->vq_index + i,
+ .num = enable,
+ };
+
+ vhost_set_vring(dev, VHOST_USER_SET_VRING_ENABLE, &state);
+ }
+
+ return 0;
+}
+
+static int vhost_user_get_vring_base(struct vhost_dev *dev,
+ struct vhost_vring_state *ring)
+{
+ VhostUserMsg msg = {
+ .request = VHOST_USER_GET_VRING_BASE,
+ .flags = VHOST_USER_VERSION,
+ .payload.state = *ring,
+ .size = sizeof(msg.payload.state),
+ };
+
+ vhost_user_write(dev, &msg, NULL, 0);
+
+ if (vhost_user_read(dev, &msg) < 0) {
return 0;
}
- if (need_reply) {
- if (vhost_user_read(dev, &msg) < 0) {
- return 0;
- }
+ if (msg.request != VHOST_USER_GET_VRING_BASE) {
+ error_report("Received unexpected msg type. Expected %d received %d",
+ VHOST_USER_GET_VRING_BASE, msg.request);
+ return -1;
+ }
- if (msg_request != msg.request) {
- error_report("Received unexpected msg type."
- " Expected %d received %d", msg_request, msg.request);
- return -1;
- }
+ if (msg.size != sizeof(msg.payload.state)) {
+ error_report("Received bad msg size.");
+ return -1;
+ }
- switch (msg_request) {
- case VHOST_USER_GET_FEATURES:
- if (msg.size != sizeof(m.u64)) {
- error_report("Received bad msg size.");
- return -1;
- }
- *((__u64 *) arg) = msg.u64;
- break;
- case VHOST_USER_GET_VRING_BASE:
- if (msg.size != sizeof(m.state)) {
- error_report("Received bad msg size.");
- return -1;
- }
- memcpy(arg, &msg.state, sizeof(struct vhost_vring_state));
- break;
- default:
- error_report("Received unexpected msg type.");
- return -1;
- break;
- }
+ *ring = msg.payload.state;
+
+ return 0;
+}
+
+static int vhost_set_vring_file(struct vhost_dev *dev,
+ VhostUserRequest request,
+ struct vhost_vring_file *file)
+{
+ int fds[VHOST_MEMORY_MAX_NREGIONS];
+ size_t fd_num = 0;
+ VhostUserMsg msg = {
+ .request = request,
+ .flags = VHOST_USER_VERSION,
+ .payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK,
+ .size = sizeof(msg.payload.u64),
+ };
+
+ if (ioeventfd_enabled() && file->fd > 0) {
+ fds[fd_num++] = file->fd;
+ } else {
+ msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
}
+ vhost_user_write(dev, &msg, fds, fd_num);
+
+ return 0;
+}
+
+static int vhost_user_set_vring_kick(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_KICK, file);
+}
+
+static int vhost_user_set_vring_call(struct vhost_dev *dev,
+ struct vhost_vring_file *file)
+{
+ return vhost_set_vring_file(dev, VHOST_USER_SET_VRING_CALL, file);
+}
+
+static int vhost_user_set_u64(struct vhost_dev *dev, int request, uint64_t u64)
+{
+ VhostUserMsg msg = {
+ .request = request,
+ .flags = VHOST_USER_VERSION,
+ .payload.u64 = u64,
+ .size = sizeof(msg.payload.u64),
+ };
+
+ vhost_user_write(dev, &msg, NULL, 0);
+
+ return 0;
+}
+
+static int vhost_user_set_features(struct vhost_dev *dev,
+ uint64_t features)
+{
+ return vhost_user_set_u64(dev, VHOST_USER_SET_FEATURES, features);
+}
+
+static int vhost_user_set_protocol_features(struct vhost_dev *dev,
+ uint64_t features)
+{
+ return vhost_user_set_u64(dev, VHOST_USER_SET_PROTOCOL_FEATURES, features);
+}
+
+static int vhost_user_get_u64(struct vhost_dev *dev, int request, uint64_t *u64)
+{
+ VhostUserMsg msg = {
+ .request = request,
+ .flags = VHOST_USER_VERSION,
+ };
+
+ if (vhost_user_one_time_request(request) && dev->vq_index != 0) {
+ return 0;
+ }
+
+ vhost_user_write(dev, &msg, NULL, 0);
+
+ if (vhost_user_read(dev, &msg) < 0) {
+ return 0;
+ }
+
+ if (msg.request != request) {
+ error_report("Received unexpected msg type. Expected %d received %d",
+ request, msg.request);
+ return -1;
+ }
+
+ if (msg.size != sizeof(msg.payload.u64)) {
+ error_report("Received bad msg size.");
+ return -1;
+ }
+
+ *u64 = msg.payload.u64;
+
+ return 0;
+}
+
+static int vhost_user_get_features(struct vhost_dev *dev, uint64_t *features)
+{
+ return vhost_user_get_u64(dev, VHOST_USER_GET_FEATURES, features);
+}
+
+static int vhost_user_set_owner(struct vhost_dev *dev)
+{
+ VhostUserMsg msg = {
+ .request = VHOST_USER_SET_OWNER,
+ .flags = VHOST_USER_VERSION,
+ };
+
+ vhost_user_write(dev, &msg, NULL, 0);
+
+ return 0;
+}
+
+static int vhost_user_reset_device(struct vhost_dev *dev)
+{
+ VhostUserMsg msg = {
+ .request = VHOST_USER_RESET_OWNER,
+ .flags = VHOST_USER_VERSION,
+ };
+
+ vhost_user_write(dev, &msg, NULL, 0);
+
return 0;
}
static int vhost_user_init(struct vhost_dev *dev, void *opaque)
{
+ uint64_t features;
+ int err;
+
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
dev->opaque = opaque;
+ err = vhost_user_get_features(dev, &features);
+ if (err < 0) {
+ return err;
+ }
+
+ if (virtio_has_feature(features, VHOST_USER_F_PROTOCOL_FEATURES)) {
+ dev->backend_features |= 1ULL << VHOST_USER_F_PROTOCOL_FEATURES;
+
+ err = vhost_user_get_u64(dev, VHOST_USER_GET_PROTOCOL_FEATURES,
+ &features);
+ if (err < 0) {
+ return err;
+ }
+
+ dev->protocol_features = features & VHOST_USER_PROTOCOL_FEATURE_MASK;
+ err = vhost_user_set_protocol_features(dev, dev->protocol_features);
+ if (err < 0) {
+ return err;
+ }
+
+ /* query the max queues we support if backend supports Multiple Queue */
+ if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) {
+ err = vhost_user_get_u64(dev, VHOST_USER_GET_QUEUE_NUM,
+ &dev->max_queues);
+ if (err < 0) {
+ return err;
+ }
+ }
+ }
+
+ if (dev->migration_blocker == NULL &&
+ !virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_LOG_SHMFD)) {
+ error_setg(&dev->migration_blocker,
+ "Migration disabled: vhost-user backend lacks "
+ "VHOST_USER_PROTOCOL_F_LOG_SHMFD feature.");
+ }
+
return 0;
}
@@ -343,9 +566,92 @@ static int vhost_user_cleanup(struct vhost_dev *dev)
return 0;
}
+static int vhost_user_get_vq_index(struct vhost_dev *dev, int idx)
+{
+ assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
+
+ return idx;
+}
+
+static int vhost_user_memslots_limit(struct vhost_dev *dev)
+{
+ return VHOST_MEMORY_MAX_NREGIONS;
+}
+
+static bool vhost_user_requires_shm_log(struct vhost_dev *dev)
+{
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
+
+ return virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_LOG_SHMFD);
+}
+
+static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr)
+{
+ VhostUserMsg msg = { 0 };
+ int err;
+
+ assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
+
+ /* If guest supports GUEST_ANNOUNCE do nothing */
+ if (virtio_has_feature(dev->acked_features, VIRTIO_NET_F_GUEST_ANNOUNCE)) {
+ return 0;
+ }
+
+ /* if backend supports VHOST_USER_PROTOCOL_F_RARP ask it to send the RARP */
+ if (virtio_has_feature(dev->protocol_features,
+ VHOST_USER_PROTOCOL_F_RARP)) {
+ msg.request = VHOST_USER_SEND_RARP;
+ msg.flags = VHOST_USER_VERSION;
+ memcpy((char *)&msg.payload.u64, mac_addr, 6);
+ msg.size = sizeof(msg.payload.u64);
+
+ err = vhost_user_write(dev, &msg, NULL, 0);
+ return err;
+ }
+ return -1;
+}
+
+static bool vhost_user_can_merge(struct vhost_dev *dev,
+ uint64_t start1, uint64_t size1,
+ uint64_t start2, uint64_t size2)
+{
+ ram_addr_t ram_addr;
+ int mfd, rfd;
+ MemoryRegion *mr;
+
+ mr = qemu_ram_addr_from_host((void *)(uintptr_t)start1, &ram_addr);
+ assert(mr);
+ mfd = qemu_get_ram_fd(ram_addr);
+
+ mr = qemu_ram_addr_from_host((void *)(uintptr_t)start2, &ram_addr);
+ assert(mr);
+ rfd = qemu_get_ram_fd(ram_addr);
+
+ return mfd == rfd;
+}
+
const VhostOps user_ops = {
.backend_type = VHOST_BACKEND_TYPE_USER,
- .vhost_call = vhost_user_call,
.vhost_backend_init = vhost_user_init,
- .vhost_backend_cleanup = vhost_user_cleanup
- };
+ .vhost_backend_cleanup = vhost_user_cleanup,
+ .vhost_backend_memslots_limit = vhost_user_memslots_limit,
+ .vhost_set_log_base = vhost_user_set_log_base,
+ .vhost_set_mem_table = vhost_user_set_mem_table,
+ .vhost_set_vring_addr = vhost_user_set_vring_addr,
+ .vhost_set_vring_endian = vhost_user_set_vring_endian,
+ .vhost_set_vring_num = vhost_user_set_vring_num,
+ .vhost_set_vring_base = vhost_user_set_vring_base,
+ .vhost_get_vring_base = vhost_user_get_vring_base,
+ .vhost_set_vring_kick = vhost_user_set_vring_kick,
+ .vhost_set_vring_call = vhost_user_set_vring_call,
+ .vhost_set_features = vhost_user_set_features,
+ .vhost_get_features = vhost_user_get_features,
+ .vhost_set_owner = vhost_user_set_owner,
+ .vhost_reset_device = vhost_user_reset_device,
+ .vhost_get_vq_index = vhost_user_get_vq_index,
+ .vhost_set_vring_enable = vhost_user_set_vring_enable,
+ .vhost_requires_shm_log = vhost_user_requires_shm_log,
+ .vhost_migration_done = vhost_user_migration_done,
+ .vhost_backend_can_merge = vhost_user_can_merge,
+};
diff --git a/qemu/hw/virtio/vhost.c b/qemu/hw/virtio/vhost.c
index 2712c6fc0..440071815 100644
--- a/qemu/hw/virtio/vhost.c
+++ b/qemu/hw/virtio/vhost.c
@@ -13,11 +13,14 @@
* GNU GPL, version 2 or (at your option) any later version.
*/
+#include "qemu/osdep.h"
+#include "qapi/error.h"
#include "hw/virtio/vhost.h"
#include "hw/hw.h"
#include "qemu/atomic.h"
#include "qemu/range.h"
#include "qemu/error-report.h"
+#include "qemu/memfd.h"
#include <linux/vhost.h>
#include "exec/address-spaces.h"
#include "hw/virtio/virtio-bus.h"
@@ -25,6 +28,23 @@
#include "migration/migration.h"
static struct vhost_log *vhost_log;
+static struct vhost_log *vhost_log_shm;
+
+static unsigned int used_memslots;
+static QLIST_HEAD(, vhost_dev) vhost_devices =
+ QLIST_HEAD_INITIALIZER(vhost_devices);
+
+bool vhost_has_free_slot(void)
+{
+ unsigned int slots_limit = ~0U;
+ struct vhost_dev *hdev;
+
+ QLIST_FOREACH(hdev, &vhost_devices, entry) {
+ unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
+ slots_limit = MIN(slots_limit, r);
+ }
+ return slots_limit > used_memslots;
+}
static void vhost_dev_sync_region(struct vhost_dev *dev,
MemoryRegionSection *section,
@@ -241,6 +261,13 @@ static void vhost_dev_assign_memory(struct vhost_dev *dev,
continue;
}
+ if (dev->vhost_ops->vhost_backend_can_merge &&
+ !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size,
+ reg->userspace_addr,
+ reg->memory_size)) {
+ continue;
+ }
+
if (merged) {
--to;
assert(to >= 0);
@@ -286,25 +313,46 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev)
}
return log_size;
}
-static struct vhost_log *vhost_log_alloc(uint64_t size)
+
+static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
{
- struct vhost_log *log = g_malloc0(sizeof *log + size * sizeof(*(log->log)));
+ struct vhost_log *log;
+ uint64_t logsize = size * sizeof(*(log->log));
+ int fd = -1;
+
+ log = g_new0(struct vhost_log, 1);
+ if (share) {
+ log->log = qemu_memfd_alloc("vhost-log", logsize,
+ F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
+ &fd);
+ memset(log->log, 0, logsize);
+ } else {
+ log->log = g_malloc0(logsize);
+ }
log->size = size;
log->refcnt = 1;
+ log->fd = fd;
return log;
}
-static struct vhost_log *vhost_log_get(uint64_t size)
+static struct vhost_log *vhost_log_get(uint64_t size, bool share)
{
- if (!vhost_log || vhost_log->size != size) {
- vhost_log = vhost_log_alloc(size);
+ struct vhost_log *log = share ? vhost_log_shm : vhost_log;
+
+ if (!log || log->size != size) {
+ log = vhost_log_alloc(size, share);
+ if (share) {
+ vhost_log_shm = log;
+ } else {
+ vhost_log = log;
+ }
} else {
- ++vhost_log->refcnt;
+ ++log->refcnt;
}
- return vhost_log;
+ return log;
}
static void vhost_log_put(struct vhost_dev *dev, bool sync)
@@ -321,20 +369,35 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync)
if (dev->log_size && sync) {
vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
}
+
if (vhost_log == log) {
+ g_free(log->log);
vhost_log = NULL;
+ } else if (vhost_log_shm == log) {
+ qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
+ log->fd);
+ vhost_log_shm = NULL;
}
+
g_free(log);
}
}
-static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size)
+static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
{
- struct vhost_log *log = vhost_log_get(size);
+ return dev->vhost_ops->vhost_requires_shm_log &&
+ dev->vhost_ops->vhost_requires_shm_log(dev);
+}
+
+static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
+{
+ struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
uint64_t log_base = (uintptr_t)log->log;
int r;
- r = dev->vhost_ops->vhost_call(dev, VHOST_SET_LOG_BASE, &log_base);
+ /* inform backend of log switching, this must be done before
+ releasing the current log, to ensure no logging is lost */
+ r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
assert(r >= 0);
vhost_log_put(dev, true);
dev->log = log;
@@ -457,6 +520,7 @@ static void vhost_set_memory(MemoryListener *listener,
dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
dev->memory_changed = true;
+ used_memslots = dev->mem->nregions;
}
static bool vhost_section(MemoryRegionSection *section)
@@ -500,7 +564,7 @@ static void vhost_commit(MemoryListener *listener)
}
if (!dev->log_enabled) {
- r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem);
+ r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
assert(r >= 0);
dev->memory_changed = false;
return;
@@ -513,7 +577,7 @@ static void vhost_commit(MemoryListener *listener)
if (dev->log_size < log_size) {
vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
}
- r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem);
+ r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
assert(r >= 0);
/* To log less, can only decrease log size after table update. */
if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
@@ -581,7 +645,7 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
.log_guest_addr = vq->used_phys,
.flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
};
- int r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ADDR, &addr);
+ int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
if (r < 0) {
return -errno;
}
@@ -595,19 +659,20 @@ static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
if (enable_log) {
features |= 0x1ULL << VHOST_F_LOG_ALL;
}
- r = dev->vhost_ops->vhost_call(dev, VHOST_SET_FEATURES, &features);
+ r = dev->vhost_ops->vhost_set_features(dev, features);
return r < 0 ? -errno : 0;
}
static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
{
- int r, t, i;
+ int r, t, i, idx;
r = vhost_dev_set_features(dev, enable_log);
if (r < 0) {
goto err_features;
}
for (i = 0; i < dev->nvqs; ++i) {
- r = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
+ idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
+ r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
enable_log);
if (r < 0) {
goto err_vq;
@@ -616,7 +681,8 @@ static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
return 0;
err_vq:
for (; i >= 0; --i) {
- t = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
+ idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
+ t = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
dev->log_enabled);
assert(t >= 0);
}
@@ -691,6 +757,27 @@ static void vhost_log_stop(MemoryListener *listener,
/* FIXME: implement */
}
+/* The vhost driver natively knows how to handle the vrings of non
+ * cross-endian legacy devices and modern devices. Only legacy devices
+ * exposed to a bi-endian guest may require the vhost driver to use a
+ * specific endianness.
+ */
+static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
+{
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ return false;
+ }
+#ifdef TARGET_IS_BIENDIAN
+#ifdef HOST_WORDS_BIGENDIAN
+ return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
+#else
+ return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
+#endif
+#else
+ return false;
+#endif
+}
+
static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
bool is_big_endian,
int vhost_vq_index)
@@ -700,7 +787,7 @@ static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
.num = is_big_endian
};
- if (!dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ENDIAN, &s)) {
+ if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
return 0;
}
@@ -719,7 +806,7 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
{
hwaddr s, l, a;
int r;
- int vhost_vq_index = idx - dev->vq_index;
+ int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
struct vhost_vring_file file = {
.index = vhost_vq_index
};
@@ -728,22 +815,20 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
};
struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
- assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
vq->num = state.num = virtio_queue_get_num(vdev, idx);
- r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_NUM, &state);
+ r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
if (r) {
return -errno;
}
state.num = virtio_queue_get_last_avail_idx(vdev, idx);
- r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_BASE, &state);
+ r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
if (r) {
return -errno;
}
- if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1) &&
- virtio_legacy_is_cross_endian(vdev)) {
+ if (vhost_needs_vring_endian(vdev)) {
r = vhost_virtqueue_set_vring_endian_legacy(dev,
virtio_is_big_endian(vdev),
vhost_vq_index);
@@ -789,7 +874,7 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
}
file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
- r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_KICK, &file);
+ r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
if (r) {
r = -errno;
goto fail_kick;
@@ -798,6 +883,14 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
/* Clear and discard previous events if any. */
event_notifier_test_and_clear(&vq->masked_notifier);
+ /* Init vring in unmasked state, unless guest_notifier_mask
+ * will do it later.
+ */
+ if (!vdev->use_guest_notifier_mask) {
+ /* TODO: check and handle errors. */
+ vhost_virtqueue_mask(dev, vdev, idx, false);
+ }
+
return 0;
fail_kick:
@@ -822,13 +915,13 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev,
struct vhost_virtqueue *vq,
unsigned idx)
{
- int vhost_vq_index = idx - dev->vq_index;
+ int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
struct vhost_vring_state state = {
.index = vhost_vq_index,
};
int r;
- assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
- r = dev->vhost_ops->vhost_call(dev, VHOST_GET_VRING_BASE, &state);
+
+ r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
if (r < 0) {
fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
fflush(stderr);
@@ -839,8 +932,7 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev,
/* In the cross-endian case, we need to reset the vring endianness to
* native as legacy devices expect so by default.
*/
- if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1) &&
- virtio_legacy_is_cross_endian(vdev)) {
+ if (vhost_needs_vring_endian(vdev)) {
r = vhost_virtqueue_set_vring_endian_legacy(dev,
!virtio_is_big_endian(vdev),
vhost_vq_index);
@@ -875,8 +967,9 @@ static void vhost_eventfd_del(MemoryListener *listener,
static int vhost_virtqueue_init(struct vhost_dev *dev,
struct vhost_virtqueue *vq, int n)
{
+ int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
struct vhost_vring_file file = {
- .index = n,
+ .index = vhost_vq_index,
};
int r = event_notifier_init(&vq->masked_notifier, 0);
if (r < 0) {
@@ -884,7 +977,7 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
}
file.fd = event_notifier_get_fd(&vq->masked_notifier);
- r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_CALL, &file);
+ r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
if (r) {
r = -errno;
goto fail_call;
@@ -906,6 +999,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
uint64_t features;
int i, r;
+ hdev->migration_blocker = NULL;
+
if (vhost_set_backend_type(hdev, backend_type) < 0) {
close((uintptr_t)opaque);
return -1;
@@ -916,18 +1011,26 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
return -errno;
}
- r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_OWNER, NULL);
+ if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
+ fprintf(stderr, "vhost backend memory slots limit is less"
+ " than current number of present memory slots\n");
+ close((uintptr_t)opaque);
+ return -1;
+ }
+ QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
+
+ r = hdev->vhost_ops->vhost_set_owner(hdev);
if (r < 0) {
goto fail;
}
- r = hdev->vhost_ops->vhost_call(hdev, VHOST_GET_FEATURES, &features);
+ r = hdev->vhost_ops->vhost_get_features(hdev, &features);
if (r < 0) {
goto fail;
}
for (i = 0; i < hdev->nvqs; ++i) {
- r = vhost_virtqueue_init(hdev, hdev->vqs + i, i);
+ r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
if (r < 0) {
goto fail_vq;
}
@@ -949,12 +1052,21 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
.eventfd_del = vhost_eventfd_del,
.priority = 10
};
- hdev->migration_blocker = NULL;
- if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
- error_setg(&hdev->migration_blocker,
- "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
+
+ if (hdev->migration_blocker == NULL) {
+ if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
+ error_setg(&hdev->migration_blocker,
+ "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
+ } else if (!qemu_memfd_check()) {
+ error_setg(&hdev->migration_blocker,
+ "Migration disabled: failed to allocate shared memory");
+ }
+ }
+
+ if (hdev->migration_blocker != NULL) {
migrate_add_blocker(hdev->migration_blocker);
}
+
hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
hdev->n_mem_sections = 0;
hdev->mem_sections = NULL;
@@ -972,6 +1084,7 @@ fail_vq:
fail:
r = -errno;
hdev->vhost_ops->vhost_backend_cleanup(hdev);
+ QLIST_REMOVE(hdev, entry);
return r;
}
@@ -989,6 +1102,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev)
g_free(hdev->mem);
g_free(hdev->mem_sections);
hdev->vhost_ops->vhost_backend_cleanup(hdev);
+ QLIST_REMOVE(hdev, entry);
}
/* Stop processing guest IO notifications in qemu.
@@ -1066,18 +1180,17 @@ void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
{
struct VirtQueue *vvq = virtio_get_queue(vdev, n);
int r, index = n - hdev->vq_index;
+ struct vhost_vring_file file;
- assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
-
- struct vhost_vring_file file = {
- .index = index
- };
if (mask) {
+ assert(vdev->use_guest_notifier_mask);
file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
} else {
file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
}
- r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_VRING_CALL, &file);
+
+ file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
+ r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
assert(r >= 0);
}
@@ -1119,7 +1232,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
if (r < 0) {
goto fail_features;
}
- r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_MEM_TABLE, hdev->mem);
+ r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
if (r < 0) {
r = -errno;
goto fail_mem;
@@ -1138,10 +1251,12 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
uint64_t log_base;
hdev->log_size = vhost_get_log_size(hdev);
- hdev->log = vhost_log_get(hdev->log_size);
+ hdev->log = vhost_log_get(hdev->log_size,
+ vhost_dev_log_is_shared(hdev));
log_base = (uintptr_t)hdev->log->log;
- r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_LOG_BASE,
- hdev->log_size ? &log_base : NULL);
+ r = hdev->vhost_ops->vhost_set_log_base(hdev,
+ hdev->log_size ? log_base : 0,
+ hdev->log);
if (r < 0) {
r = -errno;
goto fail_log;
diff --git a/qemu/hw/virtio/virtio-balloon.c b/qemu/hw/virtio/virtio-balloon.c
index 3577b7af9..9dbe68179 100644
--- a/qemu/hw/virtio/virtio-balloon.c
+++ b/qemu/hw/virtio/virtio-balloon.c
@@ -13,12 +13,12 @@
*
*/
+#include "qemu/osdep.h"
#include "qemu/iov.h"
#include "qemu/timer.h"
#include "qemu-common.h"
#include "hw/virtio/virtio.h"
#include "hw/i386/pc.h"
-#include "cpu.h"
#include "sysemu/balloon.h"
#include "hw/virtio/virtio-balloon.h"
#include "sysemu/kvm.h"
@@ -34,12 +34,16 @@
#include "hw/virtio/virtio-bus.h"
#include "hw/virtio/virtio-access.h"
+#define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT)
+
static void balloon_page(void *addr, int deflate)
{
#if defined(__linux__)
- if (!kvm_enabled() || kvm_has_sync_mmu())
- qemu_madvise(addr, TARGET_PAGE_SIZE,
+ if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
+ kvm_has_sync_mmu())) {
+ qemu_madvise(addr, BALLOON_PAGE_SIZE,
deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
+ }
#endif
}
@@ -50,6 +54,7 @@ static const char *balloon_stat_names[] = {
[VIRTIO_BALLOON_S_MINFLT] = "stat-minor-faults",
[VIRTIO_BALLOON_S_MEMFREE] = "stat-free-memory",
[VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
+ [VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
[VIRTIO_BALLOON_S_NR] = NULL
};
@@ -70,7 +75,7 @@ static inline void reset_stats(VirtIOBalloon *dev)
static bool balloon_stats_supported(const VirtIOBalloon *s)
{
VirtIODevice *vdev = VIRTIO_DEVICE(s);
- return virtio_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
+ return virtio_vdev_has_feature(vdev, VIRTIO_BALLOON_F_STATS_VQ);
}
static bool balloon_stats_enabled(const VirtIOBalloon *s)
@@ -98,39 +103,43 @@ static void balloon_stats_poll_cb(void *opaque)
VirtIOBalloon *s = opaque;
VirtIODevice *vdev = VIRTIO_DEVICE(s);
- if (!balloon_stats_supported(s)) {
+ if (s->stats_vq_elem == NULL || !balloon_stats_supported(s)) {
/* re-schedule */
balloon_stats_change_timer(s, s->stats_poll_interval);
return;
}
- virtqueue_push(s->svq, &s->stats_vq_elem, s->stats_vq_offset);
+ virtqueue_push(s->svq, s->stats_vq_elem, s->stats_vq_offset);
virtio_notify(vdev, s->svq);
+ g_free(s->stats_vq_elem);
+ s->stats_vq_elem = NULL;
}
-static void balloon_stats_get_all(Object *obj, struct Visitor *v,
- void *opaque, const char *name, Error **errp)
+static void balloon_stats_get_all(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
{
Error *err = NULL;
VirtIOBalloon *s = opaque;
int i;
- visit_start_struct(v, NULL, "guest-stats", name, 0, &err);
+ visit_start_struct(v, name, NULL, 0, &err);
if (err) {
goto out;
}
- visit_type_int(v, &s->stats_last_update, "last-update", &err);
+ visit_type_int(v, "last-update", &s->stats_last_update, &err);
if (err) {
goto out_end;
}
- visit_start_struct(v, NULL, NULL, "stats", 0, &err);
+ visit_start_struct(v, "stats", NULL, 0, &err);
if (err) {
goto out_end;
}
- for (i = 0; !err && i < VIRTIO_BALLOON_S_NR; i++) {
- visit_type_int64(v, (int64_t *) &s->stats[i], balloon_stat_names[i],
- &err);
+ for (i = 0; i < VIRTIO_BALLOON_S_NR; i++) {
+ visit_type_uint64(v, balloon_stat_names[i], &s->stats[i], &err);
+ if (err) {
+ break;
+ }
}
error_propagate(errp, err);
err = NULL;
@@ -144,23 +153,23 @@ out:
error_propagate(errp, err);
}
-static void balloon_stats_get_poll_interval(Object *obj, struct Visitor *v,
- void *opaque, const char *name,
+static void balloon_stats_get_poll_interval(Object *obj, Visitor *v,
+ const char *name, void *opaque,
Error **errp)
{
VirtIOBalloon *s = opaque;
- visit_type_int(v, &s->stats_poll_interval, name, errp);
+ visit_type_int(v, name, &s->stats_poll_interval, errp);
}
-static void balloon_stats_set_poll_interval(Object *obj, struct Visitor *v,
- void *opaque, const char *name,
+static void balloon_stats_set_poll_interval(Object *obj, Visitor *v,
+ const char *name, void *opaque,
Error **errp)
{
VirtIOBalloon *s = opaque;
Error *local_err = NULL;
int64_t value;
- visit_type_int(v, &value, name, &local_err);
+ visit_type_int(v, name, &value, &local_err);
if (local_err) {
error_propagate(errp, local_err);
return;
@@ -203,14 +212,18 @@ static void balloon_stats_set_poll_interval(Object *obj, struct Visitor *v,
static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
- VirtQueueElement elem;
+ VirtQueueElement *elem;
MemoryRegionSection section;
- while (virtqueue_pop(vq, &elem)) {
+ for (;;) {
size_t offset = 0;
uint32_t pfn;
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+ if (!elem) {
+ return;
+ }
- while (iov_to_buf(elem.out_sg, elem.out_num, offset, &pfn, 4) == 4) {
+ while (iov_to_buf(elem->out_sg, elem->out_num, offset, &pfn, 4) == 4) {
ram_addr_t pa;
ram_addr_t addr;
int p = virtio_ldl_p(vdev, &pfn);
@@ -233,23 +246,34 @@ static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue *vq)
memory_region_unref(section.mr);
}
- virtqueue_push(vq, &elem, offset);
+ virtqueue_push(vq, elem, offset);
virtio_notify(vdev, vq);
+ g_free(elem);
}
}
static void virtio_balloon_receive_stats(VirtIODevice *vdev, VirtQueue *vq)
{
VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
- VirtQueueElement *elem = &s->stats_vq_elem;
+ VirtQueueElement *elem;
VirtIOBalloonStat stat;
size_t offset = 0;
qemu_timeval tv;
- if (!virtqueue_pop(vq, elem)) {
+ elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
+ if (!elem) {
goto out;
}
+ if (s->stats_vq_elem != NULL) {
+ /* This should never happen if the driver follows the spec. */
+ virtqueue_push(vq, s->stats_vq_elem, 0);
+ virtio_notify(vdev, vq);
+ g_free(s->stats_vq_elem);
+ }
+
+ s->stats_vq_elem = elem;
+
/* Initialize the stats to get rid of any stale values. This is only
* needed to handle the case where a guest supports fewer stats than it
* used to (ie. it has booted into an old kernel).
@@ -292,6 +316,39 @@ static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t *config_data)
memcpy(config_data, &config, sizeof(struct virtio_balloon_config));
}
+static int build_dimm_list(Object *obj, void *opaque)
+{
+ GSList **list = opaque;
+
+ if (object_dynamic_cast(obj, TYPE_PC_DIMM)) {
+ DeviceState *dev = DEVICE(obj);
+ if (dev->realized) { /* only realized DIMMs matter */
+ *list = g_slist_prepend(*list, dev);
+ }
+ }
+
+ object_child_foreach(obj, build_dimm_list, opaque);
+ return 0;
+}
+
+static ram_addr_t get_current_ram_size(void)
+{
+ GSList *list = NULL, *item;
+ ram_addr_t size = ram_size;
+
+ build_dimm_list(qdev_get_machine(), &list);
+ for (item = list; item; item = g_slist_next(item)) {
+ Object *obj = OBJECT(item->data);
+ if (!strcmp(object_get_typename(obj), TYPE_PC_DIMM)) {
+ size += object_property_get_int(obj, PC_DIMM_SIZE_PROP,
+ &error_abort);
+ }
+ }
+ g_slist_free(list);
+
+ return size;
+}
+
static void virtio_balloon_set_config(VirtIODevice *vdev,
const uint8_t *config_data)
{
@@ -370,6 +427,10 @@ static int virtio_balloon_load_device(VirtIODevice *vdev, QEMUFile *f,
s->num_pages = qemu_get_be32(f);
s->actual = qemu_get_be32(f);
+
+ if (balloon_stats_enabled(s)) {
+ balloon_stats_change_timer(s, s->stats_poll_interval);
+ }
return 0;
}
@@ -412,6 +473,16 @@ static void virtio_balloon_device_unrealize(DeviceState *dev, Error **errp)
virtio_cleanup(vdev);
}
+static void virtio_balloon_device_reset(VirtIODevice *vdev)
+{
+ VirtIOBalloon *s = VIRTIO_BALLOON(vdev);
+
+ if (s->stats_vq_elem != NULL) {
+ g_free(s->stats_vq_elem);
+ s->stats_vq_elem = NULL;
+ }
+}
+
static void virtio_balloon_instance_init(Object *obj)
{
VirtIOBalloon *s = VIRTIO_BALLOON(obj);
@@ -440,6 +511,7 @@ static void virtio_balloon_class_init(ObjectClass *klass, void *data)
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
vdc->realize = virtio_balloon_device_realize;
vdc->unrealize = virtio_balloon_device_unrealize;
+ vdc->reset = virtio_balloon_device_reset;
vdc->get_config = virtio_balloon_get_config;
vdc->set_config = virtio_balloon_set_config;
vdc->get_features = virtio_balloon_get_features;
diff --git a/qemu/hw/virtio/virtio-bus.c b/qemu/hw/virtio/virtio-bus.c
index febda76b9..574f0e23f 100644
--- a/qemu/hw/virtio/virtio-bus.c
+++ b/qemu/hw/virtio/virtio-bus.c
@@ -22,6 +22,7 @@
*
*/
+#include "qemu/osdep.h"
#include "hw/hw.h"
#include "qemu/error-report.h"
#include "hw/qdev.h"
@@ -56,6 +57,9 @@ void virtio_bus_device_plugged(VirtIODevice *vdev, Error **errp)
assert(vdc->get_features != NULL);
vdev->host_features = vdc->get_features(vdev, vdev->host_features,
errp);
+ if (klass->post_plugged != NULL) {
+ klass->post_plugged(qbus->parent, errp);
+ }
}
/* Reset the virtio_bus */
diff --git a/qemu/hw/virtio/virtio-mmio.c b/qemu/hw/virtio/virtio-mmio.c
index 18660b07b..d4cd91f8c 100644
--- a/qemu/hw/virtio/virtio-mmio.c
+++ b/qemu/hw/virtio/virtio-mmio.c
@@ -19,6 +19,7 @@
* with this program; if not, see <http://www.gnu.org/licenses/>.
*/
+#include "qemu/osdep.h"
#include "hw/sysbus.h"
#include "hw/virtio/virtio.h"
#include "qemu/host-utils.h"
diff --git a/qemu/hw/virtio/virtio-pci.c b/qemu/hw/virtio/virtio-pci.c
index c024161f5..bfedbbf17 100644
--- a/qemu/hw/virtio/virtio-pci.c
+++ b/qemu/hw/virtio/virtio-pci.c
@@ -15,7 +15,7 @@
* GNU GPL, version 2 or (at your option) any later version.
*/
-#include <inttypes.h>
+#include "qemu/osdep.h"
#include "standard-headers/linux/virtio_pci.h"
#include "hw/virtio/virtio.h"
@@ -26,6 +26,7 @@
#include "hw/virtio/virtio-balloon.h"
#include "hw/virtio/virtio-input.h"
#include "hw/pci/pci.h"
+#include "qapi/error.h"
#include "qemu/error-report.h"
#include "hw/pci/msi.h"
#include "hw/pci/msix.h"
@@ -47,6 +48,7 @@
static void virtio_pci_bus_new(VirtioBusState *bus, size_t bus_size,
VirtIOPCIProxy *dev);
+static void virtio_pci_reset(DeviceState *qdev);
/* virtio device */
/* DeviceState to VirtIOPCIProxy. For use off data-path. TODO: use QOM. */
@@ -86,6 +88,129 @@ static void virtio_pci_save_config(DeviceState *d, QEMUFile *f)
qemu_put_be16(f, vdev->config_vector);
}
+static void virtio_pci_load_modern_queue_state(VirtIOPCIQueue *vq,
+ QEMUFile *f)
+{
+ vq->num = qemu_get_be16(f);
+ vq->enabled = qemu_get_be16(f);
+ vq->desc[0] = qemu_get_be32(f);
+ vq->desc[1] = qemu_get_be32(f);
+ vq->avail[0] = qemu_get_be32(f);
+ vq->avail[1] = qemu_get_be32(f);
+ vq->used[0] = qemu_get_be32(f);
+ vq->used[1] = qemu_get_be32(f);
+}
+
+static bool virtio_pci_has_extra_state(DeviceState *d)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+
+ return proxy->flags & VIRTIO_PCI_FLAG_MIGRATE_EXTRA;
+}
+
+static int get_virtio_pci_modern_state(QEMUFile *f, void *pv, size_t size)
+{
+ VirtIOPCIProxy *proxy = pv;
+ int i;
+
+ proxy->dfselect = qemu_get_be32(f);
+ proxy->gfselect = qemu_get_be32(f);
+ proxy->guest_features[0] = qemu_get_be32(f);
+ proxy->guest_features[1] = qemu_get_be32(f);
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ virtio_pci_load_modern_queue_state(&proxy->vqs[i], f);
+ }
+
+ return 0;
+}
+
+static void virtio_pci_save_modern_queue_state(VirtIOPCIQueue *vq,
+ QEMUFile *f)
+{
+ qemu_put_be16(f, vq->num);
+ qemu_put_be16(f, vq->enabled);
+ qemu_put_be32(f, vq->desc[0]);
+ qemu_put_be32(f, vq->desc[1]);
+ qemu_put_be32(f, vq->avail[0]);
+ qemu_put_be32(f, vq->avail[1]);
+ qemu_put_be32(f, vq->used[0]);
+ qemu_put_be32(f, vq->used[1]);
+}
+
+static void put_virtio_pci_modern_state(QEMUFile *f, void *pv, size_t size)
+{
+ VirtIOPCIProxy *proxy = pv;
+ int i;
+
+ qemu_put_be32(f, proxy->dfselect);
+ qemu_put_be32(f, proxy->gfselect);
+ qemu_put_be32(f, proxy->guest_features[0]);
+ qemu_put_be32(f, proxy->guest_features[1]);
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ virtio_pci_save_modern_queue_state(&proxy->vqs[i], f);
+ }
+}
+
+static const VMStateInfo vmstate_info_virtio_pci_modern_state = {
+ .name = "virtqueue_state",
+ .get = get_virtio_pci_modern_state,
+ .put = put_virtio_pci_modern_state,
+};
+
+static bool virtio_pci_modern_state_needed(void *opaque)
+{
+ VirtIOPCIProxy *proxy = opaque;
+
+ return !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN);
+}
+
+static const VMStateDescription vmstate_virtio_pci_modern_state = {
+ .name = "virtio_pci/modern_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_pci_modern_state_needed,
+ .fields = (VMStateField[]) {
+ {
+ .name = "modern_state",
+ .version_id = 0,
+ .field_exists = NULL,
+ .size = 0,
+ .info = &vmstate_info_virtio_pci_modern_state,
+ .flags = VMS_SINGLE,
+ .offset = 0,
+ },
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_pci = {
+ .name = "virtio_pci",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .minimum_version_id_old = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription*[]) {
+ &vmstate_virtio_pci_modern_state,
+ NULL
+ }
+};
+
+static void virtio_pci_save_extra_state(DeviceState *d, QEMUFile *f)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+
+ vmstate_save_state(f, &vmstate_virtio_pci, proxy, NULL);
+}
+
+static int virtio_pci_load_extra_state(DeviceState *d, QEMUFile *f)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+
+ return vmstate_load_state(f, &vmstate_virtio_pci, proxy, 1);
+}
+
static void virtio_pci_save_queue(DeviceState *d, int n, QEMUFile *f)
{
VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
@@ -133,6 +258,7 @@ static int virtio_pci_load_queue(DeviceState *d, int n, QEMUFile *f)
if (vector != VIRTIO_NO_VECTOR) {
return msix_vector_use(&proxy->pci_dev, vector);
}
+
return 0;
}
@@ -146,7 +272,10 @@ static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy,
EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
bool legacy = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_LEGACY);
bool modern = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN);
+ bool fast_mmio = kvm_ioeventfd_any_length_enabled();
+ bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY;
MemoryRegion *modern_mr = &proxy->notify.mr;
+ MemoryRegion *modern_notify_mr = &proxy->notify_pio.mr;
MemoryRegion *legacy_mr = &proxy->bar;
hwaddr modern_addr = QEMU_VIRTIO_PCI_QUEUE_MEM_MULT *
virtio_get_queue_index(vq);
@@ -162,8 +291,17 @@ static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy,
}
virtio_queue_set_host_notifier_fd_handler(vq, true, set_handler);
if (modern) {
- memory_region_add_eventfd(modern_mr, modern_addr, 2,
- true, n, notifier);
+ if (fast_mmio) {
+ memory_region_add_eventfd(modern_mr, modern_addr, 0,
+ false, n, notifier);
+ } else {
+ memory_region_add_eventfd(modern_mr, modern_addr, 2,
+ false, n, notifier);
+ }
+ if (modern_pio) {
+ memory_region_add_eventfd(modern_notify_mr, 0, 2,
+ true, n, notifier);
+ }
}
if (legacy) {
memory_region_add_eventfd(legacy_mr, legacy_addr, 2,
@@ -171,8 +309,17 @@ static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy,
}
} else {
if (modern) {
- memory_region_del_eventfd(modern_mr, modern_addr, 2,
- true, n, notifier);
+ if (fast_mmio) {
+ memory_region_del_eventfd(modern_mr, modern_addr, 0,
+ false, n, notifier);
+ } else {
+ memory_region_del_eventfd(modern_mr, modern_addr, 2,
+ false, n, notifier);
+ }
+ if (modern_pio) {
+ memory_region_del_eventfd(modern_notify_mr, 0, 2,
+ true, n, notifier);
+ }
}
if (legacy) {
memory_region_del_eventfd(legacy_mr, legacy_addr, 2,
@@ -259,9 +406,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
case VIRTIO_PCI_QUEUE_PFN:
pa = (hwaddr)val << VIRTIO_PCI_QUEUE_ADDR_SHIFT;
if (pa == 0) {
- virtio_pci_stop_ioeventfd(proxy);
- virtio_reset(vdev);
- msix_unuse_all_vectors(&proxy->pci_dev);
+ virtio_pci_reset(DEVICE(proxy));
}
else
virtio_queue_set_addr(vdev, vdev->queue_sel, pa);
@@ -287,8 +432,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
}
if (vdev->status == 0) {
- virtio_reset(vdev);
- msix_unuse_all_vectors(&proxy->pci_dev);
+ virtio_pci_reset(DEVICE(proxy));
}
/* Linux before 2.6.34 drives the device without enabling
@@ -590,7 +734,7 @@ static int kvm_virtio_pci_vq_vector_use(VirtIOPCIProxy *proxy,
int ret;
if (irqfd->users == 0) {
- ret = kvm_irqchip_add_msi_route(kvm_state, msg);
+ ret = kvm_irqchip_add_msi_route(kvm_state, msg, &proxy->pci_dev);
if (ret < 0) {
return ret;
}
@@ -661,7 +805,7 @@ static int kvm_virtio_pci_vector_use(VirtIOPCIProxy *proxy, int nvqs)
/* If guest supports masking, set up irqfd now.
* Otherwise, delay until unmasked in the frontend.
*/
- if (k->guest_notifier_mask) {
+ if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector);
if (ret < 0) {
kvm_virtio_pci_vq_vector_release(proxy, vector);
@@ -677,7 +821,7 @@ undo:
if (vector >= msix_nr_vectors_allocated(dev)) {
continue;
}
- if (k->guest_notifier_mask) {
+ if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
kvm_virtio_pci_irqfd_release(proxy, queue_no, vector);
}
kvm_virtio_pci_vq_vector_release(proxy, vector);
@@ -704,7 +848,7 @@ static void kvm_virtio_pci_vector_release(VirtIOPCIProxy *proxy, int nvqs)
/* If guest supports masking, clean up irqfd now.
* Otherwise, it was cleaned when masked in the frontend.
*/
- if (k->guest_notifier_mask) {
+ if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
kvm_virtio_pci_irqfd_release(proxy, queue_no, vector);
}
kvm_virtio_pci_vq_vector_release(proxy, vector);
@@ -726,7 +870,8 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
if (proxy->vector_irqfd) {
irqfd = &proxy->vector_irqfd[vector];
if (irqfd->msg.data != msg.data || irqfd->msg.address != msg.address) {
- ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg);
+ ret = kvm_irqchip_update_msi_route(kvm_state, irqfd->virq, msg,
+ &proxy->pci_dev);
if (ret < 0) {
return ret;
}
@@ -736,7 +881,7 @@ static int virtio_pci_vq_vector_unmask(VirtIOPCIProxy *proxy,
/* If guest supports masking, irqfd is already setup, unmask it.
* Otherwise, set it up now.
*/
- if (k->guest_notifier_mask) {
+ if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
k->guest_notifier_mask(vdev, queue_no, false);
/* Test after unmasking to avoid losing events. */
if (k->guest_notifier_pending &&
@@ -759,7 +904,7 @@ static void virtio_pci_vq_vector_mask(VirtIOPCIProxy *proxy,
/* If guest supports masking, keep irqfd but mask it.
* Otherwise, clean it up now.
*/
- if (k->guest_notifier_mask) {
+ if (vdev->use_guest_notifier_mask && k->guest_notifier_mask) {
k->guest_notifier_mask(vdev, queue_no, true);
} else {
kvm_virtio_pci_irqfd_release(proxy, queue_no, vector);
@@ -876,7 +1021,9 @@ static int virtio_pci_set_guest_notifier(DeviceState *d, int n, bool assign,
event_notifier_cleanup(notifier);
}
- if (!msix_enabled(&proxy->pci_dev) && vdc->guest_notifier_mask) {
+ if (!msix_enabled(&proxy->pci_dev) &&
+ vdev->use_guest_notifier_mask &&
+ vdc->guest_notifier_mask) {
vdc->guest_notifier_mask(vdev, n, !assign);
}
@@ -1205,8 +1352,7 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr,
}
if (vdev->status == 0) {
- virtio_reset(vdev);
- msix_unuse_all_vectors(&proxy->pci_dev);
+ virtio_pci_reset(DEVICE(proxy));
}
break;
@@ -1238,6 +1384,7 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr,
proxy->vqs[vdev->queue_sel].avail[0],
((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 |
proxy->vqs[vdev->queue_sel].used[0]);
+ proxy->vqs[vdev->queue_sel].enabled = 1;
break;
case VIRTIO_PCI_COMMON_Q_DESCLO:
proxy->vqs[vdev->queue_sel].desc[0] = val;
@@ -1280,6 +1427,17 @@ static void virtio_pci_notify_write(void *opaque, hwaddr addr,
}
}
+static void virtio_pci_notify_write_pio(void *opaque, hwaddr addr,
+ uint64_t val, unsigned size)
+{
+ VirtIODevice *vdev = opaque;
+ unsigned queue = val;
+
+ if (queue < VIRTIO_QUEUE_MAX) {
+ virtio_queue_notify(vdev, queue);
+ }
+}
+
static uint64_t virtio_pci_isr_read(void *opaque, hwaddr addr,
unsigned size)
{
@@ -1373,6 +1531,16 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy)
},
.endianness = DEVICE_LITTLE_ENDIAN,
};
+ static const MemoryRegionOps notify_pio_ops = {
+ .read = virtio_pci_notify_read,
+ .write = virtio_pci_notify_write_pio,
+ .impl = {
+ .min_access_size = 1,
+ .max_access_size = 4,
+ },
+ .endianness = DEVICE_LITTLE_ENDIAN,
+ };
+
memory_region_init_io(&proxy->common.mr, OBJECT(proxy),
&common_ops,
@@ -1397,30 +1565,60 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy)
virtio_bus_get_device(&proxy->bus),
"virtio-pci-notify",
proxy->notify.size);
+
+ memory_region_init_io(&proxy->notify_pio.mr, OBJECT(proxy),
+ &notify_pio_ops,
+ virtio_bus_get_device(&proxy->bus),
+ "virtio-pci-notify-pio",
+ proxy->notify.size);
}
static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy,
VirtIOPCIRegion *region,
- struct virtio_pci_cap *cap)
+ struct virtio_pci_cap *cap,
+ MemoryRegion *mr,
+ uint8_t bar)
{
- memory_region_add_subregion(&proxy->modern_bar,
- region->offset,
- &region->mr);
+ memory_region_add_subregion(mr, region->offset, &region->mr);
cap->cfg_type = region->type;
- cap->bar = proxy->modern_mem_bar;
+ cap->bar = bar;
cap->offset = cpu_to_le32(region->offset);
cap->length = cpu_to_le32(region->size);
virtio_pci_add_mem_cap(proxy, cap);
+
+}
+
+static void virtio_pci_modern_mem_region_map(VirtIOPCIProxy *proxy,
+ VirtIOPCIRegion *region,
+ struct virtio_pci_cap *cap)
+{
+ virtio_pci_modern_region_map(proxy, region, cap,
+ &proxy->modern_bar, proxy->modern_mem_bar);
}
-static void virtio_pci_modern_region_unmap(VirtIOPCIProxy *proxy,
- VirtIOPCIRegion *region)
+static void virtio_pci_modern_io_region_map(VirtIOPCIProxy *proxy,
+ VirtIOPCIRegion *region,
+ struct virtio_pci_cap *cap)
+{
+ virtio_pci_modern_region_map(proxy, region, cap,
+ &proxy->io_bar, proxy->modern_io_bar);
+}
+
+static void virtio_pci_modern_mem_region_unmap(VirtIOPCIProxy *proxy,
+ VirtIOPCIRegion *region)
{
memory_region_del_subregion(&proxy->modern_bar,
&region->mr);
}
+static void virtio_pci_modern_io_region_unmap(VirtIOPCIProxy *proxy,
+ VirtIOPCIRegion *region)
+{
+ memory_region_del_subregion(&proxy->io_bar,
+ &region->mr);
+}
+
/* This is called by virtio-bus just after the device is plugged. */
static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
{
@@ -1428,6 +1626,7 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
VirtioBusState *bus = &proxy->bus;
bool legacy = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_LEGACY);
bool modern = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN);
+ bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY;
uint8_t *config;
uint32_t size;
VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
@@ -1466,16 +1665,31 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
.cap.cap_len = sizeof cfg,
.cap.cfg_type = VIRTIO_PCI_CAP_PCI_CFG,
};
- struct virtio_pci_cfg_cap *cfg_mask;
+ struct virtio_pci_notify_cap notify_pio = {
+ .cap.cap_len = sizeof notify,
+ .notify_off_multiplier = cpu_to_le32(0x0),
+ };
- /* TODO: add io access for speed */
+ struct virtio_pci_cfg_cap *cfg_mask;
virtio_add_feature(&vdev->host_features, VIRTIO_F_VERSION_1);
virtio_pci_modern_regions_init(proxy);
- virtio_pci_modern_region_map(proxy, &proxy->common, &cap);
- virtio_pci_modern_region_map(proxy, &proxy->isr, &cap);
- virtio_pci_modern_region_map(proxy, &proxy->device, &cap);
- virtio_pci_modern_region_map(proxy, &proxy->notify, &notify.cap);
+
+ virtio_pci_modern_mem_region_map(proxy, &proxy->common, &cap);
+ virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap);
+ virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap);
+ virtio_pci_modern_mem_region_map(proxy, &proxy->notify, &notify.cap);
+
+ if (modern_pio) {
+ memory_region_init(&proxy->io_bar, OBJECT(proxy),
+ "virtio-pci-io", 0x4);
+
+ pci_register_bar(&proxy->pci_dev, proxy->modern_io_bar,
+ PCI_BASE_ADDRESS_SPACE_IO, &proxy->io_bar);
+
+ virtio_pci_modern_io_region_map(proxy, &proxy->notify_pio,
+ &notify_pio.cap);
+ }
pci_register_bar(&proxy->pci_dev, proxy->modern_mem_bar,
PCI_BASE_ADDRESS_SPACE_MEMORY |
@@ -1491,12 +1705,17 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
pci_set_long(cfg_mask->pci_cfg_data, ~0x0);
}
- if (proxy->nvectors &&
- msix_init_exclusive_bar(&proxy->pci_dev, proxy->nvectors,
- proxy->msix_bar)) {
- error_report("unable to init msix vectors to %" PRIu32,
- proxy->nvectors);
- proxy->nvectors = 0;
+ if (proxy->nvectors) {
+ int err = msix_init_exclusive_bar(&proxy->pci_dev, proxy->nvectors,
+ proxy->msix_bar);
+ if (err) {
+ /* Notice when a system that supports MSIx can't initialize it. */
+ if (err != -ENOTSUP) {
+ error_report("unable to init msix vectors to %" PRIu32,
+ proxy->nvectors);
+ }
+ proxy->nvectors = 0;
+ }
}
proxy->pci_dev.config_write = virtio_write_config;
@@ -1505,9 +1724,7 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
if (legacy) {
size = VIRTIO_PCI_REGION_SIZE(&proxy->pci_dev)
+ virtio_bus_get_vdev_config_len(bus);
- if (size & (size - 1)) {
- size = 1 << qemu_fls(size);
- }
+ size = pow2ceil(size);
memory_region_init_io(&proxy->bar, OBJECT(proxy),
&virtio_pci_config_ops,
@@ -1528,14 +1745,18 @@ static void virtio_pci_device_unplugged(DeviceState *d)
{
VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
bool modern = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN);
+ bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY;
virtio_pci_stop_ioeventfd(proxy);
if (modern) {
- virtio_pci_modern_region_unmap(proxy, &proxy->common);
- virtio_pci_modern_region_unmap(proxy, &proxy->isr);
- virtio_pci_modern_region_unmap(proxy, &proxy->device);
- virtio_pci_modern_region_unmap(proxy, &proxy->notify);
+ virtio_pci_modern_mem_region_unmap(proxy, &proxy->common);
+ virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr);
+ virtio_pci_modern_mem_region_unmap(proxy, &proxy->device);
+ virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify);
+ if (modern_pio) {
+ virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio);
+ }
}
}
@@ -1555,6 +1776,7 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
*/
proxy->legacy_io_bar = 0;
proxy->msix_bar = 1;
+ proxy->modern_io_bar = 2;
proxy->modern_mem_bar = 4;
proxy->common.offset = 0x0;
@@ -1574,6 +1796,10 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
QEMU_VIRTIO_PCI_QUEUE_MEM_MULT * VIRTIO_QUEUE_MAX;
proxy->notify.type = VIRTIO_PCI_CAP_NOTIFY_CFG;
+ proxy->notify_pio.offset = 0x0;
+ proxy->notify_pio.size = 0x4;
+ proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG;
+
/* subclasses can enforce modern, so do this unconditionally */
memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci",
2 * QEMU_VIRTIO_PCI_QUEUE_MEM_MULT *
@@ -1588,6 +1814,29 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
address_space_init(&proxy->modern_as, &proxy->modern_cfg, "virtio-pci-cfg-as");
+ if (pci_is_express(pci_dev) && pci_bus_is_express(pci_dev->bus) &&
+ !pci_bus_is_root(pci_dev->bus)) {
+ int pos;
+
+ pos = pcie_endpoint_cap_init(pci_dev, 0);
+ assert(pos > 0);
+
+ pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0, PCI_PM_SIZEOF);
+ assert(pos > 0);
+
+ /*
+ * Indicates that this function complies with revision 1.2 of the
+ * PCI Power Management Interface Specification.
+ */
+ pci_set_word(pci_dev->config + pos + PCI_PM_PMC, 0x3);
+ } else {
+ /*
+ * make future invocations of pci_is_express() return false
+ * and pci_config_size() return PCI_CONFIG_SPACE_SIZE.
+ */
+ pci_dev->cap_present &= ~QEMU_PCI_CAP_EXPRESS;
+ }
+
virtio_pci_bus_new(&proxy->bus, sizeof(proxy->bus), proxy);
if (k->realize) {
k->realize(proxy, errp);
@@ -1606,9 +1855,15 @@ static void virtio_pci_reset(DeviceState *qdev)
{
VirtIOPCIProxy *proxy = VIRTIO_PCI(qdev);
VirtioBusState *bus = VIRTIO_BUS(&proxy->bus);
+ int i;
+
virtio_pci_stop_ioeventfd(proxy);
virtio_bus_reset(bus);
msix_unuse_all_vectors(&proxy->pci_dev);
+
+ for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+ proxy->vqs[i].enabled = 0;
+ }
}
static Property virtio_pci_properties[] = {
@@ -1618,13 +1873,34 @@ static Property virtio_pci_properties[] = {
VIRTIO_PCI_FLAG_DISABLE_LEGACY_BIT, false),
DEFINE_PROP_BIT("disable-modern", VirtIOPCIProxy, flags,
VIRTIO_PCI_FLAG_DISABLE_MODERN_BIT, true),
+ DEFINE_PROP_BIT("migrate-extra", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT, true),
+ DEFINE_PROP_BIT("modern-pio-notify", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT, false),
+ DEFINE_PROP_BIT("x-disable-pcie", VirtIOPCIProxy, flags,
+ VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT, false),
DEFINE_PROP_END_OF_LIST(),
};
+static void virtio_pci_dc_realize(DeviceState *qdev, Error **errp)
+{
+ VirtioPCIClass *vpciklass = VIRTIO_PCI_GET_CLASS(qdev);
+ VirtIOPCIProxy *proxy = VIRTIO_PCI(qdev);
+ PCIDevice *pci_dev = &proxy->pci_dev;
+
+ if (!(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_PCIE) &&
+ !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN)) {
+ pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
+ }
+
+ vpciklass->parent_dc_realize(qdev, errp);
+}
+
static void virtio_pci_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+ VirtioPCIClass *vpciklass = VIRTIO_PCI_CLASS(klass);
dc->props = virtio_pci_properties;
k->realize = virtio_pci_realize;
@@ -1632,6 +1908,8 @@ static void virtio_pci_class_init(ObjectClass *klass, void *data)
k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
k->revision = VIRTIO_PCI_ABI_VERSION;
k->class_id = PCI_CLASS_OTHERS;
+ vpciklass->parent_dc_realize = dc->realize;
+ dc->realize = virtio_pci_dc_realize;
dc->reset = virtio_pci_reset;
}
@@ -2009,10 +2287,6 @@ static const TypeInfo virtio_net_pci_info = {
/* virtio-rng-pci */
-static Property virtio_rng_pci_properties[] = {
- DEFINE_PROP_END_OF_LIST(),
-};
-
static void virtio_rng_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
{
VirtIORngPCI *vrng = VIRTIO_RNG_PCI(vpci_dev);
@@ -2039,7 +2313,6 @@ static void virtio_rng_pci_class_init(ObjectClass *klass, void *data)
k->realize = virtio_rng_pci_realize;
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
- dc->props = virtio_rng_pci_properties;
pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
pcidev_k->device_id = PCI_DEVICE_ID_VIRTIO_RNG;
@@ -2136,14 +2409,6 @@ static void virtio_tablet_initfn(Object *obj)
TYPE_VIRTIO_TABLET);
}
-static void virtio_host_initfn(Object *obj)
-{
- VirtIOInputHostPCI *dev = VIRTIO_INPUT_HOST_PCI(obj);
-
- virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
- TYPE_VIRTIO_INPUT_HOST);
-}
-
static const TypeInfo virtio_input_pci_info = {
.name = TYPE_VIRTIO_INPUT_PCI,
.parent = TYPE_VIRTIO_PCI,
@@ -2182,12 +2447,22 @@ static const TypeInfo virtio_tablet_pci_info = {
.instance_init = virtio_tablet_initfn,
};
+#ifdef CONFIG_LINUX
+static void virtio_host_initfn(Object *obj)
+{
+ VirtIOInputHostPCI *dev = VIRTIO_INPUT_HOST_PCI(obj);
+
+ virtio_instance_init_common(obj, &dev->vdev, sizeof(dev->vdev),
+ TYPE_VIRTIO_INPUT_HOST);
+}
+
static const TypeInfo virtio_host_pci_info = {
.name = TYPE_VIRTIO_INPUT_HOST_PCI,
.parent = TYPE_VIRTIO_INPUT_PCI,
.instance_size = sizeof(VirtIOInputHostPCI),
.instance_init = virtio_host_initfn,
};
+#endif
/* virtio-pci-bus */
@@ -2211,6 +2486,9 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data)
k->load_config = virtio_pci_load_config;
k->save_queue = virtio_pci_save_queue;
k->load_queue = virtio_pci_load_queue;
+ k->save_extra_state = virtio_pci_save_extra_state;
+ k->load_extra_state = virtio_pci_load_extra_state;
+ k->has_extra_state = virtio_pci_has_extra_state;
k->query_guest_notifiers = virtio_pci_query_guest_notifiers;
k->set_host_notifier = virtio_pci_set_host_notifier;
k->set_guest_notifiers = virtio_pci_set_guest_notifiers;
@@ -2235,7 +2513,9 @@ static void virtio_pci_register_types(void)
type_register_static(&virtio_keyboard_pci_info);
type_register_static(&virtio_mouse_pci_info);
type_register_static(&virtio_tablet_pci_info);
+#ifdef CONFIG_LINUX
type_register_static(&virtio_host_pci_info);
+#endif
type_register_static(&virtio_pci_bus_info);
type_register_static(&virtio_pci_info);
#ifdef CONFIG_VIRTFS
diff --git a/qemu/hw/virtio/virtio-pci.h b/qemu/hw/virtio/virtio-pci.h
index b6c442f52..e4548c2f9 100644
--- a/qemu/hw/virtio/virtio-pci.h
+++ b/qemu/hw/virtio/virtio-pci.h
@@ -23,7 +23,6 @@
#include "hw/virtio/virtio-scsi.h"
#include "hw/virtio/virtio-balloon.h"
#include "hw/virtio/virtio-bus.h"
-#include "hw/virtio/virtio-9p.h"
#include "hw/virtio/virtio-input.h"
#include "hw/virtio/virtio-gpu.h"
#ifdef CONFIG_VIRTFS
@@ -59,21 +58,35 @@ typedef struct VirtioBusClass VirtioPCIBusClass;
#define VIRTIO_PCI_BUS_CLASS(klass) \
OBJECT_CLASS_CHECK(VirtioPCIBusClass, klass, TYPE_VIRTIO_PCI_BUS)
+enum {
+ VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION_BIT,
+ VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT,
+ VIRTIO_PCI_FLAG_DISABLE_LEGACY_BIT,
+ VIRTIO_PCI_FLAG_DISABLE_MODERN_BIT,
+ VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT,
+ VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT,
+ VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT,
+};
+
/* Need to activate work-arounds for buggy guests at vmstate load. */
-#define VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION_BIT 0
#define VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION \
(1 << VIRTIO_PCI_FLAG_BUS_MASTER_BUG_MIGRATION_BIT)
/* Performance improves when virtqueue kick processing is decoupled from the
* vcpu thread using ioeventfd for some devices. */
-#define VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT 1
#define VIRTIO_PCI_FLAG_USE_IOEVENTFD (1 << VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT)
/* virtio version flags */
-#define VIRTIO_PCI_FLAG_DISABLE_LEGACY_BIT 2
-#define VIRTIO_PCI_FLAG_DISABLE_MODERN_BIT 3
#define VIRTIO_PCI_FLAG_DISABLE_LEGACY (1 << VIRTIO_PCI_FLAG_DISABLE_LEGACY_BIT)
#define VIRTIO_PCI_FLAG_DISABLE_MODERN (1 << VIRTIO_PCI_FLAG_DISABLE_MODERN_BIT)
+#define VIRTIO_PCI_FLAG_DISABLE_PCIE (1 << VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT)
+
+/* migrate extra state */
+#define VIRTIO_PCI_FLAG_MIGRATE_EXTRA (1 << VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT)
+
+/* have pio notification for modern device ? */
+#define VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY \
+ (1 << VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT)
typedef struct {
MSIMessage msg;
@@ -94,6 +107,7 @@ typedef struct {
typedef struct VirtioPCIClass {
PCIDeviceClass parent_class;
+ DeviceRealize parent_dc_realize;
void (*realize)(VirtIOPCIProxy *vpci_dev, Error **errp);
} VirtioPCIClass;
@@ -104,6 +118,14 @@ typedef struct VirtIOPCIRegion {
uint32_t type;
} VirtIOPCIRegion;
+typedef struct VirtIOPCIQueue {
+ uint16_t num;
+ bool enabled;
+ uint32_t desc[2];
+ uint32_t avail[2];
+ uint32_t used[2];
+} VirtIOPCIQueue;
+
struct VirtIOPCIProxy {
PCIDevice pci_dev;
MemoryRegion bar;
@@ -111,11 +133,14 @@ struct VirtIOPCIProxy {
VirtIOPCIRegion isr;
VirtIOPCIRegion device;
VirtIOPCIRegion notify;
+ VirtIOPCIRegion notify_pio;
MemoryRegion modern_bar;
+ MemoryRegion io_bar;
MemoryRegion modern_cfg;
AddressSpace modern_as;
uint32_t legacy_io_bar;
uint32_t msix_bar;
+ uint32_t modern_io_bar;
uint32_t modern_mem_bar;
int config_cap;
uint32_t flags;
@@ -124,13 +149,7 @@ struct VirtIOPCIProxy {
uint32_t dfselect;
uint32_t gfselect;
uint32_t guest_features[2];
- struct {
- uint16_t num;
- bool enabled;
- uint32_t desc[2];
- uint32_t avail[2];
- uint32_t used[2];
- } vqs[VIRTIO_QUEUE_MAX];
+ VirtIOPCIQueue vqs[VIRTIO_QUEUE_MAX];
bool ioeventfd_disabled;
bool ioeventfd_started;
@@ -226,7 +245,7 @@ struct VirtIONetPCI {
typedef struct V9fsPCIState {
VirtIOPCIProxy parent_obj;
- V9fsState vdev;
+ V9fsVirtioState vdev;
} V9fsPCIState;
#endif
@@ -267,6 +286,8 @@ struct VirtIOInputHIDPCI {
VirtIOInputHID vdev;
};
+#ifdef CONFIG_LINUX
+
#define TYPE_VIRTIO_INPUT_HOST_PCI "virtio-input-host-pci"
#define VIRTIO_INPUT_HOST_PCI(obj) \
OBJECT_CHECK(VirtIOInputHostPCI, (obj), TYPE_VIRTIO_INPUT_HOST_PCI)
@@ -276,6 +297,8 @@ struct VirtIOInputHostPCI {
VirtIOInputHost vdev;
};
+#endif
+
/*
* virtio-gpu-pci: This extends VirtioPCIProxy.
*/
diff --git a/qemu/hw/virtio/virtio-rng.c b/qemu/hw/virtio/virtio-rng.c
index 97d154191..6b991a764 100644
--- a/qemu/hw/virtio/virtio-rng.c
+++ b/qemu/hw/virtio/virtio-rng.c
@@ -9,6 +9,8 @@
* top-level directory.
*/
+#include "qemu/osdep.h"
+#include "qapi/error.h"
#include "qemu/iov.h"
#include "hw/qdev.h"
#include "hw/virtio/virtio.h"
@@ -43,7 +45,7 @@ static void chr_read(void *opaque, const void *buf, size_t size)
{
VirtIORNG *vrng = opaque;
VirtIODevice *vdev = VIRTIO_DEVICE(vrng);
- VirtQueueElement elem;
+ VirtQueueElement *elem;
size_t len;
int offset;
@@ -55,17 +57,26 @@ static void chr_read(void *opaque, const void *buf, size_t size)
offset = 0;
while (offset < size) {
- if (!virtqueue_pop(vrng->vq, &elem)) {
+ elem = virtqueue_pop(vrng->vq, sizeof(VirtQueueElement));
+ if (!elem) {
break;
}
- len = iov_from_buf(elem.in_sg, elem.in_num,
+ len = iov_from_buf(elem->in_sg, elem->in_num,
0, buf + offset, size - offset);
offset += len;
- virtqueue_push(vrng->vq, &elem, len);
+ virtqueue_push(vrng->vq, elem, len);
trace_virtio_rng_pushed(vrng, len);
+ g_free(elem);
}
virtio_notify(vdev, vrng->vq);
+
+ if (!virtio_queue_empty(vrng->vq)) {
+ /* If we didn't drain the queue, call virtio_rng_process
+ * to take care of asking for more data as appropriate.
+ */
+ virtio_rng_process(vrng);
+ }
}
static void virtio_rng_process(VirtIORNG *vrng)
diff --git a/qemu/hw/virtio/virtio.c b/qemu/hw/virtio/virtio.c
index 788b556a7..30ede3d1c 100644
--- a/qemu/hw/virtio/virtio.c
+++ b/qemu/hw/virtio/virtio.c
@@ -11,8 +11,10 @@
*
*/
-#include <inttypes.h>
-
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "cpu.h"
#include "trace.h"
#include "exec/address-spaces.h"
#include "qemu/error-report.h"
@@ -60,6 +62,7 @@ typedef struct VRingUsed
typedef struct VRing
{
unsigned int num;
+ unsigned int num_default;
unsigned int align;
hwaddr desc;
hwaddr avail;
@@ -69,7 +72,15 @@ typedef struct VRing
struct VirtQueue
{
VRing vring;
+
+ /* Next head to pop */
uint16_t last_avail_idx;
+
+ /* Last avail_idx read from VQ. */
+ uint16_t shadow_avail_idx;
+
+ uint16_t used_idx;
+
/* Last used index value we have signalled on */
uint16_t signalled_used;
@@ -85,6 +96,7 @@ struct VirtQueue
uint16_t vector;
void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
+ void (*handle_aio_output)(VirtIODevice *vdev, VirtQueue *vq);
VirtIODevice *vdev;
EventNotifier guest_notifier;
EventNotifier host_notifier;
@@ -106,35 +118,15 @@ void virtio_queue_update_rings(VirtIODevice *vdev, int n)
vring->align);
}
-static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa,
- int i)
-{
- hwaddr pa;
- pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
- return virtio_ldq_phys(vdev, pa);
-}
-
-static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa, int i)
+static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc,
+ hwaddr desc_pa, int i)
{
- hwaddr pa;
- pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
- return virtio_ldl_phys(vdev, pa);
-}
-
-static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr desc_pa,
- int i)
-{
- hwaddr pa;
- pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
- return virtio_lduw_phys(vdev, pa);
-}
-
-static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa,
- int i)
-{
- hwaddr pa;
- pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
- return virtio_lduw_phys(vdev, pa);
+ address_space_read(&address_space_memory, desc_pa + i * sizeof(VRingDesc),
+ MEMTXATTRS_UNSPECIFIED, (void *)desc, sizeof(VRingDesc));
+ virtio_tswap64s(vdev, &desc->addr);
+ virtio_tswap32s(vdev, &desc->len);
+ virtio_tswap16s(vdev, &desc->flags);
+ virtio_tswap16s(vdev, &desc->next);
}
static inline uint16_t vring_avail_flags(VirtQueue *vq)
@@ -148,7 +140,8 @@ static inline uint16_t vring_avail_idx(VirtQueue *vq)
{
hwaddr pa;
pa = vq->vring.avail + offsetof(VRingAvail, idx);
- return virtio_lduw_phys(vq->vdev, pa);
+ vq->shadow_avail_idx = virtio_lduw_phys(vq->vdev, pa);
+ return vq->shadow_avail_idx;
}
static inline uint16_t vring_avail_ring(VirtQueue *vq, int i)
@@ -163,18 +156,15 @@ static inline uint16_t vring_get_used_event(VirtQueue *vq)
return vring_avail_ring(vq, vq->vring.num);
}
-static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val)
-{
- hwaddr pa;
- pa = vq->vring.used + offsetof(VRingUsed, ring[i].id);
- virtio_stl_phys(vq->vdev, pa, val);
-}
-
-static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val)
+static inline void vring_used_write(VirtQueue *vq, VRingUsedElem *uelem,
+ int i)
{
hwaddr pa;
- pa = vq->vring.used + offsetof(VRingUsed, ring[i].len);
- virtio_stl_phys(vq->vdev, pa, val);
+ virtio_tswap32s(vq->vdev, &uelem->id);
+ virtio_tswap32s(vq->vdev, &uelem->len);
+ pa = vq->vring.used + offsetof(VRingUsed, ring[i]);
+ address_space_write(&address_space_memory, pa, MEMTXATTRS_UNSPECIFIED,
+ (void *)uelem, sizeof(VRingUsedElem));
}
static uint16_t vring_used_idx(VirtQueue *vq)
@@ -189,6 +179,7 @@ static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val)
hwaddr pa;
pa = vq->vring.used + offsetof(VRingUsed, idx);
virtio_stw_phys(vq->vdev, pa, val);
+ vq->used_idx = val;
}
static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask)
@@ -220,7 +211,7 @@ static inline void vring_set_avail_event(VirtQueue *vq, uint16_t val)
void virtio_queue_set_notification(VirtQueue *vq, int enable)
{
vq->notification = enable;
- if (virtio_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ if (virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
vring_set_avail_event(vq, vring_avail_idx(vq));
} else if (enable) {
vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY);
@@ -238,19 +229,23 @@ int virtio_queue_ready(VirtQueue *vq)
return vq->vring.avail != 0;
}
+/* Fetch avail_idx from VQ memory only when we really need to know if
+ * guest has added some buffers. */
int virtio_queue_empty(VirtQueue *vq)
{
+ if (vq->shadow_avail_idx != vq->last_avail_idx) {
+ return 0;
+ }
+
return vring_avail_idx(vq) == vq->last_avail_idx;
}
-void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
- unsigned int len, unsigned int idx)
+static void virtqueue_unmap_sg(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len)
{
unsigned int offset;
int i;
- trace_virtqueue_fill(vq, elem, len, idx);
-
offset = 0;
for (i = 0; i < elem->in_num; i++) {
size_t size = MIN(len - offset, elem->in_sg[i].iov_len);
@@ -266,12 +261,29 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
cpu_physical_memory_unmap(elem->out_sg[i].iov_base,
elem->out_sg[i].iov_len,
0, elem->out_sg[i].iov_len);
+}
- idx = (idx + vring_used_idx(vq)) % vq->vring.num;
+void virtqueue_discard(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len)
+{
+ vq->last_avail_idx--;
+ virtqueue_unmap_sg(vq, elem, len);
+}
- /* Get a pointer to the next entry in the used ring. */
- vring_used_ring_id(vq, idx, elem->index);
- vring_used_ring_len(vq, idx, len);
+void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
+ unsigned int len, unsigned int idx)
+{
+ VRingUsedElem uelem;
+
+ trace_virtqueue_fill(vq, elem, len, idx);
+
+ virtqueue_unmap_sg(vq, elem, len);
+
+ idx = (idx + vq->used_idx) % vq->vring.num;
+
+ uelem.id = elem->index;
+ uelem.len = len;
+ vring_used_write(vq, &uelem, idx);
}
void virtqueue_flush(VirtQueue *vq, unsigned int count)
@@ -280,7 +292,7 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count)
/* Make sure buffer is written before we update index. */
smp_wmb();
trace_virtqueue_flush(vq, count);
- old = vring_used_idx(vq);
+ old = vq->used_idx;
new = old + count;
vring_used_idx_set(vq, new);
vq->inuse -= count;
@@ -302,7 +314,7 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx)
/* Check it isn't doing very strange things with descriptor numbers. */
if (num_heads > vq->vring.num) {
error_report("Guest moved used index from %u to %u",
- idx, vring_avail_idx(vq));
+ idx, vq->shadow_avail_idx);
exit(1);
}
/* On success, callers read a descriptor at vq->last_avail_idx.
@@ -331,18 +343,18 @@ static unsigned int virtqueue_get_head(VirtQueue *vq, unsigned int idx)
return head;
}
-static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa,
- unsigned int i, unsigned int max)
+static unsigned virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc *desc,
+ hwaddr desc_pa, unsigned int max)
{
unsigned int next;
/* If this descriptor says it doesn't chain, we're done. */
- if (!(vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_NEXT)) {
+ if (!(desc->flags & VRING_DESC_F_NEXT)) {
return max;
}
/* Check they're not leading us off end of descriptors. */
- next = vring_desc_next(vdev, desc_pa, i);
+ next = desc->next;
/* Make sure compiler knows to grab that: we don't want it changing! */
smp_wmb();
@@ -351,6 +363,7 @@ static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa,
exit(1);
}
+ vring_desc_read(vdev, desc, desc_pa, next);
return next;
}
@@ -367,6 +380,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
while (virtqueue_num_heads(vq, idx)) {
VirtIODevice *vdev = vq->vdev;
unsigned int max, num_bufs, indirect = 0;
+ VRingDesc desc;
hwaddr desc_pa;
int i;
@@ -374,9 +388,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
num_bufs = total_bufs;
i = virtqueue_get_head(vq, idx++);
desc_pa = vq->vring.desc;
+ vring_desc_read(vdev, &desc, desc_pa, i);
- if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
- if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
+ if (desc.flags & VRING_DESC_F_INDIRECT) {
+ if (desc.len % sizeof(VRingDesc)) {
error_report("Invalid size for indirect buffer table");
exit(1);
}
@@ -389,9 +404,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
/* loop over the indirect descriptor table */
indirect = 1;
- max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
- desc_pa = vring_desc_addr(vdev, desc_pa, i);
+ max = desc.len / sizeof(VRingDesc);
+ desc_pa = desc.addr;
num_bufs = i = 0;
+ vring_desc_read(vdev, &desc, desc_pa, i);
}
do {
@@ -401,15 +417,15 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
exit(1);
}
- if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) {
- in_total += vring_desc_len(vdev, desc_pa, i);
+ if (desc.flags & VRING_DESC_F_WRITE) {
+ in_total += desc.len;
} else {
- out_total += vring_desc_len(vdev, desc_pa, i);
+ out_total += desc.len;
}
if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
goto done;
}
- } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
+ } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) != max);
if (!indirect)
total_bufs = num_bufs;
@@ -434,98 +450,256 @@ int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes,
return in_bytes <= in_total && out_bytes <= out_total;
}
-void virtqueue_map_sg(struct iovec *sg, hwaddr *addr,
- size_t num_sg, int is_write)
+static void virtqueue_map_desc(unsigned int *p_num_sg, hwaddr *addr, struct iovec *iov,
+ unsigned int max_num_sg, bool is_write,
+ hwaddr pa, size_t sz)
+{
+ unsigned num_sg = *p_num_sg;
+ assert(num_sg <= max_num_sg);
+
+ while (sz) {
+ hwaddr len = sz;
+
+ if (num_sg == max_num_sg) {
+ error_report("virtio: too many write descriptors in indirect table");
+ exit(1);
+ }
+
+ iov[num_sg].iov_base = cpu_physical_memory_map(pa, &len, is_write);
+ iov[num_sg].iov_len = len;
+ addr[num_sg] = pa;
+
+ sz -= len;
+ pa += len;
+ num_sg++;
+ }
+ *p_num_sg = num_sg;
+}
+
+static void virtqueue_map_iovec(struct iovec *sg, hwaddr *addr,
+ unsigned int *num_sg, unsigned int max_size,
+ int is_write)
{
unsigned int i;
hwaddr len;
- if (num_sg > VIRTQUEUE_MAX_SIZE) {
- error_report("virtio: map attempt out of bounds: %zd > %d",
- num_sg, VIRTQUEUE_MAX_SIZE);
- exit(1);
- }
+ /* Note: this function MUST validate input, some callers
+ * are passing in num_sg values received over the network.
+ */
+ /* TODO: teach all callers that this can fail, and return failure instead
+ * of asserting here.
+ * When we do, we might be able to re-enable NDEBUG below.
+ */
+#ifdef NDEBUG
+#error building with NDEBUG is not supported
+#endif
+ assert(*num_sg <= max_size);
- for (i = 0; i < num_sg; i++) {
+ for (i = 0; i < *num_sg; i++) {
len = sg[i].iov_len;
sg[i].iov_base = cpu_physical_memory_map(addr[i], &len, is_write);
- if (sg[i].iov_base == NULL || len != sg[i].iov_len) {
+ if (!sg[i].iov_base) {
error_report("virtio: error trying to map MMIO memory");
exit(1);
}
+ if (len != sg[i].iov_len) {
+ error_report("virtio: unexpected memory split");
+ exit(1);
+ }
}
}
-int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
+void virtqueue_map(VirtQueueElement *elem)
+{
+ virtqueue_map_iovec(elem->in_sg, elem->in_addr, &elem->in_num,
+ VIRTQUEUE_MAX_SIZE, 1);
+ virtqueue_map_iovec(elem->out_sg, elem->out_addr, &elem->out_num,
+ VIRTQUEUE_MAX_SIZE, 0);
+}
+
+void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num)
+{
+ VirtQueueElement *elem;
+ size_t in_addr_ofs = QEMU_ALIGN_UP(sz, __alignof__(elem->in_addr[0]));
+ size_t out_addr_ofs = in_addr_ofs + in_num * sizeof(elem->in_addr[0]);
+ size_t out_addr_end = out_addr_ofs + out_num * sizeof(elem->out_addr[0]);
+ size_t in_sg_ofs = QEMU_ALIGN_UP(out_addr_end, __alignof__(elem->in_sg[0]));
+ size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]);
+ size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]);
+
+ assert(sz >= sizeof(VirtQueueElement));
+ elem = g_malloc(out_sg_end);
+ elem->out_num = out_num;
+ elem->in_num = in_num;
+ elem->in_addr = (void *)elem + in_addr_ofs;
+ elem->out_addr = (void *)elem + out_addr_ofs;
+ elem->in_sg = (void *)elem + in_sg_ofs;
+ elem->out_sg = (void *)elem + out_sg_ofs;
+ return elem;
+}
+
+void *virtqueue_pop(VirtQueue *vq, size_t sz)
{
unsigned int i, head, max;
hwaddr desc_pa = vq->vring.desc;
VirtIODevice *vdev = vq->vdev;
+ VirtQueueElement *elem;
+ unsigned out_num, in_num;
+ hwaddr addr[VIRTQUEUE_MAX_SIZE];
+ struct iovec iov[VIRTQUEUE_MAX_SIZE];
+ VRingDesc desc;
- if (!virtqueue_num_heads(vq, vq->last_avail_idx))
- return 0;
+ if (virtio_queue_empty(vq)) {
+ return NULL;
+ }
+ /* Needed after virtio_queue_empty(), see comment in
+ * virtqueue_num_heads(). */
+ smp_rmb();
/* When we start there are none of either input nor output. */
- elem->out_num = elem->in_num = 0;
+ out_num = in_num = 0;
max = vq->vring.num;
i = head = virtqueue_get_head(vq, vq->last_avail_idx++);
- if (virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
vring_set_avail_event(vq, vq->last_avail_idx);
}
- if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
- if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
+ vring_desc_read(vdev, &desc, desc_pa, i);
+ if (desc.flags & VRING_DESC_F_INDIRECT) {
+ if (desc.len % sizeof(VRingDesc)) {
error_report("Invalid size for indirect buffer table");
exit(1);
}
/* loop over the indirect descriptor table */
- max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
- desc_pa = vring_desc_addr(vdev, desc_pa, i);
+ max = desc.len / sizeof(VRingDesc);
+ desc_pa = desc.addr;
i = 0;
+ vring_desc_read(vdev, &desc, desc_pa, i);
}
/* Collect all the descriptors */
do {
- struct iovec *sg;
-
- if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) {
- if (elem->in_num >= ARRAY_SIZE(elem->in_sg)) {
- error_report("Too many write descriptors in indirect table");
- exit(1);
- }
- elem->in_addr[elem->in_num] = vring_desc_addr(vdev, desc_pa, i);
- sg = &elem->in_sg[elem->in_num++];
+ if (desc.flags & VRING_DESC_F_WRITE) {
+ virtqueue_map_desc(&in_num, addr + out_num, iov + out_num,
+ VIRTQUEUE_MAX_SIZE - out_num, true, desc.addr, desc.len);
} else {
- if (elem->out_num >= ARRAY_SIZE(elem->out_sg)) {
- error_report("Too many read descriptors in indirect table");
+ if (in_num) {
+ error_report("Incorrect order for descriptors");
exit(1);
}
- elem->out_addr[elem->out_num] = vring_desc_addr(vdev, desc_pa, i);
- sg = &elem->out_sg[elem->out_num++];
+ virtqueue_map_desc(&out_num, addr, iov,
+ VIRTQUEUE_MAX_SIZE, false, desc.addr, desc.len);
}
- sg->iov_len = vring_desc_len(vdev, desc_pa, i);
-
/* If we've got too many, that implies a descriptor loop. */
- if ((elem->in_num + elem->out_num) > max) {
+ if ((in_num + out_num) > max) {
error_report("Looped descriptor");
exit(1);
}
- } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
-
- /* Now map what we have collected */
- virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1);
- virtqueue_map_sg(elem->out_sg, elem->out_addr, elem->out_num, 0);
+ } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) != max);
+ /* Now copy what we have collected and mapped */
+ elem = virtqueue_alloc_element(sz, out_num, in_num);
elem->index = head;
+ for (i = 0; i < out_num; i++) {
+ elem->out_addr[i] = addr[i];
+ elem->out_sg[i] = iov[i];
+ }
+ for (i = 0; i < in_num; i++) {
+ elem->in_addr[i] = addr[out_num + i];
+ elem->in_sg[i] = iov[out_num + i];
+ }
vq->inuse++;
trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
- return elem->in_num + elem->out_num;
+ return elem;
+}
+
+/* Reading and writing a structure directly to QEMUFile is *awful*, but
+ * it is what QEMU has always done by mistake. We can change it sooner
+ * or later by bumping the version number of the affected vm states.
+ * In the meanwhile, since the in-memory layout of VirtQueueElement
+ * has changed, we need to marshal to and from the layout that was
+ * used before the change.
+ */
+typedef struct VirtQueueElementOld {
+ unsigned int index;
+ unsigned int out_num;
+ unsigned int in_num;
+ hwaddr in_addr[VIRTQUEUE_MAX_SIZE];
+ hwaddr out_addr[VIRTQUEUE_MAX_SIZE];
+ struct iovec in_sg[VIRTQUEUE_MAX_SIZE];
+ struct iovec out_sg[VIRTQUEUE_MAX_SIZE];
+} VirtQueueElementOld;
+
+void *qemu_get_virtqueue_element(QEMUFile *f, size_t sz)
+{
+ VirtQueueElement *elem;
+ VirtQueueElementOld data;
+ int i;
+
+ qemu_get_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld));
+
+ elem = virtqueue_alloc_element(sz, data.out_num, data.in_num);
+ elem->index = data.index;
+
+ for (i = 0; i < elem->in_num; i++) {
+ elem->in_addr[i] = data.in_addr[i];
+ }
+
+ for (i = 0; i < elem->out_num; i++) {
+ elem->out_addr[i] = data.out_addr[i];
+ }
+
+ for (i = 0; i < elem->in_num; i++) {
+ /* Base is overwritten by virtqueue_map. */
+ elem->in_sg[i].iov_base = 0;
+ elem->in_sg[i].iov_len = data.in_sg[i].iov_len;
+ }
+
+ for (i = 0; i < elem->out_num; i++) {
+ /* Base is overwritten by virtqueue_map. */
+ elem->out_sg[i].iov_base = 0;
+ elem->out_sg[i].iov_len = data.out_sg[i].iov_len;
+ }
+
+ virtqueue_map(elem);
+ return elem;
+}
+
+void qemu_put_virtqueue_element(QEMUFile *f, VirtQueueElement *elem)
+{
+ VirtQueueElementOld data;
+ int i;
+
+ memset(&data, 0, sizeof(data));
+ data.index = elem->index;
+ data.in_num = elem->in_num;
+ data.out_num = elem->out_num;
+
+ for (i = 0; i < elem->in_num; i++) {
+ data.in_addr[i] = elem->in_addr[i];
+ }
+
+ for (i = 0; i < elem->out_num; i++) {
+ data.out_addr[i] = elem->out_addr[i];
+ }
+
+ for (i = 0; i < elem->in_num; i++) {
+ /* Base is overwritten by virtqueue_map when loading. Do not
+ * save it, as it would leak the QEMU address space layout. */
+ data.in_sg[i].iov_len = elem->in_sg[i].iov_len;
+ }
+
+ for (i = 0; i < elem->out_num; i++) {
+ /* Do not save iov_base as above. */
+ data.out_sg[i].iov_len = elem->out_sg[i].iov_len;
+ }
+ qemu_put_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld));
}
/* virtio device */
@@ -560,7 +734,7 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val)
VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
trace_virtio_set_status(vdev, val);
- if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
if (!(vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) &&
val & VIRTIO_CONFIG_S_FEATURES_OK) {
int ret = virtio_validate_features(vdev);
@@ -629,10 +803,13 @@ void virtio_reset(void *opaque)
vdev->vq[i].vring.avail = 0;
vdev->vq[i].vring.used = 0;
vdev->vq[i].last_avail_idx = 0;
+ vdev->vq[i].shadow_avail_idx = 0;
+ vdev->vq[i].used_idx = 0;
virtio_queue_set_vector(vdev, i, VIRTIO_NO_VECTOR);
vdev->vq[i].signalled_used = 0;
vdev->vq[i].signalled_used_valid = false;
vdev->vq[i].notification = true;
+ vdev->vq[i].vring.num = vdev->vq[i].vring.num_default;
}
}
@@ -898,7 +1075,7 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align)
VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
/* virtio-1 compliant devices cannot change the alignment */
- if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
error_report("tried to modify queue alignment for virtio-1 device");
return;
}
@@ -912,7 +1089,17 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align)
virtio_queue_update_rings(vdev, n);
}
-void virtio_queue_notify_vq(VirtQueue *vq)
+static void virtio_queue_notify_aio_vq(VirtQueue *vq)
+{
+ if (vq->vring.desc && vq->handle_aio_output) {
+ VirtIODevice *vdev = vq->vdev;
+
+ trace_virtio_queue_notify(vdev, vq - vdev->vq, vq);
+ vq->handle_aio_output(vdev, vq);
+ }
+}
+
+static void virtio_queue_notify_vq(VirtQueue *vq)
{
if (vq->vring.desc && vq->handle_output) {
VirtIODevice *vdev = vq->vdev;
@@ -964,8 +1151,10 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
abort();
vdev->vq[i].vring.num = queue_size;
+ vdev->vq[i].vring.num_default = queue_size;
vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN;
vdev->vq[i].handle_output = handle_output;
+ vdev->vq[i].handle_aio_output = NULL;
return &vdev->vq[i];
}
@@ -977,6 +1166,7 @@ void virtio_del_queue(VirtIODevice *vdev, int n)
}
vdev->vq[n].vring.num = 0;
+ vdev->vq[n].vring.num_default = 0;
}
void virtio_irq(VirtQueue *vq)
@@ -986,32 +1176,32 @@ void virtio_irq(VirtQueue *vq)
virtio_notify_vector(vq->vdev, vq->vector);
}
-static bool vring_notify(VirtIODevice *vdev, VirtQueue *vq)
+bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq)
{
uint16_t old, new;
bool v;
/* We need to expose used array entries before checking used event. */
smp_mb();
/* Always notify when queue is empty (when feature acknowledge) */
- if (virtio_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
- !vq->inuse && vring_avail_idx(vq) == vq->last_avail_idx) {
+ if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) &&
+ !vq->inuse && virtio_queue_empty(vq)) {
return true;
}
- if (!virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT);
}
v = vq->signalled_used_valid;
vq->signalled_used_valid = true;
old = vq->signalled_used;
- new = vq->signalled_used = vring_used_idx(vq);
+ new = vq->signalled_used = vq->used_idx;
return !v || vring_need_event(vring_get_used_event(vq), new, old);
}
void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
{
- if (!vring_notify(vdev, vq)) {
+ if (!virtio_should_notify(vdev, vq)) {
return;
}
@@ -1035,7 +1225,7 @@ static bool virtio_device_endian_needed(void *opaque)
VirtIODevice *vdev = opaque;
assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN);
- if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+ if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
return vdev->device_endian != virtio_default_endian();
}
/* Devices conforming to VIRTIO 1.0 or later are always LE. */
@@ -1056,33 +1246,38 @@ static bool virtio_virtqueue_needed(void *opaque)
return virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1);
}
-static void put_virtqueue_state(QEMUFile *f, void *pv, size_t size)
+static bool virtio_ringsize_needed(void *opaque)
{
- VirtIODevice *vdev = pv;
+ VirtIODevice *vdev = opaque;
int i;
for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
- qemu_put_be64(f, vdev->vq[i].vring.avail);
- qemu_put_be64(f, vdev->vq[i].vring.used);
+ if (vdev->vq[i].vring.num != vdev->vq[i].vring.num_default) {
+ return true;
+ }
}
+ return false;
}
-static int get_virtqueue_state(QEMUFile *f, void *pv, size_t size)
+static bool virtio_extra_state_needed(void *opaque)
{
- VirtIODevice *vdev = pv;
- int i;
+ VirtIODevice *vdev = opaque;
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
- for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
- vdev->vq[i].vring.avail = qemu_get_be64(f);
- vdev->vq[i].vring.used = qemu_get_be64(f);
- }
- return 0;
+ return k->has_extra_state &&
+ k->has_extra_state(qbus->parent);
}
-static VMStateInfo vmstate_info_virtqueue = {
+static const VMStateDescription vmstate_virtqueue = {
.name = "virtqueue_state",
- .get = get_virtqueue_state,
- .put = put_virtqueue_state,
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(vring.avail, struct VirtQueue),
+ VMSTATE_UINT64(vring.used, struct VirtQueue),
+ VMSTATE_END_OF_LIST()
+ }
};
static const VMStateDescription vmstate_virtio_virtqueues = {
@@ -1091,12 +1286,74 @@ static const VMStateDescription vmstate_virtio_virtqueues = {
.minimum_version_id = 1,
.needed = &virtio_virtqueue_needed,
.fields = (VMStateField[]) {
+ VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice,
+ VIRTIO_QUEUE_MAX, 0, vmstate_virtqueue, VirtQueue),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_ringsize = {
+ .name = "ringsize_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT32(vring.num_default, struct VirtQueue),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_virtio_ringsize = {
+ .name = "virtio/ringsize",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_ringsize_needed,
+ .fields = (VMStateField[]) {
+ VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice,
+ VIRTIO_QUEUE_MAX, 0, vmstate_ringsize, VirtQueue),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static int get_extra_state(QEMUFile *f, void *pv, size_t size)
+{
+ VirtIODevice *vdev = pv;
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+ if (!k->load_extra_state) {
+ return -1;
+ } else {
+ return k->load_extra_state(qbus->parent, f);
+ }
+}
+
+static void put_extra_state(QEMUFile *f, void *pv, size_t size)
+{
+ VirtIODevice *vdev = pv;
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+ k->save_extra_state(qbus->parent, f);
+}
+
+static const VMStateInfo vmstate_info_extra_state = {
+ .name = "virtqueue_extra_state",
+ .get = get_extra_state,
+ .put = put_extra_state,
+};
+
+static const VMStateDescription vmstate_virtio_extra_state = {
+ .name = "virtio/extra_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = &virtio_extra_state_needed,
+ .fields = (VMStateField[]) {
{
- .name = "virtqueues",
+ .name = "extra_state",
.version_id = 0,
.field_exists = NULL,
.size = 0,
- .info = &vmstate_info_virtqueue,
+ .info = &vmstate_info_extra_state,
.flags = VMS_SINGLE,
.offset = 0,
},
@@ -1138,6 +1395,8 @@ static const VMStateDescription vmstate_virtio = {
&vmstate_virtio_device_endian,
&vmstate_virtio_64bit_features,
&vmstate_virtio_virtqueues,
+ &vmstate_virtio_ringsize,
+ &vmstate_virtio_extra_state,
NULL
}
};
@@ -1264,7 +1523,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
num = qemu_get_be32(f);
if (num > VIRTIO_QUEUE_MAX) {
- error_report("Invalid number of PCI queues: 0x%x", num);
+ error_report("Invalid number of virtqueues: 0x%x", num);
return -1;
}
@@ -1348,6 +1607,8 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id)
vdev->vq[i].last_avail_idx, nheads);
return -1;
}
+ vdev->vq[i].used_idx = vring_used_idx(&vdev->vq[i]);
+ vdev->vq[i].shadow_avail_idx = vring_avail_idx(&vdev->vq[i]);
}
}
@@ -1430,6 +1691,7 @@ void virtio_init(VirtIODevice *vdev, const char *name,
vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change,
vdev);
vdev->device_endian = virtio_default_endian();
+ vdev->use_guest_notifier_mask = true;
}
hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n)
@@ -1460,7 +1722,7 @@ hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n)
hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n)
{
return offsetof(VRingAvail, ring) +
- sizeof(uint64_t) * vdev->vq[n].vring.num;
+ sizeof(uint16_t) * vdev->vq[n].vring.num;
}
hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n)
@@ -1483,6 +1745,7 @@ uint16_t virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n)
void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx)
{
vdev->vq[n].last_avail_idx = idx;
+ vdev->vq[n].shadow_avail_idx = idx;
}
void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n)
@@ -1512,10 +1775,10 @@ void virtio_queue_set_guest_notifier_fd_handler(VirtQueue *vq, bool assign,
bool with_irqfd)
{
if (assign && !with_irqfd) {
- event_notifier_set_handler(&vq->guest_notifier,
+ event_notifier_set_handler(&vq->guest_notifier, false,
virtio_queue_guest_notifier_read);
} else {
- event_notifier_set_handler(&vq->guest_notifier, NULL);
+ event_notifier_set_handler(&vq->guest_notifier, false, NULL);
}
if (!assign) {
/* Test and clear notifier before closing it,
@@ -1529,6 +1792,31 @@ EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq)
return &vq->guest_notifier;
}
+static void virtio_queue_host_notifier_aio_read(EventNotifier *n)
+{
+ VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
+ if (event_notifier_test_and_clear(n)) {
+ virtio_queue_notify_aio_vq(vq);
+ }
+}
+
+void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx,
+ void (*handle_output)(VirtIODevice *,
+ VirtQueue *))
+{
+ if (handle_output) {
+ vq->handle_aio_output = handle_output;
+ aio_set_event_notifier(ctx, &vq->host_notifier, true,
+ virtio_queue_host_notifier_aio_read);
+ } else {
+ aio_set_event_notifier(ctx, &vq->host_notifier, true, NULL);
+ /* Test and clear notifier before after disabling event,
+ * in case poll callback didn't have time to run. */
+ virtio_queue_host_notifier_aio_read(&vq->host_notifier);
+ vq->handle_aio_output = NULL;
+ }
+}
+
static void virtio_queue_host_notifier_read(EventNotifier *n)
{
VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
@@ -1541,10 +1829,10 @@ void virtio_queue_set_host_notifier_fd_handler(VirtQueue *vq, bool assign,
bool set_handler)
{
if (assign && set_handler) {
- event_notifier_set_handler(&vq->host_notifier,
+ event_notifier_set_handler(&vq->host_notifier, true,
virtio_queue_host_notifier_read);
} else {
- event_notifier_set_handler(&vq->host_notifier, NULL);
+ event_notifier_set_handler(&vq->host_notifier, true, NULL);
}
if (!assign) {
/* Test and clear notifier before after disabling event,