diff options
Diffstat (limited to 'kernel/drivers/vfio')
21 files changed, 8469 insertions, 0 deletions
diff --git a/kernel/drivers/vfio/Kconfig b/kernel/drivers/vfio/Kconfig new file mode 100644 index 000000000..7d092ddc8 --- /dev/null +++ b/kernel/drivers/vfio/Kconfig @@ -0,0 +1,35 @@ +config VFIO_IOMMU_TYPE1 + tristate + depends on VFIO + default n + +config VFIO_IOMMU_SPAPR_TCE + tristate + depends on VFIO && SPAPR_TCE_IOMMU + default n + +config VFIO_SPAPR_EEH + tristate + depends on EEH && VFIO_IOMMU_SPAPR_TCE + default n + +config VFIO_VIRQFD + tristate + depends on VFIO && EVENTFD + default n + +menuconfig VFIO + tristate "VFIO Non-Privileged userspace driver framework" + depends on IOMMU_API + select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM_SMMU) + select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES) + select VFIO_SPAPR_EEH if (PPC_POWERNV || PPC_PSERIES) + select ANON_INODES + help + VFIO provides a framework for secure userspace device drivers. + See Documentation/vfio.txt for more details. + + If you don't know what to do here, say N. + +source "drivers/vfio/pci/Kconfig" +source "drivers/vfio/platform/Kconfig" diff --git a/kernel/drivers/vfio/Makefile b/kernel/drivers/vfio/Makefile new file mode 100644 index 000000000..7b8a31f63 --- /dev/null +++ b/kernel/drivers/vfio/Makefile @@ -0,0 +1,9 @@ +vfio_virqfd-y := virqfd.o + +obj-$(CONFIG_VFIO) += vfio.o +obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o +obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o +obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o +obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o +obj-$(CONFIG_VFIO_PCI) += pci/ +obj-$(CONFIG_VFIO_PLATFORM) += platform/ diff --git a/kernel/drivers/vfio/pci/Kconfig b/kernel/drivers/vfio/pci/Kconfig new file mode 100644 index 000000000..579d83bf5 --- /dev/null +++ b/kernel/drivers/vfio/pci/Kconfig @@ -0,0 +1,27 @@ +config VFIO_PCI + tristate "VFIO support for PCI devices" + depends on VFIO && PCI && EVENTFD + select VFIO_VIRQFD + help + Support for the PCI VFIO bus driver. This is required to make + use of PCI drivers using the VFIO framework. + + If you don't know what to do here, say N. + +config VFIO_PCI_VGA + bool "VFIO PCI support for VGA devices" + depends on VFIO_PCI && X86 && VGA_ARB + help + Support for VGA extension to VFIO PCI. This exposes an additional + region on VGA devices for accessing legacy VGA addresses used by + BIOS and generic video drivers. + + If you don't know what to do here, say N. + +config VFIO_PCI_MMAP + depends on VFIO_PCI + def_bool y if !S390 + +config VFIO_PCI_INTX + depends on VFIO_PCI + def_bool y if !S390 diff --git a/kernel/drivers/vfio/pci/Makefile b/kernel/drivers/vfio/pci/Makefile new file mode 100644 index 000000000..131079255 --- /dev/null +++ b/kernel/drivers/vfio/pci/Makefile @@ -0,0 +1,4 @@ + +vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o + +obj-$(CONFIG_VFIO_PCI) += vfio-pci.o diff --git a/kernel/drivers/vfio/pci/vfio_pci.c b/kernel/drivers/vfio/pci/vfio_pci.c new file mode 100644 index 000000000..e9851add6 --- /dev/null +++ b/kernel/drivers/vfio/pci/vfio_pci.c @@ -0,0 +1,1215 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio: + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/device.h> +#include <linux/eventfd.h> +#include <linux/file.h> +#include <linux/interrupt.h> +#include <linux/iommu.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/pci.h> +#include <linux/pm_runtime.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/vgaarb.h> + +#include "vfio_pci_private.h" + +#define DRIVER_VERSION "0.2" +#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" +#define DRIVER_DESC "VFIO PCI - User Level meta-driver" + +static char ids[1024] __initdata; +module_param_string(ids, ids, sizeof(ids), 0); +MODULE_PARM_DESC(ids, "Initial PCI IDs to add to the vfio driver, format is \"vendor:device[:subvendor[:subdevice[:class[:class_mask]]]]\" and multiple comma separated entries can be specified"); + +static bool nointxmask; +module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(nointxmask, + "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag."); + +#ifdef CONFIG_VFIO_PCI_VGA +static bool disable_vga; +module_param(disable_vga, bool, S_IRUGO); +MODULE_PARM_DESC(disable_vga, "Disable VGA resource access through vfio-pci"); +#endif + +static bool disable_idle_d3; +module_param(disable_idle_d3, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(disable_idle_d3, + "Disable using the PCI D3 low power state for idle, unused devices"); + +static DEFINE_MUTEX(driver_lock); + +static inline bool vfio_vga_disabled(void) +{ +#ifdef CONFIG_VFIO_PCI_VGA + return disable_vga; +#else + return true; +#endif +} + +/* + * Our VGA arbiter participation is limited since we don't know anything + * about the device itself. However, if the device is the only VGA device + * downstream of a bridge and VFIO VGA support is disabled, then we can + * safely return legacy VGA IO and memory as not decoded since the user + * has no way to get to it and routing can be disabled externally at the + * bridge. + */ +static unsigned int vfio_pci_set_vga_decode(void *opaque, bool single_vga) +{ + struct vfio_pci_device *vdev = opaque; + struct pci_dev *tmp = NULL, *pdev = vdev->pdev; + unsigned char max_busnr; + unsigned int decodes; + + if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus)) + return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | + VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; + + max_busnr = pci_bus_max_busnr(pdev->bus); + decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM; + + while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) { + if (tmp == pdev || + pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) || + pci_is_root_bus(tmp->bus)) + continue; + + if (tmp->bus->number >= pdev->bus->number && + tmp->bus->number <= max_busnr) { + pci_dev_put(tmp); + decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM; + break; + } + } + + return decodes; +} + +static inline bool vfio_pci_is_vga(struct pci_dev *pdev) +{ + return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; +} + +static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); + +static int vfio_pci_enable(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + int ret; + u16 cmd; + u8 msix_pos; + + pci_set_power_state(pdev, PCI_D0); + + /* Don't allow our initial saved state to include busmaster */ + pci_clear_master(pdev); + + ret = pci_enable_device(pdev); + if (ret) + return ret; + + vdev->reset_works = (pci_reset_function(pdev) == 0); + pci_save_state(pdev); + vdev->pci_saved_state = pci_store_saved_state(pdev); + if (!vdev->pci_saved_state) + pr_debug("%s: Couldn't store %s saved state\n", + __func__, dev_name(&pdev->dev)); + + ret = vfio_config_init(vdev); + if (ret) { + kfree(vdev->pci_saved_state); + vdev->pci_saved_state = NULL; + pci_disable_device(pdev); + return ret; + } + + if (likely(!nointxmask)) + vdev->pci_2_3 = pci_intx_mask_supported(pdev); + + pci_read_config_word(pdev, PCI_COMMAND, &cmd); + if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { + cmd &= ~PCI_COMMAND_INTX_DISABLE; + pci_write_config_word(pdev, PCI_COMMAND, cmd); + } + + msix_pos = pdev->msix_cap; + if (msix_pos) { + u16 flags; + u32 table; + + pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags); + pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table); + + vdev->msix_bar = table & PCI_MSIX_TABLE_BIR; + vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET; + vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16; + } else + vdev->msix_bar = 0xFF; + + if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) + vdev->has_vga = true; + + return 0; +} + +static void vfio_pci_disable(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + int bar; + + /* Stop the device from further DMA */ + pci_clear_master(pdev); + + vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE | + VFIO_IRQ_SET_ACTION_TRIGGER, + vdev->irq_type, 0, 0, NULL); + + vdev->virq_disabled = false; + + vfio_config_free(vdev); + + for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) { + if (!vdev->barmap[bar]) + continue; + pci_iounmap(pdev, vdev->barmap[bar]); + pci_release_selected_regions(pdev, 1 << bar); + vdev->barmap[bar] = NULL; + } + + vdev->needs_reset = true; + + /* + * If we have saved state, restore it. If we can reset the device, + * even better. Resetting with current state seems better than + * nothing, but saving and restoring current state without reset + * is just busy work. + */ + if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) { + pr_info("%s: Couldn't reload %s saved state\n", + __func__, dev_name(&pdev->dev)); + + if (!vdev->reset_works) + goto out; + + pci_save_state(pdev); + } + + /* + * Disable INTx and MSI, presumably to avoid spurious interrupts + * during reset. Stolen from pci_reset_function() + */ + pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE); + + /* + * Try to reset the device. The success of this is dependent on + * being able to lock the device, which is not always possible. + */ + if (vdev->reset_works && !pci_try_reset_function(pdev)) + vdev->needs_reset = false; + + pci_restore_state(pdev); +out: + pci_disable_device(pdev); + + vfio_pci_try_bus_reset(vdev); + + if (!disable_idle_d3) + pci_set_power_state(pdev, PCI_D3hot); +} + +static void vfio_pci_release(void *device_data) +{ + struct vfio_pci_device *vdev = device_data; + + mutex_lock(&driver_lock); + + if (!(--vdev->refcnt)) { + vfio_spapr_pci_eeh_release(vdev->pdev); + vfio_pci_disable(vdev); + } + + mutex_unlock(&driver_lock); + + module_put(THIS_MODULE); +} + +static int vfio_pci_open(void *device_data) +{ + struct vfio_pci_device *vdev = device_data; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&driver_lock); + + if (!vdev->refcnt) { + ret = vfio_pci_enable(vdev); + if (ret) + goto error; + + vfio_spapr_pci_eeh_open(vdev->pdev); + } + vdev->refcnt++; +error: + mutex_unlock(&driver_lock); + if (ret) + module_put(THIS_MODULE); + return ret; +} + +static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) +{ + if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { + u8 pin; + pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); + if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && pin) + return 1; + + } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { + u8 pos; + u16 flags; + + pos = vdev->pdev->msi_cap; + if (pos) { + pci_read_config_word(vdev->pdev, + pos + PCI_MSI_FLAGS, &flags); + return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1); + } + } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) { + u8 pos; + u16 flags; + + pos = vdev->pdev->msix_cap; + if (pos) { + pci_read_config_word(vdev->pdev, + pos + PCI_MSIX_FLAGS, &flags); + + return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; + } + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { + if (pci_is_pcie(vdev->pdev)) + return 1; + } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { + return 1; + } + + return 0; +} + +static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) +{ + (*(int *)data)++; + return 0; +} + +struct vfio_pci_fill_info { + int max; + int cur; + struct vfio_pci_dependent_device *devices; +}; + +static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) +{ + struct vfio_pci_fill_info *fill = data; + struct iommu_group *iommu_group; + + if (fill->cur == fill->max) + return -EAGAIN; /* Something changed, try again */ + + iommu_group = iommu_group_get(&pdev->dev); + if (!iommu_group) + return -EPERM; /* Cannot reset non-isolated devices */ + + fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); + fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); + fill->devices[fill->cur].bus = pdev->bus->number; + fill->devices[fill->cur].devfn = pdev->devfn; + fill->cur++; + iommu_group_put(iommu_group); + return 0; +} + +struct vfio_pci_group_entry { + struct vfio_group *group; + int id; +}; + +struct vfio_pci_group_info { + int count; + struct vfio_pci_group_entry *groups; +}; + +static int vfio_pci_validate_devs(struct pci_dev *pdev, void *data) +{ + struct vfio_pci_group_info *info = data; + struct iommu_group *group; + int id, i; + + group = iommu_group_get(&pdev->dev); + if (!group) + return -EPERM; + + id = iommu_group_id(group); + + for (i = 0; i < info->count; i++) + if (info->groups[i].id == id) + break; + + iommu_group_put(group); + + return (i == info->count) ? -EINVAL : 0; +} + +static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot) +{ + for (; pdev; pdev = pdev->bus->self) + if (pdev->bus == slot->bus) + return (pdev->slot == slot); + return false; +} + +struct vfio_pci_walk_info { + int (*fn)(struct pci_dev *, void *data); + void *data; + struct pci_dev *pdev; + bool slot; + int ret; +}; + +static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data) +{ + struct vfio_pci_walk_info *walk = data; + + if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot)) + walk->ret = walk->fn(pdev, walk->data); + + return walk->ret; +} + +static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, + int (*fn)(struct pci_dev *, + void *data), void *data, + bool slot) +{ + struct vfio_pci_walk_info walk = { + .fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0, + }; + + pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk); + + return walk.ret; +} + +static long vfio_pci_ioctl(void *device_data, + unsigned int cmd, unsigned long arg) +{ + struct vfio_pci_device *vdev = device_data; + unsigned long minsz; + + if (cmd == VFIO_DEVICE_GET_INFO) { + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.flags = VFIO_DEVICE_FLAGS_PCI; + + if (vdev->reset_works) + info.flags |= VFIO_DEVICE_FLAGS_RESET; + + info.num_regions = VFIO_PCI_NUM_REGIONS; + info.num_irqs = VFIO_PCI_NUM_IRQS; + + return copy_to_user((void __user *)arg, &info, minsz); + + } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { + struct pci_dev *pdev = vdev->pdev; + struct vfio_region_info info; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + switch (info.index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = pdev->cfg_size; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = pci_resource_len(pdev, info.index); + if (!info.size) { + info.flags = 0; + break; + } + + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (IS_ENABLED(CONFIG_VFIO_PCI_MMAP) && + pci_resource_flags(pdev, info.index) & + IORESOURCE_MEM && info.size >= PAGE_SIZE) + info.flags |= VFIO_REGION_INFO_FLAG_MMAP; + break; + case VFIO_PCI_ROM_REGION_INDEX: + { + void __iomem *io; + size_t size; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.flags = 0; + + /* Report the BAR size, not the ROM size */ + info.size = pci_resource_len(pdev, info.index); + if (!info.size) + break; + + /* Is it really there? */ + io = pci_map_rom(pdev, &size); + if (!io || !size) { + info.size = 0; + break; + } + pci_unmap_rom(pdev, io); + + info.flags = VFIO_REGION_INFO_FLAG_READ; + break; + } + case VFIO_PCI_VGA_REGION_INDEX: + if (!vdev->has_vga) + return -EINVAL; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0xc0000; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + + break; + default: + return -EINVAL; + } + + return copy_to_user((void __user *)arg, &info, minsz); + + } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) + return -EINVAL; + + switch (info.index) { + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: + case VFIO_PCI_REQ_IRQ_INDEX: + break; + case VFIO_PCI_ERR_IRQ_INDEX: + if (pci_is_pcie(vdev->pdev)) + break; + /* pass thru to return error */ + default: + return -EINVAL; + } + + info.flags = VFIO_IRQ_INFO_EVENTFD; + + info.count = vfio_pci_get_irq_count(vdev, info.index); + + if (info.index == VFIO_PCI_INTX_IRQ_INDEX) + info.flags |= (VFIO_IRQ_INFO_MASKABLE | + VFIO_IRQ_INFO_AUTOMASKED); + else + info.flags |= VFIO_IRQ_INFO_NORESIZE; + + return copy_to_user((void __user *)arg, &info, minsz); + + } else if (cmd == VFIO_DEVICE_SET_IRQS) { + struct vfio_irq_set hdr; + u8 *data = NULL; + int ret = 0; + + minsz = offsetofend(struct vfio_irq_set, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || + hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | + VFIO_IRQ_SET_ACTION_TYPE_MASK)) + return -EINVAL; + + if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { + size_t size; + int max = vfio_pci_get_irq_count(vdev, hdr.index); + + if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) + size = sizeof(uint8_t); + else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) + size = sizeof(int32_t); + else + return -EINVAL; + + if (hdr.argsz - minsz < hdr.count * size || + hdr.start >= max || hdr.start + hdr.count > max) + return -EINVAL; + + data = memdup_user((void __user *)(arg + minsz), + hdr.count * size); + if (IS_ERR(data)) + return PTR_ERR(data); + } + + mutex_lock(&vdev->igate); + + ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, + hdr.start, hdr.count, data); + + mutex_unlock(&vdev->igate); + kfree(data); + + return ret; + + } else if (cmd == VFIO_DEVICE_RESET) { + return vdev->reset_works ? + pci_try_reset_function(vdev->pdev) : -EINVAL; + + } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { + struct vfio_pci_hot_reset_info hdr; + struct vfio_pci_fill_info fill = { 0 }; + struct vfio_pci_dependent_device *devices = NULL; + bool slot = false; + int ret = 0; + + minsz = offsetofend(struct vfio_pci_hot_reset_info, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz) + return -EINVAL; + + hdr.flags = 0; + + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; + + /* How many devices are affected? */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_count_devs, + &fill.max, slot); + if (ret) + return ret; + + WARN_ON(!fill.max); /* Should always be at least one */ + + /* + * If there's enough space, fill it now, otherwise return + * -ENOSPC and the number of devices affected. + */ + if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { + ret = -ENOSPC; + hdr.count = fill.max; + goto reset_info_exit; + } + + devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); + if (!devices) + return -ENOMEM; + + fill.devices = devices; + + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_fill_devs, + &fill, slot); + + /* + * If a device was removed between counting and filling, + * we may come up short of fill.max. If a device was + * added, we'll have a return of -EAGAIN above. + */ + if (!ret) + hdr.count = fill.cur; + +reset_info_exit: + if (copy_to_user((void __user *)arg, &hdr, minsz)) + ret = -EFAULT; + + if (!ret) { + if (copy_to_user((void __user *)(arg + minsz), devices, + hdr.count * sizeof(*devices))) + ret = -EFAULT; + } + + kfree(devices); + return ret; + + } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { + struct vfio_pci_hot_reset hdr; + int32_t *group_fds; + struct vfio_pci_group_entry *groups; + struct vfio_pci_group_info info; + bool slot = false; + int i, count = 0, ret = 0; + + minsz = offsetofend(struct vfio_pci_hot_reset, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz || hdr.flags) + return -EINVAL; + + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; + + /* + * We can't let userspace give us an arbitrarily large + * buffer to copy, so verify how many we think there + * could be. Note groups can have multiple devices so + * one group per device is the max. + */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_count_devs, + &count, slot); + if (ret) + return ret; + + /* Somewhere between 1 and count is OK */ + if (!hdr.count || hdr.count > count) + return -EINVAL; + + group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); + groups = kcalloc(hdr.count, sizeof(*groups), GFP_KERNEL); + if (!group_fds || !groups) { + kfree(group_fds); + kfree(groups); + return -ENOMEM; + } + + if (copy_from_user(group_fds, (void __user *)(arg + minsz), + hdr.count * sizeof(*group_fds))) { + kfree(group_fds); + kfree(groups); + return -EFAULT; + } + + /* + * For each group_fd, get the group through the vfio external + * user interface and store the group and iommu ID. This + * ensures the group is held across the reset. + */ + for (i = 0; i < hdr.count; i++) { + struct vfio_group *group; + struct fd f = fdget(group_fds[i]); + if (!f.file) { + ret = -EBADF; + break; + } + + group = vfio_group_get_external_user(f.file); + fdput(f); + if (IS_ERR(group)) { + ret = PTR_ERR(group); + break; + } + + groups[i].group = group; + groups[i].id = vfio_external_user_iommu_id(group); + } + + kfree(group_fds); + + /* release reference to groups on error */ + if (ret) + goto hot_reset_release; + + info.count = hdr.count; + info.groups = groups; + + /* + * Test whether all the affected devices are contained + * by the set of groups provided by the user. + */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_validate_devs, + &info, slot); + if (!ret) + /* User has access, do the reset */ + ret = slot ? pci_try_reset_slot(vdev->pdev->slot) : + pci_try_reset_bus(vdev->pdev->bus); + +hot_reset_release: + for (i--; i >= 0; i--) + vfio_group_put_external_user(groups[i].group); + + kfree(groups); + return ret; + } + + return -ENOTTY; +} + +static ssize_t vfio_pci_rw(void *device_data, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + struct vfio_pci_device *vdev = device_data; + + if (index >= VFIO_PCI_NUM_REGIONS) + return -EINVAL; + + switch (index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); + + case VFIO_PCI_ROM_REGION_INDEX: + if (iswrite) + return -EINVAL; + return vfio_pci_bar_rw(vdev, buf, count, ppos, false); + + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); + + case VFIO_PCI_VGA_REGION_INDEX: + return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); + } + + return -EINVAL; +} + +static ssize_t vfio_pci_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + if (!count) + return 0; + + return vfio_pci_rw(device_data, buf, count, ppos, false); +} + +static ssize_t vfio_pci_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (!count) + return 0; + + return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); +} + +static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) +{ + struct vfio_pci_device *vdev = device_data; + struct pci_dev *pdev = vdev->pdev; + unsigned int index; + u64 phys_len, req_len, pgoff, req_start; + int ret; + + index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); + + if (vma->vm_end < vma->vm_start) + return -EINVAL; + if ((vma->vm_flags & VM_SHARED) == 0) + return -EINVAL; + if (index >= VFIO_PCI_ROM_REGION_INDEX) + return -EINVAL; + if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM)) + return -EINVAL; + + phys_len = pci_resource_len(pdev, index); + req_len = vma->vm_end - vma->vm_start; + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + req_start = pgoff << PAGE_SHIFT; + + if (phys_len < PAGE_SIZE || req_start + req_len > phys_len) + return -EINVAL; + + if (index == vdev->msix_bar) { + /* + * Disallow mmaps overlapping the MSI-X table; users don't + * get to touch this directly. We could find somewhere + * else to map the overlap, but page granularity is only + * a recommendation, not a requirement, so the user needs + * to know which bits are real. Requiring them to mmap + * around the table makes that clear. + */ + + /* If neither entirely above nor below, then it overlaps */ + if (!(req_start >= vdev->msix_offset + vdev->msix_size || + req_start + req_len <= vdev->msix_offset)) + return -EINVAL; + } + + /* + * Even though we don't make use of the barmap for the mmap, + * we need to request the region and the barmap tracks that. + */ + if (!vdev->barmap[index]) { + ret = pci_request_selected_regions(pdev, + 1 << index, "vfio-pci"); + if (ret) + return ret; + + vdev->barmap[index] = pci_iomap(pdev, index, 0); + } + + vma->vm_private_data = vdev; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff; + + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + req_len, vma->vm_page_prot); +} + +static void vfio_pci_request(void *device_data, unsigned int count) +{ + struct vfio_pci_device *vdev = device_data; + + mutex_lock(&vdev->igate); + + if (vdev->req_trigger) { + if (!(count % 10)) + dev_notice_ratelimited(&vdev->pdev->dev, + "Relaying device request to user (#%u)\n", + count); + eventfd_signal(vdev->req_trigger, 1); + } else if (count == 0) { + dev_warn(&vdev->pdev->dev, + "No device request channel registered, blocked until released by user\n"); + } + + mutex_unlock(&vdev->igate); +} + +static const struct vfio_device_ops vfio_pci_ops = { + .name = "vfio-pci", + .open = vfio_pci_open, + .release = vfio_pci_release, + .ioctl = vfio_pci_ioctl, + .read = vfio_pci_read, + .write = vfio_pci_write, + .mmap = vfio_pci_mmap, + .request = vfio_pci_request, +}; + +static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct vfio_pci_device *vdev; + struct iommu_group *group; + int ret; + + if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return -EINVAL; + + group = iommu_group_get(&pdev->dev); + if (!group) + return -EINVAL; + + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); + if (!vdev) { + iommu_group_put(group); + return -ENOMEM; + } + + vdev->pdev = pdev; + vdev->irq_type = VFIO_PCI_NUM_IRQS; + mutex_init(&vdev->igate); + spin_lock_init(&vdev->irqlock); + + ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); + if (ret) { + iommu_group_put(group); + kfree(vdev); + return ret; + } + + if (vfio_pci_is_vga(pdev)) { + vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); + vga_set_legacy_decoding(pdev, + vfio_pci_set_vga_decode(vdev, false)); + } + + if (!disable_idle_d3) { + /* + * pci-core sets the device power state to an unknown value at + * bootup and after being removed from a driver. The only + * transition it allows from this unknown state is to D0, which + * typically happens when a driver calls pci_enable_device(). + * We're not ready to enable the device yet, but we do want to + * be able to get to D3. Therefore first do a D0 transition + * before going to D3. + */ + pci_set_power_state(pdev, PCI_D0); + pci_set_power_state(pdev, PCI_D3hot); + } + + return ret; +} + +static void vfio_pci_remove(struct pci_dev *pdev) +{ + struct vfio_pci_device *vdev; + + vdev = vfio_del_group_dev(&pdev->dev); + if (!vdev) + return; + + iommu_group_put(pdev->dev.iommu_group); + kfree(vdev); + + if (vfio_pci_is_vga(pdev)) { + vga_client_register(pdev, NULL, NULL, NULL); + vga_set_legacy_decoding(pdev, + VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | + VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM); + } + + if (!disable_idle_d3) + pci_set_power_state(pdev, PCI_D0); +} + +static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct vfio_pci_device *vdev; + struct vfio_device *device; + + device = vfio_device_get_from_dev(&pdev->dev); + if (device == NULL) + return PCI_ERS_RESULT_DISCONNECT; + + vdev = vfio_device_data(device); + if (vdev == NULL) { + vfio_device_put(device); + return PCI_ERS_RESULT_DISCONNECT; + } + + mutex_lock(&vdev->igate); + + if (vdev->err_trigger) + eventfd_signal(vdev->err_trigger, 1); + + mutex_unlock(&vdev->igate); + + vfio_device_put(device); + + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static struct pci_error_handlers vfio_err_handlers = { + .error_detected = vfio_pci_aer_err_detected, +}; + +static struct pci_driver vfio_pci_driver = { + .name = "vfio-pci", + .id_table = NULL, /* only dynamic ids */ + .probe = vfio_pci_probe, + .remove = vfio_pci_remove, + .err_handler = &vfio_err_handlers, +}; + +struct vfio_devices { + struct vfio_device **devices; + int cur_index; + int max_index; +}; + +static int vfio_pci_get_devs(struct pci_dev *pdev, void *data) +{ + struct vfio_devices *devs = data; + struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver); + + if (pci_drv != &vfio_pci_driver) + return -EBUSY; + + if (devs->cur_index == devs->max_index) + return -ENOSPC; + + devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev); + if (!devs->devices[devs->cur_index]) + return -EINVAL; + + devs->cur_index++; + return 0; +} + +/* + * Attempt to do a bus/slot reset if there are devices affected by a reset for + * this device that are needs_reset and all of the affected devices are unused + * (!refcnt). Callers are required to hold driver_lock when calling this to + * prevent device opens and concurrent bus reset attempts. We prevent device + * unbinds by acquiring and holding a reference to the vfio_device. + * + * NB: vfio-core considers a group to be viable even if some devices are + * bound to drivers like pci-stub or pcieport. Here we require all devices + * to be bound to vfio_pci since that's the only way we can be sure they + * stay put. + */ +static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) +{ + struct vfio_devices devs = { .cur_index = 0 }; + int i = 0, ret = -EINVAL; + bool needs_reset = false, slot = false; + struct vfio_pci_device *tmp; + + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return; + + if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, + &i, slot) || !i) + return; + + devs.max_index = i; + devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL); + if (!devs.devices) + return; + + if (vfio_pci_for_each_slot_or_bus(vdev->pdev, + vfio_pci_get_devs, &devs, slot)) + goto put_devs; + + for (i = 0; i < devs.cur_index; i++) { + tmp = vfio_device_data(devs.devices[i]); + if (tmp->needs_reset) + needs_reset = true; + if (tmp->refcnt) + goto put_devs; + } + + if (needs_reset) + ret = slot ? pci_try_reset_slot(vdev->pdev->slot) : + pci_try_reset_bus(vdev->pdev->bus); + +put_devs: + for (i = 0; i < devs.cur_index; i++) { + tmp = vfio_device_data(devs.devices[i]); + if (!ret) + tmp->needs_reset = false; + + if (!tmp->refcnt && !disable_idle_d3) + pci_set_power_state(tmp->pdev, PCI_D3hot); + + vfio_device_put(devs.devices[i]); + } + + kfree(devs.devices); +} + +static void __exit vfio_pci_cleanup(void) +{ + pci_unregister_driver(&vfio_pci_driver); + vfio_pci_uninit_perm_bits(); +} + +static void __init vfio_pci_fill_ids(void) +{ + char *p, *id; + int rc; + + /* no ids passed actually */ + if (ids[0] == '\0') + return; + + /* add ids specified in the module parameter */ + p = ids; + while ((id = strsep(&p, ","))) { + unsigned int vendor, device, subvendor = PCI_ANY_ID, + subdevice = PCI_ANY_ID, class = 0, class_mask = 0; + int fields; + + if (!strlen(id)) + continue; + + fields = sscanf(id, "%x:%x:%x:%x:%x:%x", + &vendor, &device, &subvendor, &subdevice, + &class, &class_mask); + + if (fields < 2) { + pr_warn("invalid id string \"%s\"\n", id); + continue; + } + + rc = pci_add_dynid(&vfio_pci_driver, vendor, device, + subvendor, subdevice, class, class_mask, 0); + if (rc) + pr_warn("failed to add dynamic id [%04hx:%04hx[%04hx:%04hx]] class %#08x/%08x (%d)\n", + vendor, device, subvendor, subdevice, + class, class_mask, rc); + else + pr_info("add [%04hx:%04hx[%04hx:%04hx]] class %#08x/%08x\n", + vendor, device, subvendor, subdevice, + class, class_mask); + } +} + +static int __init vfio_pci_init(void) +{ + int ret; + + /* Allocate shared config space permision data used by all devices */ + ret = vfio_pci_init_perm_bits(); + if (ret) + return ret; + + /* Register and scan for devices */ + ret = pci_register_driver(&vfio_pci_driver); + if (ret) + goto out_driver; + + vfio_pci_fill_ids(); + + return 0; + +out_driver: + vfio_pci_uninit_perm_bits(); + return ret; +} + +module_init(vfio_pci_init); +module_exit(vfio_pci_cleanup); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/kernel/drivers/vfio/pci/vfio_pci_config.c b/kernel/drivers/vfio/pci/vfio_pci_config.c new file mode 100644 index 000000000..ff75ca31a --- /dev/null +++ b/kernel/drivers/vfio/pci/vfio_pci_config.c @@ -0,0 +1,1602 @@ +/* + * VFIO PCI config space virtualization + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio: + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + */ + +/* + * This code handles reading and writing of PCI configuration registers. + * This is hairy because we want to allow a lot of flexibility to the + * user driver, but cannot trust it with all of the config fields. + * Tables determine which fields can be read and written, as well as + * which fields are 'virtualized' - special actions and translations to + * make it appear to the user that he has control, when in fact things + * must be negotiated with the underlying OS. + */ + +#include <linux/fs.h> +#include <linux/pci.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/slab.h> + +#include "vfio_pci_private.h" + +#define PCI_CFG_SPACE_SIZE 256 + +/* Useful "pseudo" capabilities */ +#define PCI_CAP_ID_BASIC 0 +#define PCI_CAP_ID_INVALID 0xFF + +#define is_bar(offset) \ + ((offset >= PCI_BASE_ADDRESS_0 && offset < PCI_BASE_ADDRESS_5 + 4) || \ + (offset >= PCI_ROM_ADDRESS && offset < PCI_ROM_ADDRESS + 4)) + +/* + * Lengths of PCI Config Capabilities + * 0: Removed from the user visible capability list + * FF: Variable length + */ +static u8 pci_cap_length[] = { + [PCI_CAP_ID_BASIC] = PCI_STD_HEADER_SIZEOF, /* pci config header */ + [PCI_CAP_ID_PM] = PCI_PM_SIZEOF, + [PCI_CAP_ID_AGP] = PCI_AGP_SIZEOF, + [PCI_CAP_ID_VPD] = PCI_CAP_VPD_SIZEOF, + [PCI_CAP_ID_SLOTID] = 0, /* bridge - don't care */ + [PCI_CAP_ID_MSI] = 0xFF, /* 10, 14, 20, or 24 */ + [PCI_CAP_ID_CHSWP] = 0, /* cpci - not yet */ + [PCI_CAP_ID_PCIX] = 0xFF, /* 8 or 24 */ + [PCI_CAP_ID_HT] = 0xFF, /* hypertransport */ + [PCI_CAP_ID_VNDR] = 0xFF, /* variable */ + [PCI_CAP_ID_DBG] = 0, /* debug - don't care */ + [PCI_CAP_ID_CCRC] = 0, /* cpci - not yet */ + [PCI_CAP_ID_SHPC] = 0, /* hotswap - not yet */ + [PCI_CAP_ID_SSVID] = 0, /* bridge - don't care */ + [PCI_CAP_ID_AGP3] = 0, /* AGP8x - not yet */ + [PCI_CAP_ID_SECDEV] = 0, /* secure device not yet */ + [PCI_CAP_ID_EXP] = 0xFF, /* 20 or 44 */ + [PCI_CAP_ID_MSIX] = PCI_CAP_MSIX_SIZEOF, + [PCI_CAP_ID_SATA] = 0xFF, + [PCI_CAP_ID_AF] = PCI_CAP_AF_SIZEOF, +}; + +/* + * Lengths of PCIe/PCI-X Extended Config Capabilities + * 0: Removed or masked from the user visible capabilty list + * FF: Variable length + */ +static u16 pci_ext_cap_length[] = { + [PCI_EXT_CAP_ID_ERR] = PCI_ERR_ROOT_COMMAND, + [PCI_EXT_CAP_ID_VC] = 0xFF, + [PCI_EXT_CAP_ID_DSN] = PCI_EXT_CAP_DSN_SIZEOF, + [PCI_EXT_CAP_ID_PWR] = PCI_EXT_CAP_PWR_SIZEOF, + [PCI_EXT_CAP_ID_RCLD] = 0, /* root only - don't care */ + [PCI_EXT_CAP_ID_RCILC] = 0, /* root only - don't care */ + [PCI_EXT_CAP_ID_RCEC] = 0, /* root only - don't care */ + [PCI_EXT_CAP_ID_MFVC] = 0xFF, + [PCI_EXT_CAP_ID_VC9] = 0xFF, /* same as CAP_ID_VC */ + [PCI_EXT_CAP_ID_RCRB] = 0, /* root only - don't care */ + [PCI_EXT_CAP_ID_VNDR] = 0xFF, + [PCI_EXT_CAP_ID_CAC] = 0, /* obsolete */ + [PCI_EXT_CAP_ID_ACS] = 0xFF, + [PCI_EXT_CAP_ID_ARI] = PCI_EXT_CAP_ARI_SIZEOF, + [PCI_EXT_CAP_ID_ATS] = PCI_EXT_CAP_ATS_SIZEOF, + [PCI_EXT_CAP_ID_SRIOV] = PCI_EXT_CAP_SRIOV_SIZEOF, + [PCI_EXT_CAP_ID_MRIOV] = 0, /* not yet */ + [PCI_EXT_CAP_ID_MCAST] = PCI_EXT_CAP_MCAST_ENDPOINT_SIZEOF, + [PCI_EXT_CAP_ID_PRI] = PCI_EXT_CAP_PRI_SIZEOF, + [PCI_EXT_CAP_ID_AMD_XXX] = 0, /* not yet */ + [PCI_EXT_CAP_ID_REBAR] = 0xFF, + [PCI_EXT_CAP_ID_DPA] = 0xFF, + [PCI_EXT_CAP_ID_TPH] = 0xFF, + [PCI_EXT_CAP_ID_LTR] = PCI_EXT_CAP_LTR_SIZEOF, + [PCI_EXT_CAP_ID_SECPCI] = 0, /* not yet */ + [PCI_EXT_CAP_ID_PMUX] = 0, /* not yet */ + [PCI_EXT_CAP_ID_PASID] = 0, /* not yet */ +}; + +/* + * Read/Write Permission Bits - one bit for each bit in capability + * Any field can be read if it exists, but what is read depends on + * whether the field is 'virtualized', or just pass thru to the + * hardware. Any virtualized field is also virtualized for writes. + * Writes are only permitted if they have a 1 bit here. + */ +struct perm_bits { + u8 *virt; /* read/write virtual data, not hw */ + u8 *write; /* writeable bits */ + int (*readfn)(struct vfio_pci_device *vdev, int pos, int count, + struct perm_bits *perm, int offset, __le32 *val); + int (*writefn)(struct vfio_pci_device *vdev, int pos, int count, + struct perm_bits *perm, int offset, __le32 val); +}; + +#define NO_VIRT 0 +#define ALL_VIRT 0xFFFFFFFFU +#define NO_WRITE 0 +#define ALL_WRITE 0xFFFFFFFFU + +static int vfio_user_config_read(struct pci_dev *pdev, int offset, + __le32 *val, int count) +{ + int ret = -EINVAL; + u32 tmp_val = 0; + + switch (count) { + case 1: + { + u8 tmp; + ret = pci_user_read_config_byte(pdev, offset, &tmp); + tmp_val = tmp; + break; + } + case 2: + { + u16 tmp; + ret = pci_user_read_config_word(pdev, offset, &tmp); + tmp_val = tmp; + break; + } + case 4: + ret = pci_user_read_config_dword(pdev, offset, &tmp_val); + break; + } + + *val = cpu_to_le32(tmp_val); + + return pcibios_err_to_errno(ret); +} + +static int vfio_user_config_write(struct pci_dev *pdev, int offset, + __le32 val, int count) +{ + int ret = -EINVAL; + u32 tmp_val = le32_to_cpu(val); + + switch (count) { + case 1: + ret = pci_user_write_config_byte(pdev, offset, tmp_val); + break; + case 2: + ret = pci_user_write_config_word(pdev, offset, tmp_val); + break; + case 4: + ret = pci_user_write_config_dword(pdev, offset, tmp_val); + break; + } + + return pcibios_err_to_errno(ret); +} + +static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) +{ + __le32 virt = 0; + + memcpy(val, vdev->vconfig + pos, count); + + memcpy(&virt, perm->virt + offset, count); + + /* Any non-virtualized bits? */ + if (cpu_to_le32(~0U >> (32 - (count * 8))) != virt) { + struct pci_dev *pdev = vdev->pdev; + __le32 phys_val = 0; + int ret; + + ret = vfio_user_config_read(pdev, pos, &phys_val, count); + if (ret) + return ret; + + *val = (phys_val & ~virt) | (*val & virt); + } + + return count; +} + +static int vfio_default_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) +{ + __le32 virt = 0, write = 0; + + memcpy(&write, perm->write + offset, count); + + if (!write) + return count; /* drop, no writable bits */ + + memcpy(&virt, perm->virt + offset, count); + + /* Virtualized and writable bits go to vconfig */ + if (write & virt) { + __le32 virt_val = 0; + + memcpy(&virt_val, vdev->vconfig + pos, count); + + virt_val &= ~(write & virt); + virt_val |= (val & (write & virt)); + + memcpy(vdev->vconfig + pos, &virt_val, count); + } + + /* Non-virtualzed and writable bits go to hardware */ + if (write & ~virt) { + struct pci_dev *pdev = vdev->pdev; + __le32 phys_val = 0; + int ret; + + ret = vfio_user_config_read(pdev, pos, &phys_val, count); + if (ret) + return ret; + + phys_val &= ~(write & ~virt); + phys_val |= (val & (write & ~virt)); + + ret = vfio_user_config_write(pdev, pos, phys_val, count); + if (ret) + return ret; + } + + return count; +} + +/* Allow direct read from hardware, except for capability next pointer */ +static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) +{ + int ret; + + ret = vfio_user_config_read(vdev->pdev, pos, val, count); + if (ret) + return pcibios_err_to_errno(ret); + + if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */ + if (offset < 4) + memcpy(val, vdev->vconfig + pos, count); + } else if (pos >= PCI_STD_HEADER_SIZEOF) { /* Std cap mangling */ + if (offset == PCI_CAP_LIST_ID && count > 1) + memcpy(val, vdev->vconfig + pos, + min(PCI_CAP_FLAGS, count)); + else if (offset == PCI_CAP_LIST_NEXT) + memcpy(val, vdev->vconfig + pos, 1); + } + + return count; +} + +/* Raw access skips any kind of virtualization */ +static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) +{ + int ret; + + ret = vfio_user_config_write(vdev->pdev, pos, val, count); + if (ret) + return ret; + + return count; +} + +static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) +{ + int ret; + + ret = vfio_user_config_read(vdev->pdev, pos, val, count); + if (ret) + return pcibios_err_to_errno(ret); + + return count; +} + +/* Default capability regions to read-only, no-virtualization */ +static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { + [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } +}; +static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = { + [0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } +}; +/* + * Default unassigned regions to raw read-write access. Some devices + * require this to function as they hide registers between the gaps in + * config space (be2net). Like MMIO and I/O port registers, we have + * to trust the hardware isolation. + */ +static struct perm_bits unassigned_perms = { + .readfn = vfio_raw_config_read, + .writefn = vfio_raw_config_write +}; + +static void free_perm_bits(struct perm_bits *perm) +{ + kfree(perm->virt); + kfree(perm->write); + perm->virt = NULL; + perm->write = NULL; +} + +static int alloc_perm_bits(struct perm_bits *perm, int size) +{ + /* + * Round up all permission bits to the next dword, this lets us + * ignore whether a read/write exceeds the defined capability + * structure. We can do this because: + * - Standard config space is already dword aligned + * - Capabilities are all dword alinged (bits 0:1 of next reserved) + * - Express capabilities defined as dword aligned + */ + size = round_up(size, 4); + + /* + * Zero state is + * - All Readable, None Writeable, None Virtualized + */ + perm->virt = kzalloc(size, GFP_KERNEL); + perm->write = kzalloc(size, GFP_KERNEL); + if (!perm->virt || !perm->write) { + free_perm_bits(perm); + return -ENOMEM; + } + + perm->readfn = vfio_default_config_read; + perm->writefn = vfio_default_config_write; + + return 0; +} + +/* + * Helper functions for filling in permission tables + */ +static inline void p_setb(struct perm_bits *p, int off, u8 virt, u8 write) +{ + p->virt[off] = virt; + p->write[off] = write; +} + +/* Handle endian-ness - pci and tables are little-endian */ +static inline void p_setw(struct perm_bits *p, int off, u16 virt, u16 write) +{ + *(__le16 *)(&p->virt[off]) = cpu_to_le16(virt); + *(__le16 *)(&p->write[off]) = cpu_to_le16(write); +} + +/* Handle endian-ness - pci and tables are little-endian */ +static inline void p_setd(struct perm_bits *p, int off, u32 virt, u32 write) +{ + *(__le32 *)(&p->virt[off]) = cpu_to_le32(virt); + *(__le32 *)(&p->write[off]) = cpu_to_le32(write); +} + +/* + * Restore the *real* BARs after we detect a FLR or backdoor reset. + * (backdoor = some device specific technique that we didn't catch) + */ +static void vfio_bar_restore(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + u32 *rbar = vdev->rbar; + int i; + + if (pdev->is_virtfn) + return; + + pr_info("%s: %s reset recovery - restoring bars\n", + __func__, dev_name(&pdev->dev)); + + for (i = PCI_BASE_ADDRESS_0; i <= PCI_BASE_ADDRESS_5; i += 4, rbar++) + pci_user_write_config_dword(pdev, i, *rbar); + + pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar); +} + +static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar) +{ + unsigned long flags = pci_resource_flags(pdev, bar); + u32 val; + + if (flags & IORESOURCE_IO) + return cpu_to_le32(PCI_BASE_ADDRESS_SPACE_IO); + + val = PCI_BASE_ADDRESS_SPACE_MEMORY; + + if (flags & IORESOURCE_PREFETCH) + val |= PCI_BASE_ADDRESS_MEM_PREFETCH; + + if (flags & IORESOURCE_MEM_64) + val |= PCI_BASE_ADDRESS_MEM_TYPE_64; + + return cpu_to_le32(val); +} + +/* + * Pretend we're hardware and tweak the values of the *virtual* PCI BARs + * to reflect the hardware capabilities. This implements BAR sizing. + */ +static void vfio_bar_fixup(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + int i; + __le32 *bar; + u64 mask; + + bar = (__le32 *)&vdev->vconfig[PCI_BASE_ADDRESS_0]; + + for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++, bar++) { + if (!pci_resource_start(pdev, i)) { + *bar = 0; /* Unmapped by host = unimplemented to user */ + continue; + } + + mask = ~(pci_resource_len(pdev, i) - 1); + + *bar &= cpu_to_le32((u32)mask); + *bar |= vfio_generate_bar_flags(pdev, i); + + if (*bar & cpu_to_le32(PCI_BASE_ADDRESS_MEM_TYPE_64)) { + bar++; + *bar &= cpu_to_le32((u32)(mask >> 32)); + i++; + } + } + + bar = (__le32 *)&vdev->vconfig[PCI_ROM_ADDRESS]; + + /* + * NB. we expose the actual BAR size here, regardless of whether + * we can read it. When we report the REGION_INFO for the ROM + * we report what PCI tells us is the actual ROM size. + */ + if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { + mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); + mask |= PCI_ROM_ADDRESS_ENABLE; + *bar &= cpu_to_le32((u32)mask); + } else + *bar = 0; + + vdev->bardirty = false; +} + +static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) +{ + if (is_bar(offset)) /* pos == offset for basic config */ + vfio_bar_fixup(vdev); + + count = vfio_default_config_read(vdev, pos, count, perm, offset, val); + + /* Mask in virtual memory enable for SR-IOV devices */ + if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) { + u16 cmd = le16_to_cpu(*(__le16 *)&vdev->vconfig[PCI_COMMAND]); + u32 tmp_val = le32_to_cpu(*val); + + tmp_val |= cmd & PCI_COMMAND_MEMORY; + *val = cpu_to_le32(tmp_val); + } + + return count; +} + +static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) +{ + struct pci_dev *pdev = vdev->pdev; + __le16 *virt_cmd; + u16 new_cmd = 0; + int ret; + + virt_cmd = (__le16 *)&vdev->vconfig[PCI_COMMAND]; + + if (offset == PCI_COMMAND) { + bool phys_mem, virt_mem, new_mem, phys_io, virt_io, new_io; + u16 phys_cmd; + + ret = pci_user_read_config_word(pdev, PCI_COMMAND, &phys_cmd); + if (ret) + return ret; + + new_cmd = le32_to_cpu(val); + + phys_mem = !!(phys_cmd & PCI_COMMAND_MEMORY); + virt_mem = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_MEMORY); + new_mem = !!(new_cmd & PCI_COMMAND_MEMORY); + + phys_io = !!(phys_cmd & PCI_COMMAND_IO); + virt_io = !!(le16_to_cpu(*virt_cmd) & PCI_COMMAND_IO); + new_io = !!(new_cmd & PCI_COMMAND_IO); + + /* + * If the user is writing mem/io enable (new_mem/io) and we + * think it's already enabled (virt_mem/io), but the hardware + * shows it disabled (phys_mem/io, then the device has + * undergone some kind of backdoor reset and needs to be + * restored before we allow it to enable the bars. + * SR-IOV devices will trigger this, but we catch them later + */ + if ((new_mem && virt_mem && !phys_mem) || + (new_io && virt_io && !phys_io)) + vfio_bar_restore(vdev); + } + + count = vfio_default_config_write(vdev, pos, count, perm, offset, val); + if (count < 0) + return count; + + /* + * Save current memory/io enable bits in vconfig to allow for + * the test above next time. + */ + if (offset == PCI_COMMAND) { + u16 mask = PCI_COMMAND_MEMORY | PCI_COMMAND_IO; + + *virt_cmd &= cpu_to_le16(~mask); + *virt_cmd |= cpu_to_le16(new_cmd & mask); + } + + /* Emulate INTx disable */ + if (offset >= PCI_COMMAND && offset <= PCI_COMMAND + 1) { + bool virt_intx_disable; + + virt_intx_disable = !!(le16_to_cpu(*virt_cmd) & + PCI_COMMAND_INTX_DISABLE); + + if (virt_intx_disable && !vdev->virq_disabled) { + vdev->virq_disabled = true; + vfio_pci_intx_mask(vdev); + } else if (!virt_intx_disable && vdev->virq_disabled) { + vdev->virq_disabled = false; + vfio_pci_intx_unmask(vdev); + } + } + + if (is_bar(offset)) + vdev->bardirty = true; + + return count; +} + +/* Permissions for the Basic PCI Header */ +static int __init init_pci_cap_basic_perm(struct perm_bits *perm) +{ + if (alloc_perm_bits(perm, PCI_STD_HEADER_SIZEOF)) + return -ENOMEM; + + perm->readfn = vfio_basic_config_read; + perm->writefn = vfio_basic_config_write; + + /* Virtualized for SR-IOV functions, which just have FFFF */ + p_setw(perm, PCI_VENDOR_ID, (u16)ALL_VIRT, NO_WRITE); + p_setw(perm, PCI_DEVICE_ID, (u16)ALL_VIRT, NO_WRITE); + + /* + * Virtualize INTx disable, we use it internally for interrupt + * control and can emulate it for non-PCI 2.3 devices. + */ + p_setw(perm, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE, (u16)ALL_WRITE); + + /* Virtualize capability list, we might want to skip/disable */ + p_setw(perm, PCI_STATUS, PCI_STATUS_CAP_LIST, NO_WRITE); + + /* No harm to write */ + p_setb(perm, PCI_CACHE_LINE_SIZE, NO_VIRT, (u8)ALL_WRITE); + p_setb(perm, PCI_LATENCY_TIMER, NO_VIRT, (u8)ALL_WRITE); + p_setb(perm, PCI_BIST, NO_VIRT, (u8)ALL_WRITE); + + /* Virtualize all bars, can't touch the real ones */ + p_setd(perm, PCI_BASE_ADDRESS_0, ALL_VIRT, ALL_WRITE); + p_setd(perm, PCI_BASE_ADDRESS_1, ALL_VIRT, ALL_WRITE); + p_setd(perm, PCI_BASE_ADDRESS_2, ALL_VIRT, ALL_WRITE); + p_setd(perm, PCI_BASE_ADDRESS_3, ALL_VIRT, ALL_WRITE); + p_setd(perm, PCI_BASE_ADDRESS_4, ALL_VIRT, ALL_WRITE); + p_setd(perm, PCI_BASE_ADDRESS_5, ALL_VIRT, ALL_WRITE); + p_setd(perm, PCI_ROM_ADDRESS, ALL_VIRT, ALL_WRITE); + + /* Allow us to adjust capability chain */ + p_setb(perm, PCI_CAPABILITY_LIST, (u8)ALL_VIRT, NO_WRITE); + + /* Sometimes used by sw, just virtualize */ + p_setb(perm, PCI_INTERRUPT_LINE, (u8)ALL_VIRT, (u8)ALL_WRITE); + + /* Virtualize interrupt pin to allow hiding INTx */ + p_setb(perm, PCI_INTERRUPT_PIN, (u8)ALL_VIRT, (u8)NO_WRITE); + + return 0; +} + +static int vfio_pm_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) +{ + count = vfio_default_config_write(vdev, pos, count, perm, offset, val); + if (count < 0) + return count; + + if (offset == PCI_PM_CTRL) { + pci_power_t state; + + switch (le32_to_cpu(val) & PCI_PM_CTRL_STATE_MASK) { + case 0: + state = PCI_D0; + break; + case 1: + state = PCI_D1; + break; + case 2: + state = PCI_D2; + break; + case 3: + state = PCI_D3hot; + break; + } + + pci_set_power_state(vdev->pdev, state); + } + + return count; +} + +/* Permissions for the Power Management capability */ +static int __init init_pci_cap_pm_perm(struct perm_bits *perm) +{ + if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_PM])) + return -ENOMEM; + + perm->writefn = vfio_pm_config_write; + + /* + * We always virtualize the next field so we can remove + * capabilities from the chain if we want to. + */ + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); + + /* + * Power management is defined *per function*, so we can let + * the user change power state, but we trap and initiate the + * change ourselves, so the state bits are read-only. + */ + p_setd(perm, PCI_PM_CTRL, NO_VIRT, ~PCI_PM_CTRL_STATE_MASK); + return 0; +} + +/* Permissions for PCI-X capability */ +static int __init init_pci_cap_pcix_perm(struct perm_bits *perm) +{ + /* Alloc 24, but only 8 are used in v0 */ + if (alloc_perm_bits(perm, PCI_CAP_PCIX_SIZEOF_V2)) + return -ENOMEM; + + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); + + p_setw(perm, PCI_X_CMD, NO_VIRT, (u16)ALL_WRITE); + p_setd(perm, PCI_X_ECC_CSR, NO_VIRT, ALL_WRITE); + return 0; +} + +/* Permissions for PCI Express capability */ +static int __init init_pci_cap_exp_perm(struct perm_bits *perm) +{ + /* Alloc larger of two possible sizes */ + if (alloc_perm_bits(perm, PCI_CAP_EXP_ENDPOINT_SIZEOF_V2)) + return -ENOMEM; + + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); + + /* + * Allow writes to device control fields (includes FLR!) + * but not to devctl_phantom which could confuse IOMMU + * or to the ARI bit in devctl2 which is set at probe time + */ + p_setw(perm, PCI_EXP_DEVCTL, NO_VIRT, ~PCI_EXP_DEVCTL_PHANTOM); + p_setw(perm, PCI_EXP_DEVCTL2, NO_VIRT, ~PCI_EXP_DEVCTL2_ARI); + return 0; +} + +/* Permissions for Advanced Function capability */ +static int __init init_pci_cap_af_perm(struct perm_bits *perm) +{ + if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_AF])) + return -ENOMEM; + + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); + p_setb(perm, PCI_AF_CTRL, NO_VIRT, PCI_AF_CTRL_FLR); + return 0; +} + +/* Permissions for Advanced Error Reporting extended capability */ +static int __init init_pci_ext_cap_err_perm(struct perm_bits *perm) +{ + u32 mask; + + if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_ERR])) + return -ENOMEM; + + /* + * Virtualize the first dword of all express capabilities + * because it includes the next pointer. This lets us later + * remove capabilities from the chain if we need to. + */ + p_setd(perm, 0, ALL_VIRT, NO_WRITE); + + /* Writable bits mask */ + mask = PCI_ERR_UNC_UND | /* Undefined */ + PCI_ERR_UNC_DLP | /* Data Link Protocol */ + PCI_ERR_UNC_SURPDN | /* Surprise Down */ + PCI_ERR_UNC_POISON_TLP | /* Poisoned TLP */ + PCI_ERR_UNC_FCP | /* Flow Control Protocol */ + PCI_ERR_UNC_COMP_TIME | /* Completion Timeout */ + PCI_ERR_UNC_COMP_ABORT | /* Completer Abort */ + PCI_ERR_UNC_UNX_COMP | /* Unexpected Completion */ + PCI_ERR_UNC_RX_OVER | /* Receiver Overflow */ + PCI_ERR_UNC_MALF_TLP | /* Malformed TLP */ + PCI_ERR_UNC_ECRC | /* ECRC Error Status */ + PCI_ERR_UNC_UNSUP | /* Unsupported Request */ + PCI_ERR_UNC_ACSV | /* ACS Violation */ + PCI_ERR_UNC_INTN | /* internal error */ + PCI_ERR_UNC_MCBTLP | /* MC blocked TLP */ + PCI_ERR_UNC_ATOMEG | /* Atomic egress blocked */ + PCI_ERR_UNC_TLPPRE; /* TLP prefix blocked */ + p_setd(perm, PCI_ERR_UNCOR_STATUS, NO_VIRT, mask); + p_setd(perm, PCI_ERR_UNCOR_MASK, NO_VIRT, mask); + p_setd(perm, PCI_ERR_UNCOR_SEVER, NO_VIRT, mask); + + mask = PCI_ERR_COR_RCVR | /* Receiver Error Status */ + PCI_ERR_COR_BAD_TLP | /* Bad TLP Status */ + PCI_ERR_COR_BAD_DLLP | /* Bad DLLP Status */ + PCI_ERR_COR_REP_ROLL | /* REPLAY_NUM Rollover */ + PCI_ERR_COR_REP_TIMER | /* Replay Timer Timeout */ + PCI_ERR_COR_ADV_NFAT | /* Advisory Non-Fatal */ + PCI_ERR_COR_INTERNAL | /* Corrected Internal */ + PCI_ERR_COR_LOG_OVER; /* Header Log Overflow */ + p_setd(perm, PCI_ERR_COR_STATUS, NO_VIRT, mask); + p_setd(perm, PCI_ERR_COR_MASK, NO_VIRT, mask); + + mask = PCI_ERR_CAP_ECRC_GENE | /* ECRC Generation Enable */ + PCI_ERR_CAP_ECRC_CHKE; /* ECRC Check Enable */ + p_setd(perm, PCI_ERR_CAP, NO_VIRT, mask); + return 0; +} + +/* Permissions for Power Budgeting extended capability */ +static int __init init_pci_ext_cap_pwr_perm(struct perm_bits *perm) +{ + if (alloc_perm_bits(perm, pci_ext_cap_length[PCI_EXT_CAP_ID_PWR])) + return -ENOMEM; + + p_setd(perm, 0, ALL_VIRT, NO_WRITE); + + /* Writing the data selector is OK, the info is still read-only */ + p_setb(perm, PCI_PWR_DATA, NO_VIRT, (u8)ALL_WRITE); + return 0; +} + +/* + * Initialize the shared permission tables + */ +void vfio_pci_uninit_perm_bits(void) +{ + free_perm_bits(&cap_perms[PCI_CAP_ID_BASIC]); + + free_perm_bits(&cap_perms[PCI_CAP_ID_PM]); + free_perm_bits(&cap_perms[PCI_CAP_ID_PCIX]); + free_perm_bits(&cap_perms[PCI_CAP_ID_EXP]); + free_perm_bits(&cap_perms[PCI_CAP_ID_AF]); + + free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_ERR]); + free_perm_bits(&ecap_perms[PCI_EXT_CAP_ID_PWR]); +} + +int __init vfio_pci_init_perm_bits(void) +{ + int ret; + + /* Basic config space */ + ret = init_pci_cap_basic_perm(&cap_perms[PCI_CAP_ID_BASIC]); + + /* Capabilities */ + ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); + cap_perms[PCI_CAP_ID_VPD].writefn = vfio_raw_config_write; + ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); + cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write; + ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); + ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]); + + /* Extended capabilities */ + ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]); + ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]); + ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write; + + if (ret) + vfio_pci_uninit_perm_bits(); + + return ret; +} + +static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos) +{ + u8 cap; + int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE : + PCI_STD_HEADER_SIZEOF; + cap = vdev->pci_config_map[pos]; + + if (cap == PCI_CAP_ID_BASIC) + return 0; + + /* XXX Can we have to abutting capabilities of the same type? */ + while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap) + pos--; + + return pos; +} + +static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 *val) +{ + /* Update max available queue size from msi_qmax */ + if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) { + __le16 *flags; + int start; + + start = vfio_find_cap_start(vdev, pos); + + flags = (__le16 *)&vdev->vconfig[start]; + + *flags &= cpu_to_le16(~PCI_MSI_FLAGS_QMASK); + *flags |= cpu_to_le16(vdev->msi_qmax << 1); + } + + return vfio_default_config_read(vdev, pos, count, perm, offset, val); +} + +static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) +{ + count = vfio_default_config_write(vdev, pos, count, perm, offset, val); + if (count < 0) + return count; + + /* Fixup and write configured queue size and enable to hardware */ + if (offset <= PCI_MSI_FLAGS && offset + count >= PCI_MSI_FLAGS) { + __le16 *pflags; + u16 flags; + int start, ret; + + start = vfio_find_cap_start(vdev, pos); + + pflags = (__le16 *)&vdev->vconfig[start + PCI_MSI_FLAGS]; + + flags = le16_to_cpu(*pflags); + + /* MSI is enabled via ioctl */ + if (!is_msi(vdev)) + flags &= ~PCI_MSI_FLAGS_ENABLE; + + /* Check queue size */ + if ((flags & PCI_MSI_FLAGS_QSIZE) >> 4 > vdev->msi_qmax) { + flags &= ~PCI_MSI_FLAGS_QSIZE; + flags |= vdev->msi_qmax << 4; + } + + /* Write back to virt and to hardware */ + *pflags = cpu_to_le16(flags); + ret = pci_user_write_config_word(vdev->pdev, + start + PCI_MSI_FLAGS, + flags); + if (ret) + return pcibios_err_to_errno(ret); + } + + return count; +} + +/* + * MSI determination is per-device, so this routine gets used beyond + * initialization time. Don't add __init + */ +static int init_pci_cap_msi_perm(struct perm_bits *perm, int len, u16 flags) +{ + if (alloc_perm_bits(perm, len)) + return -ENOMEM; + + perm->readfn = vfio_msi_config_read; + perm->writefn = vfio_msi_config_write; + + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); + + /* + * The upper byte of the control register is reserved, + * just setup the lower byte. + */ + p_setb(perm, PCI_MSI_FLAGS, (u8)ALL_VIRT, (u8)ALL_WRITE); + p_setd(perm, PCI_MSI_ADDRESS_LO, ALL_VIRT, ALL_WRITE); + if (flags & PCI_MSI_FLAGS_64BIT) { + p_setd(perm, PCI_MSI_ADDRESS_HI, ALL_VIRT, ALL_WRITE); + p_setw(perm, PCI_MSI_DATA_64, (u16)ALL_VIRT, (u16)ALL_WRITE); + if (flags & PCI_MSI_FLAGS_MASKBIT) { + p_setd(perm, PCI_MSI_MASK_64, NO_VIRT, ALL_WRITE); + p_setd(perm, PCI_MSI_PENDING_64, NO_VIRT, ALL_WRITE); + } + } else { + p_setw(perm, PCI_MSI_DATA_32, (u16)ALL_VIRT, (u16)ALL_WRITE); + if (flags & PCI_MSI_FLAGS_MASKBIT) { + p_setd(perm, PCI_MSI_MASK_32, NO_VIRT, ALL_WRITE); + p_setd(perm, PCI_MSI_PENDING_32, NO_VIRT, ALL_WRITE); + } + } + return 0; +} + +/* Determine MSI CAP field length; initialize msi_perms on 1st call per vdev */ +static int vfio_msi_cap_len(struct vfio_pci_device *vdev, u8 pos) +{ + struct pci_dev *pdev = vdev->pdev; + int len, ret; + u16 flags; + + ret = pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &flags); + if (ret) + return pcibios_err_to_errno(ret); + + len = 10; /* Minimum size */ + if (flags & PCI_MSI_FLAGS_64BIT) + len += 4; + if (flags & PCI_MSI_FLAGS_MASKBIT) + len += 10; + + if (vdev->msi_perm) + return len; + + vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL); + if (!vdev->msi_perm) + return -ENOMEM; + + ret = init_pci_cap_msi_perm(vdev->msi_perm, len, flags); + if (ret) + return ret; + + return len; +} + +/* Determine extended capability length for VC (2 & 9) and MFVC */ +static int vfio_vc_cap_len(struct vfio_pci_device *vdev, u16 pos) +{ + struct pci_dev *pdev = vdev->pdev; + u32 tmp; + int ret, evcc, phases, vc_arb; + int len = PCI_CAP_VC_BASE_SIZEOF; + + ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_CAP1, &tmp); + if (ret) + return pcibios_err_to_errno(ret); + + evcc = tmp & PCI_VC_CAP1_EVCC; /* extended vc count */ + ret = pci_read_config_dword(pdev, pos + PCI_VC_PORT_CAP2, &tmp); + if (ret) + return pcibios_err_to_errno(ret); + + if (tmp & PCI_VC_CAP2_128_PHASE) + phases = 128; + else if (tmp & PCI_VC_CAP2_64_PHASE) + phases = 64; + else if (tmp & PCI_VC_CAP2_32_PHASE) + phases = 32; + else + phases = 0; + + vc_arb = phases * 4; + + /* + * Port arbitration tables are root & switch only; + * function arbitration tables are function 0 only. + * In either case, we'll never let user write them so + * we don't care how big they are + */ + len += (1 + evcc) * PCI_CAP_VC_PER_VC_SIZEOF; + if (vc_arb) { + len = round_up(len, 16); + len += vc_arb / 8; + } + return len; +} + +static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) +{ + struct pci_dev *pdev = vdev->pdev; + u32 dword; + u16 word; + u8 byte; + int ret; + + switch (cap) { + case PCI_CAP_ID_MSI: + return vfio_msi_cap_len(vdev, pos); + case PCI_CAP_ID_PCIX: + ret = pci_read_config_word(pdev, pos + PCI_X_CMD, &word); + if (ret) + return pcibios_err_to_errno(ret); + + if (PCI_X_CMD_VERSION(word)) { + /* Test for extended capabilities */ + pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword); + vdev->extended_caps = (dword != 0); + return PCI_CAP_PCIX_SIZEOF_V2; + } else + return PCI_CAP_PCIX_SIZEOF_V0; + case PCI_CAP_ID_VNDR: + /* length follows next field */ + ret = pci_read_config_byte(pdev, pos + PCI_CAP_FLAGS, &byte); + if (ret) + return pcibios_err_to_errno(ret); + + return byte; + case PCI_CAP_ID_EXP: + /* Test for extended capabilities */ + pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword); + vdev->extended_caps = (dword != 0); + + /* length based on version */ + if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1) + return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1; + else + return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2; + case PCI_CAP_ID_HT: + ret = pci_read_config_byte(pdev, pos + 3, &byte); + if (ret) + return pcibios_err_to_errno(ret); + + return (byte & HT_3BIT_CAP_MASK) ? + HT_CAP_SIZEOF_SHORT : HT_CAP_SIZEOF_LONG; + case PCI_CAP_ID_SATA: + ret = pci_read_config_byte(pdev, pos + PCI_SATA_REGS, &byte); + if (ret) + return pcibios_err_to_errno(ret); + + byte &= PCI_SATA_REGS_MASK; + if (byte == PCI_SATA_REGS_INLINE) + return PCI_SATA_SIZEOF_LONG; + else + return PCI_SATA_SIZEOF_SHORT; + default: + pr_warn("%s: %s unknown length for pci cap 0x%x@0x%x\n", + dev_name(&pdev->dev), __func__, cap, pos); + } + + return 0; +} + +static int vfio_ext_cap_len(struct vfio_pci_device *vdev, u16 ecap, u16 epos) +{ + struct pci_dev *pdev = vdev->pdev; + u8 byte; + u32 dword; + int ret; + + switch (ecap) { + case PCI_EXT_CAP_ID_VNDR: + ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword); + if (ret) + return pcibios_err_to_errno(ret); + + return dword >> PCI_VSEC_HDR_LEN_SHIFT; + case PCI_EXT_CAP_ID_VC: + case PCI_EXT_CAP_ID_VC9: + case PCI_EXT_CAP_ID_MFVC: + return vfio_vc_cap_len(vdev, epos); + case PCI_EXT_CAP_ID_ACS: + ret = pci_read_config_byte(pdev, epos + PCI_ACS_CAP, &byte); + if (ret) + return pcibios_err_to_errno(ret); + + if (byte & PCI_ACS_EC) { + int bits; + + ret = pci_read_config_byte(pdev, + epos + PCI_ACS_EGRESS_BITS, + &byte); + if (ret) + return pcibios_err_to_errno(ret); + + bits = byte ? round_up(byte, 32) : 256; + return 8 + (bits / 8); + } + return 8; + + case PCI_EXT_CAP_ID_REBAR: + ret = pci_read_config_byte(pdev, epos + PCI_REBAR_CTRL, &byte); + if (ret) + return pcibios_err_to_errno(ret); + + byte &= PCI_REBAR_CTRL_NBAR_MASK; + byte >>= PCI_REBAR_CTRL_NBAR_SHIFT; + + return 4 + (byte * 8); + case PCI_EXT_CAP_ID_DPA: + ret = pci_read_config_byte(pdev, epos + PCI_DPA_CAP, &byte); + if (ret) + return pcibios_err_to_errno(ret); + + byte &= PCI_DPA_CAP_SUBSTATE_MASK; + return PCI_DPA_BASE_SIZEOF + byte + 1; + case PCI_EXT_CAP_ID_TPH: + ret = pci_read_config_dword(pdev, epos + PCI_TPH_CAP, &dword); + if (ret) + return pcibios_err_to_errno(ret); + + if ((dword & PCI_TPH_CAP_LOC_MASK) == PCI_TPH_LOC_CAP) { + int sts; + + sts = dword & PCI_TPH_CAP_ST_MASK; + sts >>= PCI_TPH_CAP_ST_SHIFT; + return PCI_TPH_BASE_SIZEOF + (sts * 2) + 2; + } + return PCI_TPH_BASE_SIZEOF; + default: + pr_warn("%s: %s unknown length for pci ecap 0x%x@0x%x\n", + dev_name(&pdev->dev), __func__, ecap, epos); + } + + return 0; +} + +static int vfio_fill_vconfig_bytes(struct vfio_pci_device *vdev, + int offset, int size) +{ + struct pci_dev *pdev = vdev->pdev; + int ret = 0; + + /* + * We try to read physical config space in the largest chunks + * we can, assuming that all of the fields support dword access. + * pci_save_state() makes this same assumption and seems to do ok. + */ + while (size) { + int filled; + + if (size >= 4 && !(offset % 4)) { + __le32 *dwordp = (__le32 *)&vdev->vconfig[offset]; + u32 dword; + + ret = pci_read_config_dword(pdev, offset, &dword); + if (ret) + return ret; + *dwordp = cpu_to_le32(dword); + filled = 4; + } else if (size >= 2 && !(offset % 2)) { + __le16 *wordp = (__le16 *)&vdev->vconfig[offset]; + u16 word; + + ret = pci_read_config_word(pdev, offset, &word); + if (ret) + return ret; + *wordp = cpu_to_le16(word); + filled = 2; + } else { + u8 *byte = &vdev->vconfig[offset]; + ret = pci_read_config_byte(pdev, offset, byte); + if (ret) + return ret; + filled = 1; + } + + offset += filled; + size -= filled; + } + + return ret; +} + +static int vfio_cap_init(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + u8 *map = vdev->pci_config_map; + u16 status; + u8 pos, *prev, cap; + int loops, ret, caps = 0; + + /* Any capabilities? */ + ret = pci_read_config_word(pdev, PCI_STATUS, &status); + if (ret) + return ret; + + if (!(status & PCI_STATUS_CAP_LIST)) + return 0; /* Done */ + + ret = pci_read_config_byte(pdev, PCI_CAPABILITY_LIST, &pos); + if (ret) + return ret; + + /* Mark the previous position in case we want to skip a capability */ + prev = &vdev->vconfig[PCI_CAPABILITY_LIST]; + + /* We can bound our loop, capabilities are dword aligned */ + loops = (PCI_CFG_SPACE_SIZE - PCI_STD_HEADER_SIZEOF) / PCI_CAP_SIZEOF; + while (pos && loops--) { + u8 next; + int i, len = 0; + + ret = pci_read_config_byte(pdev, pos, &cap); + if (ret) + return ret; + + ret = pci_read_config_byte(pdev, + pos + PCI_CAP_LIST_NEXT, &next); + if (ret) + return ret; + + if (cap <= PCI_CAP_ID_MAX) { + len = pci_cap_length[cap]; + if (len == 0xFF) { /* Variable length */ + len = vfio_cap_len(vdev, cap, pos); + if (len < 0) + return len; + } + } + + if (!len) { + pr_info("%s: %s hiding cap 0x%x\n", + __func__, dev_name(&pdev->dev), cap); + *prev = next; + pos = next; + continue; + } + + /* Sanity check, do we overlap other capabilities? */ + for (i = 0; i < len; i++) { + if (likely(map[pos + i] == PCI_CAP_ID_INVALID)) + continue; + + pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n", + __func__, dev_name(&pdev->dev), + pos + i, map[pos + i], cap); + } + + memset(map + pos, cap, len); + ret = vfio_fill_vconfig_bytes(vdev, pos, len); + if (ret) + return ret; + + prev = &vdev->vconfig[pos + PCI_CAP_LIST_NEXT]; + pos = next; + caps++; + } + + /* If we didn't fill any capabilities, clear the status flag */ + if (!caps) { + __le16 *vstatus = (__le16 *)&vdev->vconfig[PCI_STATUS]; + *vstatus &= ~cpu_to_le16(PCI_STATUS_CAP_LIST); + } + + return 0; +} + +static int vfio_ecap_init(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + u8 *map = vdev->pci_config_map; + u16 epos; + __le32 *prev = NULL; + int loops, ret, ecaps = 0; + + if (!vdev->extended_caps) + return 0; + + epos = PCI_CFG_SPACE_SIZE; + + loops = (pdev->cfg_size - PCI_CFG_SPACE_SIZE) / PCI_CAP_SIZEOF; + + while (loops-- && epos >= PCI_CFG_SPACE_SIZE) { + u32 header; + u16 ecap; + int i, len = 0; + bool hidden = false; + + ret = pci_read_config_dword(pdev, epos, &header); + if (ret) + return ret; + + ecap = PCI_EXT_CAP_ID(header); + + if (ecap <= PCI_EXT_CAP_ID_MAX) { + len = pci_ext_cap_length[ecap]; + if (len == 0xFF) { + len = vfio_ext_cap_len(vdev, ecap, epos); + if (len < 0) + return ret; + } + } + + if (!len) { + pr_info("%s: %s hiding ecap 0x%x@0x%x\n", + __func__, dev_name(&pdev->dev), ecap, epos); + + /* If not the first in the chain, we can skip over it */ + if (prev) { + u32 val = epos = PCI_EXT_CAP_NEXT(header); + *prev &= cpu_to_le32(~(0xffcU << 20)); + *prev |= cpu_to_le32(val << 20); + continue; + } + + /* + * Otherwise, fill in a placeholder, the direct + * readfn will virtualize this automatically + */ + len = PCI_CAP_SIZEOF; + hidden = true; + } + + for (i = 0; i < len; i++) { + if (likely(map[epos + i] == PCI_CAP_ID_INVALID)) + continue; + + pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n", + __func__, dev_name(&pdev->dev), + epos + i, map[epos + i], ecap); + } + + /* + * Even though ecap is 2 bytes, we're currently a long way + * from exceeding 1 byte capabilities. If we ever make it + * up to 0xFF we'll need to up this to a two-byte, byte map. + */ + BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID); + + memset(map + epos, ecap, len); + ret = vfio_fill_vconfig_bytes(vdev, epos, len); + if (ret) + return ret; + + /* + * If we're just using this capability to anchor the list, + * hide the real ID. Only count real ecaps. XXX PCI spec + * indicates to use cap id = 0, version = 0, next = 0 if + * ecaps are absent, hope users check all the way to next. + */ + if (hidden) + *(__le32 *)&vdev->vconfig[epos] &= + cpu_to_le32((0xffcU << 20)); + else + ecaps++; + + prev = (__le32 *)&vdev->vconfig[epos]; + epos = PCI_EXT_CAP_NEXT(header); + } + + if (!ecaps) + *(u32 *)&vdev->vconfig[PCI_CFG_SPACE_SIZE] = 0; + + return 0; +} + +/* + * For each device we allocate a pci_config_map that indicates the + * capability occupying each dword and thus the struct perm_bits we + * use for read and write. We also allocate a virtualized config + * space which tracks reads and writes to bits that we emulate for + * the user. Initial values filled from device. + * + * Using shared stuct perm_bits between all vfio-pci devices saves + * us from allocating cfg_size buffers for virt and write for every + * device. We could remove vconfig and allocate individual buffers + * for each area requring emulated bits, but the array of pointers + * would be comparable in size (at least for standard config space). + */ +int vfio_config_init(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + u8 *map, *vconfig; + int ret; + + /* + * Config space, caps and ecaps are all dword aligned, so we could + * use one byte per dword to record the type. However, there are + * no requiremenst on the length of a capability, so the gap between + * capabilities needs byte granularity. + */ + map = kmalloc(pdev->cfg_size, GFP_KERNEL); + if (!map) + return -ENOMEM; + + vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL); + if (!vconfig) { + kfree(map); + return -ENOMEM; + } + + vdev->pci_config_map = map; + vdev->vconfig = vconfig; + + memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF); + memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID, + pdev->cfg_size - PCI_STD_HEADER_SIZEOF); + + ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF); + if (ret) + goto out; + + vdev->bardirty = true; + + /* + * XXX can we just pci_load_saved_state/pci_restore_state? + * may need to rebuild vconfig after that + */ + + /* For restore after reset */ + vdev->rbar[0] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_0]); + vdev->rbar[1] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_1]); + vdev->rbar[2] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_2]); + vdev->rbar[3] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_3]); + vdev->rbar[4] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_4]); + vdev->rbar[5] = le32_to_cpu(*(__le32 *)&vconfig[PCI_BASE_ADDRESS_5]); + vdev->rbar[6] = le32_to_cpu(*(__le32 *)&vconfig[PCI_ROM_ADDRESS]); + + if (pdev->is_virtfn) { + *(__le16 *)&vconfig[PCI_VENDOR_ID] = cpu_to_le16(pdev->vendor); + *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device); + } + + if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX)) + vconfig[PCI_INTERRUPT_PIN] = 0; + + ret = vfio_cap_init(vdev); + if (ret) + goto out; + + ret = vfio_ecap_init(vdev); + if (ret) + goto out; + + return 0; + +out: + kfree(map); + vdev->pci_config_map = NULL; + kfree(vconfig); + vdev->vconfig = NULL; + return pcibios_err_to_errno(ret); +} + +void vfio_config_free(struct vfio_pci_device *vdev) +{ + kfree(vdev->vconfig); + vdev->vconfig = NULL; + kfree(vdev->pci_config_map); + vdev->pci_config_map = NULL; + kfree(vdev->msi_perm); + vdev->msi_perm = NULL; +} + +/* + * Find the remaining number of bytes in a dword that match the given + * position. Stop at either the end of the capability or the dword boundary. + */ +static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev, + loff_t pos) +{ + u8 cap = vdev->pci_config_map[pos]; + size_t i; + + for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++) + /* nop */; + + return i; +} + +static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + struct pci_dev *pdev = vdev->pdev; + struct perm_bits *perm; + __le32 val = 0; + int cap_start = 0, offset; + u8 cap_id; + ssize_t ret; + + if (*ppos < 0 || *ppos >= pdev->cfg_size || + *ppos + count > pdev->cfg_size) + return -EFAULT; + + /* + * Chop accesses into aligned chunks containing no more than a + * single capability. Caller increments to the next chunk. + */ + count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos)); + if (count >= 4 && !(*ppos % 4)) + count = 4; + else if (count >= 2 && !(*ppos % 2)) + count = 2; + else + count = 1; + + ret = count; + + cap_id = vdev->pci_config_map[*ppos]; + + if (cap_id == PCI_CAP_ID_INVALID) { + perm = &unassigned_perms; + cap_start = *ppos; + } else { + if (*ppos >= PCI_CFG_SPACE_SIZE) { + WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); + + perm = &ecap_perms[cap_id]; + cap_start = vfio_find_cap_start(vdev, *ppos); + } else { + WARN_ON(cap_id > PCI_CAP_ID_MAX); + + perm = &cap_perms[cap_id]; + + if (cap_id == PCI_CAP_ID_MSI) + perm = vdev->msi_perm; + + if (cap_id > PCI_CAP_ID_BASIC) + cap_start = vfio_find_cap_start(vdev, *ppos); + } + } + + WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC); + WARN_ON(cap_start > *ppos); + + offset = *ppos - cap_start; + + if (iswrite) { + if (!perm->writefn) + return ret; + + if (copy_from_user(&val, buf, count)) + return -EFAULT; + + ret = perm->writefn(vdev, *ppos, count, perm, offset, val); + } else { + if (perm->readfn) { + ret = perm->readfn(vdev, *ppos, count, + perm, offset, &val); + if (ret < 0) + return ret; + } + + if (copy_to_user(buf, &val, count)) + return -EFAULT; + } + + return ret; +} + +ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + size_t done = 0; + int ret = 0; + loff_t pos = *ppos; + + pos &= VFIO_PCI_OFFSET_MASK; + + while (count) { + ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite); + if (ret < 0) + return ret; + + count -= ret; + done += ret; + buf += ret; + pos += ret; + } + + *ppos += done; + + return done; +} diff --git a/kernel/drivers/vfio/pci/vfio_pci_intrs.c b/kernel/drivers/vfio/pci/vfio_pci_intrs.c new file mode 100644 index 000000000..1f577b4ac --- /dev/null +++ b/kernel/drivers/vfio/pci/vfio_pci_intrs.c @@ -0,0 +1,675 @@ +/* + * VFIO PCI interrupt handling + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio: + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + */ + +#include <linux/device.h> +#include <linux/interrupt.h> +#include <linux/eventfd.h> +#include <linux/msi.h> +#include <linux/pci.h> +#include <linux/file.h> +#include <linux/vfio.h> +#include <linux/wait.h> +#include <linux/slab.h> + +#include "vfio_pci_private.h" + +/* + * INTx + */ +static void vfio_send_intx_eventfd(void *opaque, void *unused) +{ + struct vfio_pci_device *vdev = opaque; + + if (likely(is_intx(vdev) && !vdev->virq_disabled)) + eventfd_signal(vdev->ctx[0].trigger, 1); +} + +void vfio_pci_intx_mask(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + unsigned long flags; + + spin_lock_irqsave(&vdev->irqlock, flags); + + /* + * Masking can come from interrupt, ioctl, or config space + * via INTx disable. The latter means this can get called + * even when not using intx delivery. In this case, just + * try to have the physical bit follow the virtual bit. + */ + if (unlikely(!is_intx(vdev))) { + if (vdev->pci_2_3) + pci_intx(pdev, 0); + } else if (!vdev->ctx[0].masked) { + /* + * Can't use check_and_mask here because we always want to + * mask, not just when something is pending. + */ + if (vdev->pci_2_3) + pci_intx(pdev, 0); + else + disable_irq_nosync(pdev->irq); + + vdev->ctx[0].masked = true; + } + + spin_unlock_irqrestore(&vdev->irqlock, flags); +} + +/* + * If this is triggered by an eventfd, we can't call eventfd_signal + * or else we'll deadlock on the eventfd wait queue. Return >0 when + * a signal is necessary, which can then be handled via a work queue + * or directly depending on the caller. + */ +static int vfio_pci_intx_unmask_handler(void *opaque, void *unused) +{ + struct vfio_pci_device *vdev = opaque; + struct pci_dev *pdev = vdev->pdev; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&vdev->irqlock, flags); + + /* + * Unmasking comes from ioctl or config, so again, have the + * physical bit follow the virtual even when not using INTx. + */ + if (unlikely(!is_intx(vdev))) { + if (vdev->pci_2_3) + pci_intx(pdev, 1); + } else if (vdev->ctx[0].masked && !vdev->virq_disabled) { + /* + * A pending interrupt here would immediately trigger, + * but we can avoid that overhead by just re-sending + * the interrupt to the user. + */ + if (vdev->pci_2_3) { + if (!pci_check_and_unmask_intx(pdev)) + ret = 1; + } else + enable_irq(pdev->irq); + + vdev->ctx[0].masked = (ret > 0); + } + + spin_unlock_irqrestore(&vdev->irqlock, flags); + + return ret; +} + +void vfio_pci_intx_unmask(struct vfio_pci_device *vdev) +{ + if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0) + vfio_send_intx_eventfd(vdev, NULL); +} + +static irqreturn_t vfio_intx_handler(int irq, void *dev_id) +{ + struct vfio_pci_device *vdev = dev_id; + unsigned long flags; + int ret = IRQ_NONE; + + spin_lock_irqsave(&vdev->irqlock, flags); + + if (!vdev->pci_2_3) { + disable_irq_nosync(vdev->pdev->irq); + vdev->ctx[0].masked = true; + ret = IRQ_HANDLED; + } else if (!vdev->ctx[0].masked && /* may be shared */ + pci_check_and_mask_intx(vdev->pdev)) { + vdev->ctx[0].masked = true; + ret = IRQ_HANDLED; + } + + spin_unlock_irqrestore(&vdev->irqlock, flags); + + if (ret == IRQ_HANDLED) + vfio_send_intx_eventfd(vdev, NULL); + + return ret; +} + +static int vfio_intx_enable(struct vfio_pci_device *vdev) +{ + if (!is_irq_none(vdev)) + return -EINVAL; + + if (!vdev->pdev->irq) + return -ENODEV; + + vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); + if (!vdev->ctx) + return -ENOMEM; + + vdev->num_ctx = 1; + + /* + * If the virtual interrupt is masked, restore it. Devices + * supporting DisINTx can be masked at the hardware level + * here, non-PCI-2.3 devices will have to wait until the + * interrupt is enabled. + */ + vdev->ctx[0].masked = vdev->virq_disabled; + if (vdev->pci_2_3) + pci_intx(vdev->pdev, !vdev->ctx[0].masked); + + vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX; + + return 0; +} + +static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd) +{ + struct pci_dev *pdev = vdev->pdev; + unsigned long irqflags = IRQF_SHARED; + struct eventfd_ctx *trigger; + unsigned long flags; + int ret; + + if (vdev->ctx[0].trigger) { + free_irq(pdev->irq, vdev); + kfree(vdev->ctx[0].name); + eventfd_ctx_put(vdev->ctx[0].trigger); + vdev->ctx[0].trigger = NULL; + } + + if (fd < 0) /* Disable only */ + return 0; + + vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)", + pci_name(pdev)); + if (!vdev->ctx[0].name) + return -ENOMEM; + + trigger = eventfd_ctx_fdget(fd); + if (IS_ERR(trigger)) { + kfree(vdev->ctx[0].name); + return PTR_ERR(trigger); + } + + vdev->ctx[0].trigger = trigger; + + if (!vdev->pci_2_3) + irqflags = 0; + + ret = request_irq(pdev->irq, vfio_intx_handler, + irqflags, vdev->ctx[0].name, vdev); + if (ret) { + vdev->ctx[0].trigger = NULL; + kfree(vdev->ctx[0].name); + eventfd_ctx_put(trigger); + return ret; + } + + /* + * INTx disable will stick across the new irq setup, + * disable_irq won't. + */ + spin_lock_irqsave(&vdev->irqlock, flags); + if (!vdev->pci_2_3 && vdev->ctx[0].masked) + disable_irq_nosync(pdev->irq); + spin_unlock_irqrestore(&vdev->irqlock, flags); + + return 0; +} + +static void vfio_intx_disable(struct vfio_pci_device *vdev) +{ + vfio_intx_set_signal(vdev, -1); + vfio_virqfd_disable(&vdev->ctx[0].unmask); + vfio_virqfd_disable(&vdev->ctx[0].mask); + vdev->irq_type = VFIO_PCI_NUM_IRQS; + vdev->num_ctx = 0; + kfree(vdev->ctx); +} + +/* + * MSI/MSI-X + */ +static irqreturn_t vfio_msihandler(int irq, void *arg) +{ + struct eventfd_ctx *trigger = arg; + + eventfd_signal(trigger, 1); + return IRQ_HANDLED; +} + +static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix) +{ + struct pci_dev *pdev = vdev->pdev; + int ret; + + if (!is_irq_none(vdev)) + return -EINVAL; + + vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); + if (!vdev->ctx) + return -ENOMEM; + + if (msix) { + int i; + + vdev->msix = kzalloc(nvec * sizeof(struct msix_entry), + GFP_KERNEL); + if (!vdev->msix) { + kfree(vdev->ctx); + return -ENOMEM; + } + + for (i = 0; i < nvec; i++) + vdev->msix[i].entry = i; + + ret = pci_enable_msix_range(pdev, vdev->msix, 1, nvec); + if (ret < nvec) { + if (ret > 0) + pci_disable_msix(pdev); + kfree(vdev->msix); + kfree(vdev->ctx); + return ret; + } + } else { + ret = pci_enable_msi_range(pdev, 1, nvec); + if (ret < nvec) { + if (ret > 0) + pci_disable_msi(pdev); + kfree(vdev->ctx); + return ret; + } + } + + vdev->num_ctx = nvec; + vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX : + VFIO_PCI_MSI_IRQ_INDEX; + + if (!msix) { + /* + * Compute the virtual hardware field for max msi vectors - + * it is the log base 2 of the number of vectors. + */ + vdev->msi_qmax = fls(nvec * 2 - 1) - 1; + } + + return 0; +} + +static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, + int vector, int fd, bool msix) +{ + struct pci_dev *pdev = vdev->pdev; + int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector; + char *name = msix ? "vfio-msix" : "vfio-msi"; + struct eventfd_ctx *trigger; + int ret; + + if (vector >= vdev->num_ctx) + return -EINVAL; + + if (vdev->ctx[vector].trigger) { + free_irq(irq, vdev->ctx[vector].trigger); + kfree(vdev->ctx[vector].name); + eventfd_ctx_put(vdev->ctx[vector].trigger); + vdev->ctx[vector].trigger = NULL; + } + + if (fd < 0) + return 0; + + vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)", + name, vector, pci_name(pdev)); + if (!vdev->ctx[vector].name) + return -ENOMEM; + + trigger = eventfd_ctx_fdget(fd); + if (IS_ERR(trigger)) { + kfree(vdev->ctx[vector].name); + return PTR_ERR(trigger); + } + + /* + * The MSIx vector table resides in device memory which may be cleared + * via backdoor resets. We don't allow direct access to the vector + * table so even if a userspace driver attempts to save/restore around + * such a reset it would be unsuccessful. To avoid this, restore the + * cached value of the message prior to enabling. + */ + if (msix) { + struct msi_msg msg; + + get_cached_msi_msg(irq, &msg); + pci_write_msi_msg(irq, &msg); + } + + ret = request_irq(irq, vfio_msihandler, 0, + vdev->ctx[vector].name, trigger); + if (ret) { + kfree(vdev->ctx[vector].name); + eventfd_ctx_put(trigger); + return ret; + } + + vdev->ctx[vector].trigger = trigger; + + return 0; +} + +static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start, + unsigned count, int32_t *fds, bool msix) +{ + int i, j, ret = 0; + + if (start + count > vdev->num_ctx) + return -EINVAL; + + for (i = 0, j = start; i < count && !ret; i++, j++) { + int fd = fds ? fds[i] : -1; + ret = vfio_msi_set_vector_signal(vdev, j, fd, msix); + } + + if (ret) { + for (--j; j >= start; j--) + vfio_msi_set_vector_signal(vdev, j, -1, msix); + } + + return ret; +} + +static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix) +{ + struct pci_dev *pdev = vdev->pdev; + int i; + + vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix); + + for (i = 0; i < vdev->num_ctx; i++) { + vfio_virqfd_disable(&vdev->ctx[i].unmask); + vfio_virqfd_disable(&vdev->ctx[i].mask); + } + + if (msix) { + pci_disable_msix(vdev->pdev); + kfree(vdev->msix); + } else + pci_disable_msi(pdev); + + vdev->irq_type = VFIO_PCI_NUM_IRQS; + vdev->num_ctx = 0; + kfree(vdev->ctx); +} + +/* + * IOCTL support + */ +static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (!is_intx(vdev) || start != 0 || count != 1) + return -EINVAL; + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + vfio_pci_intx_unmask(vdev); + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t unmask = *(uint8_t *)data; + if (unmask) + vfio_pci_intx_unmask(vdev); + } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int32_t fd = *(int32_t *)data; + if (fd >= 0) + return vfio_virqfd_enable((void *) vdev, + vfio_pci_intx_unmask_handler, + vfio_send_intx_eventfd, NULL, + &vdev->ctx[0].unmask, fd); + + vfio_virqfd_disable(&vdev->ctx[0].unmask); + } + + return 0; +} + +static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (!is_intx(vdev) || start != 0 || count != 1) + return -EINVAL; + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + vfio_pci_intx_mask(vdev); + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t mask = *(uint8_t *)data; + if (mask) + vfio_pci_intx_mask(vdev); + } else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + return -ENOTTY; /* XXX implement me */ + } + + return 0; +} + +static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) { + vfio_intx_disable(vdev); + return 0; + } + + if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1) + return -EINVAL; + + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int32_t fd = *(int32_t *)data; + int ret; + + if (is_intx(vdev)) + return vfio_intx_set_signal(vdev, fd); + + ret = vfio_intx_enable(vdev); + if (ret) + return ret; + + ret = vfio_intx_set_signal(vdev, fd); + if (ret) + vfio_intx_disable(vdev); + + return ret; + } + + if (!is_intx(vdev)) + return -EINVAL; + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + vfio_send_intx_eventfd(vdev, NULL); + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t trigger = *(uint8_t *)data; + if (trigger) + vfio_send_intx_eventfd(vdev, NULL); + } + return 0; +} + +static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + int i; + bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false; + + if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) { + vfio_msi_disable(vdev, msix); + return 0; + } + + if (!(irq_is(vdev, index) || is_irq_none(vdev))) + return -EINVAL; + + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int32_t *fds = data; + int ret; + + if (vdev->irq_type == index) + return vfio_msi_set_block(vdev, start, count, + fds, msix); + + ret = vfio_msi_enable(vdev, start + count, msix); + if (ret) + return ret; + + ret = vfio_msi_set_block(vdev, start, count, fds, msix); + if (ret) + vfio_msi_disable(vdev, msix); + + return ret; + } + + if (!irq_is(vdev, index) || start + count > vdev->num_ctx) + return -EINVAL; + + for (i = start; i < start + count; i++) { + if (!vdev->ctx[i].trigger) + continue; + if (flags & VFIO_IRQ_SET_DATA_NONE) { + eventfd_signal(vdev->ctx[i].trigger, 1); + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t *bools = data; + if (bools[i - start]) + eventfd_signal(vdev->ctx[i].trigger, 1); + } + } + return 0; +} + +static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, + uint32_t flags, void *data) +{ + int32_t fd = *(int32_t *)data; + + if (!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) + return -EINVAL; + + /* DATA_NONE/DATA_BOOL enables loopback testing */ + if (flags & VFIO_IRQ_SET_DATA_NONE) { + if (*ctx) + eventfd_signal(*ctx, 1); + return 0; + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t trigger = *(uint8_t *)data; + if (trigger && *ctx) + eventfd_signal(*ctx, 1); + return 0; + } + + /* Handle SET_DATA_EVENTFD */ + if (fd == -1) { + if (*ctx) + eventfd_ctx_put(*ctx); + *ctx = NULL; + return 0; + } else if (fd >= 0) { + struct eventfd_ctx *efdctx; + efdctx = eventfd_ctx_fdget(fd); + if (IS_ERR(efdctx)) + return PTR_ERR(efdctx); + if (*ctx) + eventfd_ctx_put(*ctx); + *ctx = efdctx; + return 0; + } else + return -EINVAL; +} + +static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (index != VFIO_PCI_ERR_IRQ_INDEX) + return -EINVAL; + + /* + * We should sanitize start & count, but that wasn't caught + * originally, so this IRQ index must forever ignore them :-( + */ + + return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data); +} + +static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count != 1) + return -EINVAL; + + return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, flags, data); +} + +int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, + unsigned index, unsigned start, unsigned count, + void *data) +{ + int (*func)(struct vfio_pci_device *vdev, unsigned index, + unsigned start, unsigned count, uint32_t flags, + void *data) = NULL; + + switch (index) { + case VFIO_PCI_INTX_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_MASK: + func = vfio_pci_set_intx_mask; + break; + case VFIO_IRQ_SET_ACTION_UNMASK: + func = vfio_pci_set_intx_unmask; + break; + case VFIO_IRQ_SET_ACTION_TRIGGER: + func = vfio_pci_set_intx_trigger; + break; + } + break; + case VFIO_PCI_MSI_IRQ_INDEX: + case VFIO_PCI_MSIX_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_MASK: + case VFIO_IRQ_SET_ACTION_UNMASK: + /* XXX Need masking support exported */ + break; + case VFIO_IRQ_SET_ACTION_TRIGGER: + func = vfio_pci_set_msi_trigger; + break; + } + break; + case VFIO_PCI_ERR_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_TRIGGER: + if (pci_is_pcie(vdev->pdev)) + func = vfio_pci_set_err_trigger; + break; + } + break; + case VFIO_PCI_REQ_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_TRIGGER: + func = vfio_pci_set_req_trigger; + break; + } + break; + } + + if (!func) + return -ENOTTY; + + return func(vdev, index, start, count, flags, data); +} diff --git a/kernel/drivers/vfio/pci/vfio_pci_private.h b/kernel/drivers/vfio/pci/vfio_pci_private.h new file mode 100644 index 000000000..ae0e1b4c1 --- /dev/null +++ b/kernel/drivers/vfio/pci/vfio_pci_private.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio: + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + */ + +#include <linux/mutex.h> +#include <linux/pci.h> + +#ifndef VFIO_PCI_PRIVATE_H +#define VFIO_PCI_PRIVATE_H + +#define VFIO_PCI_OFFSET_SHIFT 40 + +#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) +#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) + +struct vfio_pci_irq_ctx { + struct eventfd_ctx *trigger; + struct virqfd *unmask; + struct virqfd *mask; + char *name; + bool masked; +}; + +struct vfio_pci_device { + struct pci_dev *pdev; + void __iomem *barmap[PCI_STD_RESOURCE_END + 1]; + u8 *pci_config_map; + u8 *vconfig; + struct perm_bits *msi_perm; + spinlock_t irqlock; + struct mutex igate; + struct msix_entry *msix; + struct vfio_pci_irq_ctx *ctx; + int num_ctx; + int irq_type; + u8 msi_qmax; + u8 msix_bar; + u16 msix_size; + u32 msix_offset; + u32 rbar[7]; + bool pci_2_3; + bool virq_disabled; + bool reset_works; + bool extended_caps; + bool bardirty; + bool has_vga; + bool needs_reset; + struct pci_saved_state *pci_saved_state; + int refcnt; + struct eventfd_ctx *err_trigger; + struct eventfd_ctx *req_trigger; +}; + +#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) +#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) +#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) +#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) +#define irq_is(vdev, type) (vdev->irq_type == type) + +extern void vfio_pci_intx_mask(struct vfio_pci_device *vdev); +extern void vfio_pci_intx_unmask(struct vfio_pci_device *vdev); + +extern int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, + uint32_t flags, unsigned index, + unsigned start, unsigned count, void *data); + +extern ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite); + +extern ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); + +extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); + +extern int vfio_pci_init_perm_bits(void); +extern void vfio_pci_uninit_perm_bits(void); + +extern int vfio_config_init(struct vfio_pci_device *vdev); +extern void vfio_config_free(struct vfio_pci_device *vdev); +#endif /* VFIO_PCI_PRIVATE_H */ diff --git a/kernel/drivers/vfio/pci/vfio_pci_rdwr.c b/kernel/drivers/vfio/pci/vfio_pci_rdwr.c new file mode 100644 index 000000000..210db24d2 --- /dev/null +++ b/kernel/drivers/vfio/pci/vfio_pci_rdwr.c @@ -0,0 +1,238 @@ +/* + * VFIO PCI I/O Port & MMIO access + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio: + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + */ + +#include <linux/fs.h> +#include <linux/pci.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/vgaarb.h> + +#include "vfio_pci_private.h" + +/* + * Read or write from an __iomem region (MMIO or I/O port) with an excluded + * range which is inaccessible. The excluded range drops writes and fills + * reads with -1. This is intended for handling MSI-X vector tables and + * leftover space for ROM BARs. + */ +static ssize_t do_io_rw(void __iomem *io, char __user *buf, + loff_t off, size_t count, size_t x_start, + size_t x_end, bool iswrite) +{ + ssize_t done = 0; + + while (count) { + size_t fillable, filled; + + if (off < x_start) + fillable = min(count, (size_t)(x_start - off)); + else if (off >= x_end) + fillable = count; + else + fillable = 0; + + if (fillable >= 4 && !(off % 4)) { + __le32 val; + + if (iswrite) { + if (copy_from_user(&val, buf, 4)) + return -EFAULT; + + iowrite32(le32_to_cpu(val), io + off); + } else { + val = cpu_to_le32(ioread32(io + off)); + + if (copy_to_user(buf, &val, 4)) + return -EFAULT; + } + + filled = 4; + } else if (fillable >= 2 && !(off % 2)) { + __le16 val; + + if (iswrite) { + if (copy_from_user(&val, buf, 2)) + return -EFAULT; + + iowrite16(le16_to_cpu(val), io + off); + } else { + val = cpu_to_le16(ioread16(io + off)); + + if (copy_to_user(buf, &val, 2)) + return -EFAULT; + } + + filled = 2; + } else if (fillable) { + u8 val; + + if (iswrite) { + if (copy_from_user(&val, buf, 1)) + return -EFAULT; + + iowrite8(val, io + off); + } else { + val = ioread8(io + off); + + if (copy_to_user(buf, &val, 1)) + return -EFAULT; + } + + filled = 1; + } else { + /* Fill reads with -1, drop writes */ + filled = min(count, (size_t)(x_end - off)); + if (!iswrite) { + u8 val = 0xFF; + size_t i; + + for (i = 0; i < filled; i++) + if (copy_to_user(buf + i, &val, 1)) + return -EFAULT; + } + } + + count -= filled; + done += filled; + off += filled; + buf += filled; + } + + return done; +} + +ssize_t vfio_pci_bar_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + struct pci_dev *pdev = vdev->pdev; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + int bar = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + size_t x_start = 0, x_end = 0; + resource_size_t end; + void __iomem *io; + ssize_t done; + + if (!pci_resource_start(pdev, bar)) + return -EINVAL; + + end = pci_resource_len(pdev, bar); + + if (pos >= end) + return -EINVAL; + + count = min(count, (size_t)(end - pos)); + + if (bar == PCI_ROM_RESOURCE) { + /* + * The ROM can fill less space than the BAR, so we start the + * excluded range at the end of the actual ROM. This makes + * filling large ROM BARs much faster. + */ + io = pci_map_rom(pdev, &x_start); + if (!io) + return -ENOMEM; + x_end = end; + } else if (!vdev->barmap[bar]) { + int ret; + + ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); + if (ret) + return ret; + + io = pci_iomap(pdev, bar, 0); + if (!io) { + pci_release_selected_regions(pdev, 1 << bar); + return -ENOMEM; + } + + vdev->barmap[bar] = io; + } else + io = vdev->barmap[bar]; + + if (bar == vdev->msix_bar) { + x_start = vdev->msix_offset; + x_end = vdev->msix_offset + vdev->msix_size; + } + + done = do_io_rw(io, buf, pos, count, x_start, x_end, iswrite); + + if (done >= 0) + *ppos += done; + + if (bar == PCI_ROM_RESOURCE) + pci_unmap_rom(pdev, io); + + return done; +} + +ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + int ret; + loff_t off, pos = *ppos & VFIO_PCI_OFFSET_MASK; + void __iomem *iomem = NULL; + unsigned int rsrc; + bool is_ioport; + ssize_t done; + + if (!vdev->has_vga) + return -EINVAL; + + switch (pos) { + case 0xa0000 ... 0xbffff: + count = min(count, (size_t)(0xc0000 - pos)); + iomem = ioremap_nocache(0xa0000, 0xbffff - 0xa0000 + 1); + off = pos - 0xa0000; + rsrc = VGA_RSRC_LEGACY_MEM; + is_ioport = false; + break; + case 0x3b0 ... 0x3bb: + count = min(count, (size_t)(0x3bc - pos)); + iomem = ioport_map(0x3b0, 0x3bb - 0x3b0 + 1); + off = pos - 0x3b0; + rsrc = VGA_RSRC_LEGACY_IO; + is_ioport = true; + break; + case 0x3c0 ... 0x3df: + count = min(count, (size_t)(0x3e0 - pos)); + iomem = ioport_map(0x3c0, 0x3df - 0x3c0 + 1); + off = pos - 0x3c0; + rsrc = VGA_RSRC_LEGACY_IO; + is_ioport = true; + break; + default: + return -EINVAL; + } + + if (!iomem) + return -ENOMEM; + + ret = vga_get_interruptible(vdev->pdev, rsrc); + if (ret) { + is_ioport ? ioport_unmap(iomem) : iounmap(iomem); + return ret; + } + + done = do_io_rw(iomem, buf, off, count, 0, 0, iswrite); + + vga_put(vdev->pdev, rsrc); + + is_ioport ? ioport_unmap(iomem) : iounmap(iomem); + + if (done >= 0) + *ppos += done; + + return done; +} diff --git a/kernel/drivers/vfio/platform/Kconfig b/kernel/drivers/vfio/platform/Kconfig new file mode 100644 index 000000000..9a4403e2a --- /dev/null +++ b/kernel/drivers/vfio/platform/Kconfig @@ -0,0 +1,20 @@ +config VFIO_PLATFORM + tristate "VFIO support for platform devices" + depends on VFIO && EVENTFD && ARM + select VFIO_VIRQFD + help + Support for platform devices with VFIO. This is required to make + use of platform devices present on the system using the VFIO + framework. + + If you don't know what to do here, say N. + +config VFIO_AMBA + tristate "VFIO support for AMBA devices" + depends on VFIO_PLATFORM && ARM_AMBA + help + Support for ARM AMBA devices with VFIO. This is required to make + use of ARM AMBA devices present on the system using the VFIO + framework. + + If you don't know what to do here, say N. diff --git a/kernel/drivers/vfio/platform/Makefile b/kernel/drivers/vfio/platform/Makefile new file mode 100644 index 000000000..81de144c0 --- /dev/null +++ b/kernel/drivers/vfio/platform/Makefile @@ -0,0 +1,8 @@ + +vfio-platform-y := vfio_platform.o vfio_platform_common.o vfio_platform_irq.o + +obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o + +vfio-amba-y := vfio_amba.o + +obj-$(CONFIG_VFIO_AMBA) += vfio-amba.o diff --git a/kernel/drivers/vfio/platform/vfio_amba.c b/kernel/drivers/vfio/platform/vfio_amba.c new file mode 100644 index 000000000..ff0331f72 --- /dev/null +++ b/kernel/drivers/vfio/platform/vfio_amba.c @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2013 - Virtual Open Systems + * Author: Antonios Motakis <a.motakis@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vfio.h> +#include <linux/amba/bus.h> + +#include "vfio_platform_private.h" + +#define DRIVER_VERSION "0.10" +#define DRIVER_AUTHOR "Antonios Motakis <a.motakis@virtualopensystems.com>" +#define DRIVER_DESC "VFIO for AMBA devices - User Level meta-driver" + +/* probing devices from the AMBA bus */ + +static struct resource *get_amba_resource(struct vfio_platform_device *vdev, + int i) +{ + struct amba_device *adev = (struct amba_device *) vdev->opaque; + + if (i == 0) + return &adev->res; + + return NULL; +} + +static int get_amba_irq(struct vfio_platform_device *vdev, int i) +{ + struct amba_device *adev = (struct amba_device *) vdev->opaque; + int ret = 0; + + if (i < AMBA_NR_IRQS) + ret = adev->irq[i]; + + /* zero is an unset IRQ for AMBA devices */ + return ret ? ret : -ENXIO; +} + +static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id) +{ + struct vfio_platform_device *vdev; + int ret; + + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); + if (!vdev) + return -ENOMEM; + + vdev->name = kasprintf(GFP_KERNEL, "vfio-amba-%08x", adev->periphid); + if (!vdev->name) { + kfree(vdev); + return -ENOMEM; + } + + vdev->opaque = (void *) adev; + vdev->flags = VFIO_DEVICE_FLAGS_AMBA; + vdev->get_resource = get_amba_resource; + vdev->get_irq = get_amba_irq; + + ret = vfio_platform_probe_common(vdev, &adev->dev); + if (ret) { + kfree(vdev->name); + kfree(vdev); + } + + return ret; +} + +static int vfio_amba_remove(struct amba_device *adev) +{ + struct vfio_platform_device *vdev; + + vdev = vfio_platform_remove_common(&adev->dev); + if (vdev) { + kfree(vdev->name); + kfree(vdev); + return 0; + } + + return -EINVAL; +} + +static struct amba_id pl330_ids[] = { + { 0, 0 }, +}; + +MODULE_DEVICE_TABLE(amba, pl330_ids); + +static struct amba_driver vfio_amba_driver = { + .probe = vfio_amba_probe, + .remove = vfio_amba_remove, + .id_table = pl330_ids, + .drv = { + .name = "vfio-amba", + .owner = THIS_MODULE, + }, +}; + +module_amba_driver(vfio_amba_driver); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/kernel/drivers/vfio/platform/vfio_platform.c b/kernel/drivers/vfio/platform/vfio_platform.c new file mode 100644 index 000000000..cef645c83 --- /dev/null +++ b/kernel/drivers/vfio/platform/vfio_platform.c @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2013 - Virtual Open Systems + * Author: Antonios Motakis <a.motakis@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vfio.h> +#include <linux/platform_device.h> + +#include "vfio_platform_private.h" + +#define DRIVER_VERSION "0.10" +#define DRIVER_AUTHOR "Antonios Motakis <a.motakis@virtualopensystems.com>" +#define DRIVER_DESC "VFIO for platform devices - User Level meta-driver" + +/* probing devices from the linux platform bus */ + +static struct resource *get_platform_resource(struct vfio_platform_device *vdev, + int num) +{ + struct platform_device *dev = (struct platform_device *) vdev->opaque; + int i; + + for (i = 0; i < dev->num_resources; i++) { + struct resource *r = &dev->resource[i]; + + if (resource_type(r) & (IORESOURCE_MEM|IORESOURCE_IO)) { + if (!num) + return r; + + num--; + } + } + return NULL; +} + +static int get_platform_irq(struct vfio_platform_device *vdev, int i) +{ + struct platform_device *pdev = (struct platform_device *) vdev->opaque; + + return platform_get_irq(pdev, i); +} + +static int vfio_platform_probe(struct platform_device *pdev) +{ + struct vfio_platform_device *vdev; + int ret; + + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); + if (!vdev) + return -ENOMEM; + + vdev->opaque = (void *) pdev; + vdev->name = pdev->name; + vdev->flags = VFIO_DEVICE_FLAGS_PLATFORM; + vdev->get_resource = get_platform_resource; + vdev->get_irq = get_platform_irq; + + ret = vfio_platform_probe_common(vdev, &pdev->dev); + if (ret) + kfree(vdev); + + return ret; +} + +static int vfio_platform_remove(struct platform_device *pdev) +{ + struct vfio_platform_device *vdev; + + vdev = vfio_platform_remove_common(&pdev->dev); + if (vdev) { + kfree(vdev); + return 0; + } + + return -EINVAL; +} + +static struct platform_driver vfio_platform_driver = { + .probe = vfio_platform_probe, + .remove = vfio_platform_remove, + .driver = { + .name = "vfio-platform", + .owner = THIS_MODULE, + }, +}; + +module_platform_driver(vfio_platform_driver); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/kernel/drivers/vfio/platform/vfio_platform_common.c b/kernel/drivers/vfio/platform/vfio_platform_common.c new file mode 100644 index 000000000..abcff7a1a --- /dev/null +++ b/kernel/drivers/vfio/platform/vfio_platform_common.c @@ -0,0 +1,521 @@ +/* + * Copyright (C) 2013 - Virtual Open Systems + * Author: Antonios Motakis <a.motakis@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/device.h> +#include <linux/iommu.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> + +#include "vfio_platform_private.h" + +static DEFINE_MUTEX(driver_lock); + +static int vfio_platform_regions_init(struct vfio_platform_device *vdev) +{ + int cnt = 0, i; + + while (vdev->get_resource(vdev, cnt)) + cnt++; + + vdev->regions = kcalloc(cnt, sizeof(struct vfio_platform_region), + GFP_KERNEL); + if (!vdev->regions) + return -ENOMEM; + + for (i = 0; i < cnt; i++) { + struct resource *res = + vdev->get_resource(vdev, i); + + if (!res) + goto err; + + vdev->regions[i].addr = res->start; + vdev->regions[i].size = resource_size(res); + vdev->regions[i].flags = 0; + + switch (resource_type(res)) { + case IORESOURCE_MEM: + vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_MMIO; + vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_READ; + if (!(res->flags & IORESOURCE_READONLY)) + vdev->regions[i].flags |= + VFIO_REGION_INFO_FLAG_WRITE; + + /* + * Only regions addressed with PAGE granularity may be + * MMAPed securely. + */ + if (!(vdev->regions[i].addr & ~PAGE_MASK) && + !(vdev->regions[i].size & ~PAGE_MASK)) + vdev->regions[i].flags |= + VFIO_REGION_INFO_FLAG_MMAP; + + break; + case IORESOURCE_IO: + vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_PIO; + break; + default: + goto err; + } + } + + vdev->num_regions = cnt; + + return 0; +err: + kfree(vdev->regions); + return -EINVAL; +} + +static void vfio_platform_regions_cleanup(struct vfio_platform_device *vdev) +{ + int i; + + for (i = 0; i < vdev->num_regions; i++) + iounmap(vdev->regions[i].ioaddr); + + vdev->num_regions = 0; + kfree(vdev->regions); +} + +static void vfio_platform_release(void *device_data) +{ + struct vfio_platform_device *vdev = device_data; + + mutex_lock(&driver_lock); + + if (!(--vdev->refcnt)) { + vfio_platform_regions_cleanup(vdev); + vfio_platform_irq_cleanup(vdev); + } + + mutex_unlock(&driver_lock); + + module_put(THIS_MODULE); +} + +static int vfio_platform_open(void *device_data) +{ + struct vfio_platform_device *vdev = device_data; + int ret; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + mutex_lock(&driver_lock); + + if (!vdev->refcnt) { + ret = vfio_platform_regions_init(vdev); + if (ret) + goto err_reg; + + ret = vfio_platform_irq_init(vdev); + if (ret) + goto err_irq; + } + + vdev->refcnt++; + + mutex_unlock(&driver_lock); + return 0; + +err_irq: + vfio_platform_regions_cleanup(vdev); +err_reg: + mutex_unlock(&driver_lock); + module_put(THIS_MODULE); + return ret; +} + +static long vfio_platform_ioctl(void *device_data, + unsigned int cmd, unsigned long arg) +{ + struct vfio_platform_device *vdev = device_data; + unsigned long minsz; + + if (cmd == VFIO_DEVICE_GET_INFO) { + struct vfio_device_info info; + + minsz = offsetofend(struct vfio_device_info, num_irqs); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.flags = vdev->flags; + info.num_regions = vdev->num_regions; + info.num_irqs = vdev->num_irqs; + + return copy_to_user((void __user *)arg, &info, minsz); + + } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { + struct vfio_region_info info; + + minsz = offsetofend(struct vfio_region_info, offset); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index >= vdev->num_regions) + return -EINVAL; + + /* map offset to the physical address */ + info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index); + info.size = vdev->regions[info.index].size; + info.flags = vdev->regions[info.index].flags; + + return copy_to_user((void __user *)arg, &info, minsz); + + } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { + struct vfio_irq_info info; + + minsz = offsetofend(struct vfio_irq_info, count); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + if (info.index >= vdev->num_irqs) + return -EINVAL; + + info.flags = vdev->irqs[info.index].flags; + info.count = vdev->irqs[info.index].count; + + return copy_to_user((void __user *)arg, &info, minsz); + + } else if (cmd == VFIO_DEVICE_SET_IRQS) { + struct vfio_irq_set hdr; + u8 *data = NULL; + int ret = 0; + + minsz = offsetofend(struct vfio_irq_set, count); + + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz) + return -EINVAL; + + if (hdr.index >= vdev->num_irqs) + return -EINVAL; + + if (hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | + VFIO_IRQ_SET_ACTION_TYPE_MASK)) + return -EINVAL; + + if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { + size_t size; + + if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) + size = sizeof(uint8_t); + else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) + size = sizeof(int32_t); + else + return -EINVAL; + + if (hdr.argsz - minsz < size) + return -EINVAL; + + data = memdup_user((void __user *)(arg + minsz), size); + if (IS_ERR(data)) + return PTR_ERR(data); + } + + mutex_lock(&vdev->igate); + + ret = vfio_platform_set_irqs_ioctl(vdev, hdr.flags, hdr.index, + hdr.start, hdr.count, data); + mutex_unlock(&vdev->igate); + kfree(data); + + return ret; + + } else if (cmd == VFIO_DEVICE_RESET) + return -EINVAL; + + return -ENOTTY; +} + +static ssize_t vfio_platform_read_mmio(struct vfio_platform_region reg, + char __user *buf, size_t count, + loff_t off) +{ + unsigned int done = 0; + + if (!reg.ioaddr) { + reg.ioaddr = + ioremap_nocache(reg.addr, reg.size); + + if (!reg.ioaddr) + return -ENOMEM; + } + + while (count) { + size_t filled; + + if (count >= 4 && !(off % 4)) { + u32 val; + + val = ioread32(reg.ioaddr + off); + if (copy_to_user(buf, &val, 4)) + goto err; + + filled = 4; + } else if (count >= 2 && !(off % 2)) { + u16 val; + + val = ioread16(reg.ioaddr + off); + if (copy_to_user(buf, &val, 2)) + goto err; + + filled = 2; + } else { + u8 val; + + val = ioread8(reg.ioaddr + off); + if (copy_to_user(buf, &val, 1)) + goto err; + + filled = 1; + } + + + count -= filled; + done += filled; + off += filled; + buf += filled; + } + + return done; +err: + return -EFAULT; +} + +static ssize_t vfio_platform_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + struct vfio_platform_device *vdev = device_data; + unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos); + loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK; + + if (index >= vdev->num_regions) + return -EINVAL; + + if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ)) + return -EINVAL; + + if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO) + return vfio_platform_read_mmio(vdev->regions[index], + buf, count, off); + else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO) + return -EINVAL; /* not implemented */ + + return -EINVAL; +} + +static ssize_t vfio_platform_write_mmio(struct vfio_platform_region reg, + const char __user *buf, size_t count, + loff_t off) +{ + unsigned int done = 0; + + if (!reg.ioaddr) { + reg.ioaddr = + ioremap_nocache(reg.addr, reg.size); + + if (!reg.ioaddr) + return -ENOMEM; + } + + while (count) { + size_t filled; + + if (count >= 4 && !(off % 4)) { + u32 val; + + if (copy_from_user(&val, buf, 4)) + goto err; + iowrite32(val, reg.ioaddr + off); + + filled = 4; + } else if (count >= 2 && !(off % 2)) { + u16 val; + + if (copy_from_user(&val, buf, 2)) + goto err; + iowrite16(val, reg.ioaddr + off); + + filled = 2; + } else { + u8 val; + + if (copy_from_user(&val, buf, 1)) + goto err; + iowrite8(val, reg.ioaddr + off); + + filled = 1; + } + + count -= filled; + done += filled; + off += filled; + buf += filled; + } + + return done; +err: + return -EFAULT; +} + +static ssize_t vfio_platform_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct vfio_platform_device *vdev = device_data; + unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos); + loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK; + + if (index >= vdev->num_regions) + return -EINVAL; + + if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_WRITE)) + return -EINVAL; + + if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO) + return vfio_platform_write_mmio(vdev->regions[index], + buf, count, off); + else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO) + return -EINVAL; /* not implemented */ + + return -EINVAL; +} + +static int vfio_platform_mmap_mmio(struct vfio_platform_region region, + struct vm_area_struct *vma) +{ + u64 req_len, pgoff, req_start; + + req_len = vma->vm_end - vma->vm_start; + pgoff = vma->vm_pgoff & + ((1U << (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT)) - 1); + req_start = pgoff << PAGE_SHIFT; + + if (region.size < PAGE_SIZE || req_start + req_len > region.size) + return -EINVAL; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_pgoff = (region.addr >> PAGE_SHIFT) + pgoff; + + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + req_len, vma->vm_page_prot); +} + +static int vfio_platform_mmap(void *device_data, struct vm_area_struct *vma) +{ + struct vfio_platform_device *vdev = device_data; + unsigned int index; + + index = vma->vm_pgoff >> (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT); + + if (vma->vm_end < vma->vm_start) + return -EINVAL; + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + if (index >= vdev->num_regions) + return -EINVAL; + if (vma->vm_start & ~PAGE_MASK) + return -EINVAL; + if (vma->vm_end & ~PAGE_MASK) + return -EINVAL; + + if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_MMAP)) + return -EINVAL; + + if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ) + && (vma->vm_flags & VM_READ)) + return -EINVAL; + + if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_WRITE) + && (vma->vm_flags & VM_WRITE)) + return -EINVAL; + + vma->vm_private_data = vdev; + + if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO) + return vfio_platform_mmap_mmio(vdev->regions[index], vma); + + else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO) + return -EINVAL; /* not implemented */ + + return -EINVAL; +} + +static const struct vfio_device_ops vfio_platform_ops = { + .name = "vfio-platform", + .open = vfio_platform_open, + .release = vfio_platform_release, + .ioctl = vfio_platform_ioctl, + .read = vfio_platform_read, + .write = vfio_platform_write, + .mmap = vfio_platform_mmap, +}; + +int vfio_platform_probe_common(struct vfio_platform_device *vdev, + struct device *dev) +{ + struct iommu_group *group; + int ret; + + if (!vdev) + return -EINVAL; + + group = iommu_group_get(dev); + if (!group) { + pr_err("VFIO: No IOMMU group for device %s\n", vdev->name); + return -EINVAL; + } + + ret = vfio_add_group_dev(dev, &vfio_platform_ops, vdev); + if (ret) { + iommu_group_put(group); + return ret; + } + + mutex_init(&vdev->igate); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_platform_probe_common); + +struct vfio_platform_device *vfio_platform_remove_common(struct device *dev) +{ + struct vfio_platform_device *vdev; + + vdev = vfio_del_group_dev(dev); + if (vdev) + iommu_group_put(dev->iommu_group); + + return vdev; +} +EXPORT_SYMBOL_GPL(vfio_platform_remove_common); diff --git a/kernel/drivers/vfio/platform/vfio_platform_irq.c b/kernel/drivers/vfio/platform/vfio_platform_irq.c new file mode 100644 index 000000000..88bba57b3 --- /dev/null +++ b/kernel/drivers/vfio/platform/vfio_platform_irq.c @@ -0,0 +1,336 @@ +/* + * VFIO platform devices interrupt handling + * + * Copyright (C) 2013 - Virtual Open Systems + * Author: Antonios Motakis <a.motakis@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/eventfd.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/vfio.h> +#include <linux/irq.h> + +#include "vfio_platform_private.h" + +static void vfio_platform_mask(struct vfio_platform_irq *irq_ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&irq_ctx->lock, flags); + + if (!irq_ctx->masked) { + disable_irq_nosync(irq_ctx->hwirq); + irq_ctx->masked = true; + } + + spin_unlock_irqrestore(&irq_ctx->lock, flags); +} + +static int vfio_platform_mask_handler(void *opaque, void *unused) +{ + struct vfio_platform_irq *irq_ctx = opaque; + + vfio_platform_mask(irq_ctx); + + return 0; +} + +static int vfio_platform_set_irq_mask(struct vfio_platform_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, + void *data) +{ + if (start != 0 || count != 1) + return -EINVAL; + + if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE)) + return -EINVAL; + + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int32_t fd = *(int32_t *)data; + + if (fd >= 0) + return vfio_virqfd_enable((void *) &vdev->irqs[index], + vfio_platform_mask_handler, + NULL, NULL, + &vdev->irqs[index].mask, fd); + + vfio_virqfd_disable(&vdev->irqs[index].mask); + return 0; + } + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + vfio_platform_mask(&vdev->irqs[index]); + + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t mask = *(uint8_t *)data; + + if (mask) + vfio_platform_mask(&vdev->irqs[index]); + } + + return 0; +} + +static void vfio_platform_unmask(struct vfio_platform_irq *irq_ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&irq_ctx->lock, flags); + + if (irq_ctx->masked) { + enable_irq(irq_ctx->hwirq); + irq_ctx->masked = false; + } + + spin_unlock_irqrestore(&irq_ctx->lock, flags); +} + +static int vfio_platform_unmask_handler(void *opaque, void *unused) +{ + struct vfio_platform_irq *irq_ctx = opaque; + + vfio_platform_unmask(irq_ctx); + + return 0; +} + +static int vfio_platform_set_irq_unmask(struct vfio_platform_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, + void *data) +{ + if (start != 0 || count != 1) + return -EINVAL; + + if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE)) + return -EINVAL; + + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int32_t fd = *(int32_t *)data; + + if (fd >= 0) + return vfio_virqfd_enable((void *) &vdev->irqs[index], + vfio_platform_unmask_handler, + NULL, NULL, + &vdev->irqs[index].unmask, + fd); + + vfio_virqfd_disable(&vdev->irqs[index].unmask); + return 0; + } + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + vfio_platform_unmask(&vdev->irqs[index]); + + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t unmask = *(uint8_t *)data; + + if (unmask) + vfio_platform_unmask(&vdev->irqs[index]); + } + + return 0; +} + +static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id) +{ + struct vfio_platform_irq *irq_ctx = dev_id; + unsigned long flags; + int ret = IRQ_NONE; + + spin_lock_irqsave(&irq_ctx->lock, flags); + + if (!irq_ctx->masked) { + ret = IRQ_HANDLED; + + /* automask maskable interrupts */ + disable_irq_nosync(irq_ctx->hwirq); + irq_ctx->masked = true; + } + + spin_unlock_irqrestore(&irq_ctx->lock, flags); + + if (ret == IRQ_HANDLED) + eventfd_signal(irq_ctx->trigger, 1); + + return ret; +} + +static irqreturn_t vfio_irq_handler(int irq, void *dev_id) +{ + struct vfio_platform_irq *irq_ctx = dev_id; + + eventfd_signal(irq_ctx->trigger, 1); + + return IRQ_HANDLED; +} + +static int vfio_set_trigger(struct vfio_platform_device *vdev, int index, + int fd, irq_handler_t handler) +{ + struct vfio_platform_irq *irq = &vdev->irqs[index]; + struct eventfd_ctx *trigger; + int ret; + + if (irq->trigger) { + free_irq(irq->hwirq, irq); + kfree(irq->name); + eventfd_ctx_put(irq->trigger); + irq->trigger = NULL; + } + + if (fd < 0) /* Disable only */ + return 0; + + irq->name = kasprintf(GFP_KERNEL, "vfio-irq[%d](%s)", + irq->hwirq, vdev->name); + if (!irq->name) + return -ENOMEM; + + trigger = eventfd_ctx_fdget(fd); + if (IS_ERR(trigger)) { + kfree(irq->name); + return PTR_ERR(trigger); + } + + irq->trigger = trigger; + + irq_set_status_flags(irq->hwirq, IRQ_NOAUTOEN); + ret = request_irq(irq->hwirq, handler, 0, irq->name, irq); + if (ret) { + kfree(irq->name); + eventfd_ctx_put(trigger); + irq->trigger = NULL; + return ret; + } + + if (!irq->masked) + enable_irq(irq->hwirq); + + return 0; +} + +static int vfio_platform_set_irq_trigger(struct vfio_platform_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, + void *data) +{ + struct vfio_platform_irq *irq = &vdev->irqs[index]; + irq_handler_t handler; + + if (vdev->irqs[index].flags & VFIO_IRQ_INFO_AUTOMASKED) + handler = vfio_automasked_irq_handler; + else + handler = vfio_irq_handler; + + if (!count && (flags & VFIO_IRQ_SET_DATA_NONE)) + return vfio_set_trigger(vdev, index, -1, handler); + + if (start != 0 || count != 1) + return -EINVAL; + + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) { + int32_t fd = *(int32_t *)data; + + return vfio_set_trigger(vdev, index, fd, handler); + } + + if (flags & VFIO_IRQ_SET_DATA_NONE) { + handler(irq->hwirq, irq); + + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { + uint8_t trigger = *(uint8_t *)data; + + if (trigger) + handler(irq->hwirq, irq); + } + + return 0; +} + +int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev, + uint32_t flags, unsigned index, unsigned start, + unsigned count, void *data) +{ + int (*func)(struct vfio_platform_device *vdev, unsigned index, + unsigned start, unsigned count, uint32_t flags, + void *data) = NULL; + + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_MASK: + func = vfio_platform_set_irq_mask; + break; + case VFIO_IRQ_SET_ACTION_UNMASK: + func = vfio_platform_set_irq_unmask; + break; + case VFIO_IRQ_SET_ACTION_TRIGGER: + func = vfio_platform_set_irq_trigger; + break; + } + + if (!func) + return -ENOTTY; + + return func(vdev, index, start, count, flags, data); +} + +int vfio_platform_irq_init(struct vfio_platform_device *vdev) +{ + int cnt = 0, i; + + while (vdev->get_irq(vdev, cnt) >= 0) + cnt++; + + vdev->irqs = kcalloc(cnt, sizeof(struct vfio_platform_irq), GFP_KERNEL); + if (!vdev->irqs) + return -ENOMEM; + + for (i = 0; i < cnt; i++) { + int hwirq = vdev->get_irq(vdev, i); + + if (hwirq < 0) + goto err; + + spin_lock_init(&vdev->irqs[i].lock); + + vdev->irqs[i].flags = VFIO_IRQ_INFO_EVENTFD; + + if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK) + vdev->irqs[i].flags |= VFIO_IRQ_INFO_MASKABLE + | VFIO_IRQ_INFO_AUTOMASKED; + + vdev->irqs[i].count = 1; + vdev->irqs[i].hwirq = hwirq; + vdev->irqs[i].masked = false; + } + + vdev->num_irqs = cnt; + + return 0; +err: + kfree(vdev->irqs); + return -EINVAL; +} + +void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev) +{ + int i; + + for (i = 0; i < vdev->num_irqs; i++) + vfio_set_trigger(vdev, i, -1, NULL); + + vdev->num_irqs = 0; + kfree(vdev->irqs); +} diff --git a/kernel/drivers/vfio/platform/vfio_platform_private.h b/kernel/drivers/vfio/platform/vfio_platform_private.h new file mode 100644 index 000000000..5d31e0473 --- /dev/null +++ b/kernel/drivers/vfio/platform/vfio_platform_private.h @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2013 - Virtual Open Systems + * Author: Antonios Motakis <a.motakis@virtualopensystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef VFIO_PLATFORM_PRIVATE_H +#define VFIO_PLATFORM_PRIVATE_H + +#include <linux/types.h> +#include <linux/interrupt.h> + +#define VFIO_PLATFORM_OFFSET_SHIFT 40 +#define VFIO_PLATFORM_OFFSET_MASK (((u64)(1) << VFIO_PLATFORM_OFFSET_SHIFT) - 1) + +#define VFIO_PLATFORM_OFFSET_TO_INDEX(off) \ + (off >> VFIO_PLATFORM_OFFSET_SHIFT) + +#define VFIO_PLATFORM_INDEX_TO_OFFSET(index) \ + ((u64)(index) << VFIO_PLATFORM_OFFSET_SHIFT) + +struct vfio_platform_irq { + u32 flags; + u32 count; + int hwirq; + char *name; + struct eventfd_ctx *trigger; + bool masked; + spinlock_t lock; + struct virqfd *unmask; + struct virqfd *mask; +}; + +struct vfio_platform_region { + u64 addr; + resource_size_t size; + u32 flags; + u32 type; +#define VFIO_PLATFORM_REGION_TYPE_MMIO 1 +#define VFIO_PLATFORM_REGION_TYPE_PIO 2 + void __iomem *ioaddr; +}; + +struct vfio_platform_device { + struct vfio_platform_region *regions; + u32 num_regions; + struct vfio_platform_irq *irqs; + u32 num_irqs; + int refcnt; + struct mutex igate; + + /* + * These fields should be filled by the bus specific binder + */ + void *opaque; + const char *name; + uint32_t flags; + /* callbacks to discover device resources */ + struct resource* + (*get_resource)(struct vfio_platform_device *vdev, int i); + int (*get_irq)(struct vfio_platform_device *vdev, int i); +}; + +extern int vfio_platform_probe_common(struct vfio_platform_device *vdev, + struct device *dev); +extern struct vfio_platform_device *vfio_platform_remove_common + (struct device *dev); + +extern int vfio_platform_irq_init(struct vfio_platform_device *vdev); +extern void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev); + +extern int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev, + uint32_t flags, unsigned index, + unsigned start, unsigned count, + void *data); + +#endif /* VFIO_PLATFORM_PRIVATE_H */ diff --git a/kernel/drivers/vfio/vfio.c b/kernel/drivers/vfio/vfio.c new file mode 100644 index 000000000..e1278fe04 --- /dev/null +++ b/kernel/drivers/vfio/vfio.c @@ -0,0 +1,1612 @@ +/* + * VFIO core + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio: + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + */ + +#include <linux/cdev.h> +#include <linux/compat.h> +#include <linux/device.h> +#include <linux/file.h> +#include <linux/anon_inodes.h> +#include <linux/fs.h> +#include <linux/idr.h> +#include <linux/iommu.h> +#include <linux/list.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/wait.h> + +#define DRIVER_VERSION "0.3" +#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" +#define DRIVER_DESC "VFIO - User Level meta-driver" + +static struct vfio { + struct class *class; + struct list_head iommu_drivers_list; + struct mutex iommu_drivers_lock; + struct list_head group_list; + struct idr group_idr; + struct mutex group_lock; + struct cdev group_cdev; + dev_t group_devt; + wait_queue_head_t release_q; +} vfio; + +struct vfio_iommu_driver { + const struct vfio_iommu_driver_ops *ops; + struct list_head vfio_next; +}; + +struct vfio_container { + struct kref kref; + struct list_head group_list; + struct rw_semaphore group_lock; + struct vfio_iommu_driver *iommu_driver; + void *iommu_data; +}; + +struct vfio_unbound_dev { + struct device *dev; + struct list_head unbound_next; +}; + +struct vfio_group { + struct kref kref; + int minor; + atomic_t container_users; + struct iommu_group *iommu_group; + struct vfio_container *container; + struct list_head device_list; + struct mutex device_lock; + struct device *dev; + struct notifier_block nb; + struct list_head vfio_next; + struct list_head container_next; + struct list_head unbound_list; + struct mutex unbound_lock; + atomic_t opened; +}; + +struct vfio_device { + struct kref kref; + struct device *dev; + const struct vfio_device_ops *ops; + struct vfio_group *group; + struct list_head group_next; + void *device_data; +}; + +/** + * IOMMU driver registration + */ +int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) +{ + struct vfio_iommu_driver *driver, *tmp; + + driver = kzalloc(sizeof(*driver), GFP_KERNEL); + if (!driver) + return -ENOMEM; + + driver->ops = ops; + + mutex_lock(&vfio.iommu_drivers_lock); + + /* Check for duplicates */ + list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) { + if (tmp->ops == ops) { + mutex_unlock(&vfio.iommu_drivers_lock); + kfree(driver); + return -EINVAL; + } + } + + list_add(&driver->vfio_next, &vfio.iommu_drivers_list); + + mutex_unlock(&vfio.iommu_drivers_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_register_iommu_driver); + +void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) +{ + struct vfio_iommu_driver *driver; + + mutex_lock(&vfio.iommu_drivers_lock); + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { + if (driver->ops == ops) { + list_del(&driver->vfio_next); + mutex_unlock(&vfio.iommu_drivers_lock); + kfree(driver); + return; + } + } + mutex_unlock(&vfio.iommu_drivers_lock); +} +EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); + +/** + * Group minor allocation/free - both called with vfio.group_lock held + */ +static int vfio_alloc_group_minor(struct vfio_group *group) +{ + return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL); +} + +static void vfio_free_group_minor(int minor) +{ + idr_remove(&vfio.group_idr, minor); +} + +static int vfio_iommu_group_notifier(struct notifier_block *nb, + unsigned long action, void *data); +static void vfio_group_get(struct vfio_group *group); + +/** + * Container objects - containers are created when /dev/vfio/vfio is + * opened, but their lifecycle extends until the last user is done, so + * it's freed via kref. Must support container/group/device being + * closed in any order. + */ +static void vfio_container_get(struct vfio_container *container) +{ + kref_get(&container->kref); +} + +static void vfio_container_release(struct kref *kref) +{ + struct vfio_container *container; + container = container_of(kref, struct vfio_container, kref); + + kfree(container); +} + +static void vfio_container_put(struct vfio_container *container) +{ + kref_put(&container->kref, vfio_container_release); +} + +static void vfio_group_unlock_and_free(struct vfio_group *group) +{ + mutex_unlock(&vfio.group_lock); + /* + * Unregister outside of lock. A spurious callback is harmless now + * that the group is no longer in vfio.group_list. + */ + iommu_group_unregister_notifier(group->iommu_group, &group->nb); + kfree(group); +} + +/** + * Group objects - create, release, get, put, search + */ +static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) +{ + struct vfio_group *group, *tmp; + struct device *dev; + int ret, minor; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) + return ERR_PTR(-ENOMEM); + + kref_init(&group->kref); + INIT_LIST_HEAD(&group->device_list); + mutex_init(&group->device_lock); + INIT_LIST_HEAD(&group->unbound_list); + mutex_init(&group->unbound_lock); + atomic_set(&group->container_users, 0); + atomic_set(&group->opened, 0); + group->iommu_group = iommu_group; + + group->nb.notifier_call = vfio_iommu_group_notifier; + + /* + * blocking notifiers acquire a rwsem around registering and hold + * it around callback. Therefore, need to register outside of + * vfio.group_lock to avoid A-B/B-A contention. Our callback won't + * do anything unless it can find the group in vfio.group_list, so + * no harm in registering early. + */ + ret = iommu_group_register_notifier(iommu_group, &group->nb); + if (ret) { + kfree(group); + return ERR_PTR(ret); + } + + mutex_lock(&vfio.group_lock); + + /* Did we race creating this group? */ + list_for_each_entry(tmp, &vfio.group_list, vfio_next) { + if (tmp->iommu_group == iommu_group) { + vfio_group_get(tmp); + vfio_group_unlock_and_free(group); + return tmp; + } + } + + minor = vfio_alloc_group_minor(group); + if (minor < 0) { + vfio_group_unlock_and_free(group); + return ERR_PTR(minor); + } + + dev = device_create(vfio.class, NULL, + MKDEV(MAJOR(vfio.group_devt), minor), + group, "%d", iommu_group_id(iommu_group)); + if (IS_ERR(dev)) { + vfio_free_group_minor(minor); + vfio_group_unlock_and_free(group); + return (struct vfio_group *)dev; /* ERR_PTR */ + } + + group->minor = minor; + group->dev = dev; + + list_add(&group->vfio_next, &vfio.group_list); + + mutex_unlock(&vfio.group_lock); + + return group; +} + +/* called with vfio.group_lock held */ +static void vfio_group_release(struct kref *kref) +{ + struct vfio_group *group = container_of(kref, struct vfio_group, kref); + struct vfio_unbound_dev *unbound, *tmp; + struct iommu_group *iommu_group = group->iommu_group; + + WARN_ON(!list_empty(&group->device_list)); + + list_for_each_entry_safe(unbound, tmp, + &group->unbound_list, unbound_next) { + list_del(&unbound->unbound_next); + kfree(unbound); + } + + device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor)); + list_del(&group->vfio_next); + vfio_free_group_minor(group->minor); + vfio_group_unlock_and_free(group); + iommu_group_put(iommu_group); +} + +static void vfio_group_put(struct vfio_group *group) +{ + kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock); +} + +/* Assume group_lock or group reference is held */ +static void vfio_group_get(struct vfio_group *group) +{ + kref_get(&group->kref); +} + +/* + * Not really a try as we will sleep for mutex, but we need to make + * sure the group pointer is valid under lock and get a reference. + */ +static struct vfio_group *vfio_group_try_get(struct vfio_group *group) +{ + struct vfio_group *target = group; + + mutex_lock(&vfio.group_lock); + list_for_each_entry(group, &vfio.group_list, vfio_next) { + if (group == target) { + vfio_group_get(group); + mutex_unlock(&vfio.group_lock); + return group; + } + } + mutex_unlock(&vfio.group_lock); + + return NULL; +} + +static +struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group) +{ + struct vfio_group *group; + + mutex_lock(&vfio.group_lock); + list_for_each_entry(group, &vfio.group_list, vfio_next) { + if (group->iommu_group == iommu_group) { + vfio_group_get(group); + mutex_unlock(&vfio.group_lock); + return group; + } + } + mutex_unlock(&vfio.group_lock); + + return NULL; +} + +static struct vfio_group *vfio_group_get_from_minor(int minor) +{ + struct vfio_group *group; + + mutex_lock(&vfio.group_lock); + group = idr_find(&vfio.group_idr, minor); + if (!group) { + mutex_unlock(&vfio.group_lock); + return NULL; + } + vfio_group_get(group); + mutex_unlock(&vfio.group_lock); + + return group; +} + +/** + * Device objects - create, release, get, put, search + */ +static +struct vfio_device *vfio_group_create_device(struct vfio_group *group, + struct device *dev, + const struct vfio_device_ops *ops, + void *device_data) +{ + struct vfio_device *device; + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) + return ERR_PTR(-ENOMEM); + + kref_init(&device->kref); + device->dev = dev; + device->group = group; + device->ops = ops; + device->device_data = device_data; + dev_set_drvdata(dev, device); + + /* No need to get group_lock, caller has group reference */ + vfio_group_get(group); + + mutex_lock(&group->device_lock); + list_add(&device->group_next, &group->device_list); + mutex_unlock(&group->device_lock); + + return device; +} + +static void vfio_device_release(struct kref *kref) +{ + struct vfio_device *device = container_of(kref, + struct vfio_device, kref); + struct vfio_group *group = device->group; + + list_del(&device->group_next); + mutex_unlock(&group->device_lock); + + dev_set_drvdata(device->dev, NULL); + + kfree(device); + + /* vfio_del_group_dev may be waiting for this device */ + wake_up(&vfio.release_q); +} + +/* Device reference always implies a group reference */ +void vfio_device_put(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock); + vfio_group_put(group); +} +EXPORT_SYMBOL_GPL(vfio_device_put); + +static void vfio_device_get(struct vfio_device *device) +{ + vfio_group_get(device->group); + kref_get(&device->kref); +} + +static struct vfio_device *vfio_group_get_device(struct vfio_group *group, + struct device *dev) +{ + struct vfio_device *device; + + mutex_lock(&group->device_lock); + list_for_each_entry(device, &group->device_list, group_next) { + if (device->dev == dev) { + vfio_device_get(device); + mutex_unlock(&group->device_lock); + return device; + } + } + mutex_unlock(&group->device_lock); + return NULL; +} + +/* + * Whitelist some drivers that we know are safe (no dma) or just sit on + * a device. It's not always practical to leave a device within a group + * driverless as it could get re-bound to something unsafe. + */ +static const char * const vfio_driver_whitelist[] = { "pci-stub", "pcieport" }; + +static bool vfio_whitelisted_driver(struct device_driver *drv) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) { + if (!strcmp(drv->name, vfio_driver_whitelist[i])) + return true; + } + + return false; +} + +/* + * A vfio group is viable for use by userspace if all devices are in + * one of the following states: + * - driver-less + * - bound to a vfio driver + * - bound to a whitelisted driver + * + * We use two methods to determine whether a device is bound to a vfio + * driver. The first is to test whether the device exists in the vfio + * group. The second is to test if the device exists on the group + * unbound_list, indicating it's in the middle of transitioning from + * a vfio driver to driver-less. + */ +static int vfio_dev_viable(struct device *dev, void *data) +{ + struct vfio_group *group = data; + struct vfio_device *device; + struct device_driver *drv = ACCESS_ONCE(dev->driver); + struct vfio_unbound_dev *unbound; + int ret = -EINVAL; + + mutex_lock(&group->unbound_lock); + list_for_each_entry(unbound, &group->unbound_list, unbound_next) { + if (dev == unbound->dev) { + ret = 0; + break; + } + } + mutex_unlock(&group->unbound_lock); + + if (!ret || !drv || vfio_whitelisted_driver(drv)) + return 0; + + device = vfio_group_get_device(group, dev); + if (device) { + vfio_device_put(device); + return 0; + } + + return ret; +} + +/** + * Async device support + */ +static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev) +{ + struct vfio_device *device; + + /* Do we already know about it? We shouldn't */ + device = vfio_group_get_device(group, dev); + if (WARN_ON_ONCE(device)) { + vfio_device_put(device); + return 0; + } + + /* Nothing to do for idle groups */ + if (!atomic_read(&group->container_users)) + return 0; + + /* TODO Prevent device auto probing */ + WARN("Device %s added to live group %d!\n", dev_name(dev), + iommu_group_id(group->iommu_group)); + + return 0; +} + +static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev) +{ + /* We don't care what happens when the group isn't in use */ + if (!atomic_read(&group->container_users)) + return 0; + + return vfio_dev_viable(dev, group); +} + +static int vfio_iommu_group_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct vfio_group *group = container_of(nb, struct vfio_group, nb); + struct device *dev = data; + struct vfio_unbound_dev *unbound; + + /* + * Need to go through a group_lock lookup to get a reference or we + * risk racing a group being removed. Ignore spurious notifies. + */ + group = vfio_group_try_get(group); + if (!group) + return NOTIFY_OK; + + switch (action) { + case IOMMU_GROUP_NOTIFY_ADD_DEVICE: + vfio_group_nb_add_dev(group, dev); + break; + case IOMMU_GROUP_NOTIFY_DEL_DEVICE: + /* + * Nothing to do here. If the device is in use, then the + * vfio sub-driver should block the remove callback until + * it is unused. If the device is unused or attached to a + * stub driver, then it should be released and we don't + * care that it will be going away. + */ + break; + case IOMMU_GROUP_NOTIFY_BIND_DRIVER: + pr_debug("%s: Device %s, group %d binding to driver\n", + __func__, dev_name(dev), + iommu_group_id(group->iommu_group)); + break; + case IOMMU_GROUP_NOTIFY_BOUND_DRIVER: + pr_debug("%s: Device %s, group %d bound to driver %s\n", + __func__, dev_name(dev), + iommu_group_id(group->iommu_group), dev->driver->name); + BUG_ON(vfio_group_nb_verify(group, dev)); + break; + case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER: + pr_debug("%s: Device %s, group %d unbinding from driver %s\n", + __func__, dev_name(dev), + iommu_group_id(group->iommu_group), dev->driver->name); + break; + case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER: + pr_debug("%s: Device %s, group %d unbound from driver\n", + __func__, dev_name(dev), + iommu_group_id(group->iommu_group)); + /* + * XXX An unbound device in a live group is ok, but we'd + * really like to avoid the above BUG_ON by preventing other + * drivers from binding to it. Once that occurs, we have to + * stop the system to maintain isolation. At a minimum, we'd + * want a toggle to disable driver auto probe for this device. + */ + + mutex_lock(&group->unbound_lock); + list_for_each_entry(unbound, + &group->unbound_list, unbound_next) { + if (dev == unbound->dev) { + list_del(&unbound->unbound_next); + kfree(unbound); + break; + } + } + mutex_unlock(&group->unbound_lock); + break; + } + + vfio_group_put(group); + return NOTIFY_OK; +} + +/** + * VFIO driver API + */ +int vfio_add_group_dev(struct device *dev, + const struct vfio_device_ops *ops, void *device_data) +{ + struct iommu_group *iommu_group; + struct vfio_group *group; + struct vfio_device *device; + + iommu_group = iommu_group_get(dev); + if (!iommu_group) + return -EINVAL; + + group = vfio_group_get_from_iommu(iommu_group); + if (!group) { + group = vfio_create_group(iommu_group); + if (IS_ERR(group)) { + iommu_group_put(iommu_group); + return PTR_ERR(group); + } + } else { + /* + * A found vfio_group already holds a reference to the + * iommu_group. A created vfio_group keeps the reference. + */ + iommu_group_put(iommu_group); + } + + device = vfio_group_get_device(group, dev); + if (device) { + WARN(1, "Device %s already exists on group %d\n", + dev_name(dev), iommu_group_id(iommu_group)); + vfio_device_put(device); + vfio_group_put(group); + return -EBUSY; + } + + device = vfio_group_create_device(group, dev, ops, device_data); + if (IS_ERR(device)) { + vfio_group_put(group); + return PTR_ERR(device); + } + + /* + * Drop all but the vfio_device reference. The vfio_device holds + * a reference to the vfio_group, which holds a reference to the + * iommu_group. + */ + vfio_group_put(group); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_add_group_dev); + +/** + * Get a reference to the vfio_device for a device that is known to + * be bound to a vfio driver. The driver implicitly holds a + * vfio_device reference between vfio_add_group_dev and + * vfio_del_group_dev. We can therefore use drvdata to increment + * that reference from the struct device. This additional + * reference must be released by calling vfio_device_put. + */ +struct vfio_device *vfio_device_get_from_dev(struct device *dev) +{ + struct vfio_device *device = dev_get_drvdata(dev); + + vfio_device_get(device); + + return device; +} +EXPORT_SYMBOL_GPL(vfio_device_get_from_dev); + +/* + * Caller must hold a reference to the vfio_device + */ +void *vfio_device_data(struct vfio_device *device) +{ + return device->device_data; +} +EXPORT_SYMBOL_GPL(vfio_device_data); + +/* Given a referenced group, check if it contains the device */ +static bool vfio_dev_present(struct vfio_group *group, struct device *dev) +{ + struct vfio_device *device; + + device = vfio_group_get_device(group, dev); + if (!device) + return false; + + vfio_device_put(device); + return true; +} + +/* + * Decrement the device reference count and wait for the device to be + * removed. Open file descriptors for the device... */ +void *vfio_del_group_dev(struct device *dev) +{ + struct vfio_device *device = dev_get_drvdata(dev); + struct vfio_group *group = device->group; + void *device_data = device->device_data; + struct vfio_unbound_dev *unbound; + unsigned int i = 0; + long ret; + bool interrupted = false; + + /* + * The group exists so long as we have a device reference. Get + * a group reference and use it to scan for the device going away. + */ + vfio_group_get(group); + + /* + * When the device is removed from the group, the group suddenly + * becomes non-viable; the device has a driver (until the unbind + * completes), but it's not present in the group. This is bad news + * for any external users that need to re-acquire a group reference + * in order to match and release their existing reference. To + * solve this, we track such devices on the unbound_list to bridge + * the gap until they're fully unbound. + */ + unbound = kzalloc(sizeof(*unbound), GFP_KERNEL); + if (unbound) { + unbound->dev = dev; + mutex_lock(&group->unbound_lock); + list_add(&unbound->unbound_next, &group->unbound_list); + mutex_unlock(&group->unbound_lock); + } + WARN_ON(!unbound); + + vfio_device_put(device); + + /* + * If the device is still present in the group after the above + * 'put', then it is in use and we need to request it from the + * bus driver. The driver may in turn need to request the + * device from the user. We send the request on an arbitrary + * interval with counter to allow the driver to take escalating + * measures to release the device if it has the ability to do so. + */ + do { + device = vfio_group_get_device(group, dev); + if (!device) + break; + + if (device->ops->request) + device->ops->request(device_data, i++); + + vfio_device_put(device); + + if (interrupted) { + ret = wait_event_timeout(vfio.release_q, + !vfio_dev_present(group, dev), HZ * 10); + } else { + ret = wait_event_interruptible_timeout(vfio.release_q, + !vfio_dev_present(group, dev), HZ * 10); + if (ret == -ERESTARTSYS) { + interrupted = true; + dev_warn(dev, + "Device is currently in use, task" + " \"%s\" (%d) " + "blocked until device is released", + current->comm, task_pid_nr(current)); + } + } + } while (ret <= 0); + + vfio_group_put(group); + + return device_data; +} +EXPORT_SYMBOL_GPL(vfio_del_group_dev); + +/** + * VFIO base fd, /dev/vfio/vfio + */ +static long vfio_ioctl_check_extension(struct vfio_container *container, + unsigned long arg) +{ + struct vfio_iommu_driver *driver; + long ret = 0; + + down_read(&container->group_lock); + + driver = container->iommu_driver; + + switch (arg) { + /* No base extensions yet */ + default: + /* + * If no driver is set, poll all registered drivers for + * extensions and return the first positive result. If + * a driver is already set, further queries will be passed + * only to that driver. + */ + if (!driver) { + mutex_lock(&vfio.iommu_drivers_lock); + list_for_each_entry(driver, &vfio.iommu_drivers_list, + vfio_next) { + if (!try_module_get(driver->ops->owner)) + continue; + + ret = driver->ops->ioctl(NULL, + VFIO_CHECK_EXTENSION, + arg); + module_put(driver->ops->owner); + if (ret > 0) + break; + } + mutex_unlock(&vfio.iommu_drivers_lock); + } else + ret = driver->ops->ioctl(container->iommu_data, + VFIO_CHECK_EXTENSION, arg); + } + + up_read(&container->group_lock); + + return ret; +} + +/* hold write lock on container->group_lock */ +static int __vfio_container_attach_groups(struct vfio_container *container, + struct vfio_iommu_driver *driver, + void *data) +{ + struct vfio_group *group; + int ret = -ENODEV; + + list_for_each_entry(group, &container->group_list, container_next) { + ret = driver->ops->attach_group(data, group->iommu_group); + if (ret) + goto unwind; + } + + return ret; + +unwind: + list_for_each_entry_continue_reverse(group, &container->group_list, + container_next) { + driver->ops->detach_group(data, group->iommu_group); + } + + return ret; +} + +static long vfio_ioctl_set_iommu(struct vfio_container *container, + unsigned long arg) +{ + struct vfio_iommu_driver *driver; + long ret = -ENODEV; + + down_write(&container->group_lock); + + /* + * The container is designed to be an unprivileged interface while + * the group can be assigned to specific users. Therefore, only by + * adding a group to a container does the user get the privilege of + * enabling the iommu, which may allocate finite resources. There + * is no unset_iommu, but by removing all the groups from a container, + * the container is deprivileged and returns to an unset state. + */ + if (list_empty(&container->group_list) || container->iommu_driver) { + up_write(&container->group_lock); + return -EINVAL; + } + + mutex_lock(&vfio.iommu_drivers_lock); + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { + void *data; + + if (!try_module_get(driver->ops->owner)) + continue; + + /* + * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION, + * so test which iommu driver reported support for this + * extension and call open on them. We also pass them the + * magic, allowing a single driver to support multiple + * interfaces if they'd like. + */ + if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) { + module_put(driver->ops->owner); + continue; + } + + /* module reference holds the driver we're working on */ + mutex_unlock(&vfio.iommu_drivers_lock); + + data = driver->ops->open(arg); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + module_put(driver->ops->owner); + goto skip_drivers_unlock; + } + + ret = __vfio_container_attach_groups(container, driver, data); + if (!ret) { + container->iommu_driver = driver; + container->iommu_data = data; + } else { + driver->ops->release(data); + module_put(driver->ops->owner); + } + + goto skip_drivers_unlock; + } + + mutex_unlock(&vfio.iommu_drivers_lock); +skip_drivers_unlock: + up_write(&container->group_lock); + + return ret; +} + +static long vfio_fops_unl_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vfio_container *container = filep->private_data; + struct vfio_iommu_driver *driver; + void *data; + long ret = -EINVAL; + + if (!container) + return ret; + + switch (cmd) { + case VFIO_GET_API_VERSION: + ret = VFIO_API_VERSION; + break; + case VFIO_CHECK_EXTENSION: + ret = vfio_ioctl_check_extension(container, arg); + break; + case VFIO_SET_IOMMU: + ret = vfio_ioctl_set_iommu(container, arg); + break; + default: + down_read(&container->group_lock); + + driver = container->iommu_driver; + data = container->iommu_data; + + if (driver) /* passthrough all unrecognized ioctls */ + ret = driver->ops->ioctl(data, cmd, arg); + + up_read(&container->group_lock); + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static long vfio_fops_compat_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + arg = (unsigned long)compat_ptr(arg); + return vfio_fops_unl_ioctl(filep, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + +static int vfio_fops_open(struct inode *inode, struct file *filep) +{ + struct vfio_container *container; + + container = kzalloc(sizeof(*container), GFP_KERNEL); + if (!container) + return -ENOMEM; + + INIT_LIST_HEAD(&container->group_list); + init_rwsem(&container->group_lock); + kref_init(&container->kref); + + filep->private_data = container; + + return 0; +} + +static int vfio_fops_release(struct inode *inode, struct file *filep) +{ + struct vfio_container *container = filep->private_data; + + filep->private_data = NULL; + + vfio_container_put(container); + + return 0; +} + +/* + * Once an iommu driver is set, we optionally pass read/write/mmap + * on to the driver, allowing management interfaces beyond ioctl. + */ +static ssize_t vfio_fops_read(struct file *filep, char __user *buf, + size_t count, loff_t *ppos) +{ + struct vfio_container *container = filep->private_data; + struct vfio_iommu_driver *driver; + ssize_t ret = -EINVAL; + + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->read)) + ret = driver->ops->read(container->iommu_data, + buf, count, ppos); + + up_read(&container->group_lock); + + return ret; +} + +static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct vfio_container *container = filep->private_data; + struct vfio_iommu_driver *driver; + ssize_t ret = -EINVAL; + + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->write)) + ret = driver->ops->write(container->iommu_data, + buf, count, ppos); + + up_read(&container->group_lock); + + return ret; +} + +static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) +{ + struct vfio_container *container = filep->private_data; + struct vfio_iommu_driver *driver; + int ret = -EINVAL; + + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->mmap)) + ret = driver->ops->mmap(container->iommu_data, vma); + + up_read(&container->group_lock); + + return ret; +} + +static const struct file_operations vfio_fops = { + .owner = THIS_MODULE, + .open = vfio_fops_open, + .release = vfio_fops_release, + .read = vfio_fops_read, + .write = vfio_fops_write, + .unlocked_ioctl = vfio_fops_unl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vfio_fops_compat_ioctl, +#endif + .mmap = vfio_fops_mmap, +}; + +/** + * VFIO Group fd, /dev/vfio/$GROUP + */ +static void __vfio_group_unset_container(struct vfio_group *group) +{ + struct vfio_container *container = group->container; + struct vfio_iommu_driver *driver; + + down_write(&container->group_lock); + + driver = container->iommu_driver; + if (driver) + driver->ops->detach_group(container->iommu_data, + group->iommu_group); + + group->container = NULL; + list_del(&group->container_next); + + /* Detaching the last group deprivileges a container, remove iommu */ + if (driver && list_empty(&container->group_list)) { + driver->ops->release(container->iommu_data); + module_put(driver->ops->owner); + container->iommu_driver = NULL; + container->iommu_data = NULL; + } + + up_write(&container->group_lock); + + vfio_container_put(container); +} + +/* + * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or + * if there was no container to unset. Since the ioctl is called on + * the group, we know that still exists, therefore the only valid + * transition here is 1->0. + */ +static int vfio_group_unset_container(struct vfio_group *group) +{ + int users = atomic_cmpxchg(&group->container_users, 1, 0); + + if (!users) + return -EINVAL; + if (users != 1) + return -EBUSY; + + __vfio_group_unset_container(group); + + return 0; +} + +/* + * When removing container users, anything that removes the last user + * implicitly removes the group from the container. That is, if the + * group file descriptor is closed, as well as any device file descriptors, + * the group is free. + */ +static void vfio_group_try_dissolve_container(struct vfio_group *group) +{ + if (0 == atomic_dec_if_positive(&group->container_users)) + __vfio_group_unset_container(group); +} + +static int vfio_group_set_container(struct vfio_group *group, int container_fd) +{ + struct fd f; + struct vfio_container *container; + struct vfio_iommu_driver *driver; + int ret = 0; + + if (atomic_read(&group->container_users)) + return -EINVAL; + + f = fdget(container_fd); + if (!f.file) + return -EBADF; + + /* Sanity check, is this really our fd? */ + if (f.file->f_op != &vfio_fops) { + fdput(f); + return -EINVAL; + } + + container = f.file->private_data; + WARN_ON(!container); /* fget ensures we don't race vfio_release */ + + down_write(&container->group_lock); + + driver = container->iommu_driver; + if (driver) { + ret = driver->ops->attach_group(container->iommu_data, + group->iommu_group); + if (ret) + goto unlock_out; + } + + group->container = container; + list_add(&group->container_next, &container->group_list); + + /* Get a reference on the container and mark a user within the group */ + vfio_container_get(container); + atomic_inc(&group->container_users); + +unlock_out: + up_write(&container->group_lock); + fdput(f); + return ret; +} + +static bool vfio_group_viable(struct vfio_group *group) +{ + return (iommu_group_for_each_dev(group->iommu_group, + group, vfio_dev_viable) == 0); +} + +static const struct file_operations vfio_device_fops; + +static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) +{ + struct vfio_device *device; + struct file *filep; + int ret = -ENODEV; + + if (0 == atomic_read(&group->container_users) || + !group->container->iommu_driver || !vfio_group_viable(group)) + return -EINVAL; + + mutex_lock(&group->device_lock); + list_for_each_entry(device, &group->device_list, group_next) { + if (strcmp(dev_name(device->dev), buf)) + continue; + + ret = device->ops->open(device->device_data); + if (ret) + break; + /* + * We can't use anon_inode_getfd() because we need to modify + * the f_mode flags directly to allow more than just ioctls + */ + ret = get_unused_fd_flags(O_CLOEXEC); + if (ret < 0) { + device->ops->release(device->device_data); + break; + } + + filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, + device, O_RDWR); + if (IS_ERR(filep)) { + put_unused_fd(ret); + ret = PTR_ERR(filep); + device->ops->release(device->device_data); + break; + } + + /* + * TODO: add an anon_inode interface to do this. + * Appears to be missing by lack of need rather than + * explicitly prevented. Now there's need. + */ + filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + + vfio_device_get(device); + atomic_inc(&group->container_users); + + fd_install(ret, filep); + break; + } + mutex_unlock(&group->device_lock); + + return ret; +} + +static long vfio_group_fops_unl_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vfio_group *group = filep->private_data; + long ret = -ENOTTY; + + switch (cmd) { + case VFIO_GROUP_GET_STATUS: + { + struct vfio_group_status status; + unsigned long minsz; + + minsz = offsetofend(struct vfio_group_status, flags); + + if (copy_from_user(&status, (void __user *)arg, minsz)) + return -EFAULT; + + if (status.argsz < minsz) + return -EINVAL; + + status.flags = 0; + + if (vfio_group_viable(group)) + status.flags |= VFIO_GROUP_FLAGS_VIABLE; + + if (group->container) + status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET; + + if (copy_to_user((void __user *)arg, &status, minsz)) + return -EFAULT; + + ret = 0; + break; + } + case VFIO_GROUP_SET_CONTAINER: + { + int fd; + + if (get_user(fd, (int __user *)arg)) + return -EFAULT; + + if (fd < 0) + return -EINVAL; + + ret = vfio_group_set_container(group, fd); + break; + } + case VFIO_GROUP_UNSET_CONTAINER: + ret = vfio_group_unset_container(group); + break; + case VFIO_GROUP_GET_DEVICE_FD: + { + char *buf; + + buf = strndup_user((const char __user *)arg, PAGE_SIZE); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + ret = vfio_group_get_device_fd(group, buf); + kfree(buf); + break; + } + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static long vfio_group_fops_compat_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + arg = (unsigned long)compat_ptr(arg); + return vfio_group_fops_unl_ioctl(filep, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + +static int vfio_group_fops_open(struct inode *inode, struct file *filep) +{ + struct vfio_group *group; + int opened; + + group = vfio_group_get_from_minor(iminor(inode)); + if (!group) + return -ENODEV; + + /* Do we need multiple instances of the group open? Seems not. */ + opened = atomic_cmpxchg(&group->opened, 0, 1); + if (opened) { + vfio_group_put(group); + return -EBUSY; + } + + /* Is something still in use from a previous open? */ + if (group->container) { + atomic_dec(&group->opened); + vfio_group_put(group); + return -EBUSY; + } + + filep->private_data = group; + + return 0; +} + +static int vfio_group_fops_release(struct inode *inode, struct file *filep) +{ + struct vfio_group *group = filep->private_data; + + filep->private_data = NULL; + + vfio_group_try_dissolve_container(group); + + atomic_dec(&group->opened); + + vfio_group_put(group); + + return 0; +} + +static const struct file_operations vfio_group_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = vfio_group_fops_unl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vfio_group_fops_compat_ioctl, +#endif + .open = vfio_group_fops_open, + .release = vfio_group_fops_release, +}; + +/** + * VFIO Device fd + */ +static int vfio_device_fops_release(struct inode *inode, struct file *filep) +{ + struct vfio_device *device = filep->private_data; + + device->ops->release(device->device_data); + + vfio_group_try_dissolve_container(device->group); + + vfio_device_put(device); + + return 0; +} + +static long vfio_device_fops_unl_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vfio_device *device = filep->private_data; + + if (unlikely(!device->ops->ioctl)) + return -EINVAL; + + return device->ops->ioctl(device->device_data, cmd, arg); +} + +static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, + size_t count, loff_t *ppos) +{ + struct vfio_device *device = filep->private_data; + + if (unlikely(!device->ops->read)) + return -EINVAL; + + return device->ops->read(device->device_data, buf, count, ppos); +} + +static ssize_t vfio_device_fops_write(struct file *filep, + const char __user *buf, + size_t count, loff_t *ppos) +{ + struct vfio_device *device = filep->private_data; + + if (unlikely(!device->ops->write)) + return -EINVAL; + + return device->ops->write(device->device_data, buf, count, ppos); +} + +static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) +{ + struct vfio_device *device = filep->private_data; + + if (unlikely(!device->ops->mmap)) + return -EINVAL; + + return device->ops->mmap(device->device_data, vma); +} + +#ifdef CONFIG_COMPAT +static long vfio_device_fops_compat_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + arg = (unsigned long)compat_ptr(arg); + return vfio_device_fops_unl_ioctl(filep, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + +static const struct file_operations vfio_device_fops = { + .owner = THIS_MODULE, + .release = vfio_device_fops_release, + .read = vfio_device_fops_read, + .write = vfio_device_fops_write, + .unlocked_ioctl = vfio_device_fops_unl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vfio_device_fops_compat_ioctl, +#endif + .mmap = vfio_device_fops_mmap, +}; + +/** + * External user API, exported by symbols to be linked dynamically. + * + * The protocol includes: + * 1. do normal VFIO init operation: + * - opening a new container; + * - attaching group(s) to it; + * - setting an IOMMU driver for a container. + * When IOMMU is set for a container, all groups in it are + * considered ready to use by an external user. + * + * 2. User space passes a group fd to an external user. + * The external user calls vfio_group_get_external_user() + * to verify that: + * - the group is initialized; + * - IOMMU is set for it. + * If both checks passed, vfio_group_get_external_user() + * increments the container user counter to prevent + * the VFIO group from disposal before KVM exits. + * + * 3. The external user calls vfio_external_user_iommu_id() + * to know an IOMMU ID. + * + * 4. When the external KVM finishes, it calls + * vfio_group_put_external_user() to release the VFIO group. + * This call decrements the container user counter. + */ +struct vfio_group *vfio_group_get_external_user(struct file *filep) +{ + struct vfio_group *group = filep->private_data; + + if (filep->f_op != &vfio_group_fops) + return ERR_PTR(-EINVAL); + + if (!atomic_inc_not_zero(&group->container_users)) + return ERR_PTR(-EINVAL); + + if (!group->container->iommu_driver || + !vfio_group_viable(group)) { + atomic_dec(&group->container_users); + return ERR_PTR(-EINVAL); + } + + vfio_group_get(group); + + return group; +} +EXPORT_SYMBOL_GPL(vfio_group_get_external_user); + +void vfio_group_put_external_user(struct vfio_group *group) +{ + vfio_group_put(group); + vfio_group_try_dissolve_container(group); +} +EXPORT_SYMBOL_GPL(vfio_group_put_external_user); + +int vfio_external_user_iommu_id(struct vfio_group *group) +{ + return iommu_group_id(group->iommu_group); +} +EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id); + +long vfio_external_check_extension(struct vfio_group *group, unsigned long arg) +{ + return vfio_ioctl_check_extension(group->container, arg); +} +EXPORT_SYMBOL_GPL(vfio_external_check_extension); + +/** + * Module/class support + */ +static char *vfio_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); +} + +static struct miscdevice vfio_dev = { + .minor = VFIO_MINOR, + .name = "vfio", + .fops = &vfio_fops, + .nodename = "vfio/vfio", + .mode = S_IRUGO | S_IWUGO, +}; + +static int __init vfio_init(void) +{ + int ret; + + idr_init(&vfio.group_idr); + mutex_init(&vfio.group_lock); + mutex_init(&vfio.iommu_drivers_lock); + INIT_LIST_HEAD(&vfio.group_list); + INIT_LIST_HEAD(&vfio.iommu_drivers_list); + init_waitqueue_head(&vfio.release_q); + + ret = misc_register(&vfio_dev); + if (ret) { + pr_err("vfio: misc device register failed\n"); + return ret; + } + + /* /dev/vfio/$GROUP */ + vfio.class = class_create(THIS_MODULE, "vfio"); + if (IS_ERR(vfio.class)) { + ret = PTR_ERR(vfio.class); + goto err_class; + } + + vfio.class->devnode = vfio_devnode; + + ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK, "vfio"); + if (ret) + goto err_alloc_chrdev; + + cdev_init(&vfio.group_cdev, &vfio_group_fops); + ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK); + if (ret) + goto err_cdev_add; + + pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); + + /* + * Attempt to load known iommu-drivers. This gives us a working + * environment without the user needing to explicitly load iommu + * drivers. + */ + request_module_nowait("vfio_iommu_type1"); + request_module_nowait("vfio_iommu_spapr_tce"); + + return 0; + +err_cdev_add: + unregister_chrdev_region(vfio.group_devt, MINORMASK); +err_alloc_chrdev: + class_destroy(vfio.class); + vfio.class = NULL; +err_class: + misc_deregister(&vfio_dev); + return ret; +} + +static void __exit vfio_cleanup(void) +{ + WARN_ON(!list_empty(&vfio.group_list)); + + idr_destroy(&vfio.group_idr); + cdev_del(&vfio.group_cdev); + unregister_chrdev_region(vfio.group_devt, MINORMASK); + class_destroy(vfio.class); + vfio.class = NULL; + misc_deregister(&vfio_dev); +} + +module_init(vfio_init); +module_exit(vfio_cleanup); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_ALIAS_MISCDEV(VFIO_MINOR); +MODULE_ALIAS("devname:vfio/vfio"); diff --git a/kernel/drivers/vfio/vfio_iommu_spapr_tce.c b/kernel/drivers/vfio/vfio_iommu_spapr_tce.c new file mode 100644 index 000000000..730b4ef3e --- /dev/null +++ b/kernel/drivers/vfio/vfio_iommu_spapr_tce.c @@ -0,0 +1,392 @@ +/* + * VFIO: IOMMU DMA mapping support for TCE on POWER + * + * Copyright (C) 2013 IBM Corp. All rights reserved. + * Author: Alexey Kardashevskiy <aik@ozlabs.ru> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio_iommu_type1.c: + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + */ + +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/err.h> +#include <linux/vfio.h> +#include <asm/iommu.h> +#include <asm/tce.h> + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "aik@ozlabs.ru" +#define DRIVER_DESC "VFIO IOMMU SPAPR TCE" + +static void tce_iommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group); + +/* + * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation + * + * This code handles mapping and unmapping of user data buffers + * into DMA'ble space using the IOMMU + */ + +/* + * The container descriptor supports only a single group per container. + * Required by the API as the container is not supplied with the IOMMU group + * at the moment of initialization. + */ +struct tce_container { + struct mutex lock; + struct iommu_table *tbl; + bool enabled; +}; + +static int tce_iommu_enable(struct tce_container *container) +{ + int ret = 0; + unsigned long locked, lock_limit, npages; + struct iommu_table *tbl = container->tbl; + + if (!container->tbl) + return -ENXIO; + + if (!current->mm) + return -ESRCH; /* process exited */ + + if (container->enabled) + return -EBUSY; + + /* + * When userspace pages are mapped into the IOMMU, they are effectively + * locked memory, so, theoretically, we need to update the accounting + * of locked pages on each map and unmap. For powerpc, the map unmap + * paths can be very hot, though, and the accounting would kill + * performance, especially since it would be difficult to impossible + * to handle the accounting in real mode only. + * + * To address that, rather than precisely accounting every page, we + * instead account for a worst case on locked memory when the iommu is + * enabled and disabled. The worst case upper bound on locked memory + * is the size of the whole iommu window, which is usually relatively + * small (compared to total memory sizes) on POWER hardware. + * + * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, + * that would effectively kill the guest at random points, much better + * enforcing the limit based on the max that the guest can map. + */ + down_write(¤t->mm->mmap_sem); + npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + locked = current->mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { + pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n", + rlimit(RLIMIT_MEMLOCK)); + ret = -ENOMEM; + } else { + + current->mm->locked_vm += npages; + container->enabled = true; + } + up_write(¤t->mm->mmap_sem); + + return ret; +} + +static void tce_iommu_disable(struct tce_container *container) +{ + if (!container->enabled) + return; + + container->enabled = false; + + if (!container->tbl || !current->mm) + return; + + down_write(¤t->mm->mmap_sem); + current->mm->locked_vm -= (container->tbl->it_size << + IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; + up_write(¤t->mm->mmap_sem); +} + +static void *tce_iommu_open(unsigned long arg) +{ + struct tce_container *container; + + if (arg != VFIO_SPAPR_TCE_IOMMU) { + pr_err("tce_vfio: Wrong IOMMU type\n"); + return ERR_PTR(-EINVAL); + } + + container = kzalloc(sizeof(*container), GFP_KERNEL); + if (!container) + return ERR_PTR(-ENOMEM); + + mutex_init(&container->lock); + + return container; +} + +static void tce_iommu_release(void *iommu_data) +{ + struct tce_container *container = iommu_data; + + WARN_ON(container->tbl && !container->tbl->it_group); + tce_iommu_disable(container); + + if (container->tbl && container->tbl->it_group) + tce_iommu_detach_group(iommu_data, container->tbl->it_group); + + mutex_destroy(&container->lock); + + kfree(container); +} + +static long tce_iommu_ioctl(void *iommu_data, + unsigned int cmd, unsigned long arg) +{ + struct tce_container *container = iommu_data; + unsigned long minsz; + long ret; + + switch (cmd) { + case VFIO_CHECK_EXTENSION: + switch (arg) { + case VFIO_SPAPR_TCE_IOMMU: + ret = 1; + break; + default: + ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg); + break; + } + + return (ret < 0) ? 0 : ret; + + case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { + struct vfio_iommu_spapr_tce_info info; + struct iommu_table *tbl = container->tbl; + + if (WARN_ON(!tbl)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_spapr_tce_info, + dma32_window_size); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K; + info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K; + info.flags = 0; + + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + + return 0; + } + case VFIO_IOMMU_MAP_DMA: { + struct vfio_iommu_type1_dma_map param; + struct iommu_table *tbl = container->tbl; + unsigned long tce, i; + + if (!tbl) + return -ENXIO; + + BUG_ON(!tbl->it_group); + + minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE)) + return -EINVAL; + + if ((param.size & ~IOMMU_PAGE_MASK_4K) || + (param.vaddr & ~IOMMU_PAGE_MASK_4K)) + return -EINVAL; + + /* iova is checked by the IOMMU API */ + tce = param.vaddr; + if (param.flags & VFIO_DMA_MAP_FLAG_READ) + tce |= TCE_PCI_READ; + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) + tce |= TCE_PCI_WRITE; + + ret = iommu_tce_put_param_check(tbl, param.iova, tce); + if (ret) + return ret; + + for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) { + ret = iommu_put_tce_user_mode(tbl, + (param.iova >> IOMMU_PAGE_SHIFT_4K) + i, + tce); + if (ret) + break; + tce += IOMMU_PAGE_SIZE_4K; + } + if (ret) + iommu_clear_tces_and_put_pages(tbl, + param.iova >> IOMMU_PAGE_SHIFT_4K, i); + + iommu_flush_tce(tbl); + + return ret; + } + case VFIO_IOMMU_UNMAP_DMA: { + struct vfio_iommu_type1_dma_unmap param; + struct iommu_table *tbl = container->tbl; + + if (WARN_ON(!tbl)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, + size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + /* No flag is supported now */ + if (param.flags) + return -EINVAL; + + if (param.size & ~IOMMU_PAGE_MASK_4K) + return -EINVAL; + + ret = iommu_tce_clear_param_check(tbl, param.iova, 0, + param.size >> IOMMU_PAGE_SHIFT_4K); + if (ret) + return ret; + + ret = iommu_clear_tces_and_put_pages(tbl, + param.iova >> IOMMU_PAGE_SHIFT_4K, + param.size >> IOMMU_PAGE_SHIFT_4K); + iommu_flush_tce(tbl); + + return ret; + } + case VFIO_IOMMU_ENABLE: + mutex_lock(&container->lock); + ret = tce_iommu_enable(container); + mutex_unlock(&container->lock); + return ret; + + + case VFIO_IOMMU_DISABLE: + mutex_lock(&container->lock); + tce_iommu_disable(container); + mutex_unlock(&container->lock); + return 0; + case VFIO_EEH_PE_OP: + if (!container->tbl || !container->tbl->it_group) + return -ENODEV; + + return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group, + cmd, arg); + } + + return -ENOTTY; +} + +static int tce_iommu_attach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + int ret; + struct tce_container *container = iommu_data; + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + + BUG_ON(!tbl); + mutex_lock(&container->lock); + + /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", + iommu_group_id(iommu_group), iommu_group); */ + if (container->tbl) { + pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n", + iommu_group_id(container->tbl->it_group), + iommu_group_id(iommu_group)); + ret = -EBUSY; + } else if (container->enabled) { + pr_err("tce_vfio: attaching group #%u to enabled container\n", + iommu_group_id(iommu_group)); + ret = -EBUSY; + } else { + ret = iommu_take_ownership(tbl); + if (!ret) + container->tbl = tbl; + } + + mutex_unlock(&container->lock); + + return ret; +} + +static void tce_iommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + struct tce_container *container = iommu_data; + struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + + BUG_ON(!tbl); + mutex_lock(&container->lock); + if (tbl != container->tbl) { + pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", + iommu_group_id(iommu_group), + iommu_group_id(tbl->it_group)); + } else { + if (container->enabled) { + pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", + iommu_group_id(tbl->it_group)); + tce_iommu_disable(container); + } + + /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", + iommu_group_id(iommu_group), iommu_group); */ + container->tbl = NULL; + iommu_release_ownership(tbl); + } + mutex_unlock(&container->lock); +} + +const struct vfio_iommu_driver_ops tce_iommu_driver_ops = { + .name = "iommu-vfio-powerpc", + .owner = THIS_MODULE, + .open = tce_iommu_open, + .release = tce_iommu_release, + .ioctl = tce_iommu_ioctl, + .attach_group = tce_iommu_attach_group, + .detach_group = tce_iommu_detach_group, +}; + +static int __init tce_iommu_init(void) +{ + return vfio_register_iommu_driver(&tce_iommu_driver_ops); +} + +static void __exit tce_iommu_cleanup(void) +{ + vfio_unregister_iommu_driver(&tce_iommu_driver_ops); +} + +module_init(tce_iommu_init); +module_exit(tce_iommu_cleanup); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); + diff --git a/kernel/drivers/vfio/vfio_iommu_type1.c b/kernel/drivers/vfio/vfio_iommu_type1.c new file mode 100644 index 000000000..57d8c37a0 --- /dev/null +++ b/kernel/drivers/vfio/vfio_iommu_type1.c @@ -0,0 +1,1054 @@ +/* + * VFIO: IOMMU DMA mapping support for Type1 IOMMU + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Derived from original vfio: + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + * + * We arbitrarily define a Type1 IOMMU as one matching the below code. + * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel + * VT-d, but that makes it harder to re-use as theoretically anyone + * implementing a similar IOMMU could make use of this. We expect the + * IOMMU to support the IOMMU API and have few to no restrictions around + * the IOVA range that can be mapped. The Type1 IOMMU is currently + * optimized for relatively static mappings of a userspace process with + * userpsace pages pinned into memory. We also assume devices and IOMMU + * domains are PCI based as the IOMMU API is still centered around a + * device/bus interface rather than a group interface. + */ + +#include <linux/compat.h> +#include <linux/device.h> +#include <linux/fs.h> +#include <linux/iommu.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/rbtree.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <linux/workqueue.h> + +#define DRIVER_VERSION "0.2" +#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" +#define DRIVER_DESC "Type1 IOMMU driver for VFIO" + +static bool allow_unsafe_interrupts; +module_param_named(allow_unsafe_interrupts, + allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(allow_unsafe_interrupts, + "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); + +static bool disable_hugepages; +module_param_named(disable_hugepages, + disable_hugepages, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(disable_hugepages, + "Disable VFIO IOMMU support for IOMMU hugepages."); + +struct vfio_iommu { + struct list_head domain_list; + struct mutex lock; + struct rb_root dma_list; + bool v2; + bool nesting; +}; + +struct vfio_domain { + struct iommu_domain *domain; + struct list_head next; + struct list_head group_list; + int prot; /* IOMMU_CACHE */ + bool fgsp; /* Fine-grained super pages */ +}; + +struct vfio_dma { + struct rb_node node; + dma_addr_t iova; /* Device address */ + unsigned long vaddr; /* Process virtual addr */ + size_t size; /* Map size (bytes) */ + int prot; /* IOMMU_READ/WRITE */ +}; + +struct vfio_group { + struct iommu_group *iommu_group; + struct list_head next; +}; + +/* + * This code handles mapping and unmapping of user data buffers + * into DMA'ble space using the IOMMU + */ + +static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, + dma_addr_t start, size_t size) +{ + struct rb_node *node = iommu->dma_list.rb_node; + + while (node) { + struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); + + if (start + size <= dma->iova) + node = node->rb_left; + else if (start >= dma->iova + dma->size) + node = node->rb_right; + else + return dma; + } + + return NULL; +} + +static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new) +{ + struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; + struct vfio_dma *dma; + + while (*link) { + parent = *link; + dma = rb_entry(parent, struct vfio_dma, node); + + if (new->iova + new->size <= dma->iova) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &iommu->dma_list); +} + +static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) +{ + rb_erase(&old->node, &iommu->dma_list); +} + +struct vwork { + struct mm_struct *mm; + long npage; + struct work_struct work; +}; + +/* delayed decrement/increment for locked_vm */ +static void vfio_lock_acct_bg(struct work_struct *work) +{ + struct vwork *vwork = container_of(work, struct vwork, work); + struct mm_struct *mm; + + mm = vwork->mm; + down_write(&mm->mmap_sem); + mm->locked_vm += vwork->npage; + up_write(&mm->mmap_sem); + mmput(mm); + kfree(vwork); +} + +static void vfio_lock_acct(long npage) +{ + struct vwork *vwork; + struct mm_struct *mm; + + if (!current->mm || !npage) + return; /* process exited or nothing to do */ + + if (down_write_trylock(¤t->mm->mmap_sem)) { + current->mm->locked_vm += npage; + up_write(¤t->mm->mmap_sem); + return; + } + + /* + * Couldn't get mmap_sem lock, so must setup to update + * mm->locked_vm later. If locked_vm were atomic, we + * wouldn't need this silliness + */ + vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); + if (!vwork) + return; + mm = get_task_mm(current); + if (!mm) { + kfree(vwork); + return; + } + INIT_WORK(&vwork->work, vfio_lock_acct_bg); + vwork->mm = mm; + vwork->npage = npage; + schedule_work(&vwork->work); +} + +/* + * Some mappings aren't backed by a struct page, for example an mmap'd + * MMIO range for our own or another device. These use a different + * pfn conversion and shouldn't be tracked as locked pages. + */ +static bool is_invalid_reserved_pfn(unsigned long pfn) +{ + if (pfn_valid(pfn)) { + bool reserved; + struct page *tail = pfn_to_page(pfn); + struct page *head = compound_head(tail); + reserved = !!(PageReserved(head)); + if (head != tail) { + /* + * "head" is not a dangling pointer + * (compound_head takes care of that) + * but the hugepage may have been split + * from under us (and we may not hold a + * reference count on the head page so it can + * be reused before we run PageReferenced), so + * we've to check PageTail before returning + * what we just read. + */ + smp_rmb(); + if (PageTail(tail)) + return reserved; + } + return PageReserved(tail); + } + + return true; +} + +static int put_pfn(unsigned long pfn, int prot) +{ + if (!is_invalid_reserved_pfn(pfn)) { + struct page *page = pfn_to_page(pfn); + if (prot & IOMMU_WRITE) + SetPageDirty(page); + put_page(page); + return 1; + } + return 0; +} + +static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) +{ + struct page *page[1]; + struct vm_area_struct *vma; + int ret = -EFAULT; + + if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) { + *pfn = page_to_pfn(page[0]); + return 0; + } + + down_read(¤t->mm->mmap_sem); + + vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); + + if (vma && vma->vm_flags & VM_PFNMAP) { + *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + if (is_invalid_reserved_pfn(*pfn)) + ret = 0; + } + + up_read(¤t->mm->mmap_sem); + + return ret; +} + +/* + * Attempt to pin pages. We really don't want to track all the pfns and + * the iommu can only map chunks of consecutive pfns anyway, so get the + * first page and all consecutive pages with the same locking. + */ +static long vfio_pin_pages(unsigned long vaddr, long npage, + int prot, unsigned long *pfn_base) +{ + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + bool lock_cap = capable(CAP_IPC_LOCK); + long ret, i; + bool rsvd; + + if (!current->mm) + return -ENODEV; + + ret = vaddr_get_pfn(vaddr, prot, pfn_base); + if (ret) + return ret; + + rsvd = is_invalid_reserved_pfn(*pfn_base); + + if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) { + put_pfn(*pfn_base, prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, + limit << PAGE_SHIFT); + return -ENOMEM; + } + + if (unlikely(disable_hugepages)) { + if (!rsvd) + vfio_lock_acct(1); + return 1; + } + + /* Lock all the consecutive pages from pfn_base */ + for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { + unsigned long pfn = 0; + + ret = vaddr_get_pfn(vaddr, prot, &pfn); + if (ret) + break; + + if (pfn != *pfn_base + i || + rsvd != is_invalid_reserved_pfn(pfn)) { + put_pfn(pfn, prot); + break; + } + + if (!rsvd && !lock_cap && + current->mm->locked_vm + i + 1 > limit) { + put_pfn(pfn, prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", + __func__, limit << PAGE_SHIFT); + break; + } + } + + if (!rsvd) + vfio_lock_acct(i); + + return i; +} + +static long vfio_unpin_pages(unsigned long pfn, long npage, + int prot, bool do_accounting) +{ + unsigned long unlocked = 0; + long i; + + for (i = 0; i < npage; i++) + unlocked += put_pfn(pfn++, prot); + + if (do_accounting) + vfio_lock_acct(-unlocked); + + return unlocked; +} + +static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) +{ + dma_addr_t iova = dma->iova, end = dma->iova + dma->size; + struct vfio_domain *domain, *d; + long unlocked = 0; + + if (!dma->size) + return; + /* + * We use the IOMMU to track the physical addresses, otherwise we'd + * need a much more complicated tracking system. Unfortunately that + * means we need to use one of the iommu domains to figure out the + * pfns to unpin. The rest need to be unmapped in advance so we have + * no iommu translations remaining when the pages are unpinned. + */ + domain = d = list_first_entry(&iommu->domain_list, + struct vfio_domain, next); + + list_for_each_entry_continue(d, &iommu->domain_list, next) { + iommu_unmap(d->domain, dma->iova, dma->size); + cond_resched(); + } + + while (iova < end) { + size_t unmapped, len; + phys_addr_t phys, next; + + phys = iommu_iova_to_phys(domain->domain, iova); + if (WARN_ON(!phys)) { + iova += PAGE_SIZE; + continue; + } + + /* + * To optimize for fewer iommu_unmap() calls, each of which + * may require hardware cache flushing, try to find the + * largest contiguous physical memory chunk to unmap. + */ + for (len = PAGE_SIZE; + !domain->fgsp && iova + len < end; len += PAGE_SIZE) { + next = iommu_iova_to_phys(domain->domain, iova + len); + if (next != phys + len) + break; + } + + unmapped = iommu_unmap(domain->domain, iova, len); + if (WARN_ON(!unmapped)) + break; + + unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, + unmapped >> PAGE_SHIFT, + dma->prot, false); + iova += unmapped; + + cond_resched(); + } + + vfio_lock_acct(-unlocked); +} + +static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) +{ + vfio_unmap_unpin(iommu, dma); + vfio_unlink_dma(iommu, dma); + kfree(dma); +} + +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu) +{ + struct vfio_domain *domain; + unsigned long bitmap = PAGE_MASK; + + mutex_lock(&iommu->lock); + list_for_each_entry(domain, &iommu->domain_list, next) + bitmap &= domain->domain->ops->pgsize_bitmap; + mutex_unlock(&iommu->lock); + + return bitmap; +} + +static int vfio_dma_do_unmap(struct vfio_iommu *iommu, + struct vfio_iommu_type1_dma_unmap *unmap) +{ + uint64_t mask; + struct vfio_dma *dma; + size_t unmapped = 0; + int ret = 0; + + mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; + + if (unmap->iova & mask) + return -EINVAL; + if (!unmap->size || unmap->size & mask) + return -EINVAL; + + WARN_ON(mask & PAGE_MASK); + + mutex_lock(&iommu->lock); + + /* + * vfio-iommu-type1 (v1) - User mappings were coalesced together to + * avoid tracking individual mappings. This means that the granularity + * of the original mapping was lost and the user was allowed to attempt + * to unmap any range. Depending on the contiguousness of physical + * memory and page sizes supported by the IOMMU, arbitrary unmaps may + * or may not have worked. We only guaranteed unmap granularity + * matching the original mapping; even though it was untracked here, + * the original mappings are reflected in IOMMU mappings. This + * resulted in a couple unusual behaviors. First, if a range is not + * able to be unmapped, ex. a set of 4k pages that was mapped as a + * 2M hugepage into the IOMMU, the unmap ioctl returns success but with + * a zero sized unmap. Also, if an unmap request overlaps the first + * address of a hugepage, the IOMMU will unmap the entire hugepage. + * This also returns success and the returned unmap size reflects the + * actual size unmapped. + * + * We attempt to maintain compatibility with this "v1" interface, but + * we take control out of the hands of the IOMMU. Therefore, an unmap + * request offset from the beginning of the original mapping will + * return success with zero sized unmap. And an unmap request covering + * the first iova of mapping will unmap the entire range. + * + * The v2 version of this interface intends to be more deterministic. + * Unmap requests must fully cover previous mappings. Multiple + * mappings may still be unmaped by specifying large ranges, but there + * must not be any previous mappings bisected by the range. An error + * will be returned if these conditions are not met. The v2 interface + * will only return success and a size of zero if there were no + * mappings within the range. + */ + if (iommu->v2) { + dma = vfio_find_dma(iommu, unmap->iova, 0); + if (dma && dma->iova != unmap->iova) { + ret = -EINVAL; + goto unlock; + } + dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0); + if (dma && dma->iova + dma->size != unmap->iova + unmap->size) { + ret = -EINVAL; + goto unlock; + } + } + + while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { + if (!iommu->v2 && unmap->iova > dma->iova) + break; + unmapped += dma->size; + vfio_remove_dma(iommu, dma); + } + +unlock: + mutex_unlock(&iommu->lock); + + /* Report how much was unmapped */ + unmap->size = unmapped; + + return ret; +} + +/* + * Turns out AMD IOMMU has a page table bug where it won't map large pages + * to a region that previously mapped smaller pages. This should be fixed + * soon, so this is just a temporary workaround to break mappings down into + * PAGE_SIZE. Better to map smaller pages than nothing. + */ +static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova, + unsigned long pfn, long npage, int prot) +{ + long i; + int ret; + + for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { + ret = iommu_map(domain->domain, iova, + (phys_addr_t)pfn << PAGE_SHIFT, + PAGE_SIZE, prot | domain->prot); + if (ret) + break; + } + + for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) + iommu_unmap(domain->domain, iova, PAGE_SIZE); + + return ret; +} + +static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, + unsigned long pfn, long npage, int prot) +{ + struct vfio_domain *d; + int ret; + + list_for_each_entry(d, &iommu->domain_list, next) { + ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT, + npage << PAGE_SHIFT, prot | d->prot); + if (ret) { + if (ret != -EBUSY || + map_try_harder(d, iova, pfn, npage, prot)) + goto unwind; + } + + cond_resched(); + } + + return 0; + +unwind: + list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) + iommu_unmap(d->domain, iova, npage << PAGE_SHIFT); + + return ret; +} + +static int vfio_dma_do_map(struct vfio_iommu *iommu, + struct vfio_iommu_type1_dma_map *map) +{ + dma_addr_t iova = map->iova; + unsigned long vaddr = map->vaddr; + size_t size = map->size; + long npage; + int ret = 0, prot = 0; + uint64_t mask; + struct vfio_dma *dma; + unsigned long pfn; + + /* Verify that none of our __u64 fields overflow */ + if (map->size != size || map->vaddr != vaddr || map->iova != iova) + return -EINVAL; + + mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; + + WARN_ON(mask & PAGE_MASK); + + /* READ/WRITE from device perspective */ + if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) + prot |= IOMMU_WRITE; + if (map->flags & VFIO_DMA_MAP_FLAG_READ) + prot |= IOMMU_READ; + + if (!prot || !size || (size | iova | vaddr) & mask) + return -EINVAL; + + /* Don't allow IOVA or virtual address wrap */ + if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) + return -EINVAL; + + mutex_lock(&iommu->lock); + + if (vfio_find_dma(iommu, iova, size)) { + mutex_unlock(&iommu->lock); + return -EEXIST; + } + + dma = kzalloc(sizeof(*dma), GFP_KERNEL); + if (!dma) { + mutex_unlock(&iommu->lock); + return -ENOMEM; + } + + dma->iova = iova; + dma->vaddr = vaddr; + dma->prot = prot; + + /* Insert zero-sized and grow as we map chunks of it */ + vfio_link_dma(iommu, dma); + + while (size) { + /* Pin a contiguous chunk of memory */ + npage = vfio_pin_pages(vaddr + dma->size, + size >> PAGE_SHIFT, prot, &pfn); + if (npage <= 0) { + WARN_ON(!npage); + ret = (int)npage; + break; + } + + /* Map it! */ + ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot); + if (ret) { + vfio_unpin_pages(pfn, npage, prot, true); + break; + } + + size -= npage << PAGE_SHIFT; + dma->size += npage << PAGE_SHIFT; + } + + if (ret) + vfio_remove_dma(iommu, dma); + + mutex_unlock(&iommu->lock); + return ret; +} + +static int vfio_bus_type(struct device *dev, void *data) +{ + struct bus_type **bus = data; + + if (*bus && *bus != dev->bus) + return -EINVAL; + + *bus = dev->bus; + + return 0; +} + +static int vfio_iommu_replay(struct vfio_iommu *iommu, + struct vfio_domain *domain) +{ + struct vfio_domain *d; + struct rb_node *n; + int ret; + + /* Arbitrarily pick the first domain in the list for lookups */ + d = list_first_entry(&iommu->domain_list, struct vfio_domain, next); + n = rb_first(&iommu->dma_list); + + /* If there's not a domain, there better not be any mappings */ + if (WARN_ON(n && !d)) + return -EINVAL; + + for (; n; n = rb_next(n)) { + struct vfio_dma *dma; + dma_addr_t iova; + + dma = rb_entry(n, struct vfio_dma, node); + iova = dma->iova; + + while (iova < dma->iova + dma->size) { + phys_addr_t phys = iommu_iova_to_phys(d->domain, iova); + size_t size; + + if (WARN_ON(!phys)) { + iova += PAGE_SIZE; + continue; + } + + size = PAGE_SIZE; + + while (iova + size < dma->iova + dma->size && + phys + size == iommu_iova_to_phys(d->domain, + iova + size)) + size += PAGE_SIZE; + + ret = iommu_map(domain->domain, iova, phys, + size, dma->prot | domain->prot); + if (ret) + return ret; + + iova += size; + } + } + + return 0; +} + +/* + * We change our unmap behavior slightly depending on whether the IOMMU + * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage + * for practically any contiguous power-of-two mapping we give it. This means + * we don't need to look for contiguous chunks ourselves to make unmapping + * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d + * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks + * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when + * hugetlbfs is in use. + */ +static void vfio_test_domain_fgsp(struct vfio_domain *domain) +{ + struct page *pages; + int ret, order = get_order(PAGE_SIZE * 2); + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!pages) + return; + + ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2, + IOMMU_READ | IOMMU_WRITE | domain->prot); + if (!ret) { + size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE); + + if (unmapped == PAGE_SIZE) + iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE); + else + domain->fgsp = true; + } + + __free_pages(pages, order); +} + +static int vfio_iommu_type1_attach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + struct vfio_iommu *iommu = iommu_data; + struct vfio_group *group, *g; + struct vfio_domain *domain, *d; + struct bus_type *bus = NULL; + int ret; + + mutex_lock(&iommu->lock); + + list_for_each_entry(d, &iommu->domain_list, next) { + list_for_each_entry(g, &d->group_list, next) { + if (g->iommu_group != iommu_group) + continue; + + mutex_unlock(&iommu->lock); + return -EINVAL; + } + } + + group = kzalloc(sizeof(*group), GFP_KERNEL); + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!group || !domain) { + ret = -ENOMEM; + goto out_free; + } + + group->iommu_group = iommu_group; + + /* Determine bus_type in order to allocate a domain */ + ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type); + if (ret) + goto out_free; + + domain->domain = iommu_domain_alloc(bus); + if (!domain->domain) { + ret = -EIO; + goto out_free; + } + + if (iommu->nesting) { + int attr = 1; + + ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING, + &attr); + if (ret) + goto out_domain; + } + + ret = iommu_attach_group(domain->domain, iommu_group); + if (ret) + goto out_domain; + + INIT_LIST_HEAD(&domain->group_list); + list_add(&group->next, &domain->group_list); + + if (!allow_unsafe_interrupts && + !iommu_capable(bus, IOMMU_CAP_INTR_REMAP)) { + pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", + __func__); + ret = -EPERM; + goto out_detach; + } + + if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY)) + domain->prot |= IOMMU_CACHE; + + /* + * Try to match an existing compatible domain. We don't want to + * preclude an IOMMU driver supporting multiple bus_types and being + * able to include different bus_types in the same IOMMU domain, so + * we test whether the domains use the same iommu_ops rather than + * testing if they're on the same bus_type. + */ + list_for_each_entry(d, &iommu->domain_list, next) { + if (d->domain->ops == domain->domain->ops && + d->prot == domain->prot) { + iommu_detach_group(domain->domain, iommu_group); + if (!iommu_attach_group(d->domain, iommu_group)) { + list_add(&group->next, &d->group_list); + iommu_domain_free(domain->domain); + kfree(domain); + mutex_unlock(&iommu->lock); + return 0; + } + + ret = iommu_attach_group(domain->domain, iommu_group); + if (ret) + goto out_domain; + } + } + + vfio_test_domain_fgsp(domain); + + /* replay mappings on new domains */ + ret = vfio_iommu_replay(iommu, domain); + if (ret) + goto out_detach; + + list_add(&domain->next, &iommu->domain_list); + + mutex_unlock(&iommu->lock); + + return 0; + +out_detach: + iommu_detach_group(domain->domain, iommu_group); +out_domain: + iommu_domain_free(domain->domain); +out_free: + kfree(domain); + kfree(group); + mutex_unlock(&iommu->lock); + return ret; +} + +static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) +{ + struct rb_node *node; + + while ((node = rb_first(&iommu->dma_list))) + vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node)); +} + +static void vfio_iommu_type1_detach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ + struct vfio_iommu *iommu = iommu_data; + struct vfio_domain *domain; + struct vfio_group *group; + + mutex_lock(&iommu->lock); + + list_for_each_entry(domain, &iommu->domain_list, next) { + list_for_each_entry(group, &domain->group_list, next) { + if (group->iommu_group != iommu_group) + continue; + + iommu_detach_group(domain->domain, iommu_group); + list_del(&group->next); + kfree(group); + /* + * Group ownership provides privilege, if the group + * list is empty, the domain goes away. If it's the + * last domain, then all the mappings go away too. + */ + if (list_empty(&domain->group_list)) { + if (list_is_singular(&iommu->domain_list)) + vfio_iommu_unmap_unpin_all(iommu); + iommu_domain_free(domain->domain); + list_del(&domain->next); + kfree(domain); + } + goto done; + } + } + +done: + mutex_unlock(&iommu->lock); +} + +static void *vfio_iommu_type1_open(unsigned long arg) +{ + struct vfio_iommu *iommu; + + iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); + if (!iommu) + return ERR_PTR(-ENOMEM); + + switch (arg) { + case VFIO_TYPE1_IOMMU: + break; + case VFIO_TYPE1_NESTING_IOMMU: + iommu->nesting = true; + case VFIO_TYPE1v2_IOMMU: + iommu->v2 = true; + break; + default: + kfree(iommu); + return ERR_PTR(-EINVAL); + } + + INIT_LIST_HEAD(&iommu->domain_list); + iommu->dma_list = RB_ROOT; + mutex_init(&iommu->lock); + + return iommu; +} + +static void vfio_iommu_type1_release(void *iommu_data) +{ + struct vfio_iommu *iommu = iommu_data; + struct vfio_domain *domain, *domain_tmp; + struct vfio_group *group, *group_tmp; + + vfio_iommu_unmap_unpin_all(iommu); + + list_for_each_entry_safe(domain, domain_tmp, + &iommu->domain_list, next) { + list_for_each_entry_safe(group, group_tmp, + &domain->group_list, next) { + iommu_detach_group(domain->domain, group->iommu_group); + list_del(&group->next); + kfree(group); + } + iommu_domain_free(domain->domain); + list_del(&domain->next); + kfree(domain); + } + + kfree(iommu); +} + +static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu) +{ + struct vfio_domain *domain; + int ret = 1; + + mutex_lock(&iommu->lock); + list_for_each_entry(domain, &iommu->domain_list, next) { + if (!(domain->prot & IOMMU_CACHE)) { + ret = 0; + break; + } + } + mutex_unlock(&iommu->lock); + + return ret; +} + +static long vfio_iommu_type1_ioctl(void *iommu_data, + unsigned int cmd, unsigned long arg) +{ + struct vfio_iommu *iommu = iommu_data; + unsigned long minsz; + + if (cmd == VFIO_CHECK_EXTENSION) { + switch (arg) { + case VFIO_TYPE1_IOMMU: + case VFIO_TYPE1v2_IOMMU: + case VFIO_TYPE1_NESTING_IOMMU: + return 1; + case VFIO_DMA_CC_IOMMU: + if (!iommu) + return 0; + return vfio_domains_have_iommu_cache(iommu); + default: + return 0; + } + } else if (cmd == VFIO_IOMMU_GET_INFO) { + struct vfio_iommu_type1_info info; + + minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); + + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + info.flags = 0; + + info.iova_pgsizes = vfio_pgsize_bitmap(iommu); + + return copy_to_user((void __user *)arg, &info, minsz); + + } else if (cmd == VFIO_IOMMU_MAP_DMA) { + struct vfio_iommu_type1_dma_map map; + uint32_t mask = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); + + if (copy_from_user(&map, (void __user *)arg, minsz)) + return -EFAULT; + + if (map.argsz < minsz || map.flags & ~mask) + return -EINVAL; + + return vfio_dma_do_map(iommu, &map); + + } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { + struct vfio_iommu_type1_dma_unmap unmap; + long ret; + + minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); + + if (copy_from_user(&unmap, (void __user *)arg, minsz)) + return -EFAULT; + + if (unmap.argsz < minsz || unmap.flags) + return -EINVAL; + + ret = vfio_dma_do_unmap(iommu, &unmap); + if (ret) + return ret; + + return copy_to_user((void __user *)arg, &unmap, minsz); + } + + return -ENOTTY; +} + +static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { + .name = "vfio-iommu-type1", + .owner = THIS_MODULE, + .open = vfio_iommu_type1_open, + .release = vfio_iommu_type1_release, + .ioctl = vfio_iommu_type1_ioctl, + .attach_group = vfio_iommu_type1_attach_group, + .detach_group = vfio_iommu_type1_detach_group, +}; + +static int __init vfio_iommu_type1_init(void) +{ + return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); +} + +static void __exit vfio_iommu_type1_cleanup(void) +{ + vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1); +} + +module_init(vfio_iommu_type1_init); +module_exit(vfio_iommu_type1_cleanup); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/kernel/drivers/vfio/vfio_spapr_eeh.c b/kernel/drivers/vfio/vfio_spapr_eeh.c new file mode 100644 index 000000000..5fa42db76 --- /dev/null +++ b/kernel/drivers/vfio/vfio_spapr_eeh.c @@ -0,0 +1,100 @@ +/* + * EEH functionality support for VFIO devices. The feature is only + * available on sPAPR compatible platforms. + * + * Copyright Gavin Shan, IBM Corporation 2014. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/vfio.h> +#include <asm/eeh.h> + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "Gavin Shan, IBM Corporation" +#define DRIVER_DESC "VFIO IOMMU SPAPR EEH" + +/* We might build address mapping here for "fast" path later */ +void vfio_spapr_pci_eeh_open(struct pci_dev *pdev) +{ + eeh_dev_open(pdev); +} +EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_open); + +void vfio_spapr_pci_eeh_release(struct pci_dev *pdev) +{ + eeh_dev_release(pdev); +} +EXPORT_SYMBOL_GPL(vfio_spapr_pci_eeh_release); + +long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, + unsigned int cmd, unsigned long arg) +{ + struct eeh_pe *pe; + struct vfio_eeh_pe_op op; + unsigned long minsz; + long ret = -EINVAL; + + switch (cmd) { + case VFIO_CHECK_EXTENSION: + if (arg == VFIO_EEH) + ret = eeh_enabled() ? 1 : 0; + else + ret = 0; + break; + case VFIO_EEH_PE_OP: + pe = eeh_iommu_group_to_pe(group); + if (!pe) + return -ENODEV; + + minsz = offsetofend(struct vfio_eeh_pe_op, op); + if (copy_from_user(&op, (void __user *)arg, minsz)) + return -EFAULT; + if (op.argsz < minsz || op.flags) + return -EINVAL; + + switch (op.op) { + case VFIO_EEH_PE_DISABLE: + ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE); + break; + case VFIO_EEH_PE_ENABLE: + ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE); + break; + case VFIO_EEH_PE_UNFREEZE_IO: + ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO); + break; + case VFIO_EEH_PE_UNFREEZE_DMA: + ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA); + break; + case VFIO_EEH_PE_GET_STATE: + ret = eeh_pe_get_state(pe); + break; + case VFIO_EEH_PE_RESET_DEACTIVATE: + ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE); + break; + case VFIO_EEH_PE_RESET_HOT: + ret = eeh_pe_reset(pe, EEH_RESET_HOT); + break; + case VFIO_EEH_PE_RESET_FUNDAMENTAL: + ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL); + break; + case VFIO_EEH_PE_CONFIGURE: + ret = eeh_pe_configure(pe); + break; + default: + ret = -EINVAL; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(vfio_spapr_iommu_eeh_ioctl); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/kernel/drivers/vfio/virqfd.c b/kernel/drivers/vfio/virqfd.c new file mode 100644 index 000000000..27c89cd5d --- /dev/null +++ b/kernel/drivers/vfio/virqfd.c @@ -0,0 +1,226 @@ +/* + * VFIO generic eventfd code for IRQFD support. + * Derived from drivers/vfio/pci/vfio_pci_intrs.c + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/vfio.h> +#include <linux/eventfd.h> +#include <linux/file.h> +#include <linux/module.h> +#include <linux/slab.h> + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" +#define DRIVER_DESC "IRQFD support for VFIO bus drivers" + +static struct workqueue_struct *vfio_irqfd_cleanup_wq; +static DEFINE_SPINLOCK(virqfd_lock); + +static int __init vfio_virqfd_init(void) +{ + vfio_irqfd_cleanup_wq = + create_singlethread_workqueue("vfio-irqfd-cleanup"); + if (!vfio_irqfd_cleanup_wq) + return -ENOMEM; + + return 0; +} + +static void __exit vfio_virqfd_exit(void) +{ + destroy_workqueue(vfio_irqfd_cleanup_wq); +} + +static void virqfd_deactivate(struct virqfd *virqfd) +{ + queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown); +} + +static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct virqfd *virqfd = container_of(wait, struct virqfd, wait); + unsigned long flags = (unsigned long)key; + + if (flags & POLLIN) { + /* An event has been signaled, call function */ + if ((!virqfd->handler || + virqfd->handler(virqfd->opaque, virqfd->data)) && + virqfd->thread) + schedule_work(&virqfd->inject); + } + + if (flags & POLLHUP) { + unsigned long flags; + spin_lock_irqsave(&virqfd_lock, flags); + + /* + * The eventfd is closing, if the virqfd has not yet been + * queued for release, as determined by testing whether the + * virqfd pointer to it is still valid, queue it now. As + * with kvm irqfds, we know we won't race against the virqfd + * going away because we hold the lock to get here. + */ + if (*(virqfd->pvirqfd) == virqfd) { + *(virqfd->pvirqfd) = NULL; + virqfd_deactivate(virqfd); + } + + spin_unlock_irqrestore(&virqfd_lock, flags); + } + + return 0; +} + +static void virqfd_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct virqfd *virqfd = container_of(pt, struct virqfd, pt); + add_wait_queue(wqh, &virqfd->wait); +} + +static void virqfd_shutdown(struct work_struct *work) +{ + struct virqfd *virqfd = container_of(work, struct virqfd, shutdown); + u64 cnt; + + eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt); + flush_work(&virqfd->inject); + eventfd_ctx_put(virqfd->eventfd); + + kfree(virqfd); +} + +static void virqfd_inject(struct work_struct *work) +{ + struct virqfd *virqfd = container_of(work, struct virqfd, inject); + if (virqfd->thread) + virqfd->thread(virqfd->opaque, virqfd->data); +} + +int vfio_virqfd_enable(void *opaque, + int (*handler)(void *, void *), + void (*thread)(void *, void *), + void *data, struct virqfd **pvirqfd, int fd) +{ + struct fd irqfd; + struct eventfd_ctx *ctx; + struct virqfd *virqfd; + int ret = 0; + unsigned int events; + + virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL); + if (!virqfd) + return -ENOMEM; + + virqfd->pvirqfd = pvirqfd; + virqfd->opaque = opaque; + virqfd->handler = handler; + virqfd->thread = thread; + virqfd->data = data; + + INIT_WORK(&virqfd->shutdown, virqfd_shutdown); + INIT_WORK(&virqfd->inject, virqfd_inject); + + irqfd = fdget(fd); + if (!irqfd.file) { + ret = -EBADF; + goto err_fd; + } + + ctx = eventfd_ctx_fileget(irqfd.file); + if (IS_ERR(ctx)) { + ret = PTR_ERR(ctx); + goto err_ctx; + } + + virqfd->eventfd = ctx; + + /* + * virqfds can be released by closing the eventfd or directly + * through ioctl. These are both done through a workqueue, so + * we update the pointer to the virqfd under lock to avoid + * pushing multiple jobs to release the same virqfd. + */ + spin_lock_irq(&virqfd_lock); + + if (*pvirqfd) { + spin_unlock_irq(&virqfd_lock); + ret = -EBUSY; + goto err_busy; + } + *pvirqfd = virqfd; + + spin_unlock_irq(&virqfd_lock); + + /* + * Install our own custom wake-up handling so we are notified via + * a callback whenever someone signals the underlying eventfd. + */ + init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup); + init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc); + + events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt); + + /* + * Check if there was an event already pending on the eventfd + * before we registered and trigger it as if we didn't miss it. + */ + if (events & POLLIN) { + if ((!handler || handler(opaque, data)) && thread) + schedule_work(&virqfd->inject); + } + + /* + * Do not drop the file until the irqfd is fully initialized, + * otherwise we might race against the POLLHUP. + */ + fdput(irqfd); + + return 0; +err_busy: + eventfd_ctx_put(ctx); +err_ctx: + fdput(irqfd); +err_fd: + kfree(virqfd); + + return ret; +} +EXPORT_SYMBOL_GPL(vfio_virqfd_enable); + +void vfio_virqfd_disable(struct virqfd **pvirqfd) +{ + unsigned long flags; + + spin_lock_irqsave(&virqfd_lock, flags); + + if (*pvirqfd) { + virqfd_deactivate(*pvirqfd); + *pvirqfd = NULL; + } + + spin_unlock_irqrestore(&virqfd_lock, flags); + + /* + * Block until we know all outstanding shutdown jobs have completed. + * Even if we don't queue the job, flush the wq to be sure it's + * been released. + */ + flush_workqueue(vfio_irqfd_cleanup_wq); +} +EXPORT_SYMBOL_GPL(vfio_virqfd_disable); + +module_init(vfio_virqfd_init); +module_exit(vfio_virqfd_exit); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); |