diff options
author | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-11 10:41:07 +0300 |
---|---|---|
committer | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-13 08:17:18 +0300 |
commit | e09b41010ba33a20a87472ee821fa407a5b8da36 (patch) | |
tree | d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/drivers/vfio | |
parent | f93b97fd65072de626c074dbe099a1fff05ce060 (diff) |
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page.
During the rebasing, the following patch collided:
Force tick interrupt and get rid of softirq magic(I70131fb85).
Collisions have been removed because its logic was found on the
source already.
Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769
Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/drivers/vfio')
21 files changed, 1647 insertions, 201 deletions
diff --git a/kernel/drivers/vfio/Kconfig b/kernel/drivers/vfio/Kconfig index 7d092ddc8..850d86ca6 100644 --- a/kernel/drivers/vfio/Kconfig +++ b/kernel/drivers/vfio/Kconfig @@ -21,7 +21,7 @@ config VFIO_VIRQFD menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" depends on IOMMU_API - select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM_SMMU) + select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM_SMMU || ARM_SMMU_V3) select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES) select VFIO_SPAPR_EEH if (PPC_POWERNV || PPC_PSERIES) select ANON_INODES @@ -33,3 +33,4 @@ menuconfig VFIO source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" +source "virt/lib/Kconfig" diff --git a/kernel/drivers/vfio/pci/Kconfig b/kernel/drivers/vfio/pci/Kconfig index 579d83bf5..02912f180 100644 --- a/kernel/drivers/vfio/pci/Kconfig +++ b/kernel/drivers/vfio/pci/Kconfig @@ -2,6 +2,7 @@ config VFIO_PCI tristate "VFIO support for PCI devices" depends on VFIO && PCI && EVENTFD select VFIO_VIRQFD + select IRQ_BYPASS_MANAGER help Support for the PCI VFIO bus driver. This is required to make use of PCI drivers using the VFIO framework. diff --git a/kernel/drivers/vfio/pci/vfio_pci.c b/kernel/drivers/vfio/pci/vfio_pci.c index e9851add6..9982cb176 100644 --- a/kernel/drivers/vfio/pci/vfio_pci.c +++ b/kernel/drivers/vfio/pci/vfio_pci.c @@ -446,7 +446,8 @@ static long vfio_pci_ioctl(void *device_data, info.num_regions = VFIO_PCI_NUM_REGIONS; info.num_irqs = VFIO_PCI_NUM_IRQS; - return copy_to_user((void __user *)arg, &info, minsz); + return copy_to_user((void __user *)arg, &info, minsz) ? + -EFAULT : 0; } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { struct pci_dev *pdev = vdev->pdev; @@ -520,7 +521,8 @@ static long vfio_pci_ioctl(void *device_data, return -EINVAL; } - return copy_to_user((void __user *)arg, &info, minsz); + return copy_to_user((void __user *)arg, &info, minsz) ? + -EFAULT : 0; } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; @@ -555,7 +557,8 @@ static long vfio_pci_ioctl(void *device_data, else info.flags |= VFIO_IRQ_INFO_NORESIZE; - return copy_to_user((void __user *)arg, &info, minsz); + return copy_to_user((void __user *)arg, &info, minsz) ? + -EFAULT : 0; } else if (cmd == VFIO_DEVICE_SET_IRQS) { struct vfio_irq_set hdr; @@ -1035,7 +1038,7 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, return PCI_ERS_RESULT_CAN_RECOVER; } -static struct pci_error_handlers vfio_err_handlers = { +static const struct pci_error_handlers vfio_err_handlers = { .error_detected = vfio_pci_aer_err_detected, }; @@ -1056,19 +1059,21 @@ struct vfio_devices { static int vfio_pci_get_devs(struct pci_dev *pdev, void *data) { struct vfio_devices *devs = data; - struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver); - - if (pci_drv != &vfio_pci_driver) - return -EBUSY; + struct vfio_device *device; if (devs->cur_index == devs->max_index) return -ENOSPC; - devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev); - if (!devs->devices[devs->cur_index]) + device = vfio_device_get_from_dev(&pdev->dev); + if (!device) return -EINVAL; - devs->cur_index++; + if (pci_dev_driver(pdev) != &vfio_pci_driver) { + vfio_device_put(device); + return -EBUSY; + } + + devs->devices[devs->cur_index++] = device; return 0; } diff --git a/kernel/drivers/vfio/pci/vfio_pci_config.c b/kernel/drivers/vfio/pci/vfio_pci_config.c index ff75ca31a..fe2b470d7 100644 --- a/kernel/drivers/vfio/pci/vfio_pci_config.c +++ b/kernel/drivers/vfio/pci/vfio_pci_config.c @@ -46,7 +46,7 @@ * 0: Removed from the user visible capability list * FF: Variable length */ -static u8 pci_cap_length[] = { +static const u8 pci_cap_length[PCI_CAP_ID_MAX + 1] = { [PCI_CAP_ID_BASIC] = PCI_STD_HEADER_SIZEOF, /* pci config header */ [PCI_CAP_ID_PM] = PCI_PM_SIZEOF, [PCI_CAP_ID_AGP] = PCI_AGP_SIZEOF, @@ -74,7 +74,7 @@ static u8 pci_cap_length[] = { * 0: Removed or masked from the user visible capabilty list * FF: Variable length */ -static u16 pci_ext_cap_length[] = { +static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = { [PCI_EXT_CAP_ID_ERR] = PCI_ERR_ROOT_COMMAND, [PCI_EXT_CAP_ID_VC] = 0xFF, [PCI_EXT_CAP_ID_DSN] = PCI_EXT_CAP_DSN_SIZEOF, @@ -671,6 +671,73 @@ static int __init init_pci_cap_pm_perm(struct perm_bits *perm) return 0; } +static int vfio_vpd_config_write(struct vfio_pci_device *vdev, int pos, + int count, struct perm_bits *perm, + int offset, __le32 val) +{ + struct pci_dev *pdev = vdev->pdev; + __le16 *paddr = (__le16 *)(vdev->vconfig + pos - offset + PCI_VPD_ADDR); + __le32 *pdata = (__le32 *)(vdev->vconfig + pos - offset + PCI_VPD_DATA); + u16 addr; + u32 data; + + /* + * Write through to emulation. If the write includes the upper byte + * of PCI_VPD_ADDR, then the PCI_VPD_ADDR_F bit is written and we + * have work to do. + */ + count = vfio_default_config_write(vdev, pos, count, perm, offset, val); + if (count < 0 || offset > PCI_VPD_ADDR + 1 || + offset + count <= PCI_VPD_ADDR + 1) + return count; + + addr = le16_to_cpu(*paddr); + + if (addr & PCI_VPD_ADDR_F) { + data = le32_to_cpu(*pdata); + if (pci_write_vpd(pdev, addr & ~PCI_VPD_ADDR_F, 4, &data) != 4) + return count; + } else { + if (pci_read_vpd(pdev, addr, 4, &data) != 4) + return count; + *pdata = cpu_to_le32(data); + } + + /* + * Toggle PCI_VPD_ADDR_F in the emulated PCI_VPD_ADDR register to + * signal completion. If an error occurs above, we assume that not + * toggling this bit will induce a driver timeout. + */ + addr ^= PCI_VPD_ADDR_F; + *paddr = cpu_to_le16(addr); + + return count; +} + +/* Permissions for Vital Product Data capability */ +static int __init init_pci_cap_vpd_perm(struct perm_bits *perm) +{ + if (alloc_perm_bits(perm, pci_cap_length[PCI_CAP_ID_VPD])) + return -ENOMEM; + + perm->writefn = vfio_vpd_config_write; + + /* + * We always virtualize the next field so we can remove + * capabilities from the chain if we want to. + */ + p_setb(perm, PCI_CAP_LIST_NEXT, (u8)ALL_VIRT, NO_WRITE); + + /* + * Both the address and data registers are virtualized to + * enable access through the pci_vpd_read/write functions + */ + p_setw(perm, PCI_VPD_ADDR, (u16)ALL_VIRT, (u16)ALL_WRITE); + p_setd(perm, PCI_VPD_DATA, ALL_VIRT, ALL_WRITE); + + return 0; +} + /* Permissions for PCI-X capability */ static int __init init_pci_cap_pcix_perm(struct perm_bits *perm) { @@ -790,6 +857,7 @@ void vfio_pci_uninit_perm_bits(void) free_perm_bits(&cap_perms[PCI_CAP_ID_BASIC]); free_perm_bits(&cap_perms[PCI_CAP_ID_PM]); + free_perm_bits(&cap_perms[PCI_CAP_ID_VPD]); free_perm_bits(&cap_perms[PCI_CAP_ID_PCIX]); free_perm_bits(&cap_perms[PCI_CAP_ID_EXP]); free_perm_bits(&cap_perms[PCI_CAP_ID_AF]); @@ -807,7 +875,7 @@ int __init vfio_pci_init_perm_bits(void) /* Capabilities */ ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]); - cap_perms[PCI_CAP_ID_VPD].writefn = vfio_raw_config_write; + ret |= init_pci_cap_vpd_perm(&cap_perms[PCI_CAP_ID_VPD]); ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]); cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write; ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]); diff --git a/kernel/drivers/vfio/pci/vfio_pci_intrs.c b/kernel/drivers/vfio/pci/vfio_pci_intrs.c index a21d8e1e3..7373e9e4b 100644 --- a/kernel/drivers/vfio/pci/vfio_pci_intrs.c +++ b/kernel/drivers/vfio/pci/vfio_pci_intrs.c @@ -319,6 +319,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, if (vdev->ctx[vector].trigger) { free_irq(irq, vdev->ctx[vector].trigger); + irq_bypass_unregister_producer(&vdev->ctx[vector].producer); kfree(vdev->ctx[vector].name); eventfd_ctx_put(vdev->ctx[vector].trigger); vdev->ctx[vector].trigger = NULL; @@ -360,6 +361,14 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, return ret; } + vdev->ctx[vector].producer.token = trigger; + vdev->ctx[vector].producer.irq = irq; + ret = irq_bypass_register_producer(&vdev->ctx[vector].producer); + if (unlikely(ret)) + dev_info(&pdev->dev, + "irq bypass producer (token %p) registration fails: %d\n", + vdev->ctx[vector].producer.token, ret); + vdev->ctx[vector].trigger = trigger; return 0; diff --git a/kernel/drivers/vfio/pci/vfio_pci_private.h b/kernel/drivers/vfio/pci/vfio_pci_private.h index ae0e1b4c1..0e7394f8f 100644 --- a/kernel/drivers/vfio/pci/vfio_pci_private.h +++ b/kernel/drivers/vfio/pci/vfio_pci_private.h @@ -13,6 +13,7 @@ #include <linux/mutex.h> #include <linux/pci.h> +#include <linux/irqbypass.h> #ifndef VFIO_PCI_PRIVATE_H #define VFIO_PCI_PRIVATE_H @@ -29,6 +30,7 @@ struct vfio_pci_irq_ctx { struct virqfd *mask; char *name; bool masked; + struct irq_bypass_producer producer; }; struct vfio_pci_device { diff --git a/kernel/drivers/vfio/platform/Kconfig b/kernel/drivers/vfio/platform/Kconfig index 9a4403e2a..bb3012878 100644 --- a/kernel/drivers/vfio/platform/Kconfig +++ b/kernel/drivers/vfio/platform/Kconfig @@ -1,6 +1,6 @@ config VFIO_PLATFORM tristate "VFIO support for platform devices" - depends on VFIO && EVENTFD && ARM + depends on VFIO && EVENTFD && (ARM || ARM64) select VFIO_VIRQFD help Support for platform devices with VFIO. This is required to make @@ -18,3 +18,5 @@ config VFIO_AMBA framework. If you don't know what to do here, say N. + +source "drivers/vfio/platform/reset/Kconfig" diff --git a/kernel/drivers/vfio/platform/Makefile b/kernel/drivers/vfio/platform/Makefile index 81de144c0..41a6224f5 100644 --- a/kernel/drivers/vfio/platform/Makefile +++ b/kernel/drivers/vfio/platform/Makefile @@ -1,8 +1,12 @@ - -vfio-platform-y := vfio_platform.o vfio_platform_common.o vfio_platform_irq.o +vfio-platform-base-y := vfio_platform_common.o vfio_platform_irq.o +vfio-platform-y := vfio_platform.o obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o +obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform-base.o +obj-$(CONFIG_VFIO_PLATFORM) += reset/ vfio-amba-y := vfio_amba.o obj-$(CONFIG_VFIO_AMBA) += vfio-amba.o +obj-$(CONFIG_VFIO_AMBA) += vfio-platform-base.o +obj-$(CONFIG_VFIO_AMBA) += reset/ diff --git a/kernel/drivers/vfio/platform/reset/Kconfig b/kernel/drivers/vfio/platform/reset/Kconfig new file mode 100644 index 000000000..70cccc582 --- /dev/null +++ b/kernel/drivers/vfio/platform/reset/Kconfig @@ -0,0 +1,15 @@ +config VFIO_PLATFORM_CALXEDAXGMAC_RESET + tristate "VFIO support for calxeda xgmac reset" + depends on VFIO_PLATFORM + help + Enables the VFIO platform driver to handle reset for Calxeda xgmac + + If you don't know what to do here, say N. + +config VFIO_PLATFORM_AMDXGBE_RESET + tristate "VFIO support for AMD XGBE reset" + depends on VFIO_PLATFORM + help + Enables the VFIO platform driver to handle reset for AMD XGBE + + If you don't know what to do here, say N. diff --git a/kernel/drivers/vfio/platform/reset/Makefile b/kernel/drivers/vfio/platform/reset/Makefile new file mode 100644 index 000000000..93f4e2326 --- /dev/null +++ b/kernel/drivers/vfio/platform/reset/Makefile @@ -0,0 +1,7 @@ +vfio-platform-calxedaxgmac-y := vfio_platform_calxedaxgmac.o +vfio-platform-amdxgbe-y := vfio_platform_amdxgbe.o + +ccflags-y += -Idrivers/vfio/platform + +obj-$(CONFIG_VFIO_PLATFORM_CALXEDAXGMAC_RESET) += vfio-platform-calxedaxgmac.o +obj-$(CONFIG_VFIO_PLATFORM_AMDXGBE_RESET) += vfio-platform-amdxgbe.o diff --git a/kernel/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c b/kernel/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c new file mode 100644 index 000000000..da5356f48 --- /dev/null +++ b/kernel/drivers/vfio/platform/reset/vfio_platform_amdxgbe.c @@ -0,0 +1,127 @@ +/* + * VFIO platform driver specialized for AMD xgbe reset + * reset code is inherited from AMD xgbe native driver + * + * Copyright (c) 2015 Linaro Ltd. + * www.linaro.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> +#include <uapi/linux/mdio.h> +#include <linux/delay.h> + +#include "vfio_platform_private.h" + +#define DMA_MR 0x3000 +#define MAC_VR 0x0110 +#define DMA_ISR 0x3008 +#define MAC_ISR 0x00b0 +#define PCS_MMD_SELECT 0xff +#define MDIO_AN_INT 0x8002 +#define MDIO_AN_INTMASK 0x8001 + +static unsigned int xmdio_read(void *ioaddr, unsigned int mmd, + unsigned int reg) +{ + unsigned int mmd_address, value; + + mmd_address = (mmd << 16) | ((reg) & 0xffff); + iowrite32(mmd_address >> 8, ioaddr + (PCS_MMD_SELECT << 2)); + value = ioread32(ioaddr + ((mmd_address & 0xff) << 2)); + return value; +} + +static void xmdio_write(void *ioaddr, unsigned int mmd, + unsigned int reg, unsigned int value) +{ + unsigned int mmd_address; + + mmd_address = (mmd << 16) | ((reg) & 0xffff); + iowrite32(mmd_address >> 8, ioaddr + (PCS_MMD_SELECT << 2)); + iowrite32(value, ioaddr + ((mmd_address & 0xff) << 2)); +} + +int vfio_platform_amdxgbe_reset(struct vfio_platform_device *vdev) +{ + struct vfio_platform_region *xgmac_regs = &vdev->regions[0]; + struct vfio_platform_region *xpcs_regs = &vdev->regions[1]; + u32 dma_mr_value, pcs_value, value; + unsigned int count; + + if (!xgmac_regs->ioaddr) { + xgmac_regs->ioaddr = + ioremap_nocache(xgmac_regs->addr, xgmac_regs->size); + if (!xgmac_regs->ioaddr) + return -ENOMEM; + } + if (!xpcs_regs->ioaddr) { + xpcs_regs->ioaddr = + ioremap_nocache(xpcs_regs->addr, xpcs_regs->size); + if (!xpcs_regs->ioaddr) + return -ENOMEM; + } + + /* reset the PHY through MDIO*/ + pcs_value = xmdio_read(xpcs_regs->ioaddr, MDIO_MMD_PCS, MDIO_CTRL1); + pcs_value |= MDIO_CTRL1_RESET; + xmdio_write(xpcs_regs->ioaddr, MDIO_MMD_PCS, MDIO_CTRL1, pcs_value); + + count = 50; + do { + msleep(20); + pcs_value = xmdio_read(xpcs_regs->ioaddr, MDIO_MMD_PCS, + MDIO_CTRL1); + } while ((pcs_value & MDIO_CTRL1_RESET) && --count); + + if (pcs_value & MDIO_CTRL1_RESET) + pr_warn("%s XGBE PHY reset timeout\n", __func__); + + /* disable auto-negotiation */ + value = xmdio_read(xpcs_regs->ioaddr, MDIO_MMD_AN, MDIO_CTRL1); + value &= ~MDIO_AN_CTRL1_ENABLE; + xmdio_write(xpcs_regs->ioaddr, MDIO_MMD_AN, MDIO_CTRL1, value); + + /* disable AN IRQ */ + xmdio_write(xpcs_regs->ioaddr, MDIO_MMD_AN, MDIO_AN_INTMASK, 0); + + /* clear AN IRQ */ + xmdio_write(xpcs_regs->ioaddr, MDIO_MMD_AN, MDIO_AN_INT, 0); + + /* MAC software reset */ + dma_mr_value = ioread32(xgmac_regs->ioaddr + DMA_MR); + dma_mr_value |= 0x1; + iowrite32(dma_mr_value, xgmac_regs->ioaddr + DMA_MR); + + usleep_range(10, 15); + + count = 2000; + while (count-- && (ioread32(xgmac_regs->ioaddr + DMA_MR) & 1)) + usleep_range(500, 600); + + if (!count) + pr_warn("%s MAC SW reset failed\n", __func__); + + return 0; +} + +module_vfio_reset_handler("amd,xgbe-seattle-v1a", vfio_platform_amdxgbe_reset); + +MODULE_VERSION("0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Eric Auger <eric.auger@linaro.org>"); +MODULE_DESCRIPTION("Reset support for AMD xgbe vfio platform device"); diff --git a/kernel/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c b/kernel/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c new file mode 100644 index 000000000..e3d3d948e --- /dev/null +++ b/kernel/drivers/vfio/platform/reset/vfio_platform_calxedaxgmac.c @@ -0,0 +1,85 @@ +/* + * VFIO platform driver specialized for Calxeda xgmac reset + * reset code is inherited from calxeda xgmac native driver + * + * Copyright 2010-2011 Calxeda, Inc. + * Copyright (c) 2015 Linaro Ltd. + * www.linaro.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> + +#include "vfio_platform_private.h" + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "Eric Auger <eric.auger@linaro.org>" +#define DRIVER_DESC "Reset support for Calxeda xgmac vfio platform device" + +/* XGMAC Register definitions */ +#define XGMAC_CONTROL 0x00000000 /* MAC Configuration */ + +/* DMA Control and Status Registers */ +#define XGMAC_DMA_CONTROL 0x00000f18 /* Ctrl (Operational Mode) */ +#define XGMAC_DMA_INTR_ENA 0x00000f1c /* Interrupt Enable */ + +/* DMA Control registe defines */ +#define DMA_CONTROL_ST 0x00002000 /* Start/Stop Transmission */ +#define DMA_CONTROL_SR 0x00000002 /* Start/Stop Receive */ + +/* Common MAC defines */ +#define MAC_ENABLE_TX 0x00000008 /* Transmitter Enable */ +#define MAC_ENABLE_RX 0x00000004 /* Receiver Enable */ + +static inline void xgmac_mac_disable(void __iomem *ioaddr) +{ + u32 value = readl(ioaddr + XGMAC_DMA_CONTROL); + + value &= ~(DMA_CONTROL_ST | DMA_CONTROL_SR); + writel(value, ioaddr + XGMAC_DMA_CONTROL); + + value = readl(ioaddr + XGMAC_CONTROL); + value &= ~(MAC_ENABLE_TX | MAC_ENABLE_RX); + writel(value, ioaddr + XGMAC_CONTROL); +} + +int vfio_platform_calxedaxgmac_reset(struct vfio_platform_device *vdev) +{ + struct vfio_platform_region *reg = &vdev->regions[0]; + + if (!reg->ioaddr) { + reg->ioaddr = + ioremap_nocache(reg->addr, reg->size); + if (!reg->ioaddr) + return -ENOMEM; + } + + /* disable IRQ */ + writel(0, reg->ioaddr + XGMAC_DMA_INTR_ENA); + + /* Disable the MAC core */ + xgmac_mac_disable(reg->ioaddr); + + return 0; +} + +module_vfio_reset_handler("calxeda,hb-xgmac", vfio_platform_calxedaxgmac_reset); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/kernel/drivers/vfio/platform/vfio_amba.c b/kernel/drivers/vfio/platform/vfio_amba.c index ff0331f72..a66479bd0 100644 --- a/kernel/drivers/vfio/platform/vfio_amba.c +++ b/kernel/drivers/vfio/platform/vfio_amba.c @@ -67,6 +67,7 @@ static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id) vdev->flags = VFIO_DEVICE_FLAGS_AMBA; vdev->get_resource = get_amba_resource; vdev->get_irq = get_amba_irq; + vdev->parent_module = THIS_MODULE; ret = vfio_platform_probe_common(vdev, &adev->dev); if (ret) { diff --git a/kernel/drivers/vfio/platform/vfio_platform.c b/kernel/drivers/vfio/platform/vfio_platform.c index cef645c83..b1cc3a768 100644 --- a/kernel/drivers/vfio/platform/vfio_platform.c +++ b/kernel/drivers/vfio/platform/vfio_platform.c @@ -65,6 +65,7 @@ static int vfio_platform_probe(struct platform_device *pdev) vdev->flags = VFIO_DEVICE_FLAGS_PLATFORM; vdev->get_resource = get_platform_resource; vdev->get_irq = get_platform_irq; + vdev->parent_module = THIS_MODULE; ret = vfio_platform_probe_common(vdev, &pdev->dev); if (ret) @@ -91,7 +92,6 @@ static struct platform_driver vfio_platform_driver = { .remove = vfio_platform_remove, .driver = { .name = "vfio-platform", - .owner = THIS_MODULE, }, }; diff --git a/kernel/drivers/vfio/platform/vfio_platform_common.c b/kernel/drivers/vfio/platform/vfio_platform_common.c index abcff7a1a..e65b142d3 100644 --- a/kernel/drivers/vfio/platform/vfio_platform_common.c +++ b/kernel/drivers/vfio/platform/vfio_platform_common.c @@ -23,8 +23,49 @@ #include "vfio_platform_private.h" +#define DRIVER_VERSION "0.10" +#define DRIVER_AUTHOR "Antonios Motakis <a.motakis@virtualopensystems.com>" +#define DRIVER_DESC "VFIO platform base module" + +static LIST_HEAD(reset_list); static DEFINE_MUTEX(driver_lock); +static vfio_platform_reset_fn_t vfio_platform_lookup_reset(const char *compat, + struct module **module) +{ + struct vfio_platform_reset_node *iter; + vfio_platform_reset_fn_t reset_fn = NULL; + + mutex_lock(&driver_lock); + list_for_each_entry(iter, &reset_list, link) { + if (!strcmp(iter->compat, compat) && + try_module_get(iter->owner)) { + *module = iter->owner; + reset_fn = iter->reset; + break; + } + } + mutex_unlock(&driver_lock); + return reset_fn; +} + +static void vfio_platform_get_reset(struct vfio_platform_device *vdev) +{ + vdev->reset = vfio_platform_lookup_reset(vdev->compat, + &vdev->reset_module); + if (!vdev->reset) { + request_module("vfio-reset:%s", vdev->compat); + vdev->reset = vfio_platform_lookup_reset(vdev->compat, + &vdev->reset_module); + } +} + +static void vfio_platform_put_reset(struct vfio_platform_device *vdev) +{ + if (vdev->reset) + module_put(vdev->reset_module); +} + static int vfio_platform_regions_init(struct vfio_platform_device *vdev) { int cnt = 0, i; @@ -100,13 +141,19 @@ static void vfio_platform_release(void *device_data) mutex_lock(&driver_lock); if (!(--vdev->refcnt)) { + if (vdev->reset) { + dev_info(vdev->device, "reset\n"); + vdev->reset(vdev); + } else { + dev_warn(vdev->device, "no reset function found!\n"); + } vfio_platform_regions_cleanup(vdev); vfio_platform_irq_cleanup(vdev); } mutex_unlock(&driver_lock); - module_put(THIS_MODULE); + module_put(vdev->parent_module); } static int vfio_platform_open(void *device_data) @@ -114,7 +161,7 @@ static int vfio_platform_open(void *device_data) struct vfio_platform_device *vdev = device_data; int ret; - if (!try_module_get(THIS_MODULE)) + if (!try_module_get(vdev->parent_module)) return -ENODEV; mutex_lock(&driver_lock); @@ -127,6 +174,13 @@ static int vfio_platform_open(void *device_data) ret = vfio_platform_irq_init(vdev); if (ret) goto err_irq; + + if (vdev->reset) { + dev_info(vdev->device, "reset\n"); + vdev->reset(vdev); + } else { + dev_warn(vdev->device, "no reset function found!\n"); + } } vdev->refcnt++; @@ -159,11 +213,14 @@ static long vfio_platform_ioctl(void *device_data, if (info.argsz < minsz) return -EINVAL; + if (vdev->reset) + vdev->flags |= VFIO_DEVICE_FLAGS_RESET; info.flags = vdev->flags; info.num_regions = vdev->num_regions; info.num_irqs = vdev->num_irqs; - return copy_to_user((void __user *)arg, &info, minsz); + return copy_to_user((void __user *)arg, &info, minsz) ? + -EFAULT : 0; } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { struct vfio_region_info info; @@ -184,7 +241,8 @@ static long vfio_platform_ioctl(void *device_data, info.size = vdev->regions[info.index].size; info.flags = vdev->regions[info.index].flags; - return copy_to_user((void __user *)arg, &info, minsz); + return copy_to_user((void __user *)arg, &info, minsz) ? + -EFAULT : 0; } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { struct vfio_irq_info info; @@ -203,7 +261,8 @@ static long vfio_platform_ioctl(void *device_data, info.flags = vdev->irqs[info.index].flags; info.count = vdev->irqs[info.index].count; - return copy_to_user((void __user *)arg, &info, minsz); + return copy_to_user((void __user *)arg, &info, minsz) ? + -EFAULT : 0; } else if (cmd == VFIO_DEVICE_SET_IRQS) { struct vfio_irq_set hdr; @@ -252,23 +311,27 @@ static long vfio_platform_ioctl(void *device_data, return ret; - } else if (cmd == VFIO_DEVICE_RESET) - return -EINVAL; + } else if (cmd == VFIO_DEVICE_RESET) { + if (vdev->reset) + return vdev->reset(vdev); + else + return -EINVAL; + } return -ENOTTY; } -static ssize_t vfio_platform_read_mmio(struct vfio_platform_region reg, +static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg, char __user *buf, size_t count, loff_t off) { unsigned int done = 0; - if (!reg.ioaddr) { - reg.ioaddr = - ioremap_nocache(reg.addr, reg.size); + if (!reg->ioaddr) { + reg->ioaddr = + ioremap_nocache(reg->addr, reg->size); - if (!reg.ioaddr) + if (!reg->ioaddr) return -ENOMEM; } @@ -278,7 +341,7 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region reg, if (count >= 4 && !(off % 4)) { u32 val; - val = ioread32(reg.ioaddr + off); + val = ioread32(reg->ioaddr + off); if (copy_to_user(buf, &val, 4)) goto err; @@ -286,7 +349,7 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region reg, } else if (count >= 2 && !(off % 2)) { u16 val; - val = ioread16(reg.ioaddr + off); + val = ioread16(reg->ioaddr + off); if (copy_to_user(buf, &val, 2)) goto err; @@ -294,7 +357,7 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region reg, } else { u8 val; - val = ioread8(reg.ioaddr + off); + val = ioread8(reg->ioaddr + off); if (copy_to_user(buf, &val, 1)) goto err; @@ -327,7 +390,7 @@ static ssize_t vfio_platform_read(void *device_data, char __user *buf, return -EINVAL; if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO) - return vfio_platform_read_mmio(vdev->regions[index], + return vfio_platform_read_mmio(&vdev->regions[index], buf, count, off); else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO) return -EINVAL; /* not implemented */ @@ -335,17 +398,17 @@ static ssize_t vfio_platform_read(void *device_data, char __user *buf, return -EINVAL; } -static ssize_t vfio_platform_write_mmio(struct vfio_platform_region reg, +static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg, const char __user *buf, size_t count, loff_t off) { unsigned int done = 0; - if (!reg.ioaddr) { - reg.ioaddr = - ioremap_nocache(reg.addr, reg.size); + if (!reg->ioaddr) { + reg->ioaddr = + ioremap_nocache(reg->addr, reg->size); - if (!reg.ioaddr) + if (!reg->ioaddr) return -ENOMEM; } @@ -357,7 +420,7 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region reg, if (copy_from_user(&val, buf, 4)) goto err; - iowrite32(val, reg.ioaddr + off); + iowrite32(val, reg->ioaddr + off); filled = 4; } else if (count >= 2 && !(off % 2)) { @@ -365,7 +428,7 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region reg, if (copy_from_user(&val, buf, 2)) goto err; - iowrite16(val, reg.ioaddr + off); + iowrite16(val, reg->ioaddr + off); filled = 2; } else { @@ -373,7 +436,7 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region reg, if (copy_from_user(&val, buf, 1)) goto err; - iowrite8(val, reg.ioaddr + off); + iowrite8(val, reg->ioaddr + off); filled = 1; } @@ -403,7 +466,7 @@ static ssize_t vfio_platform_write(void *device_data, const char __user *buf, return -EINVAL; if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO) - return vfio_platform_write_mmio(vdev->regions[index], + return vfio_platform_write_mmio(&vdev->regions[index], buf, count, off); else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO) return -EINVAL; /* not implemented */ @@ -490,6 +553,14 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev, if (!vdev) return -EINVAL; + ret = device_property_read_string(dev, "compatible", &vdev->compat); + if (ret) { + pr_err("VFIO: cannot retrieve compat for %s\n", vdev->name); + return -EINVAL; + } + + vdev->device = dev; + group = iommu_group_get(dev); if (!group) { pr_err("VFIO: No IOMMU group for device %s\n", vdev->name); @@ -502,6 +573,8 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev, return ret; } + vfio_platform_get_reset(vdev); + mutex_init(&vdev->igate); return 0; @@ -513,9 +586,43 @@ struct vfio_platform_device *vfio_platform_remove_common(struct device *dev) struct vfio_platform_device *vdev; vdev = vfio_del_group_dev(dev); - if (vdev) + + if (vdev) { + vfio_platform_put_reset(vdev); iommu_group_put(dev->iommu_group); + } return vdev; } EXPORT_SYMBOL_GPL(vfio_platform_remove_common); + +void __vfio_platform_register_reset(struct vfio_platform_reset_node *node) +{ + mutex_lock(&driver_lock); + list_add(&node->link, &reset_list); + mutex_unlock(&driver_lock); +} +EXPORT_SYMBOL_GPL(__vfio_platform_register_reset); + +void vfio_platform_unregister_reset(const char *compat, + vfio_platform_reset_fn_t fn) +{ + struct vfio_platform_reset_node *iter, *temp; + + mutex_lock(&driver_lock); + list_for_each_entry_safe(iter, temp, &reset_list, link) { + if (!strcmp(iter->compat, compat) && (iter->reset == fn)) { + list_del(&iter->link); + break; + } + } + + mutex_unlock(&driver_lock); + +} +EXPORT_SYMBOL_GPL(vfio_platform_unregister_reset); + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/kernel/drivers/vfio/platform/vfio_platform_irq.c b/kernel/drivers/vfio/platform/vfio_platform_irq.c index 88bba57b3..46d4750f4 100644 --- a/kernel/drivers/vfio/platform/vfio_platform_irq.c +++ b/kernel/drivers/vfio/platform/vfio_platform_irq.c @@ -185,6 +185,7 @@ static int vfio_set_trigger(struct vfio_platform_device *vdev, int index, int ret; if (irq->trigger) { + irq_clear_status_flags(irq->hwirq, IRQ_NOAUTOEN); free_irq(irq->hwirq, irq); kfree(irq->name); eventfd_ctx_put(irq->trigger); diff --git a/kernel/drivers/vfio/platform/vfio_platform_private.h b/kernel/drivers/vfio/platform/vfio_platform_private.h index 5d31e0473..42816dd28 100644 --- a/kernel/drivers/vfio/platform/vfio_platform_private.h +++ b/kernel/drivers/vfio/platform/vfio_platform_private.h @@ -56,6 +56,10 @@ struct vfio_platform_device { u32 num_irqs; int refcnt; struct mutex igate; + struct module *parent_module; + const char *compat; + struct module *reset_module; + struct device *device; /* * These fields should be filled by the bus specific binder @@ -67,6 +71,16 @@ struct vfio_platform_device { struct resource* (*get_resource)(struct vfio_platform_device *vdev, int i); int (*get_irq)(struct vfio_platform_device *vdev, int i); + int (*reset)(struct vfio_platform_device *vdev); +}; + +typedef int (*vfio_platform_reset_fn_t)(struct vfio_platform_device *vdev); + +struct vfio_platform_reset_node { + struct list_head link; + char *compat; + struct module *owner; + vfio_platform_reset_fn_t reset; }; extern int vfio_platform_probe_common(struct vfio_platform_device *vdev, @@ -82,4 +96,29 @@ extern int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev, unsigned start, unsigned count, void *data); +extern void __vfio_platform_register_reset(struct vfio_platform_reset_node *n); +extern void vfio_platform_unregister_reset(const char *compat, + vfio_platform_reset_fn_t fn); +#define vfio_platform_register_reset(__compat, __reset) \ +static struct vfio_platform_reset_node __reset ## _node = { \ + .owner = THIS_MODULE, \ + .compat = __compat, \ + .reset = __reset, \ +}; \ +__vfio_platform_register_reset(&__reset ## _node) + +#define module_vfio_reset_handler(compat, reset) \ +MODULE_ALIAS("vfio-reset:" compat); \ +static int __init reset ## _module_init(void) \ +{ \ + vfio_platform_register_reset(compat, reset); \ + return 0; \ +}; \ +static void __exit reset ## _module_exit(void) \ +{ \ + vfio_platform_unregister_reset(compat, reset); \ +}; \ +module_init(reset ## _module_init); \ +module_exit(reset ## _module_exit) + #endif /* VFIO_PLATFORM_PRIVATE_H */ diff --git a/kernel/drivers/vfio/vfio.c b/kernel/drivers/vfio/vfio.c index e1278fe04..6070b793c 100644 --- a/kernel/drivers/vfio/vfio.c +++ b/kernel/drivers/vfio/vfio.c @@ -25,6 +25,7 @@ #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/pci.h> #include <linux/rwsem.h> #include <linux/sched.h> #include <linux/slab.h> @@ -438,16 +439,33 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group, } /* - * Whitelist some drivers that we know are safe (no dma) or just sit on - * a device. It's not always practical to leave a device within a group - * driverless as it could get re-bound to something unsafe. + * Some drivers, like pci-stub, are only used to prevent other drivers from + * claiming a device and are therefore perfectly legitimate for a user owned + * group. The pci-stub driver has no dependencies on DMA or the IOVA mapping + * of the device, but it does prevent the user from having direct access to + * the device, which is useful in some circumstances. + * + * We also assume that we can include PCI interconnect devices, ie. bridges. + * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge + * then all of the downstream devices will be part of the same IOMMU group as + * the bridge. Thus, if placing the bridge into the user owned IOVA space + * breaks anything, it only does so for user owned devices downstream. Note + * that error notification via MSI can be affected for platforms that handle + * MSI within the same IOVA space as DMA. */ -static const char * const vfio_driver_whitelist[] = { "pci-stub", "pcieport" }; +static const char * const vfio_driver_whitelist[] = { "pci-stub" }; -static bool vfio_whitelisted_driver(struct device_driver *drv) +static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv) { int i; + if (dev_is_pci(dev)) { + struct pci_dev *pdev = to_pci_dev(dev); + + if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return true; + } + for (i = 0; i < ARRAY_SIZE(vfio_driver_whitelist); i++) { if (!strcmp(drv->name, vfio_driver_whitelist[i])) return true; @@ -462,6 +480,7 @@ static bool vfio_whitelisted_driver(struct device_driver *drv) * - driver-less * - bound to a vfio driver * - bound to a whitelisted driver + * - a PCI interconnect device * * We use two methods to determine whether a device is bound to a vfio * driver. The first is to test whether the device exists in the vfio @@ -486,7 +505,7 @@ static int vfio_dev_viable(struct device *dev, void *data) } mutex_unlock(&group->unbound_lock); - if (!ret || !drv || vfio_whitelisted_driver(drv)) + if (!ret || !drv || vfio_dev_whitelisted(dev, drv)) return 0; device = vfio_group_get_device(group, dev); @@ -517,7 +536,7 @@ static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev) return 0; /* TODO Prevent device auto probing */ - WARN("Device %s added to live group %d!\n", dev_name(dev), + WARN(1, "Device %s added to live group %d!\n", dev_name(dev), iommu_group_id(group->iommu_group)); return 0; @@ -661,23 +680,52 @@ int vfio_add_group_dev(struct device *dev, EXPORT_SYMBOL_GPL(vfio_add_group_dev); /** - * Get a reference to the vfio_device for a device that is known to - * be bound to a vfio driver. The driver implicitly holds a - * vfio_device reference between vfio_add_group_dev and - * vfio_del_group_dev. We can therefore use drvdata to increment - * that reference from the struct device. This additional - * reference must be released by calling vfio_device_put. + * Get a reference to the vfio_device for a device. Even if the + * caller thinks they own the device, they could be racing with a + * release call path, so we can't trust drvdata for the shortcut. + * Go the long way around, from the iommu_group to the vfio_group + * to the vfio_device. */ struct vfio_device *vfio_device_get_from_dev(struct device *dev) { - struct vfio_device *device = dev_get_drvdata(dev); + struct iommu_group *iommu_group; + struct vfio_group *group; + struct vfio_device *device; + + iommu_group = iommu_group_get(dev); + if (!iommu_group) + return NULL; + + group = vfio_group_get_from_iommu(iommu_group); + iommu_group_put(iommu_group); + if (!group) + return NULL; - vfio_device_get(device); + device = vfio_group_get_device(group, dev); + vfio_group_put(group); return device; } EXPORT_SYMBOL_GPL(vfio_device_get_from_dev); +static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, + char *buf) +{ + struct vfio_device *it, *device = NULL; + + mutex_lock(&group->device_lock); + list_for_each_entry(it, &group->device_list, group_next) { + if (!strcmp(dev_name(it->dev), buf)) { + device = it; + vfio_device_get(device); + break; + } + } + mutex_unlock(&group->device_lock); + + return device; +} + /* * Caller must hold a reference to the vfio_device */ @@ -1187,53 +1235,53 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) { struct vfio_device *device; struct file *filep; - int ret = -ENODEV; + int ret; if (0 == atomic_read(&group->container_users) || !group->container->iommu_driver || !vfio_group_viable(group)) return -EINVAL; - mutex_lock(&group->device_lock); - list_for_each_entry(device, &group->device_list, group_next) { - if (strcmp(dev_name(device->dev), buf)) - continue; + device = vfio_device_get_from_name(group, buf); + if (!device) + return -ENODEV; - ret = device->ops->open(device->device_data); - if (ret) - break; - /* - * We can't use anon_inode_getfd() because we need to modify - * the f_mode flags directly to allow more than just ioctls - */ - ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) { - device->ops->release(device->device_data); - break; - } + ret = device->ops->open(device->device_data); + if (ret) { + vfio_device_put(device); + return ret; + } - filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, - device, O_RDWR); - if (IS_ERR(filep)) { - put_unused_fd(ret); - ret = PTR_ERR(filep); - device->ops->release(device->device_data); - break; - } + /* + * We can't use anon_inode_getfd() because we need to modify + * the f_mode flags directly to allow more than just ioctls + */ + ret = get_unused_fd_flags(O_CLOEXEC); + if (ret < 0) { + device->ops->release(device->device_data); + vfio_device_put(device); + return ret; + } - /* - * TODO: add an anon_inode interface to do this. - * Appears to be missing by lack of need rather than - * explicitly prevented. Now there's need. - */ - filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, + device, O_RDWR); + if (IS_ERR(filep)) { + put_unused_fd(ret); + ret = PTR_ERR(filep); + device->ops->release(device->device_data); + vfio_device_put(device); + return ret; + } - vfio_device_get(device); - atomic_inc(&group->container_users); + /* + * TODO: add an anon_inode interface to do this. + * Appears to be missing by lack of need rather than + * explicitly prevented. Now there's need. + */ + filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); - fd_install(ret, filep); - break; - } - mutex_unlock(&group->device_lock); + atomic_inc(&group->container_users); + + fd_install(ret, filep); return ret; } diff --git a/kernel/drivers/vfio/vfio_iommu_spapr_tce.c b/kernel/drivers/vfio/vfio_iommu_spapr_tce.c index 730b4ef3e..0582b72ef 100644 --- a/kernel/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/kernel/drivers/vfio/vfio_iommu_spapr_tce.c @@ -19,8 +19,10 @@ #include <linux/uaccess.h> #include <linux/err.h> #include <linux/vfio.h> +#include <linux/vmalloc.h> #include <asm/iommu.h> #include <asm/tce.h> +#include <asm/mmu_context.h> #define DRIVER_VERSION "0.1" #define DRIVER_AUTHOR "aik@ozlabs.ru" @@ -29,6 +31,51 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group); +static long try_increment_locked_vm(long npages) +{ + long ret = 0, locked, lock_limit; + + if (!current || !current->mm) + return -ESRCH; /* process exited */ + + if (!npages) + return 0; + + down_write(¤t->mm->mmap_sem); + locked = current->mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + ret = -ENOMEM; + else + current->mm->locked_vm += npages; + + pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, + npages << PAGE_SHIFT, + current->mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + +static void decrement_locked_vm(long npages) +{ + if (!current || !current->mm || !npages) + return; /* process exited */ + + down_write(¤t->mm->mmap_sem); + if (WARN_ON_ONCE(npages > current->mm->locked_vm)) + npages = current->mm->locked_vm; + current->mm->locked_vm -= npages; + pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, + npages << PAGE_SHIFT, + current->mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK)); + up_write(¤t->mm->mmap_sem); +} + /* * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation * @@ -36,6 +83,11 @@ static void tce_iommu_detach_group(void *iommu_data, * into DMA'ble space using the IOMMU */ +struct tce_iommu_group { + struct list_head next; + struct iommu_group *grp; +}; + /* * The container descriptor supports only a single group per container. * Required by the API as the container is not supplied with the IOMMU group @@ -43,18 +95,140 @@ static void tce_iommu_detach_group(void *iommu_data, */ struct tce_container { struct mutex lock; - struct iommu_table *tbl; bool enabled; + bool v2; + unsigned long locked_pages; + struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; + struct list_head group_list; }; +static long tce_iommu_unregister_pages(struct tce_container *container, + __u64 vaddr, __u64 size) +{ + struct mm_iommu_table_group_mem_t *mem; + + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) + return -EINVAL; + + mem = mm_iommu_find(vaddr, size >> PAGE_SHIFT); + if (!mem) + return -ENOENT; + + return mm_iommu_put(mem); +} + +static long tce_iommu_register_pages(struct tce_container *container, + __u64 vaddr, __u64 size) +{ + long ret = 0; + struct mm_iommu_table_group_mem_t *mem = NULL; + unsigned long entries = size >> PAGE_SHIFT; + + if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || + ((vaddr + size) < vaddr)) + return -EINVAL; + + ret = mm_iommu_get(vaddr, entries, &mem); + if (ret) + return ret; + + container->enabled = true; + + return 0; +} + +static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl) +{ + unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * + tbl->it_size, PAGE_SIZE); + unsigned long *uas; + long ret; + + BUG_ON(tbl->it_userspace); + + ret = try_increment_locked_vm(cb >> PAGE_SHIFT); + if (ret) + return ret; + + uas = vzalloc(cb); + if (!uas) { + decrement_locked_vm(cb >> PAGE_SHIFT); + return -ENOMEM; + } + tbl->it_userspace = uas; + + return 0; +} + +static void tce_iommu_userspace_view_free(struct iommu_table *tbl) +{ + unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * + tbl->it_size, PAGE_SIZE); + + if (!tbl->it_userspace) + return; + + vfree(tbl->it_userspace); + tbl->it_userspace = NULL; + decrement_locked_vm(cb >> PAGE_SHIFT); +} + +static bool tce_page_is_contained(struct page *page, unsigned page_shift) +{ + /* + * Check that the TCE table granularity is not bigger than the size of + * a page we just found. Otherwise the hardware can get access to + * a bigger memory chunk that it should. + */ + return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; +} + +static inline bool tce_groups_attached(struct tce_container *container) +{ + return !list_empty(&container->group_list); +} + +static long tce_iommu_find_table(struct tce_container *container, + phys_addr_t ioba, struct iommu_table **ptbl) +{ + long i; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = container->tables[i]; + + if (tbl) { + unsigned long entry = ioba >> tbl->it_page_shift; + unsigned long start = tbl->it_offset; + unsigned long end = start + tbl->it_size; + + if ((start <= entry) && (entry < end)) { + *ptbl = tbl; + return i; + } + } + } + + return -1; +} + +static int tce_iommu_find_free_table(struct tce_container *container) +{ + int i; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + if (!container->tables[i]) + return i; + } + + return -ENOSPC; +} + static int tce_iommu_enable(struct tce_container *container) { int ret = 0; - unsigned long locked, lock_limit, npages; - struct iommu_table *tbl = container->tbl; - - if (!container->tbl) - return -ENXIO; + unsigned long locked; + struct iommu_table_group *table_group; + struct tce_iommu_group *tcegrp; if (!current->mm) return -ESRCH; /* process exited */ @@ -79,21 +253,38 @@ static int tce_iommu_enable(struct tce_container *container) * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits, * that would effectively kill the guest at random points, much better * enforcing the limit based on the max that the guest can map. + * + * Unfortunately at the moment it counts whole tables, no matter how + * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups + * each with 2GB DMA window, 8GB will be counted here. The reason for + * this is that we cannot tell here the amount of RAM used by the guest + * as this information is only available from KVM and VFIO is + * KVM agnostic. + * + * So we do not allow enabling a container without a group attached + * as there is no way to know how much we should increment + * the locked_vm counter. */ - down_write(¤t->mm->mmap_sem); - npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; - locked = current->mm->locked_vm + npages; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { - pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n", - rlimit(RLIMIT_MEMLOCK)); - ret = -ENOMEM; - } else { + if (!tce_groups_attached(container)) + return -ENODEV; - current->mm->locked_vm += npages; - container->enabled = true; - } - up_write(¤t->mm->mmap_sem); + tcegrp = list_first_entry(&container->group_list, + struct tce_iommu_group, next); + table_group = iommu_group_get_iommudata(tcegrp->grp); + if (!table_group) + return -ENODEV; + + if (!table_group->tce32_size) + return -EPERM; + + locked = table_group->tce32_size >> PAGE_SHIFT; + ret = try_increment_locked_vm(locked); + if (ret) + return ret; + + container->locked_pages = locked; + + container->enabled = true; return ret; } @@ -105,20 +296,17 @@ static void tce_iommu_disable(struct tce_container *container) container->enabled = false; - if (!container->tbl || !current->mm) + if (!current->mm) return; - down_write(¤t->mm->mmap_sem); - current->mm->locked_vm -= (container->tbl->it_size << - IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT; - up_write(¤t->mm->mmap_sem); + decrement_locked_vm(container->locked_pages); } static void *tce_iommu_open(unsigned long arg) { struct tce_container *container; - if (arg != VFIO_SPAPR_TCE_IOMMU) { + if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) { pr_err("tce_vfio: Wrong IOMMU type\n"); return ERR_PTR(-EINVAL); } @@ -128,36 +316,411 @@ static void *tce_iommu_open(unsigned long arg) return ERR_PTR(-ENOMEM); mutex_init(&container->lock); + INIT_LIST_HEAD_RCU(&container->group_list); + + container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; return container; } +static int tce_iommu_clear(struct tce_container *container, + struct iommu_table *tbl, + unsigned long entry, unsigned long pages); +static void tce_iommu_free_table(struct iommu_table *tbl); + static void tce_iommu_release(void *iommu_data) { struct tce_container *container = iommu_data; + struct iommu_table_group *table_group; + struct tce_iommu_group *tcegrp; + long i; + + while (tce_groups_attached(container)) { + tcegrp = list_first_entry(&container->group_list, + struct tce_iommu_group, next); + table_group = iommu_group_get_iommudata(tcegrp->grp); + tce_iommu_detach_group(iommu_data, tcegrp->grp); + } - WARN_ON(container->tbl && !container->tbl->it_group); - tce_iommu_disable(container); + /* + * If VFIO created a table, it was not disposed + * by tce_iommu_detach_group() so do it now. + */ + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = container->tables[i]; + + if (!tbl) + continue; - if (container->tbl && container->tbl->it_group) - tce_iommu_detach_group(iommu_data, container->tbl->it_group); + tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); + tce_iommu_free_table(tbl); + } + tce_iommu_disable(container); mutex_destroy(&container->lock); kfree(container); } +static void tce_iommu_unuse_page(struct tce_container *container, + unsigned long hpa) +{ + struct page *page; + + page = pfn_to_page(hpa >> PAGE_SHIFT); + put_page(page); +} + +static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size, + unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem) +{ + long ret = 0; + struct mm_iommu_table_group_mem_t *mem; + + mem = mm_iommu_lookup(tce, size); + if (!mem) + return -EINVAL; + + ret = mm_iommu_ua_to_hpa(mem, tce, phpa); + if (ret) + return -EINVAL; + + *pmem = mem; + + return 0; +} + +static void tce_iommu_unuse_page_v2(struct iommu_table *tbl, + unsigned long entry) +{ + struct mm_iommu_table_group_mem_t *mem = NULL; + int ret; + unsigned long hpa = 0; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + + if (!pua || !current || !current->mm) + return; + + ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl), + &hpa, &mem); + if (ret) + pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n", + __func__, *pua, entry, ret); + if (mem) + mm_iommu_mapped_dec(mem); + + *pua = 0; +} + +static int tce_iommu_clear(struct tce_container *container, + struct iommu_table *tbl, + unsigned long entry, unsigned long pages) +{ + unsigned long oldhpa; + long ret; + enum dma_data_direction direction; + + for ( ; pages; --pages, ++entry) { + direction = DMA_NONE; + oldhpa = 0; + ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction); + if (ret) + continue; + + if (direction == DMA_NONE) + continue; + + if (container->v2) { + tce_iommu_unuse_page_v2(tbl, entry); + continue; + } + + tce_iommu_unuse_page(container, oldhpa); + } + + return 0; +} + +static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) +{ + struct page *page = NULL; + enum dma_data_direction direction = iommu_tce_direction(tce); + + if (get_user_pages_fast(tce & PAGE_MASK, 1, + direction != DMA_TO_DEVICE, &page) != 1) + return -EFAULT; + + *hpa = __pa((unsigned long) page_address(page)); + + return 0; +} + +static long tce_iommu_build(struct tce_container *container, + struct iommu_table *tbl, + unsigned long entry, unsigned long tce, unsigned long pages, + enum dma_data_direction direction) +{ + long i, ret = 0; + struct page *page; + unsigned long hpa; + enum dma_data_direction dirtmp; + + for (i = 0; i < pages; ++i) { + unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; + + ret = tce_iommu_use_page(tce, &hpa); + if (ret) + break; + + page = pfn_to_page(hpa >> PAGE_SHIFT); + if (!tce_page_is_contained(page, tbl->it_page_shift)) { + ret = -EPERM; + break; + } + + hpa |= offset; + dirtmp = direction; + ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); + if (ret) { + tce_iommu_unuse_page(container, hpa); + pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", + __func__, entry << tbl->it_page_shift, + tce, ret); + break; + } + + if (dirtmp != DMA_NONE) + tce_iommu_unuse_page(container, hpa); + + tce += IOMMU_PAGE_SIZE(tbl); + } + + if (ret) + tce_iommu_clear(container, tbl, entry, i); + + return ret; +} + +static long tce_iommu_build_v2(struct tce_container *container, + struct iommu_table *tbl, + unsigned long entry, unsigned long tce, unsigned long pages, + enum dma_data_direction direction) +{ + long i, ret = 0; + struct page *page; + unsigned long hpa; + enum dma_data_direction dirtmp; + + for (i = 0; i < pages; ++i) { + struct mm_iommu_table_group_mem_t *mem = NULL; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, + entry + i); + + ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl), + &hpa, &mem); + if (ret) + break; + + page = pfn_to_page(hpa >> PAGE_SHIFT); + if (!tce_page_is_contained(page, tbl->it_page_shift)) { + ret = -EPERM; + break; + } + + /* Preserve offset within IOMMU page */ + hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK; + dirtmp = direction; + + /* The registered region is being unregistered */ + if (mm_iommu_mapped_inc(mem)) + break; + + ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp); + if (ret) { + /* dirtmp cannot be DMA_NONE here */ + tce_iommu_unuse_page_v2(tbl, entry + i); + pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n", + __func__, entry << tbl->it_page_shift, + tce, ret); + break; + } + + if (dirtmp != DMA_NONE) + tce_iommu_unuse_page_v2(tbl, entry + i); + + *pua = tce; + + tce += IOMMU_PAGE_SIZE(tbl); + } + + if (ret) + tce_iommu_clear(container, tbl, entry, i); + + return ret; +} + +static long tce_iommu_create_table(struct tce_container *container, + struct iommu_table_group *table_group, + int num, + __u32 page_shift, + __u64 window_size, + __u32 levels, + struct iommu_table **ptbl) +{ + long ret, table_size; + + table_size = table_group->ops->get_table_size(page_shift, window_size, + levels); + if (!table_size) + return -EINVAL; + + ret = try_increment_locked_vm(table_size >> PAGE_SHIFT); + if (ret) + return ret; + + ret = table_group->ops->create_table(table_group, num, + page_shift, window_size, levels, ptbl); + + WARN_ON(!ret && !(*ptbl)->it_ops->free); + WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); + + if (!ret && container->v2) { + ret = tce_iommu_userspace_view_alloc(*ptbl); + if (ret) + (*ptbl)->it_ops->free(*ptbl); + } + + if (ret) + decrement_locked_vm(table_size >> PAGE_SHIFT); + + return ret; +} + +static void tce_iommu_free_table(struct iommu_table *tbl) +{ + unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; + + tce_iommu_userspace_view_free(tbl); + tbl->it_ops->free(tbl); + decrement_locked_vm(pages); +} + +static long tce_iommu_create_window(struct tce_container *container, + __u32 page_shift, __u64 window_size, __u32 levels, + __u64 *start_addr) +{ + struct tce_iommu_group *tcegrp; + struct iommu_table_group *table_group; + struct iommu_table *tbl = NULL; + long ret, num; + + num = tce_iommu_find_free_table(container); + if (num < 0) + return num; + + /* Get the first group for ops::create_table */ + tcegrp = list_first_entry(&container->group_list, + struct tce_iommu_group, next); + table_group = iommu_group_get_iommudata(tcegrp->grp); + if (!table_group) + return -EFAULT; + + if (!(table_group->pgsizes & (1ULL << page_shift))) + return -EINVAL; + + if (!table_group->ops->set_window || !table_group->ops->unset_window || + !table_group->ops->get_table_size || + !table_group->ops->create_table) + return -EPERM; + + /* Create TCE table */ + ret = tce_iommu_create_table(container, table_group, num, + page_shift, window_size, levels, &tbl); + if (ret) + return ret; + + BUG_ON(!tbl->it_ops->free); + + /* + * Program the table to every group. + * Groups have been tested for compatibility at the attach time. + */ + list_for_each_entry(tcegrp, &container->group_list, next) { + table_group = iommu_group_get_iommudata(tcegrp->grp); + + ret = table_group->ops->set_window(table_group, num, tbl); + if (ret) + goto unset_exit; + } + + container->tables[num] = tbl; + + /* Return start address assigned by platform in create_table() */ + *start_addr = tbl->it_offset << tbl->it_page_shift; + + return 0; + +unset_exit: + list_for_each_entry(tcegrp, &container->group_list, next) { + table_group = iommu_group_get_iommudata(tcegrp->grp); + table_group->ops->unset_window(table_group, num); + } + tce_iommu_free_table(tbl); + + return ret; +} + +static long tce_iommu_remove_window(struct tce_container *container, + __u64 start_addr) +{ + struct iommu_table_group *table_group = NULL; + struct iommu_table *tbl; + struct tce_iommu_group *tcegrp; + int num; + + num = tce_iommu_find_table(container, start_addr, &tbl); + if (num < 0) + return -EINVAL; + + BUG_ON(!tbl->it_size); + + /* Detach groups from IOMMUs */ + list_for_each_entry(tcegrp, &container->group_list, next) { + table_group = iommu_group_get_iommudata(tcegrp->grp); + + /* + * SPAPR TCE IOMMU exposes the default DMA window to + * the guest via dma32_window_start/size of + * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow + * the userspace to remove this window, some do not so + * here we check for the platform capability. + */ + if (!table_group->ops || !table_group->ops->unset_window) + return -EPERM; + + table_group->ops->unset_window(table_group, num); + } + + /* Free table */ + tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); + tce_iommu_free_table(tbl); + container->tables[num] = NULL; + + return 0; +} + static long tce_iommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { struct tce_container *container = iommu_data; - unsigned long minsz; + unsigned long minsz, ddwsz; long ret; switch (cmd) { case VFIO_CHECK_EXTENSION: switch (arg) { case VFIO_SPAPR_TCE_IOMMU: + case VFIO_SPAPR_TCE_v2_IOMMU: ret = 1; break; default: @@ -169,9 +732,17 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_IOMMU_SPAPR_TCE_GET_INFO: { struct vfio_iommu_spapr_tce_info info; - struct iommu_table *tbl = container->tbl; + struct tce_iommu_group *tcegrp; + struct iommu_table_group *table_group; + + if (!tce_groups_attached(container)) + return -ENXIO; + + tcegrp = list_first_entry(&container->group_list, + struct tce_iommu_group, next); + table_group = iommu_group_get_iommudata(tcegrp->grp); - if (WARN_ON(!tbl)) + if (!table_group) return -ENXIO; minsz = offsetofend(struct vfio_iommu_spapr_tce_info, @@ -183,9 +754,24 @@ static long tce_iommu_ioctl(void *iommu_data, if (info.argsz < minsz) return -EINVAL; - info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K; - info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K; + info.dma32_window_start = table_group->tce32_start; + info.dma32_window_size = table_group->tce32_size; info.flags = 0; + memset(&info.ddw, 0, sizeof(info.ddw)); + + if (table_group->max_dynamic_windows_supported && + container->v2) { + info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW; + info.ddw.pgsizes = table_group->pgsizes; + info.ddw.max_dynamic_windows_supported = + table_group->max_dynamic_windows_supported; + info.ddw.levels = table_group->max_levels; + } + + ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw); + + if (info.argsz >= ddwsz) + minsz = ddwsz; if (copy_to_user((void __user *)arg, &info, minsz)) return -EFAULT; @@ -194,13 +780,12 @@ static long tce_iommu_ioctl(void *iommu_data, } case VFIO_IOMMU_MAP_DMA: { struct vfio_iommu_type1_dma_map param; - struct iommu_table *tbl = container->tbl; - unsigned long tce, i; + struct iommu_table *tbl = NULL; + long num; + enum dma_data_direction direction; - if (!tbl) - return -ENXIO; - - BUG_ON(!tbl->it_group); + if (!container->enabled) + return -EPERM; minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); @@ -214,32 +799,43 @@ static long tce_iommu_ioctl(void *iommu_data, VFIO_DMA_MAP_FLAG_WRITE)) return -EINVAL; - if ((param.size & ~IOMMU_PAGE_MASK_4K) || - (param.vaddr & ~IOMMU_PAGE_MASK_4K)) + num = tce_iommu_find_table(container, param.iova, &tbl); + if (num < 0) + return -ENXIO; + + if ((param.size & ~IOMMU_PAGE_MASK(tbl)) || + (param.vaddr & ~IOMMU_PAGE_MASK(tbl))) return -EINVAL; /* iova is checked by the IOMMU API */ - tce = param.vaddr; - if (param.flags & VFIO_DMA_MAP_FLAG_READ) - tce |= TCE_PCI_READ; - if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) - tce |= TCE_PCI_WRITE; + if (param.flags & VFIO_DMA_MAP_FLAG_READ) { + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) + direction = DMA_BIDIRECTIONAL; + else + direction = DMA_TO_DEVICE; + } else { + if (param.flags & VFIO_DMA_MAP_FLAG_WRITE) + direction = DMA_FROM_DEVICE; + else + return -EINVAL; + } - ret = iommu_tce_put_param_check(tbl, param.iova, tce); + ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr); if (ret) return ret; - for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) { - ret = iommu_put_tce_user_mode(tbl, - (param.iova >> IOMMU_PAGE_SHIFT_4K) + i, - tce); - if (ret) - break; - tce += IOMMU_PAGE_SIZE_4K; - } - if (ret) - iommu_clear_tces_and_put_pages(tbl, - param.iova >> IOMMU_PAGE_SHIFT_4K, i); + if (container->v2) + ret = tce_iommu_build_v2(container, tbl, + param.iova >> tbl->it_page_shift, + param.vaddr, + param.size >> tbl->it_page_shift, + direction); + else + ret = tce_iommu_build(container, tbl, + param.iova >> tbl->it_page_shift, + param.vaddr, + param.size >> tbl->it_page_shift, + direction); iommu_flush_tce(tbl); @@ -247,10 +843,11 @@ static long tce_iommu_ioctl(void *iommu_data, } case VFIO_IOMMU_UNMAP_DMA: { struct vfio_iommu_type1_dma_unmap param; - struct iommu_table *tbl = container->tbl; + struct iommu_table *tbl = NULL; + long num; - if (WARN_ON(!tbl)) - return -ENXIO; + if (!container->enabled) + return -EPERM; minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); @@ -265,22 +862,81 @@ static long tce_iommu_ioctl(void *iommu_data, if (param.flags) return -EINVAL; - if (param.size & ~IOMMU_PAGE_MASK_4K) + num = tce_iommu_find_table(container, param.iova, &tbl); + if (num < 0) + return -ENXIO; + + if (param.size & ~IOMMU_PAGE_MASK(tbl)) return -EINVAL; ret = iommu_tce_clear_param_check(tbl, param.iova, 0, - param.size >> IOMMU_PAGE_SHIFT_4K); + param.size >> tbl->it_page_shift); if (ret) return ret; - ret = iommu_clear_tces_and_put_pages(tbl, - param.iova >> IOMMU_PAGE_SHIFT_4K, - param.size >> IOMMU_PAGE_SHIFT_4K); + ret = tce_iommu_clear(container, tbl, + param.iova >> tbl->it_page_shift, + param.size >> tbl->it_page_shift); iommu_flush_tce(tbl); return ret; } + case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: { + struct vfio_iommu_spapr_register_memory param; + + if (!container->v2) + break; + + minsz = offsetofend(struct vfio_iommu_spapr_register_memory, + size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + /* No flag is supported now */ + if (param.flags) + return -EINVAL; + + mutex_lock(&container->lock); + ret = tce_iommu_register_pages(container, param.vaddr, + param.size); + mutex_unlock(&container->lock); + + return ret; + } + case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: { + struct vfio_iommu_spapr_register_memory param; + + if (!container->v2) + break; + + minsz = offsetofend(struct vfio_iommu_spapr_register_memory, + size); + + if (copy_from_user(¶m, (void __user *)arg, minsz)) + return -EFAULT; + + if (param.argsz < minsz) + return -EINVAL; + + /* No flag is supported now */ + if (param.flags) + return -EINVAL; + + mutex_lock(&container->lock); + ret = tce_iommu_unregister_pages(container, param.vaddr, + param.size); + mutex_unlock(&container->lock); + + return ret; + } case VFIO_IOMMU_ENABLE: + if (container->v2) + break; + mutex_lock(&container->lock); ret = tce_iommu_enable(container); mutex_unlock(&container->lock); @@ -288,48 +944,280 @@ static long tce_iommu_ioctl(void *iommu_data, case VFIO_IOMMU_DISABLE: + if (container->v2) + break; + mutex_lock(&container->lock); tce_iommu_disable(container); mutex_unlock(&container->lock); return 0; - case VFIO_EEH_PE_OP: - if (!container->tbl || !container->tbl->it_group) - return -ENODEV; - return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group, - cmd, arg); + case VFIO_EEH_PE_OP: { + struct tce_iommu_group *tcegrp; + + ret = 0; + list_for_each_entry(tcegrp, &container->group_list, next) { + ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp, + cmd, arg); + if (ret) + return ret; + } + return ret; + } + + case VFIO_IOMMU_SPAPR_TCE_CREATE: { + struct vfio_iommu_spapr_tce_create create; + + if (!container->v2) + break; + + if (!tce_groups_attached(container)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_spapr_tce_create, + start_addr); + + if (copy_from_user(&create, (void __user *)arg, minsz)) + return -EFAULT; + + if (create.argsz < minsz) + return -EINVAL; + + if (create.flags) + return -EINVAL; + + mutex_lock(&container->lock); + + ret = tce_iommu_create_window(container, create.page_shift, + create.window_size, create.levels, + &create.start_addr); + + mutex_unlock(&container->lock); + + if (!ret && copy_to_user((void __user *)arg, &create, minsz)) + ret = -EFAULT; + + return ret; + } + case VFIO_IOMMU_SPAPR_TCE_REMOVE: { + struct vfio_iommu_spapr_tce_remove remove; + + if (!container->v2) + break; + + if (!tce_groups_attached(container)) + return -ENXIO; + + minsz = offsetofend(struct vfio_iommu_spapr_tce_remove, + start_addr); + + if (copy_from_user(&remove, (void __user *)arg, minsz)) + return -EFAULT; + + if (remove.argsz < minsz) + return -EINVAL; + + if (remove.flags) + return -EINVAL; + + mutex_lock(&container->lock); + + ret = tce_iommu_remove_window(container, remove.start_addr); + + mutex_unlock(&container->lock); + + return ret; + } } return -ENOTTY; } +static void tce_iommu_release_ownership(struct tce_container *container, + struct iommu_table_group *table_group) +{ + int i; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = container->tables[i]; + + if (!tbl) + continue; + + tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); + tce_iommu_userspace_view_free(tbl); + if (tbl->it_map) + iommu_release_ownership(tbl); + + container->tables[i] = NULL; + } +} + +static int tce_iommu_take_ownership(struct tce_container *container, + struct iommu_table_group *table_group) +{ + int i, j, rc = 0; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbl = table_group->tables[i]; + + if (!tbl || !tbl->it_map) + continue; + + rc = tce_iommu_userspace_view_alloc(tbl); + if (!rc) + rc = iommu_take_ownership(tbl); + + if (rc) { + for (j = 0; j < i; ++j) + iommu_release_ownership( + table_group->tables[j]); + + return rc; + } + } + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) + container->tables[i] = table_group->tables[i]; + + return 0; +} + +static void tce_iommu_release_ownership_ddw(struct tce_container *container, + struct iommu_table_group *table_group) +{ + long i; + + if (!table_group->ops->unset_window) { + WARN_ON_ONCE(1); + return; + } + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) + table_group->ops->unset_window(table_group, i); + + table_group->ops->release_ownership(table_group); +} + +static long tce_iommu_take_ownership_ddw(struct tce_container *container, + struct iommu_table_group *table_group) +{ + long i, ret = 0; + struct iommu_table *tbl = NULL; + + if (!table_group->ops->create_table || !table_group->ops->set_window || + !table_group->ops->release_ownership) { + WARN_ON_ONCE(1); + return -EFAULT; + } + + table_group->ops->take_ownership(table_group); + + /* + * If it the first group attached, check if there is + * a default DMA window and create one if none as + * the userspace expects it to exist. + */ + if (!tce_groups_attached(container) && !container->tables[0]) { + ret = tce_iommu_create_table(container, + table_group, + 0, /* window number */ + IOMMU_PAGE_SHIFT_4K, + table_group->tce32_size, + 1, /* default levels */ + &tbl); + if (ret) + goto release_exit; + else + container->tables[0] = tbl; + } + + /* Set all windows to the new group */ + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + tbl = container->tables[i]; + + if (!tbl) + continue; + + /* Set the default window to a new group */ + ret = table_group->ops->set_window(table_group, i, tbl); + if (ret) + goto release_exit; + } + + return 0; + +release_exit: + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) + table_group->ops->unset_window(table_group, i); + + table_group->ops->release_ownership(table_group); + + return ret; +} + static int tce_iommu_attach_group(void *iommu_data, struct iommu_group *iommu_group) { int ret; struct tce_container *container = iommu_data; - struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + struct iommu_table_group *table_group; + struct tce_iommu_group *tcegrp = NULL; - BUG_ON(!tbl); mutex_lock(&container->lock); /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", iommu_group_id(iommu_group), iommu_group); */ - if (container->tbl) { - pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n", - iommu_group_id(container->tbl->it_group), - iommu_group_id(iommu_group)); - ret = -EBUSY; - } else if (container->enabled) { - pr_err("tce_vfio: attaching group #%u to enabled container\n", - iommu_group_id(iommu_group)); + table_group = iommu_group_get_iommudata(iommu_group); + + if (tce_groups_attached(container) && (!table_group->ops || + !table_group->ops->take_ownership || + !table_group->ops->release_ownership)) { ret = -EBUSY; - } else { - ret = iommu_take_ownership(tbl); - if (!ret) - container->tbl = tbl; + goto unlock_exit; + } + + /* Check if new group has the same iommu_ops (i.e. compatible) */ + list_for_each_entry(tcegrp, &container->group_list, next) { + struct iommu_table_group *table_group_tmp; + + if (tcegrp->grp == iommu_group) { + pr_warn("tce_vfio: Group %d is already attached\n", + iommu_group_id(iommu_group)); + ret = -EBUSY; + goto unlock_exit; + } + table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); + if (table_group_tmp->ops != table_group->ops) { + pr_warn("tce_vfio: Group %d is incompatible with group %d\n", + iommu_group_id(iommu_group), + iommu_group_id(tcegrp->grp)); + ret = -EPERM; + goto unlock_exit; + } + } + + tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL); + if (!tcegrp) { + ret = -ENOMEM; + goto unlock_exit; } + if (!table_group->ops || !table_group->ops->take_ownership || + !table_group->ops->release_ownership) + ret = tce_iommu_take_ownership(container, table_group); + else + ret = tce_iommu_take_ownership_ddw(container, table_group); + + if (!ret) { + tcegrp->grp = iommu_group; + list_add(&tcegrp->next, &container->group_list); + } + +unlock_exit: + if (ret && tcegrp) + kfree(tcegrp); + mutex_unlock(&container->lock); return ret; @@ -339,26 +1227,37 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group) { struct tce_container *container = iommu_data; - struct iommu_table *tbl = iommu_group_get_iommudata(iommu_group); + struct iommu_table_group *table_group; + bool found = false; + struct tce_iommu_group *tcegrp; - BUG_ON(!tbl); mutex_lock(&container->lock); - if (tbl != container->tbl) { - pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n", - iommu_group_id(iommu_group), - iommu_group_id(tbl->it_group)); - } else { - if (container->enabled) { - pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n", - iommu_group_id(tbl->it_group)); - tce_iommu_disable(container); + + list_for_each_entry(tcegrp, &container->group_list, next) { + if (tcegrp->grp == iommu_group) { + found = true; + break; } + } - /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n", - iommu_group_id(iommu_group), iommu_group); */ - container->tbl = NULL; - iommu_release_ownership(tbl); + if (!found) { + pr_warn("tce_vfio: detaching unattached group #%u\n", + iommu_group_id(iommu_group)); + goto unlock_exit; } + + list_del(&tcegrp->next); + kfree(tcegrp); + + table_group = iommu_group_get_iommudata(iommu_group); + BUG_ON(!table_group); + + if (!table_group->ops || !table_group->ops->release_ownership) + tce_iommu_release_ownership(container, table_group); + else + tce_iommu_release_ownership_ddw(container, table_group); + +unlock_exit: mutex_unlock(&container->lock); } diff --git a/kernel/drivers/vfio/vfio_iommu_type1.c b/kernel/drivers/vfio/vfio_iommu_type1.c index 57d8c37a0..ecb826eef 100644 --- a/kernel/drivers/vfio/vfio_iommu_type1.c +++ b/kernel/drivers/vfio/vfio_iommu_type1.c @@ -403,13 +403,26 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu) { struct vfio_domain *domain; - unsigned long bitmap = PAGE_MASK; + unsigned long bitmap = ULONG_MAX; mutex_lock(&iommu->lock); list_for_each_entry(domain, &iommu->domain_list, next) bitmap &= domain->domain->ops->pgsize_bitmap; mutex_unlock(&iommu->lock); + /* + * In case the IOMMU supports page sizes smaller than PAGE_SIZE + * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes. + * That way the user will be able to map/unmap buffers whose size/ + * start address is aligned with PAGE_SIZE. Pinning code uses that + * granularity while iommu driver can use the sub-PAGE_SIZE size + * to map the buffer. + */ + if (bitmap & ~PAGE_MASK) { + bitmap &= PAGE_MASK; + bitmap |= PAGE_SIZE; + } + return bitmap; } @@ -986,7 +999,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, info.iova_pgsizes = vfio_pgsize_bitmap(iommu); - return copy_to_user((void __user *)arg, &info, minsz); + return copy_to_user((void __user *)arg, &info, minsz) ? + -EFAULT : 0; } else if (cmd == VFIO_IOMMU_MAP_DMA) { struct vfio_iommu_type1_dma_map map; @@ -1019,7 +1033,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, if (ret) return ret; - return copy_to_user((void __user *)arg, &unmap, minsz); + return copy_to_user((void __user *)arg, &unmap, minsz) ? + -EFAULT : 0; } return -ENOTTY; diff --git a/kernel/drivers/vfio/vfio_spapr_eeh.c b/kernel/drivers/vfio/vfio_spapr_eeh.c index 5fa42db76..38edeb472 100644 --- a/kernel/drivers/vfio/vfio_spapr_eeh.c +++ b/kernel/drivers/vfio/vfio_spapr_eeh.c @@ -85,6 +85,16 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, case VFIO_EEH_PE_CONFIGURE: ret = eeh_pe_configure(pe); break; + case VFIO_EEH_PE_INJECT_ERR: + minsz = offsetofend(struct vfio_eeh_pe_op, err.mask); + if (op.argsz < minsz) + return -EINVAL; + if (copy_from_user(&op, (void __user *)arg, minsz)) + return -EFAULT; + + ret = eeh_pe_inject_err(pe, op.err.type, op.err.func, + op.err.addr, op.err.mask); + break; default: ret = -EINVAL; } |