diff options
Diffstat (limited to 'kernel/drivers/misc/mic')
59 files changed, 15176 insertions, 1254 deletions
diff --git a/kernel/drivers/misc/mic/Kconfig b/kernel/drivers/misc/mic/Kconfig index cc4eef040..40677df7f 100644 --- a/kernel/drivers/misc/mic/Kconfig +++ b/kernel/drivers/misc/mic/Kconfig @@ -15,11 +15,28 @@ config INTEL_MIC_BUS OS and tools for MIC to use with this driver are available from <http://software.intel.com/en-us/mic-developer>. +comment "SCIF Bus Driver" + +config SCIF_BUS + tristate "SCIF Bus Driver" + depends on 64BIT && PCI && X86 && X86_DEV_DMA_OPS + help + This option is selected by any driver which registers a + device or driver on the SCIF Bus, such as CONFIG_INTEL_MIC_HOST + and CONFIG_INTEL_MIC_CARD. + + If you are building a host/card kernel with an Intel MIC device + then say M (recommended) or Y, else say N. If unsure say N. + + More information about the Intel MIC family as well as the Linux + OS and tools for MIC to use with this driver are available from + <http://software.intel.com/en-us/mic-developer>. + comment "Intel MIC Host Driver" config INTEL_MIC_HOST tristate "Intel MIC Host Driver" - depends on 64BIT && PCI && X86 && INTEL_MIC_BUS + depends on 64BIT && PCI && X86 && INTEL_MIC_BUS && SCIF_BUS && MIC_COSM select VHOST_RING help This enables Host Driver support for the Intel Many Integrated @@ -39,7 +56,7 @@ comment "Intel MIC Card Driver" config INTEL_MIC_CARD tristate "Intel MIC Card Driver" - depends on 64BIT && X86 && INTEL_MIC_BUS + depends on 64BIT && X86 && INTEL_MIC_BUS && SCIF_BUS && MIC_COSM select VIRTIO help This enables card driver support for the Intel Many Integrated @@ -52,3 +69,41 @@ config INTEL_MIC_CARD For more information see <http://software.intel.com/en-us/mic-developer>. + +comment "SCIF Driver" + +config SCIF + tristate "SCIF Driver" + depends on 64BIT && PCI && X86 && SCIF_BUS && IOMMU_SUPPORT + select IOMMU_IOVA + help + This enables SCIF Driver support for the Intel Many Integrated + Core (MIC) family of PCIe form factor coprocessor devices that + run a 64 bit Linux OS. The Symmetric Communication Interface + (SCIF (pronounced as skiff)) is a low level communications API + across PCIe currently implemented for MIC. + + If you are building a host kernel with an Intel MIC device then + say M (recommended) or Y, else say N. If unsure say N. + + More information about the Intel MIC family as well as the Linux + OS and tools for MIC to use with this driver are available from + <http://software.intel.com/en-us/mic-developer>. + +comment "Intel MIC Coprocessor State Management (COSM) Drivers" + +config MIC_COSM + tristate "Intel MIC Coprocessor State Management (COSM) Drivers" + depends on 64BIT && PCI && X86 && SCIF + help + This enables COSM driver support for the Intel Many + Integrated Core (MIC) family of PCIe form factor coprocessor + devices. COSM drivers implement functions such as boot, + shutdown, reset and reboot of MIC devices. + + If you are building a host kernel with an Intel MIC device then + say M (recommended) or Y, else say N. If unsure say N. + + More information about the Intel MIC family as well as the Linux + OS and tools for MIC to use with this driver are available from + <http://software.intel.com/en-us/mic-developer>. diff --git a/kernel/drivers/misc/mic/Makefile b/kernel/drivers/misc/mic/Makefile index e9bf14875..e288a1106 100644 --- a/kernel/drivers/misc/mic/Makefile +++ b/kernel/drivers/misc/mic/Makefile @@ -4,4 +4,7 @@ # obj-$(CONFIG_INTEL_MIC_HOST) += host/ obj-$(CONFIG_INTEL_MIC_CARD) += card/ -obj-$(CONFIG_INTEL_MIC_BUS) += bus/ +obj-y += bus/ +obj-$(CONFIG_SCIF) += scif/ +obj-$(CONFIG_MIC_COSM) += cosm/ +obj-$(CONFIG_MIC_COSM) += cosm_client/ diff --git a/kernel/drivers/misc/mic/bus/Makefile b/kernel/drivers/misc/mic/bus/Makefile index d85c7f2a0..761842b0d 100644 --- a/kernel/drivers/misc/mic/bus/Makefile +++ b/kernel/drivers/misc/mic/bus/Makefile @@ -3,3 +3,5 @@ # Copyright(c) 2014, Intel Corporation. # obj-$(CONFIG_INTEL_MIC_BUS) += mic_bus.o +obj-$(CONFIG_SCIF_BUS) += scif_bus.o +obj-$(CONFIG_MIC_COSM) += cosm_bus.o diff --git a/kernel/drivers/misc/mic/bus/cosm_bus.c b/kernel/drivers/misc/mic/bus/cosm_bus.c new file mode 100644 index 000000000..d31d6c6e6 --- /dev/null +++ b/kernel/drivers/misc/mic/bus/cosm_bus.c @@ -0,0 +1,141 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + * Intel MIC COSM Bus Driver + */ +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/idr.h> +#include "cosm_bus.h" + +/* Unique numbering for cosm devices. */ +static DEFINE_IDA(cosm_index_ida); + +static int cosm_dev_probe(struct device *d) +{ + struct cosm_device *dev = dev_to_cosm(d); + struct cosm_driver *drv = drv_to_cosm(dev->dev.driver); + + return drv->probe(dev); +} + +static int cosm_dev_remove(struct device *d) +{ + struct cosm_device *dev = dev_to_cosm(d); + struct cosm_driver *drv = drv_to_cosm(dev->dev.driver); + + drv->remove(dev); + return 0; +} + +static struct bus_type cosm_bus = { + .name = "cosm_bus", + .probe = cosm_dev_probe, + .remove = cosm_dev_remove, +}; + +int cosm_register_driver(struct cosm_driver *driver) +{ + driver->driver.bus = &cosm_bus; + return driver_register(&driver->driver); +} +EXPORT_SYMBOL_GPL(cosm_register_driver); + +void cosm_unregister_driver(struct cosm_driver *driver) +{ + driver_unregister(&driver->driver); +} +EXPORT_SYMBOL_GPL(cosm_unregister_driver); + +static inline void cosm_release_dev(struct device *d) +{ + struct cosm_device *cdev = dev_to_cosm(d); + + kfree(cdev); +} + +struct cosm_device * +cosm_register_device(struct device *pdev, struct cosm_hw_ops *hw_ops) +{ + struct cosm_device *cdev; + int ret; + + cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); + if (!cdev) + return ERR_PTR(-ENOMEM); + + cdev->dev.parent = pdev; + cdev->dev.release = cosm_release_dev; + cdev->hw_ops = hw_ops; + dev_set_drvdata(&cdev->dev, cdev); + cdev->dev.bus = &cosm_bus; + + /* Assign a unique device index and hence name */ + ret = ida_simple_get(&cosm_index_ida, 0, 0, GFP_KERNEL); + if (ret < 0) + goto free_cdev; + + cdev->index = ret; + cdev->dev.id = ret; + dev_set_name(&cdev->dev, "cosm-dev%u", cdev->index); + + ret = device_register(&cdev->dev); + if (ret) + goto ida_remove; + return cdev; +ida_remove: + ida_simple_remove(&cosm_index_ida, cdev->index); +free_cdev: + put_device(&cdev->dev); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(cosm_register_device); + +void cosm_unregister_device(struct cosm_device *dev) +{ + int index = dev->index; /* save for after device release */ + + device_unregister(&dev->dev); + ida_simple_remove(&cosm_index_ida, index); +} +EXPORT_SYMBOL_GPL(cosm_unregister_device); + +struct cosm_device *cosm_find_cdev_by_id(int id) +{ + struct device *dev = subsys_find_device_by_id(&cosm_bus, id, NULL); + + return dev ? container_of(dev, struct cosm_device, dev) : NULL; +} +EXPORT_SYMBOL_GPL(cosm_find_cdev_by_id); + +static int __init cosm_init(void) +{ + return bus_register(&cosm_bus); +} + +static void __exit cosm_exit(void) +{ + bus_unregister(&cosm_bus); + ida_destroy(&cosm_index_ida); +} + +core_initcall(cosm_init); +module_exit(cosm_exit); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Intel(R) MIC card OS state management bus driver"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/drivers/misc/mic/bus/cosm_bus.h b/kernel/drivers/misc/mic/bus/cosm_bus.h new file mode 100644 index 000000000..f7c57f266 --- /dev/null +++ b/kernel/drivers/misc/mic/bus/cosm_bus.h @@ -0,0 +1,134 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + * Intel MIC COSM Bus Driver + */ +#ifndef _COSM_BUS_H_ +#define _COSM_BUS_H_ + +#include <linux/scif.h> +#include <linux/mic_common.h> +#include "../common/mic_dev.h" + +/** + * cosm_device - representation of a cosm device + * + * @attr_group: Pointer to list of sysfs attribute groups. + * @sdev: Device for sysfs entries. + * @state: MIC state. + * @shutdown_status: MIC status reported by card for shutdown/crashes. + * @shutdown_status_int: Internal shutdown status maintained by the driver + * @cosm_mutex: Mutex for synchronizing access to data structures. + * @reset_trigger_work: Work for triggering reset requests. + * @scif_work: Work for handling per device SCIF connections + * @cmdline: Kernel command line. + * @firmware: Firmware file name. + * @ramdisk: Ramdisk file name. + * @bootmode: Boot mode i.e. "linux" or "elf" for flash updates. + * @log_buf_addr: Log buffer address for MIC. + * @log_buf_len: Log buffer length address for MIC. + * @state_sysfs: Sysfs dirent for notifying ring 3 about MIC state changes. + * @hw_ops: the hardware bus ops for this device. + * @dev: underlying device. + * @index: unique position on the cosm bus + * @dbg_dir: debug fs directory + * @newepd: new endpoint from scif accept to be assigned to this cdev + * @epd: SCIF endpoint for this cdev + * @heartbeat_watchdog_enable: if heartbeat watchdog is enabled for this cdev + * @sysfs_heartbeat_enable: sysfs setting for disabling heartbeat notification + */ +struct cosm_device { + const struct attribute_group **attr_group; + struct device *sdev; + u8 state; + u8 shutdown_status; + u8 shutdown_status_int; + struct mutex cosm_mutex; + struct work_struct reset_trigger_work; + struct work_struct scif_work; + char *cmdline; + char *firmware; + char *ramdisk; + char *bootmode; + void *log_buf_addr; + int *log_buf_len; + struct kernfs_node *state_sysfs; + struct cosm_hw_ops *hw_ops; + struct device dev; + int index; + struct dentry *dbg_dir; + scif_epd_t newepd; + scif_epd_t epd; + bool heartbeat_watchdog_enable; + bool sysfs_heartbeat_enable; +}; + +/** + * cosm_driver - operations for a cosm driver + * + * @driver: underlying device driver (populate name and owner). + * @probe: the function to call when a device is found. Returns 0 or -errno. + * @remove: the function to call when a device is removed. + */ +struct cosm_driver { + struct device_driver driver; + int (*probe)(struct cosm_device *dev); + void (*remove)(struct cosm_device *dev); +}; + +/** + * cosm_hw_ops - cosm bus ops + * + * @reset: trigger MIC reset + * @force_reset: force MIC reset + * @post_reset: inform MIC reset is complete + * @ready: is MIC ready for OS download + * @start: boot MIC + * @stop: prepare MIC for reset + * @family: return MIC HW family string + * @stepping: return MIC HW stepping string + * @aper: return MIC PCIe aperture + */ +struct cosm_hw_ops { + void (*reset)(struct cosm_device *cdev); + void (*force_reset)(struct cosm_device *cdev); + void (*post_reset)(struct cosm_device *cdev, enum mic_states state); + bool (*ready)(struct cosm_device *cdev); + int (*start)(struct cosm_device *cdev, int id); + void (*stop)(struct cosm_device *cdev, bool force); + ssize_t (*family)(struct cosm_device *cdev, char *buf); + ssize_t (*stepping)(struct cosm_device *cdev, char *buf); + struct mic_mw *(*aper)(struct cosm_device *cdev); +}; + +struct cosm_device * +cosm_register_device(struct device *pdev, struct cosm_hw_ops *hw_ops); +void cosm_unregister_device(struct cosm_device *dev); +int cosm_register_driver(struct cosm_driver *drv); +void cosm_unregister_driver(struct cosm_driver *drv); +struct cosm_device *cosm_find_cdev_by_id(int id); + +static inline struct cosm_device *dev_to_cosm(struct device *dev) +{ + return container_of(dev, struct cosm_device, dev); +} + +static inline struct cosm_driver *drv_to_cosm(struct device_driver *drv) +{ + return container_of(drv, struct cosm_driver, driver); +} +#endif /* _COSM_BUS_H */ diff --git a/kernel/drivers/misc/mic/bus/mic_bus.c b/kernel/drivers/misc/mic/bus/mic_bus.c index 961ae90aa..be37890ab 100644 --- a/kernel/drivers/misc/mic/bus/mic_bus.c +++ b/kernel/drivers/misc/mic/bus/mic_bus.c @@ -25,9 +25,6 @@ #include <linux/idr.h> #include <linux/mic_bus.h> -/* Unique numbering for mbus devices. */ -static DEFINE_IDA(mbus_index_ida); - static ssize_t device_show(struct device *d, struct device_attribute *attr, char *buf) { @@ -147,7 +144,8 @@ static void mbus_release_dev(struct device *d) struct mbus_device * mbus_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops, - struct mbus_hw_ops *hw_ops, void __iomem *mmio_va) + struct mbus_hw_ops *hw_ops, int index, + void __iomem *mmio_va) { int ret; struct mbus_device *mbdev; @@ -166,13 +164,7 @@ mbus_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops, mbdev->dev.release = mbus_release_dev; mbdev->hw_ops = hw_ops; mbdev->dev.bus = &mic_bus; - - /* Assign a unique device index and hence name. */ - ret = ida_simple_get(&mbus_index_ida, 0, 0, GFP_KERNEL); - if (ret < 0) - goto free_mbdev; - - mbdev->index = ret; + mbdev->index = index; dev_set_name(&mbdev->dev, "mbus-dev%u", mbdev->index); /* * device_register() causes the bus infrastructure to look for a @@ -180,22 +172,17 @@ mbus_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops, */ ret = device_register(&mbdev->dev); if (ret) - goto ida_remove; + goto free_mbdev; return mbdev; -ida_remove: - ida_simple_remove(&mbus_index_ida, mbdev->index); free_mbdev: - kfree(mbdev); + put_device(&mbdev->dev); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(mbus_register_device); void mbus_unregister_device(struct mbus_device *mbdev) { - int index = mbdev->index; /* save for after device release */ - device_unregister(&mbdev->dev); - ida_simple_remove(&mbus_index_ida, index); } EXPORT_SYMBOL_GPL(mbus_unregister_device); @@ -207,7 +194,6 @@ static int __init mbus_init(void) static void __exit mbus_exit(void) { bus_unregister(&mic_bus); - ida_destroy(&mbus_index_ida); } core_initcall(mbus_init); diff --git a/kernel/drivers/misc/mic/bus/scif_bus.c b/kernel/drivers/misc/mic/bus/scif_bus.c new file mode 100644 index 000000000..ff6e01c25 --- /dev/null +++ b/kernel/drivers/misc/mic/bus/scif_bus.c @@ -0,0 +1,209 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel Symmetric Communications Interface Bus driver. + */ +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/idr.h> +#include <linux/dma-mapping.h> + +#include "scif_bus.h" + +static ssize_t device_show(struct device *d, + struct device_attribute *attr, char *buf) +{ + struct scif_hw_dev *dev = dev_to_scif(d); + + return sprintf(buf, "0x%04x\n", dev->id.device); +} +static DEVICE_ATTR_RO(device); + +static ssize_t vendor_show(struct device *d, + struct device_attribute *attr, char *buf) +{ + struct scif_hw_dev *dev = dev_to_scif(d); + + return sprintf(buf, "0x%04x\n", dev->id.vendor); +} +static DEVICE_ATTR_RO(vendor); + +static ssize_t modalias_show(struct device *d, + struct device_attribute *attr, char *buf) +{ + struct scif_hw_dev *dev = dev_to_scif(d); + + return sprintf(buf, "scif:d%08Xv%08X\n", + dev->id.device, dev->id.vendor); +} +static DEVICE_ATTR_RO(modalias); + +static struct attribute *scif_dev_attrs[] = { + &dev_attr_device.attr, + &dev_attr_vendor.attr, + &dev_attr_modalias.attr, + NULL, +}; +ATTRIBUTE_GROUPS(scif_dev); + +static inline int scif_id_match(const struct scif_hw_dev *dev, + const struct scif_hw_dev_id *id) +{ + if (id->device != dev->id.device && id->device != SCIF_DEV_ANY_ID) + return 0; + + return id->vendor == SCIF_DEV_ANY_ID || id->vendor == dev->id.vendor; +} + +/* + * This looks through all the IDs a driver claims to support. If any of them + * match, we return 1 and the kernel will call scif_dev_probe(). + */ +static int scif_dev_match(struct device *dv, struct device_driver *dr) +{ + unsigned int i; + struct scif_hw_dev *dev = dev_to_scif(dv); + const struct scif_hw_dev_id *ids; + + ids = drv_to_scif(dr)->id_table; + for (i = 0; ids[i].device; i++) + if (scif_id_match(dev, &ids[i])) + return 1; + return 0; +} + +static int scif_uevent(struct device *dv, struct kobj_uevent_env *env) +{ + struct scif_hw_dev *dev = dev_to_scif(dv); + + return add_uevent_var(env, "MODALIAS=scif:d%08Xv%08X", + dev->id.device, dev->id.vendor); +} + +static int scif_dev_probe(struct device *d) +{ + struct scif_hw_dev *dev = dev_to_scif(d); + struct scif_driver *drv = drv_to_scif(dev->dev.driver); + + return drv->probe(dev); +} + +static int scif_dev_remove(struct device *d) +{ + struct scif_hw_dev *dev = dev_to_scif(d); + struct scif_driver *drv = drv_to_scif(dev->dev.driver); + + drv->remove(dev); + return 0; +} + +static struct bus_type scif_bus = { + .name = "scif_bus", + .match = scif_dev_match, + .dev_groups = scif_dev_groups, + .uevent = scif_uevent, + .probe = scif_dev_probe, + .remove = scif_dev_remove, +}; + +int scif_register_driver(struct scif_driver *driver) +{ + driver->driver.bus = &scif_bus; + return driver_register(&driver->driver); +} +EXPORT_SYMBOL_GPL(scif_register_driver); + +void scif_unregister_driver(struct scif_driver *driver) +{ + driver_unregister(&driver->driver); +} +EXPORT_SYMBOL_GPL(scif_unregister_driver); + +static void scif_release_dev(struct device *d) +{ + struct scif_hw_dev *sdev = dev_to_scif(d); + + kfree(sdev); +} + +struct scif_hw_dev * +scif_register_device(struct device *pdev, int id, struct dma_map_ops *dma_ops, + struct scif_hw_ops *hw_ops, u8 dnode, u8 snode, + struct mic_mw *mmio, struct mic_mw *aper, void *dp, + void __iomem *rdp, struct dma_chan **chan, int num_chan, + bool card_rel_da) +{ + int ret; + struct scif_hw_dev *sdev; + + sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); + if (!sdev) + return ERR_PTR(-ENOMEM); + + sdev->dev.parent = pdev; + sdev->id.device = id; + sdev->id.vendor = SCIF_DEV_ANY_ID; + sdev->dev.archdata.dma_ops = dma_ops; + sdev->dev.release = scif_release_dev; + sdev->hw_ops = hw_ops; + sdev->dnode = dnode; + sdev->snode = snode; + dev_set_drvdata(&sdev->dev, sdev); + sdev->dev.bus = &scif_bus; + sdev->mmio = mmio; + sdev->aper = aper; + sdev->dp = dp; + sdev->rdp = rdp; + sdev->dev.dma_mask = &sdev->dev.coherent_dma_mask; + dma_set_mask(&sdev->dev, DMA_BIT_MASK(64)); + sdev->dma_ch = chan; + sdev->num_dma_ch = num_chan; + sdev->card_rel_da = card_rel_da; + dev_set_name(&sdev->dev, "scif-dev%u", sdev->dnode); + /* + * device_register() causes the bus infrastructure to look for a + * matching driver. + */ + ret = device_register(&sdev->dev); + if (ret) + goto free_sdev; + return sdev; +free_sdev: + put_device(&sdev->dev); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(scif_register_device); + +void scif_unregister_device(struct scif_hw_dev *sdev) +{ + device_unregister(&sdev->dev); +} +EXPORT_SYMBOL_GPL(scif_unregister_device); + +static int __init scif_init(void) +{ + return bus_register(&scif_bus); +} + +static void __exit scif_exit(void) +{ + bus_unregister(&scif_bus); +} + +core_initcall(scif_init); +module_exit(scif_exit); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Intel(R) SCIF Bus driver"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/drivers/misc/mic/bus/scif_bus.h b/kernel/drivers/misc/mic/bus/scif_bus.h new file mode 100644 index 000000000..94f29ac60 --- /dev/null +++ b/kernel/drivers/misc/mic/bus/scif_bus.h @@ -0,0 +1,133 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel Symmetric Communications Interface Bus driver. + */ +#ifndef _SCIF_BUS_H_ +#define _SCIF_BUS_H_ +/* + * Everything a scif driver needs to work with any particular scif + * hardware abstraction layer. + */ +#include <linux/dma-mapping.h> + +#include <linux/mic_common.h> +#include "../common/mic_dev.h" + +struct scif_hw_dev_id { + u32 device; + u32 vendor; +}; + +#define MIC_SCIF_DEV 1 +#define SCIF_DEV_ANY_ID 0xffffffff + +/** + * scif_hw_dev - representation of a hardware device abstracted for scif + * @hw_ops: the hardware ops supported by this device + * @id: the device type identification (used to match it with a driver) + * @mmio: MMIO memory window + * @aper: Aperture memory window + * @dev: underlying device + * @dnode - The destination node which this device will communicate with. + * @snode - The source node for this device. + * @dp - Self device page + * @rdp - Remote device page + * @dma_ch - Array of DMA channels + * @num_dma_ch - Number of DMA channels available + * @card_rel_da - Set to true if DMA addresses programmed in the DMA engine + * are relative to the card point of view + */ +struct scif_hw_dev { + struct scif_hw_ops *hw_ops; + struct scif_hw_dev_id id; + struct mic_mw *mmio; + struct mic_mw *aper; + struct device dev; + u8 dnode; + u8 snode; + void *dp; + void __iomem *rdp; + struct dma_chan **dma_ch; + int num_dma_ch; + bool card_rel_da; +}; + +/** + * scif_driver - operations for a scif I/O driver + * @driver: underlying device driver (populate name and owner). + * @id_table: the ids serviced by this driver. + * @probe: the function to call when a device is found. Returns 0 or -errno. + * @remove: the function to call when a device is removed. + */ +struct scif_driver { + struct device_driver driver; + const struct scif_hw_dev_id *id_table; + int (*probe)(struct scif_hw_dev *dev); + void (*remove)(struct scif_hw_dev *dev); +}; + +/** + * scif_hw_ops - Hardware operations for accessing a SCIF device on the SCIF bus. + * + * @next_db: Obtain the next available doorbell. + * @request_irq: Request an interrupt on a particular doorbell. + * @free_irq: Free an interrupt requested previously. + * @ack_interrupt: acknowledge an interrupt in the ISR. + * @send_intr: Send an interrupt to the remote node on a specified doorbell. + * @send_p2p_intr: Send an interrupt to the peer node on a specified doorbell + * which is specifically targeted for a peer to peer node. + * @ioremap: Map a buffer with the specified physical address and length. + * @iounmap: Unmap a buffer previously mapped. + */ +struct scif_hw_ops { + int (*next_db)(struct scif_hw_dev *sdev); + struct mic_irq * (*request_irq)(struct scif_hw_dev *sdev, + irqreturn_t (*func)(int irq, + void *data), + const char *name, void *data, + int db); + void (*free_irq)(struct scif_hw_dev *sdev, + struct mic_irq *cookie, void *data); + void (*ack_interrupt)(struct scif_hw_dev *sdev, int num); + void (*send_intr)(struct scif_hw_dev *sdev, int db); + void (*send_p2p_intr)(struct scif_hw_dev *sdev, int db, + struct mic_mw *mw); + void __iomem * (*ioremap)(struct scif_hw_dev *sdev, + phys_addr_t pa, size_t len); + void (*iounmap)(struct scif_hw_dev *sdev, void __iomem *va); +}; + +int scif_register_driver(struct scif_driver *driver); +void scif_unregister_driver(struct scif_driver *driver); +struct scif_hw_dev * +scif_register_device(struct device *pdev, int id, + struct dma_map_ops *dma_ops, + struct scif_hw_ops *hw_ops, u8 dnode, u8 snode, + struct mic_mw *mmio, struct mic_mw *aper, + void *dp, void __iomem *rdp, + struct dma_chan **chan, int num_chan, + bool card_rel_da); +void scif_unregister_device(struct scif_hw_dev *sdev); + +static inline struct scif_hw_dev *dev_to_scif(struct device *dev) +{ + return container_of(dev, struct scif_hw_dev, dev); +} + +static inline struct scif_driver *drv_to_scif(struct device_driver *drv) +{ + return container_of(drv, struct scif_driver, driver); +} +#endif /* _SCIF_BUS_H */ diff --git a/kernel/drivers/misc/mic/card/mic_device.c b/kernel/drivers/misc/mic/card/mic_device.c index 83819eee5..d0edaf7e0 100644 --- a/kernel/drivers/misc/mic/card/mic_device.c +++ b/kernel/drivers/misc/mic/card/mic_device.c @@ -28,6 +28,8 @@ #include <linux/pci.h> #include <linux/interrupt.h> #include <linux/reboot.h> +#include <linux/dmaengine.h> +#include <linux/kmod.h> #include <linux/mic_common.h> #include "../common/mic_dev.h" @@ -35,71 +37,6 @@ #include "mic_virtio.h" static struct mic_driver *g_drv; -static struct mic_irq *shutdown_cookie; - -static void mic_notify_host(u8 state) -{ - struct mic_driver *mdrv = g_drv; - struct mic_bootparam __iomem *bootparam = mdrv->dp; - - iowrite8(state, &bootparam->shutdown_status); - dev_dbg(mdrv->dev, "%s %d system_state %d\n", - __func__, __LINE__, state); - mic_send_intr(&mdrv->mdev, ioread8(&bootparam->c2h_shutdown_db)); -} - -static int mic_panic_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct mic_driver *mdrv = g_drv; - struct mic_bootparam __iomem *bootparam = mdrv->dp; - - iowrite8(-1, &bootparam->h2c_config_db); - iowrite8(-1, &bootparam->h2c_shutdown_db); - mic_notify_host(MIC_CRASHED); - return NOTIFY_DONE; -} - -static struct notifier_block mic_panic = { - .notifier_call = mic_panic_event, -}; - -static irqreturn_t mic_shutdown_isr(int irq, void *data) -{ - struct mic_driver *mdrv = g_drv; - struct mic_bootparam __iomem *bootparam = mdrv->dp; - - mic_ack_interrupt(&g_drv->mdev); - if (ioread8(&bootparam->shutdown_card)) - orderly_poweroff(true); - return IRQ_HANDLED; -} - -static int mic_shutdown_init(void) -{ - int rc = 0; - struct mic_driver *mdrv = g_drv; - struct mic_bootparam __iomem *bootparam = mdrv->dp; - int shutdown_db; - - shutdown_db = mic_next_card_db(); - shutdown_cookie = mic_request_card_irq(mic_shutdown_isr, NULL, - "Shutdown", mdrv, shutdown_db); - if (IS_ERR(shutdown_cookie)) - rc = PTR_ERR(shutdown_cookie); - else - iowrite8(shutdown_db, &bootparam->h2c_shutdown_db); - return rc; -} - -static void mic_shutdown_uninit(void) -{ - struct mic_driver *mdrv = g_drv; - struct mic_bootparam __iomem *bootparam = mdrv->dp; - - iowrite8(-1, &bootparam->h2c_shutdown_db); - mic_free_card_irq(shutdown_cookie, mdrv); -} static int __init mic_dp_init(void) { @@ -240,6 +177,111 @@ static void mic_uninit_irq(void) kfree(mdrv->irq_info.irq_usage_count); } +static inline struct mic_driver *scdev_to_mdrv(struct scif_hw_dev *scdev) +{ + return dev_get_drvdata(scdev->dev.parent); +} + +static struct mic_irq * +___mic_request_irq(struct scif_hw_dev *scdev, + irqreturn_t (*func)(int irq, void *data), + const char *name, void *data, + int db) +{ + return mic_request_card_irq(func, NULL, name, data, db); +} + +static void +___mic_free_irq(struct scif_hw_dev *scdev, + struct mic_irq *cookie, void *data) +{ + return mic_free_card_irq(cookie, data); +} + +static void ___mic_ack_interrupt(struct scif_hw_dev *scdev, int num) +{ + struct mic_driver *mdrv = scdev_to_mdrv(scdev); + + mic_ack_interrupt(&mdrv->mdev); +} + +static int ___mic_next_db(struct scif_hw_dev *scdev) +{ + return mic_next_card_db(); +} + +static void ___mic_send_intr(struct scif_hw_dev *scdev, int db) +{ + struct mic_driver *mdrv = scdev_to_mdrv(scdev); + + mic_send_intr(&mdrv->mdev, db); +} + +static void ___mic_send_p2p_intr(struct scif_hw_dev *scdev, int db, + struct mic_mw *mw) +{ + mic_send_p2p_intr(db, mw); +} + +static void __iomem * +___mic_ioremap(struct scif_hw_dev *scdev, + phys_addr_t pa, size_t len) +{ + struct mic_driver *mdrv = scdev_to_mdrv(scdev); + + return mic_card_map(&mdrv->mdev, pa, len); +} + +static void ___mic_iounmap(struct scif_hw_dev *scdev, void __iomem *va) +{ + struct mic_driver *mdrv = scdev_to_mdrv(scdev); + + mic_card_unmap(&mdrv->mdev, va); +} + +static struct scif_hw_ops scif_hw_ops = { + .request_irq = ___mic_request_irq, + .free_irq = ___mic_free_irq, + .ack_interrupt = ___mic_ack_interrupt, + .next_db = ___mic_next_db, + .send_intr = ___mic_send_intr, + .send_p2p_intr = ___mic_send_p2p_intr, + .ioremap = ___mic_ioremap, + .iounmap = ___mic_iounmap, +}; + +static int mic_request_dma_chans(struct mic_driver *mdrv) +{ + dma_cap_mask_t mask; + struct dma_chan *chan; + + request_module("mic_x100_dma"); + dma_cap_zero(mask); + dma_cap_set(DMA_MEMCPY, mask); + + do { + chan = dma_request_channel(mask, NULL, NULL); + if (chan) { + mdrv->dma_ch[mdrv->num_dma_ch++] = chan; + if (mdrv->num_dma_ch >= MIC_MAX_DMA_CHAN) + break; + } + } while (chan); + dev_info(mdrv->dev, "DMA channels # %d\n", mdrv->num_dma_ch); + return mdrv->num_dma_ch; +} + +static void mic_free_dma_chans(struct mic_driver *mdrv) +{ + int i = 0; + + for (i = 0; i < mdrv->num_dma_ch; i++) { + dma_release_channel(mdrv->dma_ch[i]); + mdrv->dma_ch[i] = NULL; + } + mdrv->num_dma_ch = 0; +} + /* * mic_driver_init - MIC driver initialization tasks. * @@ -248,13 +290,11 @@ static void mic_uninit_irq(void) int __init mic_driver_init(struct mic_driver *mdrv) { int rc; + struct mic_bootparam __iomem *bootparam; + u8 node_id; g_drv = mdrv; - /* - * Unloading the card module is not supported. The MIC card module - * handles fundamental operations like host/card initiated shutdowns - * and informing the host about card crashes and cannot be unloaded. - */ + /* Unloading the card module is not supported. */ if (!try_module_get(mdrv->dev->driver->owner)) { rc = -ENODEV; goto done; @@ -265,18 +305,31 @@ int __init mic_driver_init(struct mic_driver *mdrv) rc = mic_init_irq(); if (rc) goto dp_uninit; - rc = mic_shutdown_init(); - if (rc) + if (!mic_request_dma_chans(mdrv)) { + rc = -ENODEV; goto irq_uninit; + } rc = mic_devices_init(mdrv); if (rc) - goto shutdown_uninit; + goto dma_free; + bootparam = mdrv->dp; + node_id = ioread8(&bootparam->node_id); + mdrv->scdev = scif_register_device(mdrv->dev, MIC_SCIF_DEV, + NULL, &scif_hw_ops, + 0, node_id, &mdrv->mdev.mmio, NULL, + NULL, mdrv->dp, mdrv->dma_ch, + mdrv->num_dma_ch, true); + if (IS_ERR(mdrv->scdev)) { + rc = PTR_ERR(mdrv->scdev); + goto device_uninit; + } mic_create_card_debug_dir(mdrv); - atomic_notifier_chain_register(&panic_notifier_list, &mic_panic); done: return rc; -shutdown_uninit: - mic_shutdown_uninit(); +device_uninit: + mic_devices_uninit(mdrv); +dma_free: + mic_free_dma_chans(mdrv); irq_uninit: mic_uninit_irq(); dp_uninit: @@ -294,14 +347,9 @@ put: void mic_driver_uninit(struct mic_driver *mdrv) { mic_delete_card_debug_dir(mdrv); + scif_unregister_device(mdrv->scdev); mic_devices_uninit(mdrv); - /* - * Inform the host about the shutdown status i.e. poweroff/restart etc. - * The module cannot be unloaded so the only code path to call - * mic_devices_uninit(..) is the shutdown callback. - */ - mic_notify_host(system_state); - mic_shutdown_uninit(); + mic_free_dma_chans(mdrv); mic_uninit_irq(); mic_dp_uninit(); module_put(mdrv->dev->driver->owner); diff --git a/kernel/drivers/misc/mic/card/mic_device.h b/kernel/drivers/misc/mic/card/mic_device.h index 844be8fc9..1dbf83c41 100644 --- a/kernel/drivers/misc/mic/card/mic_device.h +++ b/kernel/drivers/misc/mic/card/mic_device.h @@ -29,9 +29,9 @@ #include <linux/workqueue.h> #include <linux/io.h> -#include <linux/irqreturn.h> #include <linux/interrupt.h> #include <linux/mic_bus.h> +#include "../bus/scif_bus.h" /** * struct mic_intr_info - Contains h/w specific interrupt sources info @@ -73,6 +73,9 @@ struct mic_device { * @irq_info: The OS specific irq information * @intr_info: H/W specific interrupt information. * @dma_mbdev: dma device on the MIC virtual bus. + * @dma_ch - Array of DMA channels + * @num_dma_ch - Number of DMA channels available + * @scdev: SCIF device on the SCIF virtual bus. */ struct mic_driver { char name[20]; @@ -84,6 +87,9 @@ struct mic_driver { struct mic_irq_info irq_info; struct mic_intr_info intr_info; struct mbus_device *dma_mbdev; + struct dma_chan *dma_ch[MIC_MAX_DMA_CHAN]; + int num_dma_ch; + struct scif_hw_dev *scdev; }; /** @@ -122,10 +128,11 @@ void mic_driver_uninit(struct mic_driver *mdrv); int mic_next_card_db(void); struct mic_irq * mic_request_card_irq(irq_handler_t handler, irq_handler_t thread_fn, - const char *name, void *data, int intr_src); + const char *name, void *data, int db); void mic_free_card_irq(struct mic_irq *cookie, void *data); u32 mic_read_spad(struct mic_device *mdev, unsigned int idx); void mic_send_intr(struct mic_device *mdev, int doorbell); +void mic_send_p2p_intr(int doorbell, struct mic_mw *mw); int mic_db_to_irq(struct mic_driver *mdrv, int db); u32 mic_ack_interrupt(struct mic_device *mdev); void mic_hw_intr_init(struct mic_driver *mdrv); diff --git a/kernel/drivers/misc/mic/card/mic_x100.c b/kernel/drivers/misc/mic/card/mic_x100.c index e98e537d6..b2958ce23 100644 --- a/kernel/drivers/misc/mic/card/mic_x100.c +++ b/kernel/drivers/misc/mic/card/mic_x100.c @@ -70,6 +70,41 @@ void mic_send_intr(struct mic_device *mdev, int doorbell) (MIC_X100_SBOX_SDBIC0 + (4 * doorbell))); } +/* + * mic_x100_send_sbox_intr - Send an MIC_X100_SBOX interrupt to MIC. + */ +static void mic_x100_send_sbox_intr(struct mic_mw *mw, int doorbell) +{ + u64 apic_icr_offset = MIC_X100_SBOX_APICICR0 + doorbell * 8; + u32 apicicr_low = mic_mmio_read(mw, MIC_X100_SBOX_BASE_ADDRESS + + apic_icr_offset); + + /* for MIC we need to make sure we "hit" the send_icr bit (13) */ + apicicr_low = (apicicr_low | (1 << 13)); + /* + * Ensure that the interrupt is ordered w.r.t. previous stores + * to main memory. Fence instructions are not implemented in X100 + * since execution is in order but a compiler barrier is still + * required. + */ + wmb(); + mic_mmio_write(mw, apicicr_low, + MIC_X100_SBOX_BASE_ADDRESS + apic_icr_offset); +} + +static void mic_x100_send_rdmasr_intr(struct mic_mw *mw, int doorbell) +{ + int rdmasr_offset = MIC_X100_SBOX_RDMASR0 + (doorbell << 2); + /* + * Ensure that the interrupt is ordered w.r.t. previous stores + * to main memory. Fence instructions are not implemented in X100 + * since execution is in order but a compiler barrier is still + * required. + */ + wmb(); + mic_mmio_write(mw, 0, MIC_X100_SBOX_BASE_ADDRESS + rdmasr_offset); +} + /** * mic_ack_interrupt - Device specific interrupt handling. * @mdev: pointer to mic_device instance @@ -91,6 +126,18 @@ static inline int mic_get_rdmasr_irq(int index) return MIC_X100_RDMASR_IRQ_BASE + index; } +void mic_send_p2p_intr(int db, struct mic_mw *mw) +{ + int rdmasr_index; + + if (db < MIC_X100_NUM_SBOX_IRQ) { + mic_x100_send_sbox_intr(mw, db); + } else { + rdmasr_index = db - MIC_X100_NUM_SBOX_IRQ; + mic_x100_send_rdmasr_intr(mw, rdmasr_index); + } +} + /** * mic_hw_intr_init - Initialize h/w specific interrupt * information. @@ -113,11 +160,15 @@ void mic_hw_intr_init(struct mic_driver *mdrv) int mic_db_to_irq(struct mic_driver *mdrv, int db) { int rdmasr_index; + + /* + * The total number of doorbell interrupts on the card are 16. Indices + * 0-8 falls in the SBOX category and 8-15 fall in the RDMASR category. + */ if (db < MIC_X100_NUM_SBOX_IRQ) { return mic_get_sbox_irq(db); } else { - rdmasr_index = db - MIC_X100_NUM_SBOX_IRQ + - MIC_X100_RDMASR_IRQ_BASE; + rdmasr_index = db - MIC_X100_NUM_SBOX_IRQ; return mic_get_rdmasr_irq(rdmasr_index); } } @@ -210,7 +261,7 @@ static int __init mic_probe(struct platform_device *pdev) mic_hw_intr_init(mdrv); platform_set_drvdata(pdev, mdrv); mdrv->dma_mbdev = mbus_register_device(mdrv->dev, MBUS_DEV_DMA_MIC, - NULL, &mbus_hw_ops, + NULL, &mbus_hw_ops, 0, mdrv->mdev.mmio.va); if (IS_ERR(mdrv->dma_mbdev)) { rc = PTR_ERR(mdrv->dma_mbdev); @@ -243,10 +294,16 @@ static void mic_platform_shutdown(struct platform_device *pdev) mic_remove(pdev); } +static u64 mic_dma_mask = DMA_BIT_MASK(64); + static struct platform_device mic_platform_dev = { .name = mic_driver_name, .id = 0, .num_resources = 0, + .dev = { + .dma_mask = &mic_dma_mask, + .coherent_dma_mask = DMA_BIT_MASK(64), + }, }; static struct platform_driver __refdata mic_platform_driver = { diff --git a/kernel/drivers/misc/mic/card/mic_x100.h b/kernel/drivers/misc/mic/card/mic_x100.h index d66ea5563..7e2224934 100644 --- a/kernel/drivers/misc/mic/card/mic_x100.h +++ b/kernel/drivers/misc/mic/card/mic_x100.h @@ -35,6 +35,7 @@ #define MIC_X100_SBOX_SDBIC0 0x0000CC90 #define MIC_X100_SBOX_SDBIC0_DBREQ_BIT 0x80000000 #define MIC_X100_SBOX_RDMASR0 0x0000B180 +#define MIC_X100_SBOX_APICICR0 0x0000A9D0 #define MIC_X100_MAX_DOORBELL_IDX 8 diff --git a/kernel/drivers/misc/mic/common/mic_dev.h b/kernel/drivers/misc/mic/common/mic_dev.h index 92999c2bb..50776772e 100644 --- a/kernel/drivers/misc/mic/common/mic_dev.h +++ b/kernel/drivers/misc/mic/common/mic_dev.h @@ -21,6 +21,19 @@ #ifndef __MIC_DEV_H__ #define __MIC_DEV_H__ +/* The maximum number of MIC devices supported in a single host system. */ +#define MIC_MAX_NUM_DEVS 128 + +/** + * enum mic_hw_family - The hardware family to which a device belongs. + */ +enum mic_hw_family { + MIC_FAMILY_X100 = 0, + MIC_FAMILY_X200, + MIC_FAMILY_UNKNOWN, + MIC_FAMILY_LAST +}; + /** * struct mic_mw - MIC memory window * @@ -48,4 +61,7 @@ struct mic_mw { #define MIC_VIRTIO_PARAM_DEV_REMOVE 0x1 #define MIC_VIRTIO_PARAM_CONFIG_CHANGED 0x2 +/* Maximum number of DMA channels */ +#define MIC_MAX_DMA_CHAN 4 + #endif diff --git a/kernel/drivers/misc/mic/cosm/Makefile b/kernel/drivers/misc/mic/cosm/Makefile new file mode 100644 index 000000000..b85d4d49d --- /dev/null +++ b/kernel/drivers/misc/mic/cosm/Makefile @@ -0,0 +1,10 @@ +# +# Makefile - Intel MIC Coprocessor State Management (COSM) Driver +# Copyright(c) 2015, Intel Corporation. +# +obj-$(CONFIG_MIC_COSM) += mic_cosm.o + +mic_cosm-objs := cosm_main.o +mic_cosm-objs += cosm_debugfs.o +mic_cosm-objs += cosm_sysfs.o +mic_cosm-objs += cosm_scif_server.o diff --git a/kernel/drivers/misc/mic/cosm/cosm_debugfs.c b/kernel/drivers/misc/mic/cosm/cosm_debugfs.c new file mode 100644 index 000000000..216cb3cd2 --- /dev/null +++ b/kernel/drivers/misc/mic/cosm/cosm_debugfs.c @@ -0,0 +1,156 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + * Intel MIC Coprocessor State Management (COSM) Driver + * + */ + +#include <linux/debugfs.h> +#include <linux/slab.h> +#include <linux/io.h> +#include "cosm_main.h" + +/* Debugfs parent dir */ +static struct dentry *cosm_dbg; + +/** + * cosm_log_buf_show - Display MIC kernel log buffer + * + * log_buf addr/len is read from System.map by user space + * and populated in sysfs entries. + */ +static int cosm_log_buf_show(struct seq_file *s, void *unused) +{ + void __iomem *log_buf_va; + int __iomem *log_buf_len_va; + struct cosm_device *cdev = s->private; + void *kva; + int size; + u64 aper_offset; + + if (!cdev || !cdev->log_buf_addr || !cdev->log_buf_len) + goto done; + + mutex_lock(&cdev->cosm_mutex); + switch (cdev->state) { + case MIC_BOOTING: + case MIC_ONLINE: + case MIC_SHUTTING_DOWN: + break; + default: + goto unlock; + } + + /* + * Card kernel will never be relocated and any kernel text/data mapping + * can be translated to phys address by subtracting __START_KERNEL_map. + */ + aper_offset = (u64)cdev->log_buf_len - __START_KERNEL_map; + log_buf_len_va = cdev->hw_ops->aper(cdev)->va + aper_offset; + aper_offset = (u64)cdev->log_buf_addr - __START_KERNEL_map; + log_buf_va = cdev->hw_ops->aper(cdev)->va + aper_offset; + + size = ioread32(log_buf_len_va); + kva = kmalloc(size, GFP_KERNEL); + if (!kva) + goto unlock; + + memcpy_fromio(kva, log_buf_va, size); + seq_write(s, kva, size); + kfree(kva); +unlock: + mutex_unlock(&cdev->cosm_mutex); +done: + return 0; +} + +static int cosm_log_buf_open(struct inode *inode, struct file *file) +{ + return single_open(file, cosm_log_buf_show, inode->i_private); +} + +static const struct file_operations log_buf_ops = { + .owner = THIS_MODULE, + .open = cosm_log_buf_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release +}; + +/** + * cosm_force_reset_show - Force MIC reset + * + * Invokes the force_reset COSM bus op instead of the standard reset + * op in case a force reset of the MIC device is required + */ +static int cosm_force_reset_show(struct seq_file *s, void *pos) +{ + struct cosm_device *cdev = s->private; + + cosm_stop(cdev, true); + return 0; +} + +static int cosm_force_reset_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, cosm_force_reset_show, inode->i_private); +} + +static const struct file_operations force_reset_ops = { + .owner = THIS_MODULE, + .open = cosm_force_reset_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release +}; + +void cosm_create_debug_dir(struct cosm_device *cdev) +{ + char name[16]; + + if (!cosm_dbg) + return; + + scnprintf(name, sizeof(name), "mic%d", cdev->index); + cdev->dbg_dir = debugfs_create_dir(name, cosm_dbg); + if (!cdev->dbg_dir) + return; + + debugfs_create_file("log_buf", 0444, cdev->dbg_dir, cdev, &log_buf_ops); + debugfs_create_file("force_reset", 0444, cdev->dbg_dir, cdev, + &force_reset_ops); +} + +void cosm_delete_debug_dir(struct cosm_device *cdev) +{ + if (!cdev->dbg_dir) + return; + + debugfs_remove_recursive(cdev->dbg_dir); +} + +void cosm_init_debugfs(void) +{ + cosm_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL); + if (!cosm_dbg) + pr_err("can't create debugfs dir\n"); +} + +void cosm_exit_debugfs(void) +{ + debugfs_remove(cosm_dbg); +} diff --git a/kernel/drivers/misc/mic/cosm/cosm_main.c b/kernel/drivers/misc/mic/cosm/cosm_main.c new file mode 100644 index 000000000..4b4b356c7 --- /dev/null +++ b/kernel/drivers/misc/mic/cosm/cosm_main.c @@ -0,0 +1,388 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + * Intel MIC Coprocessor State Management (COSM) Driver + * + */ + +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/idr.h> +#include <linux/slab.h> +#include <linux/cred.h> +#include "cosm_main.h" + +static const char cosm_driver_name[] = "mic"; + +/* COSM ID allocator */ +static struct ida g_cosm_ida; +/* Class of MIC devices for sysfs accessibility. */ +static struct class *g_cosm_class; +/* Number of MIC devices */ +static atomic_t g_num_dev; + +/** + * cosm_hw_reset - Issue a HW reset for the MIC device + * @cdev: pointer to cosm_device instance + */ +static void cosm_hw_reset(struct cosm_device *cdev, bool force) +{ + int i; + +#define MIC_RESET_TO (45) + if (force && cdev->hw_ops->force_reset) + cdev->hw_ops->force_reset(cdev); + else + cdev->hw_ops->reset(cdev); + + for (i = 0; i < MIC_RESET_TO; i++) { + if (cdev->hw_ops->ready(cdev)) { + cosm_set_state(cdev, MIC_READY); + return; + } + /* + * Resets typically take 10s of seconds to complete. + * Since an MMIO read is required to check if the + * firmware is ready or not, a 1 second delay works nicely. + */ + msleep(1000); + } + cosm_set_state(cdev, MIC_RESET_FAILED); +} + +/** + * cosm_start - Start the MIC + * @cdev: pointer to cosm_device instance + * + * This function prepares an MIC for boot and initiates boot. + * RETURNS: An appropriate -ERRNO error value on error, or 0 for success. + */ +int cosm_start(struct cosm_device *cdev) +{ + const struct cred *orig_cred; + struct cred *override_cred; + int rc; + + mutex_lock(&cdev->cosm_mutex); + if (!cdev->bootmode) { + dev_err(&cdev->dev, "%s %d bootmode not set\n", + __func__, __LINE__); + rc = -EINVAL; + goto unlock_ret; + } +retry: + if (cdev->state != MIC_READY) { + dev_err(&cdev->dev, "%s %d MIC state not READY\n", + __func__, __LINE__); + rc = -EINVAL; + goto unlock_ret; + } + if (!cdev->hw_ops->ready(cdev)) { + cosm_hw_reset(cdev, false); + /* + * The state will either be MIC_READY if the reset succeeded + * or MIC_RESET_FAILED if the firmware reset failed. + */ + goto retry; + } + + /* + * Set credentials to root to allow non-root user to download initramsfs + * with 600 permissions + */ + override_cred = prepare_creds(); + if (!override_cred) { + dev_err(&cdev->dev, "%s %d prepare_creds failed\n", + __func__, __LINE__); + rc = -ENOMEM; + goto unlock_ret; + } + override_cred->fsuid = GLOBAL_ROOT_UID; + orig_cred = override_creds(override_cred); + + rc = cdev->hw_ops->start(cdev, cdev->index); + + revert_creds(orig_cred); + put_cred(override_cred); + if (rc) + goto unlock_ret; + + /* + * If linux is being booted, card is treated 'online' only + * when the scif interface in the card is up. If anything else + * is booted, we set card to 'online' immediately. + */ + if (!strcmp(cdev->bootmode, "linux")) + cosm_set_state(cdev, MIC_BOOTING); + else + cosm_set_state(cdev, MIC_ONLINE); +unlock_ret: + mutex_unlock(&cdev->cosm_mutex); + if (rc) + dev_err(&cdev->dev, "cosm_start failed rc %d\n", rc); + return rc; +} + +/** + * cosm_stop - Prepare the MIC for reset and trigger reset + * @cdev: pointer to cosm_device instance + * @force: force a MIC to reset even if it is already reset and ready. + * + * RETURNS: None + */ +void cosm_stop(struct cosm_device *cdev, bool force) +{ + mutex_lock(&cdev->cosm_mutex); + if (cdev->state != MIC_READY || force) { + /* + * Don't call hw_ops if they have been called previously. + * stop(..) calls device_unregister and will crash the system if + * called multiple times. + */ + bool call_hw_ops = cdev->state != MIC_RESET_FAILED && + cdev->state != MIC_READY; + + if (cdev->state != MIC_RESETTING) + cosm_set_state(cdev, MIC_RESETTING); + cdev->heartbeat_watchdog_enable = false; + if (call_hw_ops) + cdev->hw_ops->stop(cdev, force); + cosm_hw_reset(cdev, force); + cosm_set_shutdown_status(cdev, MIC_NOP); + if (call_hw_ops && cdev->hw_ops->post_reset) + cdev->hw_ops->post_reset(cdev, cdev->state); + } + mutex_unlock(&cdev->cosm_mutex); + flush_work(&cdev->scif_work); +} + +/** + * cosm_reset_trigger_work - Trigger MIC reset + * @work: The work structure + * + * This work is scheduled whenever the host wants to reset the MIC. + */ +static void cosm_reset_trigger_work(struct work_struct *work) +{ + struct cosm_device *cdev = container_of(work, struct cosm_device, + reset_trigger_work); + cosm_stop(cdev, false); +} + +/** + * cosm_reset - Schedule MIC reset + * @cdev: pointer to cosm_device instance + * + * RETURNS: An -EINVAL if the card is already READY or 0 for success. + */ +int cosm_reset(struct cosm_device *cdev) +{ + int rc = 0; + + mutex_lock(&cdev->cosm_mutex); + if (cdev->state != MIC_READY) { + cosm_set_state(cdev, MIC_RESETTING); + schedule_work(&cdev->reset_trigger_work); + } else { + dev_err(&cdev->dev, "%s %d MIC is READY\n", __func__, __LINE__); + rc = -EINVAL; + } + mutex_unlock(&cdev->cosm_mutex); + return rc; +} + +/** + * cosm_shutdown - Initiate MIC shutdown. + * @cdev: pointer to cosm_device instance + * + * RETURNS: None + */ +int cosm_shutdown(struct cosm_device *cdev) +{ + struct cosm_msg msg = { .id = COSM_MSG_SHUTDOWN }; + int rc = 0; + + mutex_lock(&cdev->cosm_mutex); + if (cdev->state != MIC_ONLINE) { + rc = -EINVAL; + dev_err(&cdev->dev, "%s %d skipping shutdown in state: %s\n", + __func__, __LINE__, cosm_state_string[cdev->state]); + goto err; + } + + if (!cdev->epd) { + rc = -ENOTCONN; + dev_err(&cdev->dev, "%s %d scif endpoint not connected rc %d\n", + __func__, __LINE__, rc); + goto err; + } + + rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK); + if (rc < 0) { + dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n", + __func__, __LINE__, rc); + goto err; + } + cdev->heartbeat_watchdog_enable = false; + cosm_set_state(cdev, MIC_SHUTTING_DOWN); + rc = 0; +err: + mutex_unlock(&cdev->cosm_mutex); + return rc; +} + +static int cosm_driver_probe(struct cosm_device *cdev) +{ + int rc; + + /* Initialize SCIF server at first probe */ + if (atomic_add_return(1, &g_num_dev) == 1) { + rc = cosm_scif_init(); + if (rc) + goto scif_exit; + } + mutex_init(&cdev->cosm_mutex); + INIT_WORK(&cdev->reset_trigger_work, cosm_reset_trigger_work); + INIT_WORK(&cdev->scif_work, cosm_scif_work); + cdev->sysfs_heartbeat_enable = true; + cosm_sysfs_init(cdev); + cdev->sdev = device_create_with_groups(g_cosm_class, cdev->dev.parent, + MKDEV(0, cdev->index), cdev, cdev->attr_group, + "mic%d", cdev->index); + if (IS_ERR(cdev->sdev)) { + rc = PTR_ERR(cdev->sdev); + dev_err(&cdev->dev, "device_create_with_groups failed rc %d\n", + rc); + goto scif_exit; + } + + cdev->state_sysfs = sysfs_get_dirent(cdev->sdev->kobj.sd, + "state"); + if (!cdev->state_sysfs) { + rc = -ENODEV; + dev_err(&cdev->dev, "sysfs_get_dirent failed rc %d\n", rc); + goto destroy_device; + } + cosm_create_debug_dir(cdev); + return 0; +destroy_device: + device_destroy(g_cosm_class, MKDEV(0, cdev->index)); +scif_exit: + if (atomic_dec_and_test(&g_num_dev)) + cosm_scif_exit(); + return rc; +} + +static void cosm_driver_remove(struct cosm_device *cdev) +{ + cosm_delete_debug_dir(cdev); + sysfs_put(cdev->state_sysfs); + device_destroy(g_cosm_class, MKDEV(0, cdev->index)); + flush_work(&cdev->reset_trigger_work); + cosm_stop(cdev, false); + if (atomic_dec_and_test(&g_num_dev)) + cosm_scif_exit(); + + /* These sysfs entries might have allocated */ + kfree(cdev->cmdline); + kfree(cdev->firmware); + kfree(cdev->ramdisk); + kfree(cdev->bootmode); +} + +static int cosm_suspend(struct device *dev) +{ + struct cosm_device *cdev = dev_to_cosm(dev); + + mutex_lock(&cdev->cosm_mutex); + switch (cdev->state) { + /** + * Suspend/freeze hooks in userspace have already shutdown the card. + * Card should be 'ready' in most cases. It is however possible that + * some userspace application initiated a boot. In those cases, we + * simply reset the card. + */ + case MIC_ONLINE: + case MIC_BOOTING: + case MIC_SHUTTING_DOWN: + mutex_unlock(&cdev->cosm_mutex); + cosm_stop(cdev, false); + break; + default: + mutex_unlock(&cdev->cosm_mutex); + break; + } + return 0; +} + +static const struct dev_pm_ops cosm_pm_ops = { + .suspend = cosm_suspend, + .freeze = cosm_suspend +}; + +static struct cosm_driver cosm_driver = { + .driver = { + .name = KBUILD_MODNAME, + .owner = THIS_MODULE, + .pm = &cosm_pm_ops, + }, + .probe = cosm_driver_probe, + .remove = cosm_driver_remove +}; + +static int __init cosm_init(void) +{ + int ret; + + cosm_init_debugfs(); + + g_cosm_class = class_create(THIS_MODULE, cosm_driver_name); + if (IS_ERR(g_cosm_class)) { + ret = PTR_ERR(g_cosm_class); + pr_err("class_create failed ret %d\n", ret); + goto cleanup_debugfs; + } + + ida_init(&g_cosm_ida); + ret = cosm_register_driver(&cosm_driver); + if (ret) { + pr_err("cosm_register_driver failed ret %d\n", ret); + goto ida_destroy; + } + return 0; +ida_destroy: + ida_destroy(&g_cosm_ida); + class_destroy(g_cosm_class); +cleanup_debugfs: + cosm_exit_debugfs(); + return ret; +} + +static void __exit cosm_exit(void) +{ + cosm_unregister_driver(&cosm_driver); + ida_destroy(&g_cosm_ida); + class_destroy(g_cosm_class); + cosm_exit_debugfs(); +} + +module_init(cosm_init); +module_exit(cosm_exit); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Intel(R) MIC Coprocessor State Management (COSM) Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/drivers/misc/mic/cosm/cosm_main.h b/kernel/drivers/misc/mic/cosm/cosm_main.h new file mode 100644 index 000000000..f01156fca --- /dev/null +++ b/kernel/drivers/misc/mic/cosm/cosm_main.h @@ -0,0 +1,70 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + * Intel MIC Coprocessor State Management (COSM) Driver + * + */ +#ifndef _COSM_COSM_H_ +#define _COSM_COSM_H_ + +#include <linux/scif.h> +#include "../bus/cosm_bus.h" + +#define COSM_HEARTBEAT_SEND_SEC 30 +#define SCIF_COSM_LISTEN_PORT 201 + +/** + * enum COSM msg id's + * @COSM_MSG_SHUTDOWN: host->card trigger shutdown + * @COSM_MSG_SYNC_TIME: host->card send host time to card to sync time + * @COSM_MSG_HEARTBEAT: card->host heartbeat + * @COSM_MSG_SHUTDOWN_STATUS: card->host with shutdown status as payload + */ +enum cosm_msg_id { + COSM_MSG_SHUTDOWN, + COSM_MSG_SYNC_TIME, + COSM_MSG_HEARTBEAT, + COSM_MSG_SHUTDOWN_STATUS, +}; + +struct cosm_msg { + u64 id; + union { + u64 shutdown_status; + struct timespec64 timespec; + }; +}; + +extern const char * const cosm_state_string[]; +extern const char * const cosm_shutdown_status_string[]; + +void cosm_sysfs_init(struct cosm_device *cdev); +int cosm_start(struct cosm_device *cdev); +void cosm_stop(struct cosm_device *cdev, bool force); +int cosm_reset(struct cosm_device *cdev); +int cosm_shutdown(struct cosm_device *cdev); +void cosm_set_state(struct cosm_device *cdev, u8 state); +void cosm_set_shutdown_status(struct cosm_device *cdev, u8 status); +void cosm_init_debugfs(void); +void cosm_exit_debugfs(void); +void cosm_create_debug_dir(struct cosm_device *cdev); +void cosm_delete_debug_dir(struct cosm_device *cdev); +int cosm_scif_init(void); +void cosm_scif_exit(void); +void cosm_scif_work(struct work_struct *work); + +#endif diff --git a/kernel/drivers/misc/mic/cosm/cosm_scif_server.c b/kernel/drivers/misc/mic/cosm/cosm_scif_server.c new file mode 100644 index 000000000..5696df432 --- /dev/null +++ b/kernel/drivers/misc/mic/cosm/cosm_scif_server.c @@ -0,0 +1,405 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + * Intel MIC Coprocessor State Management (COSM) Driver + * + */ +#include <linux/kthread.h> +#include "cosm_main.h" + +/* + * The COSM driver uses SCIF to communicate between the management node and the + * MIC cards. SCIF is used to (a) Send a shutdown command to the card (b) + * receive a shutdown status back from the card upon completion of shutdown and + * (c) receive periodic heartbeat messages from the card used to deduce if the + * card has crashed. + * + * A COSM server consisting of a SCIF listening endpoint waits for incoming + * connections from the card. Upon acceptance of the connection, a separate + * work-item is scheduled to handle SCIF message processing for that card. The + * life-time of this work-item is therefore the time from which the connection + * from a card is accepted to the time at which the connection is closed. A new + * work-item starts each time the card boots and is alive till the card (a) + * shuts down (b) is reset (c) crashes (d) cosm_client driver on the card is + * unloaded. + * + * From the point of view of COSM interactions with SCIF during card + * shutdown, reset and crash are as follows: + * + * Card shutdown + * ------------- + * 1. COSM client on the card invokes orderly_poweroff() in response to SHUTDOWN + * message from the host. + * 2. Card driver shutdown callback invokes scif_unregister_device(..) resulting + * in scif_remove(..) getting called on the card + * 3. scif_remove -> scif_stop -> scif_handle_remove_node -> + * scif_peer_unregister_device -> device_unregister for the host peer device + * 4. During device_unregister remove(..) method of cosm_client is invoked which + * closes the COSM SCIF endpoint on the card. This results in a SCIF_DISCNCT + * message being sent to host SCIF. SCIF_DISCNCT message processing on the + * host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes + * up the host COSM thread blocked in scif_poll(..) resulting in + * scif_poll(..) returning POLLHUP. + * 5. On the card, scif_peer_release_dev is next called which results in an + * SCIF_EXIT message being sent to the host and after receiving the + * SCIF_EXIT_ACK from the host the peer device teardown on the card is + * complete. + * 6. As part of the SCIF_EXIT message processing on the host, host sends a + * SCIF_REMOVE_NODE to itself corresponding to the card being removed. This + * starts a similar SCIF peer device teardown sequence on the host + * corresponding to the card being shut down. + * + * Card reset + * ---------- + * The case of interest here is when the card has not been previously shut down + * since most of the steps below are skipped in that case: + + * 1. cosm_stop(..) invokes hw_ops->stop(..) method of the base PCIe driver + * which unregisters the SCIF HW device resulting in scif_remove(..) being + * called on the host. + * 2. scif_remove(..) calls scif_disconnect_node(..) which results in a + * SCIF_EXIT message being sent to the card. + * 3. The card executes scif_stop() as part of SCIF_EXIT message + * processing. This results in the COSM endpoint on the card being closed and + * the SCIF host peer device on the card getting unregistered similar to + * steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the + * host returns POLLHUP as a result. + * 4. On the host, card peer device unregister and SCIF HW remove(..) also + * subsequently complete. + * + * Card crash + * ---------- + * If a reset is issued after the card has crashed, there is no SCIF_DISCNT + * message from the card which would result in scif_poll(..) returning + * POLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE + * message to itself resulting in the card SCIF peer device being unregistered, + * this results in a scif_peer_release_dev -> scif_cleanup_scifdev-> + * scif_invalidate_ep call sequence which sets the endpoint state to + * DISCONNECTED and results in scif_poll(..) returning POLLHUP. + */ + +#define COSM_SCIF_BACKLOG 16 +#define COSM_HEARTBEAT_CHECK_DELTA_SEC 10 +#define COSM_HEARTBEAT_TIMEOUT_SEC \ + (COSM_HEARTBEAT_SEND_SEC + COSM_HEARTBEAT_CHECK_DELTA_SEC) +#define COSM_HEARTBEAT_TIMEOUT_MSEC (COSM_HEARTBEAT_TIMEOUT_SEC * MSEC_PER_SEC) + +static struct task_struct *server_thread; +static scif_epd_t listen_epd; + +/* Publish MIC card's shutdown status to user space MIC daemon */ +static void cosm_update_mic_status(struct cosm_device *cdev) +{ + if (cdev->shutdown_status_int != MIC_NOP) { + cosm_set_shutdown_status(cdev, cdev->shutdown_status_int); + cdev->shutdown_status_int = MIC_NOP; + } +} + +/* Store MIC card's shutdown status internally when it is received */ +static void cosm_shutdown_status_int(struct cosm_device *cdev, + enum mic_status shutdown_status) +{ + switch (shutdown_status) { + case MIC_HALTED: + case MIC_POWER_OFF: + case MIC_RESTART: + case MIC_CRASHED: + break; + default: + dev_err(&cdev->dev, "%s %d Unexpected shutdown_status %d\n", + __func__, __LINE__, shutdown_status); + return; + }; + cdev->shutdown_status_int = shutdown_status; + cdev->heartbeat_watchdog_enable = false; + + if (cdev->state != MIC_SHUTTING_DOWN) + cosm_set_state(cdev, MIC_SHUTTING_DOWN); +} + +/* Non-blocking recv. Read and process all available messages */ +static void cosm_scif_recv(struct cosm_device *cdev) +{ + struct cosm_msg msg; + int rc; + + while (1) { + rc = scif_recv(cdev->epd, &msg, sizeof(msg), 0); + if (!rc) { + break; + } else if (rc < 0) { + dev_dbg(&cdev->dev, "%s: %d rc %d\n", + __func__, __LINE__, rc); + break; + } + dev_dbg(&cdev->dev, "%s: %d rc %d id 0x%llx\n", + __func__, __LINE__, rc, msg.id); + + switch (msg.id) { + case COSM_MSG_SHUTDOWN_STATUS: + cosm_shutdown_status_int(cdev, msg.shutdown_status); + break; + case COSM_MSG_HEARTBEAT: + /* Nothing to do, heartbeat only unblocks scif_poll */ + break; + default: + dev_err(&cdev->dev, "%s: %d unknown msg.id %lld\n", + __func__, __LINE__, msg.id); + break; + } + } +} + +/* Publish crashed status for this MIC card */ +static void cosm_set_crashed(struct cosm_device *cdev) +{ + dev_err(&cdev->dev, "node alive timeout\n"); + cosm_shutdown_status_int(cdev, MIC_CRASHED); + cosm_update_mic_status(cdev); +} + +/* Send host time to the MIC card to sync system time between host and MIC */ +static void cosm_send_time(struct cosm_device *cdev) +{ + struct cosm_msg msg = { .id = COSM_MSG_SYNC_TIME }; + int rc; + + getnstimeofday64(&msg.timespec); + rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK); + if (rc < 0) + dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n", + __func__, __LINE__, rc); +} + +/* + * Close this cosm_device's endpoint after its peer endpoint on the card has + * been closed. In all cases except MIC card crash POLLHUP on the host is + * triggered by the client's endpoint being closed. + */ +static void cosm_scif_close(struct cosm_device *cdev) +{ + /* + * Because SHUTDOWN_STATUS message is sent by the MIC cards in the + * reboot notifier when shutdown is still not complete, we notify mpssd + * to reset the card when SCIF endpoint is closed. + */ + cosm_update_mic_status(cdev); + scif_close(cdev->epd); + cdev->epd = NULL; + dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__); +} + +/* + * Set card state to ONLINE when a new SCIF connection from a MIC card is + * received. Normally the state is BOOTING when the connection comes in, but can + * be ONLINE if cosm_client driver on the card was unloaded and then reloaded. + */ +static int cosm_set_online(struct cosm_device *cdev) +{ + int rc = 0; + + if (MIC_BOOTING == cdev->state || MIC_ONLINE == cdev->state) { + cdev->heartbeat_watchdog_enable = cdev->sysfs_heartbeat_enable; + cdev->epd = cdev->newepd; + if (cdev->state == MIC_BOOTING) + cosm_set_state(cdev, MIC_ONLINE); + cosm_send_time(cdev); + dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__); + } else { + dev_warn(&cdev->dev, "%s %d not going online in state: %s\n", + __func__, __LINE__, cosm_state_string[cdev->state]); + rc = -EINVAL; + } + /* Drop reference acquired by bus_find_device in the server thread */ + put_device(&cdev->dev); + return rc; +} + +/* + * Work function for handling work for a SCIF connection from a particular MIC + * card. It first sets the card state to ONLINE and then calls scif_poll to + * block on activity such as incoming messages on the SCIF endpoint. When the + * endpoint is closed, the work function exits, completing its life cycle, from + * MIC card boot to card shutdown/reset/crash. + */ +void cosm_scif_work(struct work_struct *work) +{ + struct cosm_device *cdev = container_of(work, struct cosm_device, + scif_work); + struct scif_pollepd pollepd; + int rc; + + mutex_lock(&cdev->cosm_mutex); + if (cosm_set_online(cdev)) + goto exit; + + while (1) { + pollepd.epd = cdev->epd; + pollepd.events = POLLIN; + + /* Drop the mutex before blocking in scif_poll(..) */ + mutex_unlock(&cdev->cosm_mutex); + /* poll(..) with timeout on our endpoint */ + rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_TIMEOUT_MSEC); + mutex_lock(&cdev->cosm_mutex); + if (rc < 0) { + dev_err(&cdev->dev, "%s %d scif_poll rc %d\n", + __func__, __LINE__, rc); + continue; + } + + /* There is a message from the card */ + if (pollepd.revents & POLLIN) + cosm_scif_recv(cdev); + + /* The peer endpoint is closed or this endpoint disconnected */ + if (pollepd.revents & POLLHUP) { + cosm_scif_close(cdev); + break; + } + + /* Did we timeout from poll? */ + if (!rc && cdev->heartbeat_watchdog_enable) + cosm_set_crashed(cdev); + } +exit: + dev_dbg(&cdev->dev, "%s %d exiting\n", __func__, __LINE__); + mutex_unlock(&cdev->cosm_mutex); +} + +/* + * COSM SCIF server thread function. Accepts incoming SCIF connections from MIC + * cards, finds the correct cosm_device to associate that connection with and + * schedules individual work items for each MIC card. + */ +static int cosm_scif_server(void *unused) +{ + struct cosm_device *cdev; + scif_epd_t newepd; + struct scif_port_id port_id; + int rc; + + allow_signal(SIGKILL); + + while (!kthread_should_stop()) { + rc = scif_accept(listen_epd, &port_id, &newepd, + SCIF_ACCEPT_SYNC); + if (rc < 0) { + if (-ERESTARTSYS != rc) + pr_err("%s %d rc %d\n", __func__, __LINE__, rc); + continue; + } + + /* + * Associate the incoming connection with a particular + * cosm_device, COSM device ID == SCIF node ID - 1 + */ + cdev = cosm_find_cdev_by_id(port_id.node - 1); + if (!cdev) + continue; + cdev->newepd = newepd; + schedule_work(&cdev->scif_work); + } + + pr_debug("%s %d Server thread stopped\n", __func__, __LINE__); + return 0; +} + +static int cosm_scif_listen(void) +{ + int rc; + + listen_epd = scif_open(); + if (!listen_epd) { + pr_err("%s %d scif_open failed\n", __func__, __LINE__); + return -ENOMEM; + } + + rc = scif_bind(listen_epd, SCIF_COSM_LISTEN_PORT); + if (rc < 0) { + pr_err("%s %d scif_bind failed rc %d\n", + __func__, __LINE__, rc); + goto err; + } + + rc = scif_listen(listen_epd, COSM_SCIF_BACKLOG); + if (rc < 0) { + pr_err("%s %d scif_listen rc %d\n", __func__, __LINE__, rc); + goto err; + } + pr_debug("%s %d listen_epd set up\n", __func__, __LINE__); + return 0; +err: + scif_close(listen_epd); + listen_epd = NULL; + return rc; +} + +static void cosm_scif_listen_exit(void) +{ + pr_debug("%s %d closing listen_epd\n", __func__, __LINE__); + if (listen_epd) { + scif_close(listen_epd); + listen_epd = NULL; + } +} + +/* + * Create a listening SCIF endpoint and a server kthread which accepts incoming + * SCIF connections from MIC cards + */ +int cosm_scif_init(void) +{ + int rc = cosm_scif_listen(); + + if (rc) { + pr_err("%s %d cosm_scif_listen rc %d\n", + __func__, __LINE__, rc); + goto err; + } + + server_thread = kthread_run(cosm_scif_server, NULL, "cosm_server"); + if (IS_ERR(server_thread)) { + rc = PTR_ERR(server_thread); + pr_err("%s %d kthread_run rc %d\n", __func__, __LINE__, rc); + goto listen_exit; + } + return 0; +listen_exit: + cosm_scif_listen_exit(); +err: + return rc; +} + +/* Stop the running server thread and close the listening SCIF endpoint */ +void cosm_scif_exit(void) +{ + int rc; + + if (!IS_ERR_OR_NULL(server_thread)) { + rc = send_sig(SIGKILL, server_thread, 0); + if (rc) { + pr_err("%s %d send_sig rc %d\n", + __func__, __LINE__, rc); + return; + } + kthread_stop(server_thread); + } + + cosm_scif_listen_exit(); +} diff --git a/kernel/drivers/misc/mic/cosm/cosm_sysfs.c b/kernel/drivers/misc/mic/cosm/cosm_sysfs.c new file mode 100644 index 000000000..29d6863b6 --- /dev/null +++ b/kernel/drivers/misc/mic/cosm/cosm_sysfs.c @@ -0,0 +1,461 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + * Intel MIC Coprocessor State Management (COSM) Driver + * + */ +#include <linux/slab.h> +#include "cosm_main.h" + +/* + * A state-to-string lookup table, for exposing a human readable state + * via sysfs. Always keep in sync with enum cosm_states + */ +const char * const cosm_state_string[] = { + [MIC_READY] = "ready", + [MIC_BOOTING] = "booting", + [MIC_ONLINE] = "online", + [MIC_SHUTTING_DOWN] = "shutting_down", + [MIC_RESETTING] = "resetting", + [MIC_RESET_FAILED] = "reset_failed", +}; + +/* + * A shutdown-status-to-string lookup table, for exposing a human + * readable state via sysfs. Always keep in sync with enum cosm_shutdown_status + */ +const char * const cosm_shutdown_status_string[] = { + [MIC_NOP] = "nop", + [MIC_CRASHED] = "crashed", + [MIC_HALTED] = "halted", + [MIC_POWER_OFF] = "poweroff", + [MIC_RESTART] = "restart", +}; + +void cosm_set_shutdown_status(struct cosm_device *cdev, u8 shutdown_status) +{ + dev_dbg(&cdev->dev, "Shutdown Status %s -> %s\n", + cosm_shutdown_status_string[cdev->shutdown_status], + cosm_shutdown_status_string[shutdown_status]); + cdev->shutdown_status = shutdown_status; +} + +void cosm_set_state(struct cosm_device *cdev, u8 state) +{ + dev_dbg(&cdev->dev, "State %s -> %s\n", + cosm_state_string[cdev->state], + cosm_state_string[state]); + cdev->state = state; + sysfs_notify_dirent(cdev->state_sysfs); +} + +static ssize_t +family_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + + if (!cdev) + return -EINVAL; + + return cdev->hw_ops->family(cdev, buf); +} +static DEVICE_ATTR_RO(family); + +static ssize_t +stepping_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + + if (!cdev) + return -EINVAL; + + return cdev->hw_ops->stepping(cdev, buf); +} +static DEVICE_ATTR_RO(stepping); + +static ssize_t +state_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + + if (!cdev || cdev->state >= MIC_LAST) + return -EINVAL; + + return scnprintf(buf, PAGE_SIZE, "%s\n", + cosm_state_string[cdev->state]); +} + +static ssize_t +state_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + int rc; + + if (!cdev) + return -EINVAL; + + if (sysfs_streq(buf, "boot")) { + rc = cosm_start(cdev); + goto done; + } + if (sysfs_streq(buf, "reset")) { + rc = cosm_reset(cdev); + goto done; + } + + if (sysfs_streq(buf, "shutdown")) { + rc = cosm_shutdown(cdev); + goto done; + } + rc = -EINVAL; +done: + if (rc) + count = rc; + return count; +} +static DEVICE_ATTR_RW(state); + +static ssize_t shutdown_status_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + + if (!cdev || cdev->shutdown_status >= MIC_STATUS_LAST) + return -EINVAL; + + return scnprintf(buf, PAGE_SIZE, "%s\n", + cosm_shutdown_status_string[cdev->shutdown_status]); +} +static DEVICE_ATTR_RO(shutdown_status); + +static ssize_t +heartbeat_enable_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + + if (!cdev) + return -EINVAL; + + return scnprintf(buf, PAGE_SIZE, "%d\n", cdev->sysfs_heartbeat_enable); +} + +static ssize_t +heartbeat_enable_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + int enable; + int ret; + + if (!cdev) + return -EINVAL; + + mutex_lock(&cdev->cosm_mutex); + ret = kstrtoint(buf, 10, &enable); + if (ret) + goto unlock; + + cdev->sysfs_heartbeat_enable = enable; + /* if state is not online, cdev->heartbeat_watchdog_enable is 0 */ + if (cdev->state == MIC_ONLINE) + cdev->heartbeat_watchdog_enable = enable; + ret = count; +unlock: + mutex_unlock(&cdev->cosm_mutex); + return ret; +} +static DEVICE_ATTR_RW(heartbeat_enable); + +static ssize_t +cmdline_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + char *cmdline; + + if (!cdev) + return -EINVAL; + + cmdline = cdev->cmdline; + + if (cmdline) + return scnprintf(buf, PAGE_SIZE, "%s\n", cmdline); + return 0; +} + +static ssize_t +cmdline_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + + if (!cdev) + return -EINVAL; + + mutex_lock(&cdev->cosm_mutex); + kfree(cdev->cmdline); + + cdev->cmdline = kmalloc(count + 1, GFP_KERNEL); + if (!cdev->cmdline) { + count = -ENOMEM; + goto unlock; + } + + strncpy(cdev->cmdline, buf, count); + + if (cdev->cmdline[count - 1] == '\n') + cdev->cmdline[count - 1] = '\0'; + else + cdev->cmdline[count] = '\0'; +unlock: + mutex_unlock(&cdev->cosm_mutex); + return count; +} +static DEVICE_ATTR_RW(cmdline); + +static ssize_t +firmware_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + char *firmware; + + if (!cdev) + return -EINVAL; + + firmware = cdev->firmware; + + if (firmware) + return scnprintf(buf, PAGE_SIZE, "%s\n", firmware); + return 0; +} + +static ssize_t +firmware_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + + if (!cdev) + return -EINVAL; + + mutex_lock(&cdev->cosm_mutex); + kfree(cdev->firmware); + + cdev->firmware = kmalloc(count + 1, GFP_KERNEL); + if (!cdev->firmware) { + count = -ENOMEM; + goto unlock; + } + strncpy(cdev->firmware, buf, count); + + if (cdev->firmware[count - 1] == '\n') + cdev->firmware[count - 1] = '\0'; + else + cdev->firmware[count] = '\0'; +unlock: + mutex_unlock(&cdev->cosm_mutex); + return count; +} +static DEVICE_ATTR_RW(firmware); + +static ssize_t +ramdisk_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + char *ramdisk; + + if (!cdev) + return -EINVAL; + + ramdisk = cdev->ramdisk; + + if (ramdisk) + return scnprintf(buf, PAGE_SIZE, "%s\n", ramdisk); + return 0; +} + +static ssize_t +ramdisk_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + + if (!cdev) + return -EINVAL; + + mutex_lock(&cdev->cosm_mutex); + kfree(cdev->ramdisk); + + cdev->ramdisk = kmalloc(count + 1, GFP_KERNEL); + if (!cdev->ramdisk) { + count = -ENOMEM; + goto unlock; + } + + strncpy(cdev->ramdisk, buf, count); + + if (cdev->ramdisk[count - 1] == '\n') + cdev->ramdisk[count - 1] = '\0'; + else + cdev->ramdisk[count] = '\0'; +unlock: + mutex_unlock(&cdev->cosm_mutex); + return count; +} +static DEVICE_ATTR_RW(ramdisk); + +static ssize_t +bootmode_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + char *bootmode; + + if (!cdev) + return -EINVAL; + + bootmode = cdev->bootmode; + + if (bootmode) + return scnprintf(buf, PAGE_SIZE, "%s\n", bootmode); + return 0; +} + +static ssize_t +bootmode_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + + if (!cdev) + return -EINVAL; + + if (!sysfs_streq(buf, "linux") && !sysfs_streq(buf, "flash")) + return -EINVAL; + + mutex_lock(&cdev->cosm_mutex); + kfree(cdev->bootmode); + + cdev->bootmode = kmalloc(count + 1, GFP_KERNEL); + if (!cdev->bootmode) { + count = -ENOMEM; + goto unlock; + } + + strncpy(cdev->bootmode, buf, count); + + if (cdev->bootmode[count - 1] == '\n') + cdev->bootmode[count - 1] = '\0'; + else + cdev->bootmode[count] = '\0'; +unlock: + mutex_unlock(&cdev->cosm_mutex); + return count; +} +static DEVICE_ATTR_RW(bootmode); + +static ssize_t +log_buf_addr_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + + if (!cdev) + return -EINVAL; + + return scnprintf(buf, PAGE_SIZE, "%p\n", cdev->log_buf_addr); +} + +static ssize_t +log_buf_addr_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + int ret; + unsigned long addr; + + if (!cdev) + return -EINVAL; + + ret = kstrtoul(buf, 16, &addr); + if (ret) + goto exit; + + cdev->log_buf_addr = (void *)addr; + ret = count; +exit: + return ret; +} +static DEVICE_ATTR_RW(log_buf_addr); + +static ssize_t +log_buf_len_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + + if (!cdev) + return -EINVAL; + + return scnprintf(buf, PAGE_SIZE, "%p\n", cdev->log_buf_len); +} + +static ssize_t +log_buf_len_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cosm_device *cdev = dev_get_drvdata(dev); + int ret; + unsigned long addr; + + if (!cdev) + return -EINVAL; + + ret = kstrtoul(buf, 16, &addr); + if (ret) + goto exit; + + cdev->log_buf_len = (int *)addr; + ret = count; +exit: + return ret; +} +static DEVICE_ATTR_RW(log_buf_len); + +static struct attribute *cosm_default_attrs[] = { + &dev_attr_family.attr, + &dev_attr_stepping.attr, + &dev_attr_state.attr, + &dev_attr_shutdown_status.attr, + &dev_attr_heartbeat_enable.attr, + &dev_attr_cmdline.attr, + &dev_attr_firmware.attr, + &dev_attr_ramdisk.attr, + &dev_attr_bootmode.attr, + &dev_attr_log_buf_addr.attr, + &dev_attr_log_buf_len.attr, + + NULL +}; + +ATTRIBUTE_GROUPS(cosm_default); + +void cosm_sysfs_init(struct cosm_device *cdev) +{ + cdev->attr_group = cosm_default_groups; +} diff --git a/kernel/drivers/misc/mic/cosm_client/Makefile b/kernel/drivers/misc/mic/cosm_client/Makefile new file mode 100644 index 000000000..6f751a519 --- /dev/null +++ b/kernel/drivers/misc/mic/cosm_client/Makefile @@ -0,0 +1,7 @@ +# +# Makefile - Intel MIC COSM Client Driver +# Copyright(c) 2015, Intel Corporation. +# +obj-$(CONFIG_MIC_COSM) += cosm_client.o + +cosm_client-objs += cosm_scif_client.o diff --git a/kernel/drivers/misc/mic/cosm_client/cosm_scif_client.c b/kernel/drivers/misc/mic/cosm_client/cosm_scif_client.c new file mode 100644 index 000000000..03e98bf1a --- /dev/null +++ b/kernel/drivers/misc/mic/cosm_client/cosm_scif_client.c @@ -0,0 +1,275 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + * Intel MIC COSM Client Driver + * + */ +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/reboot.h> +#include <linux/kthread.h> +#include "../cosm/cosm_main.h" + +#define COSM_SCIF_MAX_RETRIES 10 +#define COSM_HEARTBEAT_SEND_MSEC (COSM_HEARTBEAT_SEND_SEC * MSEC_PER_SEC) + +static struct task_struct *client_thread; +static scif_epd_t client_epd; +static struct scif_peer_dev *client_spdev; + +/* + * Reboot notifier: receives shutdown status from the OS and communicates it + * back to the COSM process on the host + */ +static int cosm_reboot_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct cosm_msg msg = { .id = COSM_MSG_SHUTDOWN_STATUS }; + int rc; + + event = (event == SYS_RESTART) ? SYSTEM_RESTART : event; + dev_info(&client_spdev->dev, "%s %d received event %ld\n", + __func__, __LINE__, event); + + msg.shutdown_status = event; + rc = scif_send(client_epd, &msg, sizeof(msg), SCIF_SEND_BLOCK); + if (rc < 0) + dev_err(&client_spdev->dev, "%s %d scif_send rc %d\n", + __func__, __LINE__, rc); + + return NOTIFY_DONE; +} + +static struct notifier_block cosm_reboot = { + .notifier_call = cosm_reboot_event, +}; + +/* Set system time from timespec value received from the host */ +static void cosm_set_time(struct cosm_msg *msg) +{ + int rc = do_settimeofday64(&msg->timespec); + + if (rc) + dev_err(&client_spdev->dev, "%s: %d settimeofday rc %d\n", + __func__, __LINE__, rc); +} + +/* COSM client receive message processing */ +static void cosm_client_recv(void) +{ + struct cosm_msg msg; + int rc; + + while (1) { + rc = scif_recv(client_epd, &msg, sizeof(msg), 0); + if (!rc) { + return; + } else if (rc < 0) { + dev_err(&client_spdev->dev, "%s: %d rc %d\n", + __func__, __LINE__, rc); + return; + } + + dev_dbg(&client_spdev->dev, "%s: %d rc %d id 0x%llx\n", + __func__, __LINE__, rc, msg.id); + + switch (msg.id) { + case COSM_MSG_SYNC_TIME: + cosm_set_time(&msg); + break; + case COSM_MSG_SHUTDOWN: + orderly_poweroff(true); + break; + default: + dev_err(&client_spdev->dev, "%s: %d unknown id %lld\n", + __func__, __LINE__, msg.id); + break; + } + } +} + +/* Initiate connection to the COSM server on the host */ +static int cosm_scif_connect(void) +{ + struct scif_port_id port_id; + int i, rc; + + client_epd = scif_open(); + if (!client_epd) { + dev_err(&client_spdev->dev, "%s %d scif_open failed\n", + __func__, __LINE__); + return -ENOMEM; + } + + port_id.node = 0; + port_id.port = SCIF_COSM_LISTEN_PORT; + + for (i = 0; i < COSM_SCIF_MAX_RETRIES; i++) { + rc = scif_connect(client_epd, &port_id); + if (rc < 0) + msleep(1000); + else + break; + } + + if (rc < 0) { + dev_err(&client_spdev->dev, "%s %d scif_connect rc %d\n", + __func__, __LINE__, rc); + scif_close(client_epd); + client_epd = NULL; + } + return rc < 0 ? rc : 0; +} + +/* Close host SCIF connection */ +static void cosm_scif_connect_exit(void) +{ + if (client_epd) { + scif_close(client_epd); + client_epd = NULL; + } +} + +/* + * COSM SCIF client thread function: waits for messages from the host and sends + * a heartbeat to the host + */ +static int cosm_scif_client(void *unused) +{ + struct cosm_msg msg = { .id = COSM_MSG_HEARTBEAT }; + struct scif_pollepd pollepd; + int rc; + + allow_signal(SIGKILL); + + while (!kthread_should_stop()) { + pollepd.epd = client_epd; + pollepd.events = POLLIN; + + rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_SEND_MSEC); + if (rc < 0) { + if (-EINTR != rc) + dev_err(&client_spdev->dev, + "%s %d scif_poll rc %d\n", + __func__, __LINE__, rc); + continue; + } + + if (pollepd.revents & POLLIN) + cosm_client_recv(); + + msg.id = COSM_MSG_HEARTBEAT; + rc = scif_send(client_epd, &msg, sizeof(msg), SCIF_SEND_BLOCK); + if (rc < 0) + dev_err(&client_spdev->dev, "%s %d scif_send rc %d\n", + __func__, __LINE__, rc); + } + + dev_dbg(&client_spdev->dev, "%s %d Client thread stopped\n", + __func__, __LINE__); + return 0; +} + +static void cosm_scif_probe(struct scif_peer_dev *spdev) +{ + int rc; + + dev_dbg(&spdev->dev, "%s %d: dnode %d\n", + __func__, __LINE__, spdev->dnode); + + /* We are only interested in the host with spdev->dnode == 0 */ + if (spdev->dnode) + return; + + client_spdev = spdev; + rc = cosm_scif_connect(); + if (rc) + goto exit; + + rc = register_reboot_notifier(&cosm_reboot); + if (rc) { + dev_err(&spdev->dev, + "reboot notifier registration failed rc %d\n", rc); + goto connect_exit; + } + + client_thread = kthread_run(cosm_scif_client, NULL, "cosm_client"); + if (IS_ERR(client_thread)) { + rc = PTR_ERR(client_thread); + dev_err(&spdev->dev, "%s %d kthread_run rc %d\n", + __func__, __LINE__, rc); + goto unreg_reboot; + } + return; +unreg_reboot: + unregister_reboot_notifier(&cosm_reboot); +connect_exit: + cosm_scif_connect_exit(); +exit: + client_spdev = NULL; +} + +static void cosm_scif_remove(struct scif_peer_dev *spdev) +{ + int rc; + + dev_dbg(&spdev->dev, "%s %d: dnode %d\n", + __func__, __LINE__, spdev->dnode); + + if (spdev->dnode) + return; + + if (!IS_ERR_OR_NULL(client_thread)) { + rc = send_sig(SIGKILL, client_thread, 0); + if (rc) { + pr_err("%s %d send_sig rc %d\n", + __func__, __LINE__, rc); + return; + } + kthread_stop(client_thread); + } + unregister_reboot_notifier(&cosm_reboot); + cosm_scif_connect_exit(); + client_spdev = NULL; +} + +static struct scif_client scif_client_cosm = { + .name = KBUILD_MODNAME, + .probe = cosm_scif_probe, + .remove = cosm_scif_remove, +}; + +static int __init cosm_client_init(void) +{ + int rc = scif_client_register(&scif_client_cosm); + + if (rc) + pr_err("scif_client_register failed rc %d\n", rc); + return rc; +} + +static void __exit cosm_client_exit(void) +{ + scif_client_unregister(&scif_client_cosm); +} + +module_init(cosm_client_init); +module_exit(cosm_client_exit); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Intel(R) MIC card OS state management client driver"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/drivers/misc/mic/host/Makefile b/kernel/drivers/misc/mic/host/Makefile index c2197f999..004d3db0f 100644 --- a/kernel/drivers/misc/mic/host/Makefile +++ b/kernel/drivers/misc/mic/host/Makefile @@ -5,7 +5,6 @@ obj-$(CONFIG_INTEL_MIC_HOST) += mic_host.o mic_host-objs := mic_main.o mic_host-objs += mic_x100.o -mic_host-objs += mic_sysfs.o mic_host-objs += mic_smpt.o mic_host-objs += mic_intr.o mic_host-objs += mic_boot.o diff --git a/kernel/drivers/misc/mic/host/mic_boot.c b/kernel/drivers/misc/mic/host/mic_boot.c index d9fa609da..7845564df 100644 --- a/kernel/drivers/misc/mic/host/mic_boot.c +++ b/kernel/drivers/misc/mic/host/mic_boot.c @@ -21,14 +21,197 @@ #include <linux/delay.h> #include <linux/firmware.h> #include <linux/pci.h> - +#include <linux/kmod.h> #include <linux/mic_common.h> #include <linux/mic_bus.h> +#include "../bus/scif_bus.h" #include "../common/mic_dev.h" #include "mic_device.h" #include "mic_smpt.h" #include "mic_virtio.h" +static inline struct mic_device *scdev_to_mdev(struct scif_hw_dev *scdev) +{ + return dev_get_drvdata(scdev->dev.parent); +} + +static void *__mic_dma_alloc(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp, + struct dma_attrs *attrs) +{ + struct scif_hw_dev *scdev = dev_get_drvdata(dev); + struct mic_device *mdev = scdev_to_mdev(scdev); + dma_addr_t tmp; + void *va = kmalloc(size, gfp); + + if (va) { + tmp = mic_map_single(mdev, va, size); + if (dma_mapping_error(dev, tmp)) { + kfree(va); + va = NULL; + } else { + *dma_handle = tmp; + } + } + return va; +} + +static void __mic_dma_free(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, struct dma_attrs *attrs) +{ + struct scif_hw_dev *scdev = dev_get_drvdata(dev); + struct mic_device *mdev = scdev_to_mdev(scdev); + + mic_unmap_single(mdev, dma_handle, size); + kfree(vaddr); +} + +static dma_addr_t +__mic_dma_map_page(struct device *dev, struct page *page, unsigned long offset, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + void *va = phys_to_virt(page_to_phys(page)) + offset; + struct scif_hw_dev *scdev = dev_get_drvdata(dev); + struct mic_device *mdev = scdev_to_mdev(scdev); + + return mic_map_single(mdev, va, size); +} + +static void +__mic_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scif_hw_dev *scdev = dev_get_drvdata(dev); + struct mic_device *mdev = scdev_to_mdev(scdev); + + mic_unmap_single(mdev, dma_addr, size); +} + +static int __mic_dma_map_sg(struct device *dev, struct scatterlist *sg, + int nents, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scif_hw_dev *scdev = dev_get_drvdata(dev); + struct mic_device *mdev = scdev_to_mdev(scdev); + struct scatterlist *s; + int i, j, ret; + dma_addr_t da; + + ret = dma_map_sg(&mdev->pdev->dev, sg, nents, dir); + if (ret <= 0) + return 0; + + for_each_sg(sg, s, nents, i) { + da = mic_map(mdev, sg_dma_address(s) + s->offset, s->length); + if (!da) + goto err; + sg_dma_address(s) = da; + } + return nents; +err: + for_each_sg(sg, s, i, j) { + mic_unmap(mdev, sg_dma_address(s), s->length); + sg_dma_address(s) = mic_to_dma_addr(mdev, sg_dma_address(s)); + } + dma_unmap_sg(&mdev->pdev->dev, sg, nents, dir); + return 0; +} + +static void __mic_dma_unmap_sg(struct device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scif_hw_dev *scdev = dev_get_drvdata(dev); + struct mic_device *mdev = scdev_to_mdev(scdev); + struct scatterlist *s; + dma_addr_t da; + int i; + + for_each_sg(sg, s, nents, i) { + da = mic_to_dma_addr(mdev, sg_dma_address(s)); + mic_unmap(mdev, sg_dma_address(s), s->length); + sg_dma_address(s) = da; + } + dma_unmap_sg(&mdev->pdev->dev, sg, nents, dir); +} + +static struct dma_map_ops __mic_dma_ops = { + .alloc = __mic_dma_alloc, + .free = __mic_dma_free, + .map_page = __mic_dma_map_page, + .unmap_page = __mic_dma_unmap_page, + .map_sg = __mic_dma_map_sg, + .unmap_sg = __mic_dma_unmap_sg, +}; + +static struct mic_irq * +___mic_request_irq(struct scif_hw_dev *scdev, + irqreturn_t (*func)(int irq, void *data), + const char *name, + void *data, int db) +{ + struct mic_device *mdev = scdev_to_mdev(scdev); + + return mic_request_threaded_irq(mdev, func, NULL, name, data, + db, MIC_INTR_DB); +} + +static void +___mic_free_irq(struct scif_hw_dev *scdev, + struct mic_irq *cookie, void *data) +{ + struct mic_device *mdev = scdev_to_mdev(scdev); + + return mic_free_irq(mdev, cookie, data); +} + +static void ___mic_ack_interrupt(struct scif_hw_dev *scdev, int num) +{ + struct mic_device *mdev = scdev_to_mdev(scdev); + + mdev->ops->intr_workarounds(mdev); +} + +static int ___mic_next_db(struct scif_hw_dev *scdev) +{ + struct mic_device *mdev = scdev_to_mdev(scdev); + + return mic_next_db(mdev); +} + +static void ___mic_send_intr(struct scif_hw_dev *scdev, int db) +{ + struct mic_device *mdev = scdev_to_mdev(scdev); + + mdev->ops->send_intr(mdev, db); +} + +static void __iomem *___mic_ioremap(struct scif_hw_dev *scdev, + phys_addr_t pa, size_t len) +{ + struct mic_device *mdev = scdev_to_mdev(scdev); + + return mdev->aper.va + pa; +} + +static void ___mic_iounmap(struct scif_hw_dev *scdev, void __iomem *va) +{ + /* nothing to do */ +} + +static struct scif_hw_ops scif_hw_ops = { + .request_irq = ___mic_request_irq, + .free_irq = ___mic_free_irq, + .ack_interrupt = ___mic_ack_interrupt, + .next_db = ___mic_next_db, + .send_intr = ___mic_send_intr, + .ioremap = ___mic_ioremap, + .iounmap = ___mic_iounmap, +}; + static inline struct mic_device *mbdev_to_mdev(struct mbus_device *mbdev) { return dev_get_drvdata(mbdev->dev.parent); @@ -87,295 +270,213 @@ static struct mbus_hw_ops mbus_hw_ops = { .ack_interrupt = _mic_ack_interrupt, }; -/** - * mic_reset - Reset the MIC device. - * @mdev: pointer to mic_device instance - */ -static void mic_reset(struct mic_device *mdev) +/* Initialize the MIC bootparams */ +void mic_bootparam_init(struct mic_device *mdev) { - int i; + struct mic_bootparam *bootparam = mdev->dp; + + bootparam->magic = cpu_to_le32(MIC_MAGIC); + bootparam->h2c_config_db = -1; + bootparam->node_id = mdev->id + 1; + bootparam->scif_host_dma_addr = 0x0; + bootparam->scif_card_dma_addr = 0x0; + bootparam->c2h_scif_db = -1; + bootparam->h2c_scif_db = -1; +} + +static inline struct mic_device *cosmdev_to_mdev(struct cosm_device *cdev) +{ + return dev_get_drvdata(cdev->dev.parent); +} -#define MIC_RESET_TO (45) +static void _mic_reset(struct cosm_device *cdev) +{ + struct mic_device *mdev = cosmdev_to_mdev(cdev); - reinit_completion(&mdev->reset_wait); mdev->ops->reset_fw_ready(mdev); mdev->ops->reset(mdev); - - for (i = 0; i < MIC_RESET_TO; i++) { - if (mdev->ops->is_fw_ready(mdev)) - goto done; - /* - * Resets typically take 10s of seconds to complete. - * Since an MMIO read is required to check if the - * firmware is ready or not, a 1 second delay works nicely. - */ - msleep(1000); - } - mic_set_state(mdev, MIC_RESET_FAILED); -done: - complete_all(&mdev->reset_wait); } -/* Initialize the MIC bootparams */ -void mic_bootparam_init(struct mic_device *mdev) +static bool _mic_ready(struct cosm_device *cdev) { - struct mic_bootparam *bootparam = mdev->dp; + struct mic_device *mdev = cosmdev_to_mdev(cdev); - bootparam->magic = cpu_to_le32(MIC_MAGIC); - bootparam->c2h_shutdown_db = mdev->shutdown_db; - bootparam->h2c_shutdown_db = -1; - bootparam->h2c_config_db = -1; - bootparam->shutdown_status = 0; - bootparam->shutdown_card = 0; + return mdev->ops->is_fw_ready(mdev); +} + +/** + * mic_request_dma_chans - Request DMA channels + * @mdev: pointer to mic_device instance + * + * returns number of DMA channels acquired + */ +static int mic_request_dma_chans(struct mic_device *mdev) +{ + dma_cap_mask_t mask; + struct dma_chan *chan; + + request_module("mic_x100_dma"); + dma_cap_zero(mask); + dma_cap_set(DMA_MEMCPY, mask); + + do { + chan = dma_request_channel(mask, mdev->ops->dma_filter, + &mdev->pdev->dev); + if (chan) { + mdev->dma_ch[mdev->num_dma_ch++] = chan; + if (mdev->num_dma_ch >= MIC_MAX_DMA_CHAN) + break; + } + } while (chan); + dev_info(&mdev->pdev->dev, "DMA channels # %d\n", mdev->num_dma_ch); + return mdev->num_dma_ch; } /** - * mic_start - Start the MIC. + * mic_free_dma_chans - release DMA channels * @mdev: pointer to mic_device instance - * @buf: buffer containing boot string including firmware/ramdisk path. + * + * returns none + */ +static void mic_free_dma_chans(struct mic_device *mdev) +{ + int i = 0; + + for (i = 0; i < mdev->num_dma_ch; i++) { + dma_release_channel(mdev->dma_ch[i]); + mdev->dma_ch[i] = NULL; + } + mdev->num_dma_ch = 0; +} + +/** + * _mic_start - Start the MIC. + * @cdev: pointer to cosm_device instance + * @id: MIC device id/index provided by COSM used in other drivers like SCIF * * This function prepares an MIC for boot and initiates boot. * RETURNS: An appropriate -ERRNO error value on error, or zero for success. + * + * For all cosm_hw_ops the caller holds a mutex to ensure serialization. */ -int mic_start(struct mic_device *mdev, const char *buf) +static int _mic_start(struct cosm_device *cdev, int id) { + struct mic_device *mdev = cosmdev_to_mdev(cdev); int rc; - mutex_lock(&mdev->mic_mutex); -retry: - if (MIC_OFFLINE != mdev->state) { - rc = -EINVAL; - goto unlock_ret; - } - if (!mdev->ops->is_fw_ready(mdev)) { - mic_reset(mdev); - /* - * The state will either be MIC_OFFLINE if the reset succeeded - * or MIC_RESET_FAILED if the firmware reset failed. - */ - goto retry; - } - mdev->dma_mbdev = mbus_register_device(mdev->sdev->parent, + + mic_bootparam_init(mdev); + mdev->dma_mbdev = mbus_register_device(&mdev->pdev->dev, MBUS_DEV_DMA_HOST, &mic_dma_ops, - &mbus_hw_ops, mdev->mmio.va); + &mbus_hw_ops, id, mdev->mmio.va); if (IS_ERR(mdev->dma_mbdev)) { rc = PTR_ERR(mdev->dma_mbdev); goto unlock_ret; } - mdev->dma_ch = mic_request_dma_chan(mdev); - if (!mdev->dma_ch) { - rc = -ENXIO; + if (!mic_request_dma_chans(mdev)) { + rc = -ENODEV; goto dma_remove; } - rc = mdev->ops->load_mic_fw(mdev, buf); + mdev->scdev = scif_register_device(&mdev->pdev->dev, MIC_SCIF_DEV, + &__mic_dma_ops, &scif_hw_ops, + id + 1, 0, &mdev->mmio, + &mdev->aper, mdev->dp, NULL, + mdev->dma_ch, mdev->num_dma_ch, + true); + if (IS_ERR(mdev->scdev)) { + rc = PTR_ERR(mdev->scdev); + goto dma_free; + } + + rc = mdev->ops->load_mic_fw(mdev, NULL); if (rc) - goto dma_release; + goto scif_remove; mic_smpt_restore(mdev); mic_intr_restore(mdev); mdev->intr_ops->enable_interrupts(mdev); mdev->ops->write_spad(mdev, MIC_DPLO_SPAD, mdev->dp_dma_addr); mdev->ops->write_spad(mdev, MIC_DPHI_SPAD, mdev->dp_dma_addr >> 32); mdev->ops->send_firmware_intr(mdev); - mic_set_state(mdev, MIC_ONLINE); goto unlock_ret; -dma_release: - dma_release_channel(mdev->dma_ch); +scif_remove: + scif_unregister_device(mdev->scdev); +dma_free: + mic_free_dma_chans(mdev); dma_remove: mbus_unregister_device(mdev->dma_mbdev); unlock_ret: - mutex_unlock(&mdev->mic_mutex); return rc; } /** - * mic_stop - Prepare the MIC for reset and trigger reset. - * @mdev: pointer to mic_device instance + * _mic_stop - Prepare the MIC for reset and trigger reset. + * @cdev: pointer to cosm_device instance * @force: force a MIC to reset even if it is already offline. * * RETURNS: None. */ -void mic_stop(struct mic_device *mdev, bool force) +static void _mic_stop(struct cosm_device *cdev, bool force) { - mutex_lock(&mdev->mic_mutex); - if (MIC_OFFLINE != mdev->state || force) { - mic_virtio_reset_devices(mdev); - if (mdev->dma_ch) { - dma_release_channel(mdev->dma_ch); - mdev->dma_ch = NULL; - } - mbus_unregister_device(mdev->dma_mbdev); - mic_bootparam_init(mdev); - mic_reset(mdev); - if (MIC_RESET_FAILED == mdev->state) - goto unlock; - mic_set_shutdown_status(mdev, MIC_NOP); - if (MIC_SUSPENDED != mdev->state) - mic_set_state(mdev, MIC_OFFLINE); - } -unlock: - mutex_unlock(&mdev->mic_mutex); -} - -/** - * mic_shutdown - Initiate MIC shutdown. - * @mdev: pointer to mic_device instance - * - * RETURNS: None. - */ -void mic_shutdown(struct mic_device *mdev) -{ - struct mic_bootparam *bootparam = mdev->dp; - s8 db = bootparam->h2c_shutdown_db; - - mutex_lock(&mdev->mic_mutex); - if (MIC_ONLINE == mdev->state && db != -1) { - bootparam->shutdown_card = 1; - mdev->ops->send_intr(mdev, db); - mic_set_state(mdev, MIC_SHUTTING_DOWN); - } - mutex_unlock(&mdev->mic_mutex); -} - -/** - * mic_shutdown_work - Handle shutdown interrupt from MIC. - * @work: The work structure. - * - * This work is scheduled whenever the host has received a shutdown - * interrupt from the MIC. - */ -void mic_shutdown_work(struct work_struct *work) -{ - struct mic_device *mdev = container_of(work, struct mic_device, - shutdown_work); - struct mic_bootparam *bootparam = mdev->dp; - - mutex_lock(&mdev->mic_mutex); - mic_set_shutdown_status(mdev, bootparam->shutdown_status); - bootparam->shutdown_status = 0; + struct mic_device *mdev = cosmdev_to_mdev(cdev); /* - * if state is MIC_SUSPENDED, OSPM suspend is in progress. We do not - * change the state here so as to prevent users from booting the card - * during and after the suspend operation. + * Since SCIF handles card shutdown and reset (using COSM), it will + * will be the first to be registered and the last to be + * unregistered. */ - if (MIC_SHUTTING_DOWN != mdev->state && - MIC_SUSPENDED != mdev->state) - mic_set_state(mdev, MIC_SHUTTING_DOWN); - mutex_unlock(&mdev->mic_mutex); + mic_virtio_reset_devices(mdev); + scif_unregister_device(mdev->scdev); + mic_free_dma_chans(mdev); + mbus_unregister_device(mdev->dma_mbdev); + mic_bootparam_init(mdev); } -/** - * mic_reset_trigger_work - Trigger MIC reset. - * @work: The work structure. - * - * This work is scheduled whenever the host wants to reset the MIC. - */ -void mic_reset_trigger_work(struct work_struct *work) +static ssize_t _mic_family(struct cosm_device *cdev, char *buf) { - struct mic_device *mdev = container_of(work, struct mic_device, - reset_trigger_work); + struct mic_device *mdev = cosmdev_to_mdev(cdev); + static const char *family[MIC_FAMILY_LAST] = { "x100", "Unknown" }; - mic_stop(mdev, false); + return scnprintf(buf, PAGE_SIZE, "%s\n", family[mdev->family]); } -/** - * mic_complete_resume - Complete MIC Resume after an OSPM suspend/hibernate - * event. - * @mdev: pointer to mic_device instance - * - * RETURNS: None. - */ -void mic_complete_resume(struct mic_device *mdev) +static ssize_t _mic_stepping(struct cosm_device *cdev, char *buf) { - if (mdev->state != MIC_SUSPENDED) { - dev_warn(mdev->sdev->parent, "state %d should be %d\n", - mdev->state, MIC_SUSPENDED); - return; - } - - /* Make sure firmware is ready */ - if (!mdev->ops->is_fw_ready(mdev)) - mic_stop(mdev, true); - - mutex_lock(&mdev->mic_mutex); - mic_set_state(mdev, MIC_OFFLINE); - mutex_unlock(&mdev->mic_mutex); -} + struct mic_device *mdev = cosmdev_to_mdev(cdev); + const char *string = "??"; -/** - * mic_prepare_suspend - Handle suspend notification for the MIC device. - * @mdev: pointer to mic_device instance - * - * RETURNS: None. - */ -void mic_prepare_suspend(struct mic_device *mdev) -{ - unsigned long timeout; - -#define MIC_SUSPEND_TIMEOUT (60 * HZ) - - mutex_lock(&mdev->mic_mutex); - switch (mdev->state) { - case MIC_OFFLINE: - /* - * Card is already offline. Set state to MIC_SUSPENDED - * to prevent users from booting the card. - */ - mic_set_state(mdev, MIC_SUSPENDED); - mutex_unlock(&mdev->mic_mutex); + switch (mdev->stepping) { + case MIC_A0_STEP: + string = "A0"; break; - case MIC_ONLINE: - /* - * Card is online. Set state to MIC_SUSPENDING and notify - * MIC user space daemon which will issue card - * shutdown and reset. - */ - mic_set_state(mdev, MIC_SUSPENDING); - mutex_unlock(&mdev->mic_mutex); - timeout = wait_for_completion_timeout(&mdev->reset_wait, - MIC_SUSPEND_TIMEOUT); - /* Force reset the card if the shutdown completion timed out */ - if (!timeout) { - mutex_lock(&mdev->mic_mutex); - mic_set_state(mdev, MIC_SUSPENDED); - mutex_unlock(&mdev->mic_mutex); - mic_stop(mdev, true); - } + case MIC_B0_STEP: + string = "B0"; break; - case MIC_SHUTTING_DOWN: - /* - * Card is shutting down. Set state to MIC_SUSPENDED - * to prevent further boot of the card. - */ - mic_set_state(mdev, MIC_SUSPENDED); - mutex_unlock(&mdev->mic_mutex); - timeout = wait_for_completion_timeout(&mdev->reset_wait, - MIC_SUSPEND_TIMEOUT); - /* Force reset the card if the shutdown completion timed out */ - if (!timeout) - mic_stop(mdev, true); + case MIC_B1_STEP: + string = "B1"; + break; + case MIC_C0_STEP: + string = "C0"; break; default: - mutex_unlock(&mdev->mic_mutex); break; } + return scnprintf(buf, PAGE_SIZE, "%s\n", string); } -/** - * mic_suspend - Initiate MIC suspend. Suspend merely issues card shutdown. - * @mdev: pointer to mic_device instance - * - * RETURNS: None. - */ -void mic_suspend(struct mic_device *mdev) +static struct mic_mw *_mic_aper(struct cosm_device *cdev) { - struct mic_bootparam *bootparam = mdev->dp; - s8 db = bootparam->h2c_shutdown_db; + struct mic_device *mdev = cosmdev_to_mdev(cdev); - mutex_lock(&mdev->mic_mutex); - if (MIC_SUSPENDING == mdev->state && db != -1) { - bootparam->shutdown_card = 1; - mdev->ops->send_intr(mdev, db); - mic_set_state(mdev, MIC_SUSPENDED); - } - mutex_unlock(&mdev->mic_mutex); + return &mdev->aper; } + +struct cosm_hw_ops cosm_hw_ops = { + .reset = _mic_reset, + .force_reset = _mic_reset, + .post_reset = NULL, + .ready = _mic_ready, + .start = _mic_start, + .stop = _mic_stop, + .family = _mic_family, + .stepping = _mic_stepping, + .aper = _mic_aper, +}; diff --git a/kernel/drivers/misc/mic/host/mic_debugfs.c b/kernel/drivers/misc/mic/host/mic_debugfs.c index 687e9aacf..105816007 100644 --- a/kernel/drivers/misc/mic/host/mic_debugfs.c +++ b/kernel/drivers/misc/mic/host/mic_debugfs.c @@ -31,71 +31,6 @@ /* Debugfs parent dir */ static struct dentry *mic_dbg; -/** - * mic_log_buf_show - Display MIC kernel log buffer. - * - * log_buf addr/len is read from System.map by user space - * and populated in sysfs entries. - */ -static int mic_log_buf_show(struct seq_file *s, void *unused) -{ - void __iomem *log_buf_va; - int __iomem *log_buf_len_va; - struct mic_device *mdev = s->private; - void *kva; - int size; - unsigned long aper_offset; - - if (!mdev || !mdev->log_buf_addr || !mdev->log_buf_len) - goto done; - /* - * Card kernel will never be relocated and any kernel text/data mapping - * can be translated to phys address by subtracting __START_KERNEL_map. - */ - aper_offset = (unsigned long)mdev->log_buf_len - __START_KERNEL_map; - log_buf_len_va = mdev->aper.va + aper_offset; - aper_offset = (unsigned long)mdev->log_buf_addr - __START_KERNEL_map; - log_buf_va = mdev->aper.va + aper_offset; - size = ioread32(log_buf_len_va); - - kva = kmalloc(size, GFP_KERNEL); - if (!kva) - goto done; - mutex_lock(&mdev->mic_mutex); - memcpy_fromio(kva, log_buf_va, size); - switch (mdev->state) { - case MIC_ONLINE: - /* Fall through */ - case MIC_SHUTTING_DOWN: - seq_write(s, kva, size); - break; - default: - break; - } - mutex_unlock(&mdev->mic_mutex); - kfree(kva); -done: - return 0; -} - -static int mic_log_buf_open(struct inode *inode, struct file *file) -{ - return single_open(file, mic_log_buf_show, inode->i_private); -} - -static int mic_log_buf_release(struct inode *inode, struct file *file) -{ - return single_release(inode, file); -} - -static const struct file_operations log_buf_ops = { - .owner = THIS_MODULE, - .open = mic_log_buf_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mic_log_buf_release -}; - static int mic_smpt_show(struct seq_file *s, void *pos) { int i; @@ -138,32 +73,6 @@ static const struct file_operations smpt_file_ops = { .release = mic_smpt_debug_release }; -static int mic_soft_reset_show(struct seq_file *s, void *pos) -{ - struct mic_device *mdev = s->private; - - mic_stop(mdev, true); - return 0; -} - -static int mic_soft_reset_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, mic_soft_reset_show, inode->i_private); -} - -static int mic_soft_reset_debug_release(struct inode *inode, struct file *file) -{ - return single_release(inode, file); -} - -static const struct file_operations soft_reset_ops = { - .owner = THIS_MODULE, - .open = mic_soft_reset_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = mic_soft_reset_debug_release -}; - static int mic_post_code_show(struct seq_file *s, void *pos) { struct mic_device *mdev = s->private; @@ -204,16 +113,19 @@ static int mic_dp_show(struct seq_file *s, void *pos) seq_printf(s, "Bootparam: magic 0x%x\n", bootparam->magic); - seq_printf(s, "Bootparam: h2c_shutdown_db %d\n", - bootparam->h2c_shutdown_db); seq_printf(s, "Bootparam: h2c_config_db %d\n", bootparam->h2c_config_db); - seq_printf(s, "Bootparam: c2h_shutdown_db %d\n", - bootparam->c2h_shutdown_db); - seq_printf(s, "Bootparam: shutdown_status %d\n", - bootparam->shutdown_status); - seq_printf(s, "Bootparam: shutdown_card %d\n", - bootparam->shutdown_card); + seq_printf(s, "Bootparam: node_id %d\n", + bootparam->node_id); + seq_printf(s, "Bootparam: c2h_scif_db %d\n", + bootparam->c2h_scif_db); + seq_printf(s, "Bootparam: h2c_scif_db %d\n", + bootparam->h2c_scif_db); + seq_printf(s, "Bootparam: scif_host_dma_addr 0x%llx\n", + bootparam->scif_host_dma_addr); + seq_printf(s, "Bootparam: scif_card_dma_addr 0x%llx\n", + bootparam->scif_card_dma_addr); + for (i = sizeof(*bootparam); i < MIC_DP_SIZE; i += mic_total_desc_size(d)) { @@ -379,8 +291,7 @@ static int mic_msi_irq_info_show(struct seq_file *s, void *pos) int i, j; u16 entry; u16 vector; - struct pci_dev *pdev = container_of(mdev->sdev->parent, - struct pci_dev, dev); + struct pci_dev *pdev = mdev->pdev; if (pci_dev_msi_enabled(pdev)) { for (i = 0; i < mdev->irq_info.num_vectors; i++) { @@ -441,20 +352,18 @@ static const struct file_operations msi_irq_info_ops = { */ void mic_create_debug_dir(struct mic_device *mdev) { + char name[16]; + if (!mic_dbg) return; - mdev->dbg_dir = debugfs_create_dir(dev_name(mdev->sdev), mic_dbg); + scnprintf(name, sizeof(name), "mic%d", mdev->id); + mdev->dbg_dir = debugfs_create_dir(name, mic_dbg); if (!mdev->dbg_dir) return; - debugfs_create_file("log_buf", 0444, mdev->dbg_dir, mdev, &log_buf_ops); - debugfs_create_file("smpt", 0444, mdev->dbg_dir, mdev, &smpt_file_ops); - debugfs_create_file("soft_reset", 0444, mdev->dbg_dir, mdev, - &soft_reset_ops); - debugfs_create_file("post_code", 0444, mdev->dbg_dir, mdev, &post_code_ops); diff --git a/kernel/drivers/misc/mic/host/mic_device.h b/kernel/drivers/misc/mic/host/mic_device.h index 016bd15a7..461184a12 100644 --- a/kernel/drivers/misc/mic/host/mic_device.h +++ b/kernel/drivers/misc/mic/host/mic_device.h @@ -26,21 +26,12 @@ #include <linux/notifier.h> #include <linux/irqreturn.h> #include <linux/dmaengine.h> +#include <linux/miscdevice.h> #include <linux/mic_bus.h> - +#include "../bus/scif_bus.h" +#include "../bus/cosm_bus.h" #include "mic_intr.h" -/* The maximum number of MIC devices supported in a single host system. */ -#define MIC_MAX_NUM_DEVS 256 - -/** - * enum mic_hw_family - The hardware family to which a device belongs. - */ -enum mic_hw_family { - MIC_FAMILY_X100 = 0, - MIC_FAMILY_UNKNOWN -}; - /** * enum mic_stepping - MIC stepping ids. */ @@ -51,6 +42,8 @@ enum mic_stepping { MIC_C0_STEP = 0x20, }; +extern struct cosm_hw_ops cosm_hw_ops; + /** * struct mic_device - MIC device information for each card. * @@ -60,8 +53,7 @@ enum mic_stepping { * @ops: MIC HW specific operations. * @id: The unique device id for this MIC device. * @stepping: Stepping ID. - * @attr_group: Pointer to list of sysfs attribute groups. - * @sdev: Device for sysfs entries. + * @pdev: Underlying PCI device. * @mic_mutex: Mutex for synchronizing access to mic_device. * @intr_ops: HW specific interrupt operations. * @smpt_ops: Hardware specific SMPT operations. @@ -69,28 +61,17 @@ enum mic_stepping { * @intr_info: H/W specific interrupt information. * @irq_info: The OS specific irq information * @dbg_dir: debugfs directory of this MIC device. - * @cmdline: Kernel command line. - * @firmware: Firmware file name. - * @ramdisk: Ramdisk file name. - * @bootmode: Boot mode i.e. "linux" or "elf" for flash updates. * @bootaddr: MIC boot address. - * @reset_trigger_work: Work for triggering reset requests. - * @shutdown_work: Work for handling shutdown interrupts. - * @state: MIC state. - * @shutdown_status: MIC status reported by card for shutdown/crashes. - * @state_sysfs: Sysfs dirent for notifying ring 3 about MIC state changes. - * @reset_wait: Waitqueue for sleeping while reset completes. - * @log_buf_addr: Log buffer address for MIC. - * @log_buf_len: Log buffer length address for MIC. * @dp: virtio device page * @dp_dma_addr: virtio device page DMA address. - * @shutdown_db: shutdown doorbell. - * @shutdown_cookie: shutdown cookie. - * @cdev: Character device for MIC. + * @name: name for the misc char device + * @miscdev: registered misc char device * @vdev_list: list of virtio devices. - * @pm_notifier: Handles PM notifications from the OS. * @dma_mbdev: MIC BUS DMA device. - * @dma_ch: DMA channel reserved by this driver for use by virtio devices. + * @dma_ch - Array of DMA channels + * @num_dma_ch - Number of DMA channels available + * @scdev: SCIF device on the SCIF virtual bus. + * @cosm_dev: COSM device */ struct mic_device { struct mic_mw mmio; @@ -99,8 +80,7 @@ struct mic_device { struct mic_hw_ops *ops; int id; enum mic_stepping stepping; - const struct attribute_group **attr_group; - struct device *sdev; + struct pci_dev *pdev; struct mutex mic_mutex; struct mic_hw_intr_ops *intr_ops; struct mic_smpt_ops *smpt_ops; @@ -108,28 +88,17 @@ struct mic_device { struct mic_intr_info *intr_info; struct mic_irq_info irq_info; struct dentry *dbg_dir; - char *cmdline; - char *firmware; - char *ramdisk; - char *bootmode; u32 bootaddr; - struct work_struct reset_trigger_work; - struct work_struct shutdown_work; - u8 state; - u8 shutdown_status; - struct kernfs_node *state_sysfs; - struct completion reset_wait; - void *log_buf_addr; - int *log_buf_len; void *dp; dma_addr_t dp_dma_addr; - int shutdown_db; - struct mic_irq *shutdown_cookie; - struct cdev cdev; + char name[16]; + struct miscdevice miscdev; struct list_head vdev_list; - struct notifier_block pm_notifier; struct mbus_device *dma_mbdev; - struct dma_chan *dma_ch; + struct dma_chan *dma_ch[MIC_MAX_DMA_CHAN]; + int num_dma_ch; + struct scif_hw_dev *scdev; + struct cosm_device *cosm_dev; }; /** @@ -195,37 +164,9 @@ mic_mmio_write(struct mic_mw *mw, u32 val, u32 offset) iowrite32(val, mw->va + offset); } -static inline struct dma_chan *mic_request_dma_chan(struct mic_device *mdev) -{ - dma_cap_mask_t mask; - struct dma_chan *chan; - - dma_cap_zero(mask); - dma_cap_set(DMA_MEMCPY, mask); - chan = dma_request_channel(mask, mdev->ops->dma_filter, - mdev->sdev->parent); - if (chan) - return chan; - dev_err(mdev->sdev->parent, "%s %d unable to acquire channel\n", - __func__, __LINE__); - return NULL; -} - -void mic_sysfs_init(struct mic_device *mdev); -int mic_start(struct mic_device *mdev, const char *buf); -void mic_stop(struct mic_device *mdev, bool force); -void mic_shutdown(struct mic_device *mdev); -void mic_reset_delayed_work(struct work_struct *work); -void mic_reset_trigger_work(struct work_struct *work); -void mic_shutdown_work(struct work_struct *work); void mic_bootparam_init(struct mic_device *mdev); -void mic_set_state(struct mic_device *mdev, u8 state); -void mic_set_shutdown_status(struct mic_device *mdev, u8 status); void mic_create_debug_dir(struct mic_device *dev); void mic_delete_debug_dir(struct mic_device *dev); void __init mic_init_debugfs(void); void mic_exit_debugfs(void); -void mic_prepare_suspend(struct mic_device *mdev); -void mic_complete_resume(struct mic_device *mdev); -void mic_suspend(struct mic_device *mdev); #endif diff --git a/kernel/drivers/misc/mic/host/mic_fops.c b/kernel/drivers/misc/mic/host/mic_fops.c index 85776d732..8cc1d90cd 100644 --- a/kernel/drivers/misc/mic/host/mic_fops.c +++ b/kernel/drivers/misc/mic/host/mic_fops.c @@ -30,8 +30,8 @@ int mic_open(struct inode *inode, struct file *f) { struct mic_vdev *mvdev; - struct mic_device *mdev = container_of(inode->i_cdev, - struct mic_device, cdev); + struct mic_device *mdev = container_of(f->private_data, + struct mic_device, miscdev); mvdev = kzalloc(sizeof(*mvdev), GFP_KERNEL); if (!mvdev) diff --git a/kernel/drivers/misc/mic/host/mic_intr.c b/kernel/drivers/misc/mic/host/mic_intr.c index b4ca6c884..08ca3e372 100644 --- a/kernel/drivers/misc/mic/host/mic_intr.c +++ b/kernel/drivers/misc/mic/host/mic_intr.c @@ -30,8 +30,7 @@ static irqreturn_t mic_thread_fn(int irq, void *dev) struct mic_intr_info *intr_info = mdev->intr_info; struct mic_irq_info *irq_info = &mdev->irq_info; struct mic_intr_cb *intr_cb; - struct pci_dev *pdev = container_of(mdev->sdev->parent, - struct pci_dev, dev); + struct pci_dev *pdev = mdev->pdev; int i; spin_lock(&irq_info->mic_thread_lock); @@ -57,8 +56,7 @@ static irqreturn_t mic_interrupt(int irq, void *dev) struct mic_intr_info *intr_info = mdev->intr_info; struct mic_irq_info *irq_info = &mdev->irq_info; struct mic_intr_cb *intr_cb; - struct pci_dev *pdev = container_of(mdev->sdev->parent, - struct pci_dev, dev); + struct pci_dev *pdev = mdev->pdev; u32 mask; int i; @@ -83,7 +81,7 @@ static irqreturn_t mic_interrupt(int irq, void *dev) /* Return the interrupt offset from the index. Index is 0 based. */ static u16 mic_map_src_to_offset(struct mic_device *mdev, - int intr_src, enum mic_intr_type type) + int intr_src, enum mic_intr_type type) { if (type >= MIC_NUM_INTR_TYPES) return MIC_NUM_OFFSETS; @@ -214,7 +212,7 @@ static int mic_setup_msix(struct mic_device *mdev, struct pci_dev *pdev) mdev->irq_info.msix_entries[i].entry = i; rc = pci_enable_msix_exact(pdev, mdev->irq_info.msix_entries, - MIC_MIN_MSIX); + MIC_MIN_MSIX); if (rc) { dev_dbg(&pdev->dev, "Error enabling MSIx. rc = %d\n", rc); goto err_enable_msix; @@ -229,7 +227,7 @@ static int mic_setup_msix(struct mic_device *mdev, struct pci_dev *pdev) goto err_nomem2; } - dev_dbg(mdev->sdev->parent, + dev_dbg(&mdev->pdev->dev, "%d MSIx irqs setup\n", mdev->irq_info.num_vectors); return 0; err_nomem2: @@ -281,7 +279,6 @@ static void mic_release_callbacks(struct mic_device *mdev) spin_lock(&mdev->irq_info.mic_thread_lock); spin_lock_irqsave(&mdev->irq_info.mic_intr_lock, flags); for (i = 0; i < MIC_NUM_OFFSETS; i++) { - if (list_empty(&mdev->irq_info.cb_list[i])) break; @@ -443,12 +440,11 @@ mic_request_threaded_irq(struct mic_device *mdev, unsigned long cookie = 0; u16 entry; struct mic_intr_cb *intr_cb; - struct pci_dev *pdev = container_of(mdev->sdev->parent, - struct pci_dev, dev); + struct pci_dev *pdev = mdev->pdev; offset = mic_map_src_to_offset(mdev, intr_src, type); if (offset >= MIC_NUM_OFFSETS) { - dev_err(mdev->sdev->parent, + dev_err(&mdev->pdev->dev, "Error mapping index %d to a valid source id.\n", intr_src); rc = -EINVAL; @@ -458,7 +454,7 @@ mic_request_threaded_irq(struct mic_device *mdev, if (mdev->irq_info.num_vectors > 1) { msix = mic_get_available_vector(mdev); if (!msix) { - dev_err(mdev->sdev->parent, + dev_err(&mdev->pdev->dev, "No MSIx vectors available for use.\n"); rc = -ENOSPC; goto err; @@ -467,7 +463,7 @@ mic_request_threaded_irq(struct mic_device *mdev, rc = request_threaded_irq(msix->vector, handler, thread_fn, 0, name, data); if (rc) { - dev_dbg(mdev->sdev->parent, + dev_dbg(&mdev->pdev->dev, "request irq failed rc = %d\n", rc); goto err; } @@ -476,13 +472,13 @@ mic_request_threaded_irq(struct mic_device *mdev, mdev->intr_ops->program_msi_to_src_map(mdev, entry, offset, true); cookie = MK_COOKIE(entry, offset); - dev_dbg(mdev->sdev->parent, "irq: %d assigned for src: %d\n", + dev_dbg(&mdev->pdev->dev, "irq: %d assigned for src: %d\n", msix->vector, intr_src); } else { intr_cb = mic_register_intr_callback(mdev, offset, handler, thread_fn, data); if (IS_ERR(intr_cb)) { - dev_err(mdev->sdev->parent, + dev_err(&mdev->pdev->dev, "No available callback entries for use\n"); rc = PTR_ERR(intr_cb); goto err; @@ -495,7 +491,7 @@ mic_request_threaded_irq(struct mic_device *mdev, entry, offset, true); } cookie = MK_COOKIE(entry, intr_cb->cb_id); - dev_dbg(mdev->sdev->parent, "callback %d registered for src: %d\n", + dev_dbg(&mdev->pdev->dev, "callback %d registered for src: %d\n", intr_cb->cb_id, intr_src); } return (struct mic_irq *)cookie; @@ -515,20 +511,19 @@ err: * returns: none. */ void mic_free_irq(struct mic_device *mdev, - struct mic_irq *cookie, void *data) + struct mic_irq *cookie, void *data) { u32 offset; u32 entry; u8 src_id; unsigned int irq; - struct pci_dev *pdev = container_of(mdev->sdev->parent, - struct pci_dev, dev); + struct pci_dev *pdev = mdev->pdev; entry = GET_ENTRY((unsigned long)cookie); offset = GET_OFFSET((unsigned long)cookie); if (mdev->irq_info.num_vectors > 1) { if (entry >= mdev->irq_info.num_vectors) { - dev_warn(mdev->sdev->parent, + dev_warn(&mdev->pdev->dev, "entry %d should be < num_irq %d\n", entry, mdev->irq_info.num_vectors); return; @@ -539,12 +534,12 @@ void mic_free_irq(struct mic_device *mdev, mdev->intr_ops->program_msi_to_src_map(mdev, entry, offset, false); - dev_dbg(mdev->sdev->parent, "irq: %d freed\n", irq); + dev_dbg(&mdev->pdev->dev, "irq: %d freed\n", irq); } else { irq = pdev->irq; src_id = mic_unregister_intr_callback(mdev, offset); if (src_id >= MIC_NUM_OFFSETS) { - dev_warn(mdev->sdev->parent, "Error unregistering callback\n"); + dev_warn(&mdev->pdev->dev, "Error unregistering callback\n"); return; } if (pci_dev_msi_enabled(pdev)) { @@ -552,7 +547,7 @@ void mic_free_irq(struct mic_device *mdev, mdev->intr_ops->program_msi_to_src_map(mdev, entry, src_id, false); } - dev_dbg(mdev->sdev->parent, "callback %d unregistered for src: %d\n", + dev_dbg(&mdev->pdev->dev, "callback %d unregistered for src: %d\n", offset, src_id); } } @@ -579,7 +574,7 @@ int mic_setup_interrupts(struct mic_device *mdev, struct pci_dev *pdev) rc = mic_setup_intx(mdev, pdev); if (rc) { - dev_err(mdev->sdev->parent, "no usable interrupts\n"); + dev_err(&mdev->pdev->dev, "no usable interrupts\n"); return rc; } done: @@ -635,8 +630,7 @@ void mic_free_interrupts(struct mic_device *mdev, struct pci_dev *pdev) void mic_intr_restore(struct mic_device *mdev) { int entry, offset; - struct pci_dev *pdev = container_of(mdev->sdev->parent, - struct pci_dev, dev); + struct pci_dev *pdev = mdev->pdev; if (!pci_dev_msi_enabled(pdev)) return; diff --git a/kernel/drivers/misc/mic/host/mic_intr.h b/kernel/drivers/misc/mic/host/mic_intr.h index 9f783d4ad..cce28824d 100644 --- a/kernel/drivers/misc/mic/host/mic_intr.h +++ b/kernel/drivers/misc/mic/host/mic_intr.h @@ -28,8 +28,9 @@ * 3 for virtio network, console and block devices. * 1 for card shutdown notifications. * 4 for host owned DMA channels. + * 1 for SCIF */ -#define MIC_MIN_MSIX 8 +#define MIC_MIN_MSIX 9 #define MIC_NUM_OFFSETS 32 /** diff --git a/kernel/drivers/misc/mic/host/mic_main.c b/kernel/drivers/misc/mic/host/mic_main.c index ab37a3117..153894e7e 100644 --- a/kernel/drivers/misc/mic/host/mic_main.c +++ b/kernel/drivers/misc/mic/host/mic_main.c @@ -16,17 +16,11 @@ * the file called "COPYING". * * Intel MIC Host driver. - * - * Global TODO's across the driver to be added after initial base - * patches are accepted upstream: - * 1) Enable DMA support. - * 2) Enable per vring interrupt support. */ #include <linux/fs.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/poll.h> -#include <linux/suspend.h> #include <linux/mic_common.h> #include "../common/mic_dev.h" @@ -63,8 +57,6 @@ MODULE_DEVICE_TABLE(pci, mic_pci_tbl); /* ID allocator for MIC devices */ static struct ida g_mic_ida; -/* Class of MIC devices for sysfs accessibility. */ -static struct class *g_mic_class; /* Base device node number for MIC devices */ static dev_t g_mic_devno; @@ -81,17 +73,14 @@ static const struct file_operations mic_fops = { static int mic_dp_init(struct mic_device *mdev) { mdev->dp = kzalloc(MIC_DP_SIZE, GFP_KERNEL); - if (!mdev->dp) { - dev_err(mdev->sdev->parent, "%s %d err %d\n", - __func__, __LINE__, -ENOMEM); + if (!mdev->dp) return -ENOMEM; - } mdev->dp_dma_addr = mic_map_single(mdev, mdev->dp, MIC_DP_SIZE); if (mic_map_error(mdev->dp_dma_addr)) { kfree(mdev->dp); - dev_err(mdev->sdev->parent, "%s %d err %d\n", + dev_err(&mdev->pdev->dev, "%s %d err %d\n", __func__, __LINE__, -ENOMEM); return -ENOMEM; } @@ -108,30 +97,6 @@ static void mic_dp_uninit(struct mic_device *mdev) } /** - * mic_shutdown_db - Shutdown doorbell interrupt handler. - */ -static irqreturn_t mic_shutdown_db(int irq, void *data) -{ - struct mic_device *mdev = data; - struct mic_bootparam *bootparam = mdev->dp; - - mdev->ops->intr_workarounds(mdev); - - switch (bootparam->shutdown_status) { - case MIC_HALTED: - case MIC_POWER_OFF: - case MIC_RESTART: - /* Fall through */ - case MIC_CRASHED: - schedule_work(&mdev->shutdown_work); - break; - default: - break; - }; - return IRQ_HANDLED; -} - -/** * mic_ops_init: Initialize HW specific operation tables. * * @mdev: pointer to mic_device instance @@ -188,43 +153,6 @@ static enum mic_hw_family mic_get_family(struct pci_dev *pdev) } /** -* mic_pm_notifier: Notifier callback function that handles -* PM notifications. -* -* @notifier_block: The notifier structure. -* @pm_event: The event for which the driver was notified. -* @unused: Meaningless. Always NULL. -* -* returns NOTIFY_DONE -*/ -static int mic_pm_notifier(struct notifier_block *notifier, - unsigned long pm_event, void *unused) -{ - struct mic_device *mdev = container_of(notifier, - struct mic_device, pm_notifier); - - switch (pm_event) { - case PM_HIBERNATION_PREPARE: - /* Fall through */ - case PM_SUSPEND_PREPARE: - mic_prepare_suspend(mdev); - break; - case PM_POST_HIBERNATION: - /* Fall through */ - case PM_POST_SUSPEND: - /* Fall through */ - case PM_POST_RESTORE: - mic_complete_resume(mdev); - break; - case PM_RESTORE_PREPARE: - break; - default: - break; - } - return NOTIFY_DONE; -} - -/** * mic_device_init - Allocates and initializes the MIC device structure * * @mdev: pointer to mic_device instance @@ -232,52 +160,16 @@ static int mic_pm_notifier(struct notifier_block *notifier, * * returns none. */ -static int +static void mic_device_init(struct mic_device *mdev, struct pci_dev *pdev) { - int rc; - + mdev->pdev = pdev; mdev->family = mic_get_family(pdev); mdev->stepping = pdev->revision; mic_ops_init(mdev); - mic_sysfs_init(mdev); mutex_init(&mdev->mic_mutex); mdev->irq_info.next_avail_src = 0; - INIT_WORK(&mdev->reset_trigger_work, mic_reset_trigger_work); - INIT_WORK(&mdev->shutdown_work, mic_shutdown_work); - init_completion(&mdev->reset_wait); INIT_LIST_HEAD(&mdev->vdev_list); - mdev->pm_notifier.notifier_call = mic_pm_notifier; - rc = register_pm_notifier(&mdev->pm_notifier); - if (rc) { - dev_err(&pdev->dev, "register_pm_notifier failed rc %d\n", - rc); - goto register_pm_notifier_fail; - } - return 0; -register_pm_notifier_fail: - flush_work(&mdev->shutdown_work); - flush_work(&mdev->reset_trigger_work); - return rc; -} - -/** - * mic_device_uninit - Frees resources allocated during mic_device_init(..) - * - * @mdev: pointer to mic_device instance - * - * returns none - */ -static void mic_device_uninit(struct mic_device *mdev) -{ - /* The cmdline sysfs entry might have allocated cmdline */ - kfree(mdev->cmdline); - kfree(mdev->firmware); - kfree(mdev->ramdisk); - kfree(mdev->bootmode); - flush_work(&mdev->reset_trigger_work); - flush_work(&mdev->shutdown_work); - unregister_pm_notifier(&mdev->pm_notifier); } /** @@ -289,7 +181,7 @@ static void mic_device_uninit(struct mic_device *mdev) * returns 0 on success, < 0 on failure. */ static int mic_probe(struct pci_dev *pdev, - const struct pci_device_id *ent) + const struct pci_device_id *ent) { int rc; struct mic_device *mdev; @@ -307,16 +199,12 @@ static int mic_probe(struct pci_dev *pdev, goto ida_fail; } - rc = mic_device_init(mdev, pdev); - if (rc) { - dev_err(&pdev->dev, "mic_device_init failed rc %d\n", rc); - goto device_init_fail; - } + mic_device_init(mdev, pdev); rc = pci_enable_device(pdev); if (rc) { dev_err(&pdev->dev, "failed to enable pci device.\n"); - goto uninit_device; + goto ida_remove; } pci_set_master(pdev); @@ -365,61 +253,39 @@ static int mic_probe(struct pci_dev *pdev, pci_set_drvdata(pdev, mdev); - mdev->sdev = device_create_with_groups(g_mic_class, &pdev->dev, - MKDEV(MAJOR(g_mic_devno), mdev->id), NULL, - mdev->attr_group, "mic%d", mdev->id); - if (IS_ERR(mdev->sdev)) { - rc = PTR_ERR(mdev->sdev); - dev_err(&pdev->dev, - "device_create_with_groups failed rc %d\n", rc); - goto smpt_uninit; - } - mdev->state_sysfs = sysfs_get_dirent(mdev->sdev->kobj.sd, "state"); - if (!mdev->state_sysfs) { - rc = -ENODEV; - dev_err(&pdev->dev, "sysfs_get_dirent failed rc %d\n", rc); - goto destroy_device; - } - rc = mic_dp_init(mdev); if (rc) { dev_err(&pdev->dev, "mic_dp_init failed rc %d\n", rc); - goto sysfs_put; - } - mutex_lock(&mdev->mic_mutex); - - mdev->shutdown_db = mic_next_db(mdev); - mdev->shutdown_cookie = mic_request_threaded_irq(mdev, mic_shutdown_db, - NULL, "shutdown-interrupt", mdev, - mdev->shutdown_db, MIC_INTR_DB); - if (IS_ERR(mdev->shutdown_cookie)) { - rc = PTR_ERR(mdev->shutdown_cookie); - mutex_unlock(&mdev->mic_mutex); - goto dp_uninit; + goto smpt_uninit; } - mutex_unlock(&mdev->mic_mutex); mic_bootparam_init(mdev); mic_create_debug_dir(mdev); - cdev_init(&mdev->cdev, &mic_fops); - mdev->cdev.owner = THIS_MODULE; - rc = cdev_add(&mdev->cdev, MKDEV(MAJOR(g_mic_devno), mdev->id), 1); + + mdev->miscdev.minor = MISC_DYNAMIC_MINOR; + snprintf(mdev->name, sizeof(mdev->name), "mic%d", mdev->id); + mdev->miscdev.name = mdev->name; + mdev->miscdev.fops = &mic_fops; + mdev->miscdev.parent = &mdev->pdev->dev; + rc = misc_register(&mdev->miscdev); if (rc) { - dev_err(&pdev->dev, "cdev_add err id %d rc %d\n", mdev->id, rc); + dev_err(&pdev->dev, "misc_register err id %d rc %d\n", + mdev->id, rc); goto cleanup_debug_dir; } + + mdev->cosm_dev = cosm_register_device(&mdev->pdev->dev, &cosm_hw_ops); + if (IS_ERR(mdev->cosm_dev)) { + rc = PTR_ERR(mdev->cosm_dev); + dev_err(&pdev->dev, "cosm_add_device failed rc %d\n", rc); + goto misc_dereg; + } return 0; +misc_dereg: + misc_deregister(&mdev->miscdev); cleanup_debug_dir: mic_delete_debug_dir(mdev); - mutex_lock(&mdev->mic_mutex); - mic_free_irq(mdev, mdev->shutdown_cookie, mdev); - mutex_unlock(&mdev->mic_mutex); -dp_uninit: mic_dp_uninit(mdev); -sysfs_put: - sysfs_put(mdev->state_sysfs); -destroy_device: - device_destroy(g_mic_class, MKDEV(MAJOR(g_mic_devno), mdev->id)); smpt_uninit: mic_smpt_uninit(mdev); free_interrupts: @@ -432,9 +298,7 @@ release_regions: pci_release_regions(pdev); disable_device: pci_disable_device(pdev); -uninit_device: - mic_device_uninit(mdev); -device_init_fail: +ida_remove: ida_simple_remove(&g_mic_ida, mdev->id); ida_fail: kfree(mdev); @@ -458,26 +322,20 @@ static void mic_remove(struct pci_dev *pdev) if (!mdev) return; - mic_stop(mdev, false); - cdev_del(&mdev->cdev); + cosm_unregister_device(mdev->cosm_dev); + misc_deregister(&mdev->miscdev); mic_delete_debug_dir(mdev); - mutex_lock(&mdev->mic_mutex); - mic_free_irq(mdev, mdev->shutdown_cookie, mdev); - mutex_unlock(&mdev->mic_mutex); - flush_work(&mdev->shutdown_work); mic_dp_uninit(mdev); - sysfs_put(mdev->state_sysfs); - device_destroy(g_mic_class, MKDEV(MAJOR(g_mic_devno), mdev->id)); mic_smpt_uninit(mdev); mic_free_interrupts(mdev, pdev); - iounmap(mdev->mmio.va); iounmap(mdev->aper.va); - mic_device_uninit(mdev); + iounmap(mdev->mmio.va); pci_release_regions(pdev); pci_disable_device(pdev); ida_simple_remove(&g_mic_ida, mdev->id); kfree(mdev); } + static struct pci_driver mic_driver = { .name = mic_driver_name, .id_table = mic_pci_tbl, @@ -490,31 +348,23 @@ static int __init mic_init(void) int ret; ret = alloc_chrdev_region(&g_mic_devno, 0, - MIC_MAX_NUM_DEVS, mic_driver_name); + MIC_MAX_NUM_DEVS, mic_driver_name); if (ret) { pr_err("alloc_chrdev_region failed ret %d\n", ret); goto error; } - g_mic_class = class_create(THIS_MODULE, mic_driver_name); - if (IS_ERR(g_mic_class)) { - ret = PTR_ERR(g_mic_class); - pr_err("class_create failed ret %d\n", ret); - goto cleanup_chrdev; - } - mic_init_debugfs(); ida_init(&g_mic_ida); ret = pci_register_driver(&mic_driver); if (ret) { pr_err("pci_register_driver failed ret %d\n", ret); - goto cleanup_debugfs; + goto cleanup_chrdev; } return ret; -cleanup_debugfs: - mic_exit_debugfs(); - class_destroy(g_mic_class); cleanup_chrdev: + ida_destroy(&g_mic_ida); + mic_exit_debugfs(); unregister_chrdev_region(g_mic_devno, MIC_MAX_NUM_DEVS); error: return ret; @@ -525,7 +375,6 @@ static void __exit mic_exit(void) pci_unregister_driver(&mic_driver); ida_destroy(&g_mic_ida); mic_exit_debugfs(); - class_destroy(g_mic_class); unregister_chrdev_region(g_mic_devno, MIC_MAX_NUM_DEVS); } diff --git a/kernel/drivers/misc/mic/host/mic_smpt.c b/kernel/drivers/misc/mic/host/mic_smpt.c index fae474c48..c3f958580 100644 --- a/kernel/drivers/misc/mic/host/mic_smpt.c +++ b/kernel/drivers/misc/mic/host/mic_smpt.c @@ -76,7 +76,7 @@ mic_is_system_addr(struct mic_device *mdev, dma_addr_t pa) /* Populate an SMPT entry and update the reference counts. */ static void mic_add_smpt_entry(int spt, s64 *ref, u64 addr, - int entries, struct mic_device *mdev) + int entries, struct mic_device *mdev) { struct mic_smpt_info *smpt_info = mdev->smpt; int i; @@ -97,7 +97,7 @@ static void mic_add_smpt_entry(int spt, s64 *ref, u64 addr, * for a given DMA address and size. */ static dma_addr_t mic_smpt_op(struct mic_device *mdev, u64 dma_addr, - int entries, s64 *ref, size_t size) + int entries, s64 *ref, size_t size) { int spt; int ae = 0; @@ -148,7 +148,7 @@ found: * and the starting smpt address */ static int mic_get_smpt_ref_count(struct mic_device *mdev, dma_addr_t dma_addr, - size_t size, s64 *ref, u64 *smpt_start) + size_t size, s64 *ref, u64 *smpt_start) { u64 start = dma_addr; u64 end = dma_addr + size; @@ -174,15 +174,14 @@ static int mic_get_smpt_ref_count(struct mic_device *mdev, dma_addr_t dma_addr, * * returns a DMA address. */ -static dma_addr_t -mic_to_dma_addr(struct mic_device *mdev, dma_addr_t mic_addr) +dma_addr_t mic_to_dma_addr(struct mic_device *mdev, dma_addr_t mic_addr) { struct mic_smpt_info *smpt_info = mdev->smpt; int spt; dma_addr_t dma_addr; if (!mic_is_system_addr(mdev, mic_addr)) { - dev_err(mdev->sdev->parent, + dev_err(&mdev->pdev->dev, "mic_addr is invalid. mic_addr = 0x%llx\n", mic_addr); return -EINVAL; } @@ -214,12 +213,12 @@ dma_addr_t mic_map(struct mic_device *mdev, dma_addr_t dma_addr, size_t size) if (!size || size > mic_max_system_memory(mdev)) return mic_addr; - ref = kmalloc(mdev->smpt->info.num_reg * sizeof(s64), GFP_KERNEL); + ref = kmalloc_array(mdev->smpt->info.num_reg, sizeof(s64), GFP_ATOMIC); if (!ref) return mic_addr; num_entries = mic_get_smpt_ref_count(mdev, dma_addr, size, - ref, &smpt_start); + ref, &smpt_start); /* Set the smpt table appropriately and get 16G aligned mic address */ mic_addr = mic_smpt_op(mdev, smpt_start, num_entries, ref, size); @@ -232,7 +231,7 @@ dma_addr_t mic_map(struct mic_device *mdev, dma_addr_t dma_addr, size_t size) * else generate mic_addr by adding the 16G offset in dma_addr */ if (!mic_addr && MIC_FAMILY_X100 == mdev->family) { - dev_err(mdev->sdev->parent, + dev_err(&mdev->pdev->dev, "mic_map failed dma_addr 0x%llx size 0x%lx\n", dma_addr, size); return mic_addr; @@ -265,13 +264,13 @@ void mic_unmap(struct mic_device *mdev, dma_addr_t mic_addr, size_t size) return; if (!mic_is_system_addr(mdev, mic_addr)) { - dev_err(mdev->sdev->parent, + dev_err(&mdev->pdev->dev, "invalid address: 0x%llx\n", mic_addr); return; } spt = mic_sys_addr_to_smpt(mdev, mic_addr); - ref = kmalloc(mdev->smpt->info.num_reg * sizeof(s64), GFP_KERNEL); + ref = kmalloc_array(mdev->smpt->info.num_reg, sizeof(s64), GFP_ATOMIC); if (!ref) return; @@ -285,7 +284,7 @@ void mic_unmap(struct mic_device *mdev, dma_addr_t mic_addr, size_t size) for (i = spt; i < spt + num_smpt; i++) { smpt_info->entry[i].ref_count -= ref[i - spt]; if (smpt_info->entry[i].ref_count < 0) - dev_warn(mdev->sdev->parent, + dev_warn(&mdev->pdev->dev, "ref count for entry %d is negative\n", i); } spin_unlock_irqrestore(&smpt_info->smpt_lock, flags); @@ -308,15 +307,14 @@ void mic_unmap(struct mic_device *mdev, dma_addr_t mic_addr, size_t size) dma_addr_t mic_map_single(struct mic_device *mdev, void *va, size_t size) { dma_addr_t mic_addr = 0; - struct pci_dev *pdev = container_of(mdev->sdev->parent, - struct pci_dev, dev); + struct pci_dev *pdev = mdev->pdev; dma_addr_t dma_addr = pci_map_single(pdev, va, size, PCI_DMA_BIDIRECTIONAL); if (!pci_dma_mapping_error(pdev, dma_addr)) { mic_addr = mic_map(mdev, dma_addr, size); if (!mic_addr) { - dev_err(mdev->sdev->parent, + dev_err(&mdev->pdev->dev, "mic_map failed dma_addr 0x%llx size 0x%lx\n", dma_addr, size); pci_unmap_single(pdev, dma_addr, @@ -340,8 +338,7 @@ dma_addr_t mic_map_single(struct mic_device *mdev, void *va, size_t size) void mic_unmap_single(struct mic_device *mdev, dma_addr_t mic_addr, size_t size) { - struct pci_dev *pdev = container_of(mdev->sdev->parent, - struct pci_dev, dev); + struct pci_dev *pdev = mdev->pdev; dma_addr_t dma_addr = mic_to_dma_addr(mdev, mic_addr); mic_unmap(mdev, mic_addr, size); pci_unmap_single(pdev, dma_addr, size, PCI_DMA_BIDIRECTIONAL); @@ -400,18 +397,18 @@ void mic_smpt_uninit(struct mic_device *mdev) struct mic_smpt_info *smpt_info = mdev->smpt; int i; - dev_dbg(mdev->sdev->parent, + dev_dbg(&mdev->pdev->dev, "nodeid %d SMPT ref count %lld map %lld unmap %lld\n", mdev->id, smpt_info->ref_count, smpt_info->map_count, smpt_info->unmap_count); for (i = 0; i < smpt_info->info.num_reg; i++) { - dev_dbg(mdev->sdev->parent, + dev_dbg(&mdev->pdev->dev, "SMPT entry[%d] dma_addr = 0x%llx ref_count = %lld\n", i, smpt_info->entry[i].dma_addr, smpt_info->entry[i].ref_count); if (smpt_info->entry[i].ref_count) - dev_warn(mdev->sdev->parent, + dev_warn(&mdev->pdev->dev, "ref count for entry %d is not zero\n", i); } kfree(smpt_info->entry); diff --git a/kernel/drivers/misc/mic/host/mic_smpt.h b/kernel/drivers/misc/mic/host/mic_smpt.h index 51970abfe..68721c6e7 100644 --- a/kernel/drivers/misc/mic/host/mic_smpt.h +++ b/kernel/drivers/misc/mic/host/mic_smpt.h @@ -78,6 +78,7 @@ void mic_unmap_single(struct mic_device *mdev, dma_addr_t mic_map(struct mic_device *mdev, dma_addr_t dma_addr, size_t size); void mic_unmap(struct mic_device *mdev, dma_addr_t mic_addr, size_t size); +dma_addr_t mic_to_dma_addr(struct mic_device *mdev, dma_addr_t mic_addr); /** * mic_map_error - Check a MIC address for errors. diff --git a/kernel/drivers/misc/mic/host/mic_sysfs.c b/kernel/drivers/misc/mic/host/mic_sysfs.c deleted file mode 100644 index 6dd864e4a..000000000 --- a/kernel/drivers/misc/mic/host/mic_sysfs.c +++ /dev/null @@ -1,459 +0,0 @@ -/* - * Intel MIC Platform Software Stack (MPSS) - * - * Copyright(c) 2013 Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * The full GNU General Public License is included in this distribution in - * the file called "COPYING". - * - * Intel MIC Host driver. - * - */ -#include <linux/pci.h> - -#include <linux/mic_common.h> -#include "../common/mic_dev.h" -#include "mic_device.h" - -/* - * A state-to-string lookup table, for exposing a human readable state - * via sysfs. Always keep in sync with enum mic_states - */ -static const char * const mic_state_string[] = { - [MIC_OFFLINE] = "offline", - [MIC_ONLINE] = "online", - [MIC_SHUTTING_DOWN] = "shutting_down", - [MIC_RESET_FAILED] = "reset_failed", - [MIC_SUSPENDING] = "suspending", - [MIC_SUSPENDED] = "suspended", -}; - -/* - * A shutdown-status-to-string lookup table, for exposing a human - * readable state via sysfs. Always keep in sync with enum mic_shutdown_status - */ -static const char * const mic_shutdown_status_string[] = { - [MIC_NOP] = "nop", - [MIC_CRASHED] = "crashed", - [MIC_HALTED] = "halted", - [MIC_POWER_OFF] = "poweroff", - [MIC_RESTART] = "restart", -}; - -void mic_set_shutdown_status(struct mic_device *mdev, u8 shutdown_status) -{ - dev_dbg(mdev->sdev->parent, "Shutdown Status %s -> %s\n", - mic_shutdown_status_string[mdev->shutdown_status], - mic_shutdown_status_string[shutdown_status]); - mdev->shutdown_status = shutdown_status; -} - -void mic_set_state(struct mic_device *mdev, u8 state) -{ - dev_dbg(mdev->sdev->parent, "State %s -> %s\n", - mic_state_string[mdev->state], - mic_state_string[state]); - mdev->state = state; - sysfs_notify_dirent(mdev->state_sysfs); -} - -static ssize_t -family_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - static const char x100[] = "x100"; - static const char unknown[] = "Unknown"; - const char *card = NULL; - struct mic_device *mdev = dev_get_drvdata(dev->parent); - - if (!mdev) - return -EINVAL; - - switch (mdev->family) { - case MIC_FAMILY_X100: - card = x100; - break; - default: - card = unknown; - break; - } - return scnprintf(buf, PAGE_SIZE, "%s\n", card); -} -static DEVICE_ATTR_RO(family); - -static ssize_t -stepping_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - char *string = "??"; - - if (!mdev) - return -EINVAL; - - switch (mdev->stepping) { - case MIC_A0_STEP: - string = "A0"; - break; - case MIC_B0_STEP: - string = "B0"; - break; - case MIC_B1_STEP: - string = "B1"; - break; - case MIC_C0_STEP: - string = "C0"; - break; - default: - break; - } - return scnprintf(buf, PAGE_SIZE, "%s\n", string); -} -static DEVICE_ATTR_RO(stepping); - -static ssize_t -state_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - - if (!mdev || mdev->state >= MIC_LAST) - return -EINVAL; - - return scnprintf(buf, PAGE_SIZE, "%s\n", - mic_state_string[mdev->state]); -} - -static ssize_t -state_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - int rc = 0; - struct mic_device *mdev = dev_get_drvdata(dev->parent); - if (!mdev) - return -EINVAL; - if (sysfs_streq(buf, "boot")) { - rc = mic_start(mdev, buf); - if (rc) { - dev_err(mdev->sdev->parent, - "mic_boot failed rc %d\n", rc); - count = rc; - } - goto done; - } - - if (sysfs_streq(buf, "reset")) { - schedule_work(&mdev->reset_trigger_work); - goto done; - } - - if (sysfs_streq(buf, "shutdown")) { - mic_shutdown(mdev); - goto done; - } - - if (sysfs_streq(buf, "suspend")) { - mic_suspend(mdev); - goto done; - } - - count = -EINVAL; -done: - return count; -} -static DEVICE_ATTR_RW(state); - -static ssize_t shutdown_status_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - - if (!mdev || mdev->shutdown_status >= MIC_STATUS_LAST) - return -EINVAL; - - return scnprintf(buf, PAGE_SIZE, "%s\n", - mic_shutdown_status_string[mdev->shutdown_status]); -} -static DEVICE_ATTR_RO(shutdown_status); - -static ssize_t -cmdline_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - char *cmdline; - - if (!mdev) - return -EINVAL; - - cmdline = mdev->cmdline; - - if (cmdline) - return scnprintf(buf, PAGE_SIZE, "%s\n", cmdline); - return 0; -} - -static ssize_t -cmdline_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - - if (!mdev) - return -EINVAL; - - mutex_lock(&mdev->mic_mutex); - kfree(mdev->cmdline); - - mdev->cmdline = kmalloc(count + 1, GFP_KERNEL); - if (!mdev->cmdline) { - count = -ENOMEM; - goto unlock; - } - - strncpy(mdev->cmdline, buf, count); - - if (mdev->cmdline[count - 1] == '\n') - mdev->cmdline[count - 1] = '\0'; - else - mdev->cmdline[count] = '\0'; -unlock: - mutex_unlock(&mdev->mic_mutex); - return count; -} -static DEVICE_ATTR_RW(cmdline); - -static ssize_t -firmware_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - char *firmware; - - if (!mdev) - return -EINVAL; - - firmware = mdev->firmware; - - if (firmware) - return scnprintf(buf, PAGE_SIZE, "%s\n", firmware); - return 0; -} - -static ssize_t -firmware_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - - if (!mdev) - return -EINVAL; - - mutex_lock(&mdev->mic_mutex); - kfree(mdev->firmware); - - mdev->firmware = kmalloc(count + 1, GFP_KERNEL); - if (!mdev->firmware) { - count = -ENOMEM; - goto unlock; - } - strncpy(mdev->firmware, buf, count); - - if (mdev->firmware[count - 1] == '\n') - mdev->firmware[count - 1] = '\0'; - else - mdev->firmware[count] = '\0'; -unlock: - mutex_unlock(&mdev->mic_mutex); - return count; -} -static DEVICE_ATTR_RW(firmware); - -static ssize_t -ramdisk_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - char *ramdisk; - - if (!mdev) - return -EINVAL; - - ramdisk = mdev->ramdisk; - - if (ramdisk) - return scnprintf(buf, PAGE_SIZE, "%s\n", ramdisk); - return 0; -} - -static ssize_t -ramdisk_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - - if (!mdev) - return -EINVAL; - - mutex_lock(&mdev->mic_mutex); - kfree(mdev->ramdisk); - - mdev->ramdisk = kmalloc(count + 1, GFP_KERNEL); - if (!mdev->ramdisk) { - count = -ENOMEM; - goto unlock; - } - - strncpy(mdev->ramdisk, buf, count); - - if (mdev->ramdisk[count - 1] == '\n') - mdev->ramdisk[count - 1] = '\0'; - else - mdev->ramdisk[count] = '\0'; -unlock: - mutex_unlock(&mdev->mic_mutex); - return count; -} -static DEVICE_ATTR_RW(ramdisk); - -static ssize_t -bootmode_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - char *bootmode; - - if (!mdev) - return -EINVAL; - - bootmode = mdev->bootmode; - - if (bootmode) - return scnprintf(buf, PAGE_SIZE, "%s\n", bootmode); - return 0; -} - -static ssize_t -bootmode_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - - if (!mdev) - return -EINVAL; - - if (!sysfs_streq(buf, "linux") && !sysfs_streq(buf, "elf")) - return -EINVAL; - - mutex_lock(&mdev->mic_mutex); - kfree(mdev->bootmode); - - mdev->bootmode = kmalloc(count + 1, GFP_KERNEL); - if (!mdev->bootmode) { - count = -ENOMEM; - goto unlock; - } - - strncpy(mdev->bootmode, buf, count); - - if (mdev->bootmode[count - 1] == '\n') - mdev->bootmode[count - 1] = '\0'; - else - mdev->bootmode[count] = '\0'; -unlock: - mutex_unlock(&mdev->mic_mutex); - return count; -} -static DEVICE_ATTR_RW(bootmode); - -static ssize_t -log_buf_addr_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - - if (!mdev) - return -EINVAL; - - return scnprintf(buf, PAGE_SIZE, "%p\n", mdev->log_buf_addr); -} - -static ssize_t -log_buf_addr_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - int ret; - unsigned long addr; - - if (!mdev) - return -EINVAL; - - ret = kstrtoul(buf, 16, &addr); - if (ret) - goto exit; - - mdev->log_buf_addr = (void *)addr; - ret = count; -exit: - return ret; -} -static DEVICE_ATTR_RW(log_buf_addr); - -static ssize_t -log_buf_len_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - - if (!mdev) - return -EINVAL; - - return scnprintf(buf, PAGE_SIZE, "%p\n", mdev->log_buf_len); -} - -static ssize_t -log_buf_len_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct mic_device *mdev = dev_get_drvdata(dev->parent); - int ret; - unsigned long addr; - - if (!mdev) - return -EINVAL; - - ret = kstrtoul(buf, 16, &addr); - if (ret) - goto exit; - - mdev->log_buf_len = (int *)addr; - ret = count; -exit: - return ret; -} -static DEVICE_ATTR_RW(log_buf_len); - -static struct attribute *mic_default_attrs[] = { - &dev_attr_family.attr, - &dev_attr_stepping.attr, - &dev_attr_state.attr, - &dev_attr_shutdown_status.attr, - &dev_attr_cmdline.attr, - &dev_attr_firmware.attr, - &dev_attr_ramdisk.attr, - &dev_attr_bootmode.attr, - &dev_attr_log_buf_addr.attr, - &dev_attr_log_buf_len.attr, - - NULL -}; - -ATTRIBUTE_GROUPS(mic_default); - -void mic_sysfs_init(struct mic_device *mdev) -{ - mdev->attr_group = mic_default_groups; -} diff --git a/kernel/drivers/misc/mic/host/mic_virtio.c b/kernel/drivers/misc/mic/host/mic_virtio.c index a020e4eb4..58b107a24 100644 --- a/kernel/drivers/misc/mic/host/mic_virtio.c +++ b/kernel/drivers/misc/mic/host/mic_virtio.c @@ -23,7 +23,6 @@ #include <linux/uaccess.h> #include <linux/dmaengine.h> #include <linux/mic_common.h> - #include "../common/mic_dev.h" #include "mic_device.h" #include "mic_smpt.h" @@ -40,7 +39,7 @@ static int mic_sync_dma(struct mic_device *mdev, dma_addr_t dst, { int err = 0; struct dma_async_tx_descriptor *tx; - struct dma_chan *mic_ch = mdev->dma_ch; + struct dma_chan *mic_ch = mdev->dma_ch[0]; if (!mic_ch) { err = -EBUSY; @@ -62,7 +61,7 @@ static int mic_sync_dma(struct mic_device *mdev, dma_addr_t dst, } error: if (err) - dev_err(mdev->sdev->parent, "%s %d err %d\n", + dev_err(&mdev->pdev->dev, "%s %d err %d\n", __func__, __LINE__, err); return err; } @@ -80,7 +79,7 @@ static int mic_virtio_copy_to_user(struct mic_vdev *mvdev, void __user *ubuf, struct mic_device *mdev = mvdev->mdev; void __iomem *dbuf = mdev->aper.va + daddr; struct mic_vringh *mvr = &mvdev->mvr[vr_idx]; - size_t dma_alignment = 1 << mdev->dma_ch->device->copy_align; + size_t dma_alignment = 1 << mdev->dma_ch[0]->device->copy_align; size_t dma_offset; size_t partlen; int err; @@ -129,7 +128,7 @@ static int mic_virtio_copy_from_user(struct mic_vdev *mvdev, void __user *ubuf, struct mic_device *mdev = mvdev->mdev; void __iomem *dbuf = mdev->aper.va + daddr; struct mic_vringh *mvr = &mvdev->mvr[vr_idx]; - size_t dma_alignment = 1 << mdev->dma_ch->device->copy_align; + size_t dma_alignment = 1 << mdev->dma_ch[0]->device->copy_align; size_t partlen; int err; @@ -440,7 +439,7 @@ void mic_virtio_reset_devices(struct mic_device *mdev) struct list_head *pos, *tmp; struct mic_vdev *mvdev; - dev_dbg(mdev->sdev->parent, "%s\n", __func__); + dev_dbg(&mdev->pdev->dev, "%s\n", __func__); list_for_each_safe(pos, tmp, &mdev->vdev_list) { mvdev = list_entry(pos, struct mic_vdev, list); @@ -686,7 +685,7 @@ int mic_virtio_add_device(struct mic_vdev *mvdev, mvr->head = USHRT_MAX; mvr->mvdev = mvdev; mvr->vrh.notify = mic_notify; - dev_dbg(mdev->sdev->parent, + dev_dbg(&mdev->pdev->dev, "%s %d index %d va %p info %p vr_size 0x%x\n", __func__, __LINE__, i, vr->va, vr->info, vr_size); mvr->buf = (void *)__get_free_pages(GFP_KERNEL, @@ -704,7 +703,7 @@ int mic_virtio_add_device(struct mic_vdev *mvdev, mvdev->virtio_db, MIC_INTR_DB); if (IS_ERR(mvdev->virtio_cookie)) { ret = PTR_ERR(mvdev->virtio_cookie); - dev_dbg(mdev->sdev->parent, "request irq failed\n"); + dev_dbg(&mdev->pdev->dev, "request irq failed\n"); goto err; } @@ -720,7 +719,7 @@ int mic_virtio_add_device(struct mic_vdev *mvdev, smp_wmb(); dd->type = type; - dev_dbg(mdev->sdev->parent, "Added virtio device id %d\n", dd->type); + dev_dbg(&mdev->pdev->dev, "Added virtio device id %d\n", dd->type); db = bootparam->h2c_config_db; if (db != -1) @@ -755,7 +754,7 @@ void mic_virtio_del_device(struct mic_vdev *mvdev) db = bootparam->h2c_config_db; if (db == -1) goto skip_hot_remove; - dev_dbg(mdev->sdev->parent, + dev_dbg(&mdev->pdev->dev, "Requesting hot remove id %d\n", mvdev->virtio_id); mvdev->dc->config_change = MIC_VIRTIO_PARAM_DEV_REMOVE; mdev->ops->send_intr(mdev, db); @@ -765,7 +764,7 @@ void mic_virtio_del_device(struct mic_vdev *mvdev) if (ret) break; } - dev_dbg(mdev->sdev->parent, + dev_dbg(&mdev->pdev->dev, "Device id %d config_change %d guest_ack %d retry %d\n", mvdev->virtio_id, mvdev->dc->config_change, mvdev->dc->guest_ack, retry); @@ -794,7 +793,7 @@ skip_hot_remove: tmp_mvdev = list_entry(pos, struct mic_vdev, list); if (tmp_mvdev == mvdev) { list_del(pos); - dev_dbg(mdev->sdev->parent, + dev_dbg(&mdev->pdev->dev, "Removing virtio device id %d\n", mvdev->virtio_id); break; diff --git a/kernel/drivers/misc/mic/host/mic_virtio.h b/kernel/drivers/misc/mic/host/mic_virtio.h index d574efb85..a80631f27 100644 --- a/kernel/drivers/misc/mic/host/mic_virtio.h +++ b/kernel/drivers/misc/mic/host/mic_virtio.h @@ -124,7 +124,7 @@ void mic_bh_handler(struct work_struct *work); /* Helper API to obtain the MIC PCIe device */ static inline struct device *mic_dev(struct mic_vdev *mvdev) { - return mvdev->mdev->sdev->parent; + return &mvdev->mdev->pdev->dev; } /* Helper API to check if a virtio device is initialized */ diff --git a/kernel/drivers/misc/mic/host/mic_x100.c b/kernel/drivers/misc/mic/host/mic_x100.c index b7a21e11d..8118ac48c 100644 --- a/kernel/drivers/misc/mic/host/mic_x100.c +++ b/kernel/drivers/misc/mic/host/mic_x100.c @@ -43,7 +43,7 @@ static void mic_x100_write_spad(struct mic_device *mdev, unsigned int idx, u32 val) { - dev_dbg(mdev->sdev->parent, "Writing 0x%x to scratch pad index %d\n", + dev_dbg(&mdev->pdev->dev, "Writing 0x%x to scratch pad index %d\n", val, idx); mic_mmio_write(&mdev->mmio, val, MIC_X100_SBOX_BASE_ADDRESS + @@ -66,7 +66,7 @@ mic_x100_read_spad(struct mic_device *mdev, unsigned int idx) MIC_X100_SBOX_BASE_ADDRESS + MIC_X100_SBOX_SPAD0 + idx * 4); - dev_dbg(mdev->sdev->parent, + dev_dbg(&mdev->pdev->dev, "Reading 0x%x from scratch pad index %d\n", val, idx); return val; } @@ -126,7 +126,7 @@ static void mic_x100_disable_interrupts(struct mic_device *mdev) * @mdev: pointer to mic_device instance */ static void mic_x100_send_sbox_intr(struct mic_device *mdev, - int doorbell) + int doorbell) { struct mic_mw *mw = &mdev->mmio; u64 apic_icr_offset = MIC_X100_SBOX_APICICR0 + doorbell * 8; @@ -147,7 +147,7 @@ static void mic_x100_send_sbox_intr(struct mic_device *mdev, * @mdev: pointer to mic_device instance */ static void mic_x100_send_rdmasr_intr(struct mic_device *mdev, - int doorbell) + int doorbell) { int rdmasr_offset = MIC_X100_SBOX_RDMASR0 + (doorbell << 2); /* Ensure that the interrupt is ordered w.r.t. previous stores. */ @@ -167,8 +167,7 @@ static void mic_x100_send_intr(struct mic_device *mdev, int doorbell) if (doorbell < MIC_X100_NUM_SBOX_IRQ) { mic_x100_send_sbox_intr(mdev, doorbell); } else { - rdmasr_db = doorbell - MIC_X100_NUM_SBOX_IRQ + - MIC_X100_RDMASR_IRQ_BASE; + rdmasr_db = doorbell - MIC_X100_NUM_SBOX_IRQ; mic_x100_send_rdmasr_intr(mdev, rdmasr_db); } } @@ -360,15 +359,14 @@ mic_x100_load_command_line(struct mic_device *mdev, const struct firmware *fw) boot_mem = mdev->aper.len >> 20; buf = kzalloc(CMDLINE_SIZE, GFP_KERNEL); - if (!buf) { - dev_err(mdev->sdev->parent, - "%s %d allocation failed\n", __func__, __LINE__); + if (!buf) return -ENOMEM; - } + len += snprintf(buf, CMDLINE_SIZE - len, " mem=%dM", boot_mem); - if (mdev->cmdline) - snprintf(buf + len, CMDLINE_SIZE - len, " %s", mdev->cmdline); + if (mdev->cosm_dev->cmdline) + snprintf(buf + len, CMDLINE_SIZE - len, " %s", + mdev->cosm_dev->cmdline); memcpy_toio(cmd_line_va, buf, strlen(buf) + 1); kfree(buf); return 0; @@ -387,12 +385,11 @@ mic_x100_load_ramdisk(struct mic_device *mdev) int rc; struct boot_params __iomem *bp = mdev->aper.va + mdev->bootaddr; - rc = request_firmware(&fw, - mdev->ramdisk, mdev->sdev->parent); + rc = request_firmware(&fw, mdev->cosm_dev->ramdisk, &mdev->pdev->dev); if (rc < 0) { - dev_err(mdev->sdev->parent, + dev_err(&mdev->pdev->dev, "ramdisk request_firmware failed: %d %s\n", - rc, mdev->ramdisk); + rc, mdev->cosm_dev->ramdisk); goto error; } /* @@ -424,10 +421,10 @@ mic_x100_get_boot_addr(struct mic_device *mdev) scratch2 = mdev->ops->read_spad(mdev, MIC_X100_DOWNLOAD_INFO); boot_addr = MIC_X100_SPAD2_DOWNLOAD_ADDR(scratch2); - dev_dbg(mdev->sdev->parent, "%s %d boot_addr 0x%x\n", + dev_dbg(&mdev->pdev->dev, "%s %d boot_addr 0x%x\n", __func__, __LINE__, boot_addr); if (boot_addr > (1 << 31)) { - dev_err(mdev->sdev->parent, + dev_err(&mdev->pdev->dev, "incorrect bootaddr 0x%x\n", boot_addr); rc = -EINVAL; @@ -455,37 +452,37 @@ mic_x100_load_firmware(struct mic_device *mdev, const char *buf) if (rc) goto error; /* load OS */ - rc = request_firmware(&fw, mdev->firmware, mdev->sdev->parent); + rc = request_firmware(&fw, mdev->cosm_dev->firmware, &mdev->pdev->dev); if (rc < 0) { - dev_err(mdev->sdev->parent, + dev_err(&mdev->pdev->dev, "ramdisk request_firmware failed: %d %s\n", - rc, mdev->firmware); + rc, mdev->cosm_dev->firmware); goto error; } if (mdev->bootaddr > mdev->aper.len - fw->size) { rc = -EINVAL; - dev_err(mdev->sdev->parent, "%s %d rc %d bootaddr 0x%x\n", + dev_err(&mdev->pdev->dev, "%s %d rc %d bootaddr 0x%x\n", __func__, __LINE__, rc, mdev->bootaddr); release_firmware(fw); goto error; } memcpy_toio(mdev->aper.va + mdev->bootaddr, fw->data, fw->size); mdev->ops->write_spad(mdev, MIC_X100_FW_SIZE, fw->size); - if (!strcmp(mdev->bootmode, "elf")) + if (!strcmp(mdev->cosm_dev->bootmode, "flash")) goto done; /* load command line */ rc = mic_x100_load_command_line(mdev, fw); if (rc) { - dev_err(mdev->sdev->parent, "%s %d rc %d\n", + dev_err(&mdev->pdev->dev, "%s %d rc %d\n", __func__, __LINE__, rc); goto error; } release_firmware(fw); /* load ramdisk */ - if (mdev->ramdisk) + if (mdev->cosm_dev->ramdisk) rc = mic_x100_load_ramdisk(mdev); error: - dev_dbg(mdev->sdev->parent, "%s %d rc %d\n", __func__, __LINE__, rc); + dev_dbg(&mdev->pdev->dev, "%s %d rc %d\n", __func__, __LINE__, rc); done: return rc; } diff --git a/kernel/drivers/misc/mic/scif/Makefile b/kernel/drivers/misc/mic/scif/Makefile new file mode 100644 index 000000000..29cfc3e51 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/Makefile @@ -0,0 +1,20 @@ +# +# Makefile - SCIF driver. +# Copyright(c) 2014, Intel Corporation. +# +obj-$(CONFIG_SCIF) += scif.o +scif-objs := scif_main.o +scif-objs += scif_peer_bus.o +scif-objs += scif_ports.o +scif-objs += scif_debugfs.o +scif-objs += scif_fd.o +scif-objs += scif_api.o +scif-objs += scif_epd.o +scif-objs += scif_rb.o +scif-objs += scif_nodeqp.o +scif-objs += scif_nm.o +scif-objs += scif_dma.o +scif-objs += scif_fence.o +scif-objs += scif_mmap.o +scif-objs += scif_rma.o +scif-objs += scif_rma_list.o diff --git a/kernel/drivers/misc/mic/scif/scif_api.c b/kernel/drivers/misc/mic/scif/scif_api.c new file mode 100644 index 000000000..ddc9e4b08 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_api.c @@ -0,0 +1,1496 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include <linux/scif.h> +#include "scif_main.h" +#include "scif_map.h" + +static const char * const scif_ep_states[] = { + "Unbound", + "Bound", + "Listening", + "Connected", + "Connecting", + "Mapping", + "Closing", + "Close Listening", + "Disconnected", + "Zombie"}; + +enum conn_async_state { + ASYNC_CONN_IDLE = 1, /* ep setup for async connect */ + ASYNC_CONN_INPROGRESS, /* async connect in progress */ + ASYNC_CONN_FLUSH_WORK /* async work flush in progress */ +}; + +/* + * File operations for anonymous inode file associated with a SCIF endpoint, + * used in kernel mode SCIF poll. Kernel mode SCIF poll calls portions of the + * poll API in the kernel and these take in a struct file *. Since a struct + * file is not available to kernel mode SCIF, it uses an anonymous file for + * this purpose. + */ +const struct file_operations scif_anon_fops = { + .owner = THIS_MODULE, +}; + +scif_epd_t scif_open(void) +{ + struct scif_endpt *ep; + int err; + + might_sleep(); + ep = kzalloc(sizeof(*ep), GFP_KERNEL); + if (!ep) + goto err_ep_alloc; + + ep->qp_info.qp = kzalloc(sizeof(*ep->qp_info.qp), GFP_KERNEL); + if (!ep->qp_info.qp) + goto err_qp_alloc; + + err = scif_anon_inode_getfile(ep); + if (err) + goto err_anon_inode; + + spin_lock_init(&ep->lock); + mutex_init(&ep->sendlock); + mutex_init(&ep->recvlock); + + scif_rma_ep_init(ep); + ep->state = SCIFEP_UNBOUND; + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI open: ep %p success\n", ep); + return ep; + +err_anon_inode: + kfree(ep->qp_info.qp); +err_qp_alloc: + kfree(ep); +err_ep_alloc: + return NULL; +} +EXPORT_SYMBOL_GPL(scif_open); + +/* + * scif_disconnect_ep - Disconnects the endpoint if found + * @epd: The end point returned from scif_open() + */ +static struct scif_endpt *scif_disconnect_ep(struct scif_endpt *ep) +{ + struct scifmsg msg; + struct scif_endpt *fep = NULL; + struct scif_endpt *tmpep; + struct list_head *pos, *tmpq; + int err; + + /* + * Wake up any threads blocked in send()/recv() before closing + * out the connection. Grabbing and releasing the send/recv lock + * will ensure that any blocked senders/receivers have exited for + * Ring 0 endpoints. It is a Ring 0 bug to call send/recv after + * close. Ring 3 endpoints are not affected since close will not + * be called while there are IOCTLs executing. + */ + wake_up_interruptible(&ep->sendwq); + wake_up_interruptible(&ep->recvwq); + mutex_lock(&ep->sendlock); + mutex_unlock(&ep->sendlock); + mutex_lock(&ep->recvlock); + mutex_unlock(&ep->recvlock); + + /* Remove from the connected list */ + mutex_lock(&scif_info.connlock); + list_for_each_safe(pos, tmpq, &scif_info.connected) { + tmpep = list_entry(pos, struct scif_endpt, list); + if (tmpep == ep) { + list_del(pos); + fep = tmpep; + spin_lock(&ep->lock); + break; + } + } + + if (!fep) { + /* + * The other side has completed the disconnect before + * the end point can be removed from the list. Therefore + * the ep lock is not locked, traverse the disconnected + * list to find the endpoint and release the conn lock. + */ + list_for_each_safe(pos, tmpq, &scif_info.disconnected) { + tmpep = list_entry(pos, struct scif_endpt, list); + if (tmpep == ep) { + list_del(pos); + break; + } + } + mutex_unlock(&scif_info.connlock); + return NULL; + } + + init_completion(&ep->discon); + msg.uop = SCIF_DISCNCT; + msg.src = ep->port; + msg.dst = ep->peer; + msg.payload[0] = (u64)ep; + msg.payload[1] = ep->remote_ep; + + err = scif_nodeqp_send(ep->remote_dev, &msg); + spin_unlock(&ep->lock); + mutex_unlock(&scif_info.connlock); + + if (!err) + /* Wait for the remote node to respond with SCIF_DISCNT_ACK */ + wait_for_completion_timeout(&ep->discon, + SCIF_NODE_ALIVE_TIMEOUT); + return ep; +} + +int scif_close(scif_epd_t epd) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct scif_endpt *tmpep; + struct list_head *pos, *tmpq; + enum scif_epd_state oldstate; + bool flush_conn; + + dev_dbg(scif_info.mdev.this_device, "SCIFAPI close: ep %p %s\n", + ep, scif_ep_states[ep->state]); + might_sleep(); + spin_lock(&ep->lock); + flush_conn = (ep->conn_async_state == ASYNC_CONN_INPROGRESS); + spin_unlock(&ep->lock); + + if (flush_conn) + flush_work(&scif_info.conn_work); + + spin_lock(&ep->lock); + oldstate = ep->state; + + ep->state = SCIFEP_CLOSING; + + switch (oldstate) { + case SCIFEP_ZOMBIE: + dev_err(scif_info.mdev.this_device, + "SCIFAPI close: zombie state unexpected\n"); + case SCIFEP_DISCONNECTED: + spin_unlock(&ep->lock); + scif_unregister_all_windows(epd); + /* Remove from the disconnected list */ + mutex_lock(&scif_info.connlock); + list_for_each_safe(pos, tmpq, &scif_info.disconnected) { + tmpep = list_entry(pos, struct scif_endpt, list); + if (tmpep == ep) { + list_del(pos); + break; + } + } + mutex_unlock(&scif_info.connlock); + break; + case SCIFEP_UNBOUND: + case SCIFEP_BOUND: + case SCIFEP_CONNECTING: + spin_unlock(&ep->lock); + break; + case SCIFEP_MAPPING: + case SCIFEP_CONNECTED: + case SCIFEP_CLOSING: + { + spin_unlock(&ep->lock); + scif_unregister_all_windows(epd); + scif_disconnect_ep(ep); + break; + } + case SCIFEP_LISTENING: + case SCIFEP_CLLISTEN: + { + struct scif_conreq *conreq; + struct scifmsg msg; + struct scif_endpt *aep; + + spin_unlock(&ep->lock); + mutex_lock(&scif_info.eplock); + + /* remove from listen list */ + list_for_each_safe(pos, tmpq, &scif_info.listen) { + tmpep = list_entry(pos, struct scif_endpt, list); + if (tmpep == ep) + list_del(pos); + } + /* Remove any dangling accepts */ + while (ep->acceptcnt) { + aep = list_first_entry(&ep->li_accept, + struct scif_endpt, liacceptlist); + list_del(&aep->liacceptlist); + scif_put_port(aep->port.port); + list_for_each_safe(pos, tmpq, &scif_info.uaccept) { + tmpep = list_entry(pos, struct scif_endpt, + miacceptlist); + if (tmpep == aep) { + list_del(pos); + break; + } + } + mutex_unlock(&scif_info.eplock); + mutex_lock(&scif_info.connlock); + list_for_each_safe(pos, tmpq, &scif_info.connected) { + tmpep = list_entry(pos, + struct scif_endpt, list); + if (tmpep == aep) { + list_del(pos); + break; + } + } + list_for_each_safe(pos, tmpq, &scif_info.disconnected) { + tmpep = list_entry(pos, + struct scif_endpt, list); + if (tmpep == aep) { + list_del(pos); + break; + } + } + mutex_unlock(&scif_info.connlock); + scif_teardown_ep(aep); + mutex_lock(&scif_info.eplock); + scif_add_epd_to_zombie_list(aep, SCIF_EPLOCK_HELD); + ep->acceptcnt--; + } + + spin_lock(&ep->lock); + mutex_unlock(&scif_info.eplock); + + /* Remove and reject any pending connection requests. */ + while (ep->conreqcnt) { + conreq = list_first_entry(&ep->conlist, + struct scif_conreq, list); + list_del(&conreq->list); + + msg.uop = SCIF_CNCT_REJ; + msg.dst.node = conreq->msg.src.node; + msg.dst.port = conreq->msg.src.port; + msg.payload[0] = conreq->msg.payload[0]; + msg.payload[1] = conreq->msg.payload[1]; + /* + * No Error Handling on purpose for scif_nodeqp_send(). + * If the remote node is lost we still want free the + * connection requests on the self node. + */ + scif_nodeqp_send(&scif_dev[conreq->msg.src.node], + &msg); + ep->conreqcnt--; + kfree(conreq); + } + + spin_unlock(&ep->lock); + /* If a kSCIF accept is waiting wake it up */ + wake_up_interruptible(&ep->conwq); + break; + } + } + scif_put_port(ep->port.port); + scif_anon_inode_fput(ep); + scif_teardown_ep(ep); + scif_add_epd_to_zombie_list(ep, !SCIF_EPLOCK_HELD); + return 0; +} +EXPORT_SYMBOL_GPL(scif_close); + +/** + * scif_flush() - Wakes up any blocking accepts. The endpoint will no longer + * accept new connections. + * @epd: The end point returned from scif_open() + */ +int __scif_flush(scif_epd_t epd) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + + switch (ep->state) { + case SCIFEP_LISTENING: + { + ep->state = SCIFEP_CLLISTEN; + + /* If an accept is waiting wake it up */ + wake_up_interruptible(&ep->conwq); + break; + } + default: + break; + } + return 0; +} + +int scif_bind(scif_epd_t epd, u16 pn) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int ret = 0; + int tmp; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI bind: ep %p %s requested port number %d\n", + ep, scif_ep_states[ep->state], pn); + if (pn) { + /* + * Similar to IETF RFC 1700, SCIF ports below + * SCIF_ADMIN_PORT_END can only be bound by system (or root) + * processes or by processes executed by privileged users. + */ + if (pn < SCIF_ADMIN_PORT_END && !capable(CAP_SYS_ADMIN)) { + ret = -EACCES; + goto scif_bind_admin_exit; + } + } + + spin_lock(&ep->lock); + if (ep->state == SCIFEP_BOUND) { + ret = -EINVAL; + goto scif_bind_exit; + } else if (ep->state != SCIFEP_UNBOUND) { + ret = -EISCONN; + goto scif_bind_exit; + } + + if (pn) { + tmp = scif_rsrv_port(pn); + if (tmp != pn) { + ret = -EINVAL; + goto scif_bind_exit; + } + } else { + pn = scif_get_new_port(); + if (!pn) { + ret = -ENOSPC; + goto scif_bind_exit; + } + } + + ep->state = SCIFEP_BOUND; + ep->port.node = scif_info.nodeid; + ep->port.port = pn; + ep->conn_async_state = ASYNC_CONN_IDLE; + ret = pn; + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI bind: bound to port number %d\n", pn); +scif_bind_exit: + spin_unlock(&ep->lock); +scif_bind_admin_exit: + return ret; +} +EXPORT_SYMBOL_GPL(scif_bind); + +int scif_listen(scif_epd_t epd, int backlog) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI listen: ep %p %s\n", ep, scif_ep_states[ep->state]); + spin_lock(&ep->lock); + switch (ep->state) { + case SCIFEP_ZOMBIE: + case SCIFEP_CLOSING: + case SCIFEP_CLLISTEN: + case SCIFEP_UNBOUND: + case SCIFEP_DISCONNECTED: + spin_unlock(&ep->lock); + return -EINVAL; + case SCIFEP_LISTENING: + case SCIFEP_CONNECTED: + case SCIFEP_CONNECTING: + case SCIFEP_MAPPING: + spin_unlock(&ep->lock); + return -EISCONN; + case SCIFEP_BOUND: + break; + } + + ep->state = SCIFEP_LISTENING; + ep->backlog = backlog; + + ep->conreqcnt = 0; + ep->acceptcnt = 0; + INIT_LIST_HEAD(&ep->conlist); + init_waitqueue_head(&ep->conwq); + INIT_LIST_HEAD(&ep->li_accept); + spin_unlock(&ep->lock); + + /* + * Listen status is complete so delete the qp information not needed + * on a listen before placing on the list of listening ep's + */ + scif_teardown_ep(ep); + ep->qp_info.qp = NULL; + + mutex_lock(&scif_info.eplock); + list_add_tail(&ep->list, &scif_info.listen); + mutex_unlock(&scif_info.eplock); + return 0; +} +EXPORT_SYMBOL_GPL(scif_listen); + +/* + ************************************************************************ + * SCIF connection flow: + * + * 1) A SCIF listening endpoint can call scif_accept(..) to wait for SCIF + * connections via a SCIF_CNCT_REQ message + * 2) A SCIF endpoint can initiate a SCIF connection by calling + * scif_connect(..) which calls scif_setup_qp_connect(..) which + * allocates the local qp for the endpoint ring buffer and then sends + * a SCIF_CNCT_REQ to the remote node and waits for a SCIF_CNCT_GNT or + * a SCIF_CNCT_REJ message + * 3) The peer node handles a SCIF_CNCT_REQ via scif_cnctreq_resp(..) which + * wakes up any threads blocked in step 1 or sends a SCIF_CNCT_REJ + * message otherwise + * 4) A thread blocked waiting for incoming connections allocates its local + * endpoint QP and ring buffer following which it sends a SCIF_CNCT_GNT + * and waits for a SCIF_CNCT_GNT(N)ACK. If the allocation fails then + * the node sends a SCIF_CNCT_REJ message + * 5) Upon receipt of a SCIF_CNCT_GNT or a SCIF_CNCT_REJ message the + * connecting endpoint is woken up as part of handling + * scif_cnctgnt_resp(..) following which it maps the remote endpoints' + * QP, updates its outbound QP and sends a SCIF_CNCT_GNTACK message on + * success or a SCIF_CNCT_GNTNACK message on failure and completes + * the scif_connect(..) API + * 6) Upon receipt of a SCIF_CNCT_GNT(N)ACK the accepting endpoint blocked + * in step 4 is woken up and completes the scif_accept(..) API + * 7) The SCIF connection is now established between the two SCIF endpoints. + */ +static int scif_conn_func(struct scif_endpt *ep) +{ + int err = 0; + struct scifmsg msg; + struct device *spdev; + + err = scif_reserve_dma_chan(ep); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + ep->state = SCIFEP_BOUND; + goto connect_error_simple; + } + /* Initiate the first part of the endpoint QP setup */ + err = scif_setup_qp_connect(ep->qp_info.qp, &ep->qp_info.qp_offset, + SCIF_ENDPT_QP_SIZE, ep->remote_dev); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s err %d qp_offset 0x%llx\n", + __func__, err, ep->qp_info.qp_offset); + ep->state = SCIFEP_BOUND; + goto connect_error_simple; + } + + spdev = scif_get_peer_dev(ep->remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + goto cleanup_qp; + } + /* Format connect message and send it */ + msg.src = ep->port; + msg.dst = ep->conn_port; + msg.uop = SCIF_CNCT_REQ; + msg.payload[0] = (u64)ep; + msg.payload[1] = ep->qp_info.qp_offset; + err = _scif_nodeqp_send(ep->remote_dev, &msg); + if (err) + goto connect_error_dec; + scif_put_peer_dev(spdev); + /* + * Wait for the remote node to respond with SCIF_CNCT_GNT or + * SCIF_CNCT_REJ message. + */ + err = wait_event_timeout(ep->conwq, ep->state != SCIFEP_CONNECTING, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d timeout\n", __func__, __LINE__); + ep->state = SCIFEP_BOUND; + } + spdev = scif_get_peer_dev(ep->remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + goto cleanup_qp; + } + if (ep->state == SCIFEP_MAPPING) { + err = scif_setup_qp_connect_response(ep->remote_dev, + ep->qp_info.qp, + ep->qp_info.gnt_pld); + /* + * If the resource to map the queue are not available then + * we need to tell the other side to terminate the accept + */ + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + msg.uop = SCIF_CNCT_GNTNACK; + msg.payload[0] = ep->remote_ep; + _scif_nodeqp_send(ep->remote_dev, &msg); + ep->state = SCIFEP_BOUND; + goto connect_error_dec; + } + + msg.uop = SCIF_CNCT_GNTACK; + msg.payload[0] = ep->remote_ep; + err = _scif_nodeqp_send(ep->remote_dev, &msg); + if (err) { + ep->state = SCIFEP_BOUND; + goto connect_error_dec; + } + ep->state = SCIFEP_CONNECTED; + mutex_lock(&scif_info.connlock); + list_add_tail(&ep->list, &scif_info.connected); + mutex_unlock(&scif_info.connlock); + dev_dbg(&ep->remote_dev->sdev->dev, + "SCIFAPI connect: ep %p connected\n", ep); + } else if (ep->state == SCIFEP_BOUND) { + dev_dbg(&ep->remote_dev->sdev->dev, + "SCIFAPI connect: ep %p connection refused\n", ep); + err = -ECONNREFUSED; + goto connect_error_dec; + } + scif_put_peer_dev(spdev); + return err; +connect_error_dec: + scif_put_peer_dev(spdev); +cleanup_qp: + scif_cleanup_ep_qp(ep); +connect_error_simple: + return err; +} + +/* + * scif_conn_handler: + * + * Workqueue handler for servicing non-blocking SCIF connect + * + */ +void scif_conn_handler(struct work_struct *work) +{ + struct scif_endpt *ep; + + do { + ep = NULL; + spin_lock(&scif_info.nb_connect_lock); + if (!list_empty(&scif_info.nb_connect_list)) { + ep = list_first_entry(&scif_info.nb_connect_list, + struct scif_endpt, conn_list); + list_del(&ep->conn_list); + } + spin_unlock(&scif_info.nb_connect_lock); + if (ep) { + ep->conn_err = scif_conn_func(ep); + wake_up_interruptible(&ep->conn_pend_wq); + } + } while (ep); +} + +int __scif_connect(scif_epd_t epd, struct scif_port_id *dst, bool non_block) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int err = 0; + struct scif_dev *remote_dev; + struct device *spdev; + + dev_dbg(scif_info.mdev.this_device, "SCIFAPI connect: ep %p %s\n", ep, + scif_ep_states[ep->state]); + + if (!scif_dev || dst->node > scif_info.maxid) + return -ENODEV; + + might_sleep(); + + remote_dev = &scif_dev[dst->node]; + spdev = scif_get_peer_dev(remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + return err; + } + + spin_lock(&ep->lock); + switch (ep->state) { + case SCIFEP_ZOMBIE: + case SCIFEP_CLOSING: + err = -EINVAL; + break; + case SCIFEP_DISCONNECTED: + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) + ep->conn_async_state = ASYNC_CONN_FLUSH_WORK; + else + err = -EINVAL; + break; + case SCIFEP_LISTENING: + case SCIFEP_CLLISTEN: + err = -EOPNOTSUPP; + break; + case SCIFEP_CONNECTING: + case SCIFEP_MAPPING: + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) + err = -EINPROGRESS; + else + err = -EISCONN; + break; + case SCIFEP_CONNECTED: + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) + ep->conn_async_state = ASYNC_CONN_FLUSH_WORK; + else + err = -EISCONN; + break; + case SCIFEP_UNBOUND: + ep->port.port = scif_get_new_port(); + if (!ep->port.port) { + err = -ENOSPC; + } else { + ep->port.node = scif_info.nodeid; + ep->conn_async_state = ASYNC_CONN_IDLE; + } + /* Fall through */ + case SCIFEP_BOUND: + /* + * If a non-blocking connect has been already initiated + * (conn_async_state is either ASYNC_CONN_INPROGRESS or + * ASYNC_CONN_FLUSH_WORK), the end point could end up in + * SCIF_BOUND due an error in the connection process + * (e.g., connection refused) If conn_async_state is + * ASYNC_CONN_INPROGRESS - transition to ASYNC_CONN_FLUSH_WORK + * so that the error status can be collected. If the state is + * already ASYNC_CONN_FLUSH_WORK - then set the error to + * EINPROGRESS since some other thread is waiting to collect + * error status. + */ + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { + ep->conn_async_state = ASYNC_CONN_FLUSH_WORK; + } else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) { + err = -EINPROGRESS; + } else { + ep->conn_port = *dst; + init_waitqueue_head(&ep->sendwq); + init_waitqueue_head(&ep->recvwq); + init_waitqueue_head(&ep->conwq); + ep->conn_async_state = 0; + + if (unlikely(non_block)) + ep->conn_async_state = ASYNC_CONN_INPROGRESS; + } + break; + } + + if (err || ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) + goto connect_simple_unlock1; + + ep->state = SCIFEP_CONNECTING; + ep->remote_dev = &scif_dev[dst->node]; + ep->qp_info.qp->magic = SCIFEP_MAGIC; + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { + init_waitqueue_head(&ep->conn_pend_wq); + spin_lock(&scif_info.nb_connect_lock); + list_add_tail(&ep->conn_list, &scif_info.nb_connect_list); + spin_unlock(&scif_info.nb_connect_lock); + err = -EINPROGRESS; + schedule_work(&scif_info.conn_work); + } +connect_simple_unlock1: + spin_unlock(&ep->lock); + scif_put_peer_dev(spdev); + if (err) { + return err; + } else if (ep->conn_async_state == ASYNC_CONN_FLUSH_WORK) { + flush_work(&scif_info.conn_work); + err = ep->conn_err; + spin_lock(&ep->lock); + ep->conn_async_state = ASYNC_CONN_IDLE; + spin_unlock(&ep->lock); + } else { + err = scif_conn_func(ep); + } + return err; +} + +int scif_connect(scif_epd_t epd, struct scif_port_id *dst) +{ + return __scif_connect(epd, dst, false); +} +EXPORT_SYMBOL_GPL(scif_connect); + +/** + * scif_accept() - Accept a connection request from the remote node + * + * The function accepts a connection request from the remote node. Successful + * complete is indicate by a new end point being created and passed back + * to the caller for future reference. + * + * Upon successful complete a zero will be returned and the peer information + * will be filled in. + * + * If the end point is not in the listening state -EINVAL will be returned. + * + * If during the connection sequence resource allocation fails the -ENOMEM + * will be returned. + * + * If the function is called with the ASYNC flag set and no connection requests + * are pending it will return -EAGAIN. + * + * If the remote side is not sending any connection requests the caller may + * terminate this function with a signal. If so a -EINTR will be returned. + */ +int scif_accept(scif_epd_t epd, struct scif_port_id *peer, + scif_epd_t *newepd, int flags) +{ + struct scif_endpt *lep = (struct scif_endpt *)epd; + struct scif_endpt *cep; + struct scif_conreq *conreq; + struct scifmsg msg; + int err; + struct device *spdev; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI accept: ep %p %s\n", lep, scif_ep_states[lep->state]); + + if (flags & ~SCIF_ACCEPT_SYNC) + return -EINVAL; + + if (!peer || !newepd) + return -EINVAL; + + might_sleep(); + spin_lock(&lep->lock); + if (lep->state != SCIFEP_LISTENING) { + spin_unlock(&lep->lock); + return -EINVAL; + } + + if (!lep->conreqcnt && !(flags & SCIF_ACCEPT_SYNC)) { + /* No connection request present and we do not want to wait */ + spin_unlock(&lep->lock); + return -EAGAIN; + } + + lep->files = current->files; +retry_connection: + spin_unlock(&lep->lock); + /* Wait for the remote node to send us a SCIF_CNCT_REQ */ + err = wait_event_interruptible(lep->conwq, + (lep->conreqcnt || + (lep->state != SCIFEP_LISTENING))); + if (err) + return err; + + if (lep->state != SCIFEP_LISTENING) + return -EINTR; + + spin_lock(&lep->lock); + + if (!lep->conreqcnt) + goto retry_connection; + + /* Get the first connect request off the list */ + conreq = list_first_entry(&lep->conlist, struct scif_conreq, list); + list_del(&conreq->list); + lep->conreqcnt--; + spin_unlock(&lep->lock); + + /* Fill in the peer information */ + peer->node = conreq->msg.src.node; + peer->port = conreq->msg.src.port; + + cep = kzalloc(sizeof(*cep), GFP_KERNEL); + if (!cep) { + err = -ENOMEM; + goto scif_accept_error_epalloc; + } + spin_lock_init(&cep->lock); + mutex_init(&cep->sendlock); + mutex_init(&cep->recvlock); + cep->state = SCIFEP_CONNECTING; + cep->remote_dev = &scif_dev[peer->node]; + cep->remote_ep = conreq->msg.payload[0]; + + scif_rma_ep_init(cep); + + err = scif_reserve_dma_chan(cep); + if (err) { + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + goto scif_accept_error_qpalloc; + } + + cep->qp_info.qp = kzalloc(sizeof(*cep->qp_info.qp), GFP_KERNEL); + if (!cep->qp_info.qp) { + err = -ENOMEM; + goto scif_accept_error_qpalloc; + } + + err = scif_anon_inode_getfile(cep); + if (err) + goto scif_accept_error_anon_inode; + + cep->qp_info.qp->magic = SCIFEP_MAGIC; + spdev = scif_get_peer_dev(cep->remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + goto scif_accept_error_map; + } + err = scif_setup_qp_accept(cep->qp_info.qp, &cep->qp_info.qp_offset, + conreq->msg.payload[1], SCIF_ENDPT_QP_SIZE, + cep->remote_dev); + if (err) { + dev_dbg(&cep->remote_dev->sdev->dev, + "SCIFAPI accept: ep %p new %p scif_setup_qp_accept %d qp_offset 0x%llx\n", + lep, cep, err, cep->qp_info.qp_offset); + scif_put_peer_dev(spdev); + goto scif_accept_error_map; + } + + cep->port.node = lep->port.node; + cep->port.port = lep->port.port; + cep->peer.node = peer->node; + cep->peer.port = peer->port; + init_waitqueue_head(&cep->sendwq); + init_waitqueue_head(&cep->recvwq); + init_waitqueue_head(&cep->conwq); + + msg.uop = SCIF_CNCT_GNT; + msg.src = cep->port; + msg.payload[0] = cep->remote_ep; + msg.payload[1] = cep->qp_info.qp_offset; + msg.payload[2] = (u64)cep; + + err = _scif_nodeqp_send(cep->remote_dev, &msg); + scif_put_peer_dev(spdev); + if (err) + goto scif_accept_error_map; +retry: + /* Wait for the remote node to respond with SCIF_CNCT_GNT(N)ACK */ + err = wait_event_timeout(cep->conwq, cep->state != SCIFEP_CONNECTING, + SCIF_NODE_ACCEPT_TIMEOUT); + if (!err && scifdev_alive(cep)) + goto retry; + err = !err ? -ENODEV : 0; + if (err) + goto scif_accept_error_map; + kfree(conreq); + + spin_lock(&cep->lock); + + if (cep->state == SCIFEP_CLOSING) { + /* + * Remote failed to allocate resources and NAKed the grant. + * There is at this point nothing referencing the new end point. + */ + spin_unlock(&cep->lock); + scif_teardown_ep(cep); + kfree(cep); + + /* If call with sync flag then go back and wait. */ + if (flags & SCIF_ACCEPT_SYNC) { + spin_lock(&lep->lock); + goto retry_connection; + } + return -EAGAIN; + } + + scif_get_port(cep->port.port); + *newepd = (scif_epd_t)cep; + spin_unlock(&cep->lock); + return 0; +scif_accept_error_map: + scif_anon_inode_fput(cep); +scif_accept_error_anon_inode: + scif_teardown_ep(cep); +scif_accept_error_qpalloc: + kfree(cep); +scif_accept_error_epalloc: + msg.uop = SCIF_CNCT_REJ; + msg.dst.node = conreq->msg.src.node; + msg.dst.port = conreq->msg.src.port; + msg.payload[0] = conreq->msg.payload[0]; + msg.payload[1] = conreq->msg.payload[1]; + scif_nodeqp_send(&scif_dev[conreq->msg.src.node], &msg); + kfree(conreq); + return err; +} +EXPORT_SYMBOL_GPL(scif_accept); + +/* + * scif_msg_param_check: + * @epd: The end point returned from scif_open() + * @len: Length to receive + * @flags: blocking or non blocking + * + * Validate parameters for messaging APIs scif_send(..)/scif_recv(..). + */ +static inline int scif_msg_param_check(scif_epd_t epd, int len, int flags) +{ + int ret = -EINVAL; + + if (len < 0) + goto err_ret; + if (flags && (!(flags & SCIF_RECV_BLOCK))) + goto err_ret; + ret = 0; +err_ret: + return ret; +} + +static int _scif_send(scif_epd_t epd, void *msg, int len, int flags) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct scifmsg notif_msg; + int curr_xfer_len = 0, sent_len = 0, write_count; + int ret = 0; + struct scif_qp *qp = ep->qp_info.qp; + + if (flags & SCIF_SEND_BLOCK) + might_sleep(); + + spin_lock(&ep->lock); + while (sent_len != len && SCIFEP_CONNECTED == ep->state) { + write_count = scif_rb_space(&qp->outbound_q); + if (write_count) { + /* Best effort to send as much data as possible */ + curr_xfer_len = min(len - sent_len, write_count); + ret = scif_rb_write(&qp->outbound_q, msg, + curr_xfer_len); + if (ret < 0) + break; + /* Success. Update write pointer */ + scif_rb_commit(&qp->outbound_q); + /* + * Send a notification to the peer about the + * produced data message. + */ + notif_msg.src = ep->port; + notif_msg.uop = SCIF_CLIENT_SENT; + notif_msg.payload[0] = ep->remote_ep; + ret = _scif_nodeqp_send(ep->remote_dev, ¬if_msg); + if (ret) + break; + sent_len += curr_xfer_len; + msg = msg + curr_xfer_len; + continue; + } + curr_xfer_len = min(len - sent_len, SCIF_ENDPT_QP_SIZE - 1); + /* Not enough RB space. return for the Non Blocking case */ + if (!(flags & SCIF_SEND_BLOCK)) + break; + + spin_unlock(&ep->lock); + /* Wait for a SCIF_CLIENT_RCVD message in the Blocking case */ + ret = + wait_event_interruptible(ep->sendwq, + (SCIFEP_CONNECTED != ep->state) || + (scif_rb_space(&qp->outbound_q) >= + curr_xfer_len)); + spin_lock(&ep->lock); + if (ret) + break; + } + if (sent_len) + ret = sent_len; + else if (!ret && SCIFEP_CONNECTED != ep->state) + ret = SCIFEP_DISCONNECTED == ep->state ? + -ECONNRESET : -ENOTCONN; + spin_unlock(&ep->lock); + return ret; +} + +static int _scif_recv(scif_epd_t epd, void *msg, int len, int flags) +{ + int read_size; + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct scifmsg notif_msg; + int curr_recv_len = 0, remaining_len = len, read_count; + int ret = 0; + struct scif_qp *qp = ep->qp_info.qp; + + if (flags & SCIF_RECV_BLOCK) + might_sleep(); + spin_lock(&ep->lock); + while (remaining_len && (SCIFEP_CONNECTED == ep->state || + SCIFEP_DISCONNECTED == ep->state)) { + read_count = scif_rb_count(&qp->inbound_q, remaining_len); + if (read_count) { + /* + * Best effort to recv as much data as there + * are bytes to read in the RB particularly + * important for the Non Blocking case. + */ + curr_recv_len = min(remaining_len, read_count); + read_size = scif_rb_get_next(&qp->inbound_q, + msg, curr_recv_len); + if (ep->state == SCIFEP_CONNECTED) { + /* + * Update the read pointer only if the endpoint + * is still connected else the read pointer + * might no longer exist since the peer has + * freed resources! + */ + scif_rb_update_read_ptr(&qp->inbound_q); + /* + * Send a notification to the peer about the + * consumed data message only if the EP is in + * SCIFEP_CONNECTED state. + */ + notif_msg.src = ep->port; + notif_msg.uop = SCIF_CLIENT_RCVD; + notif_msg.payload[0] = ep->remote_ep; + ret = _scif_nodeqp_send(ep->remote_dev, + ¬if_msg); + if (ret) + break; + } + remaining_len -= curr_recv_len; + msg = msg + curr_recv_len; + continue; + } + /* + * Bail out now if the EP is in SCIFEP_DISCONNECTED state else + * we will keep looping forever. + */ + if (ep->state == SCIFEP_DISCONNECTED) + break; + /* + * Return in the Non Blocking case if there is no data + * to read in this iteration. + */ + if (!(flags & SCIF_RECV_BLOCK)) + break; + curr_recv_len = min(remaining_len, SCIF_ENDPT_QP_SIZE - 1); + spin_unlock(&ep->lock); + /* + * Wait for a SCIF_CLIENT_SEND message in the blocking case + * or until other side disconnects. + */ + ret = + wait_event_interruptible(ep->recvwq, + SCIFEP_CONNECTED != ep->state || + scif_rb_count(&qp->inbound_q, + curr_recv_len) + >= curr_recv_len); + spin_lock(&ep->lock); + if (ret) + break; + } + if (len - remaining_len) + ret = len - remaining_len; + else if (!ret && ep->state != SCIFEP_CONNECTED) + ret = ep->state == SCIFEP_DISCONNECTED ? + -ECONNRESET : -ENOTCONN; + spin_unlock(&ep->lock); + return ret; +} + +/** + * scif_user_send() - Send data to connection queue + * @epd: The end point returned from scif_open() + * @msg: Address to place data + * @len: Length to receive + * @flags: blocking or non blocking + * + * This function is called from the driver IOCTL entry point + * only and is a wrapper for _scif_send(). + */ +int scif_user_send(scif_epd_t epd, void __user *msg, int len, int flags) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int err = 0; + int sent_len = 0; + char *tmp; + int loop_len; + int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1))); + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI send (U): ep %p %s\n", ep, scif_ep_states[ep->state]); + if (!len) + return 0; + + err = scif_msg_param_check(epd, len, flags); + if (err) + goto send_err; + + tmp = kmalloc(chunk_len, GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto send_err; + } + /* + * Grabbing the lock before breaking up the transfer in + * multiple chunks is required to ensure that messages do + * not get fragmented and reordered. + */ + mutex_lock(&ep->sendlock); + while (sent_len != len) { + loop_len = len - sent_len; + loop_len = min(chunk_len, loop_len); + if (copy_from_user(tmp, msg, loop_len)) { + err = -EFAULT; + goto send_free_err; + } + err = _scif_send(epd, tmp, loop_len, flags); + if (err < 0) + goto send_free_err; + sent_len += err; + msg += err; + if (err != loop_len) + goto send_free_err; + } +send_free_err: + mutex_unlock(&ep->sendlock); + kfree(tmp); +send_err: + return err < 0 ? err : sent_len; +} + +/** + * scif_user_recv() - Receive data from connection queue + * @epd: The end point returned from scif_open() + * @msg: Address to place data + * @len: Length to receive + * @flags: blocking or non blocking + * + * This function is called from the driver IOCTL entry point + * only and is a wrapper for _scif_recv(). + */ +int scif_user_recv(scif_epd_t epd, void __user *msg, int len, int flags) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int err = 0; + int recv_len = 0; + char *tmp; + int loop_len; + int chunk_len = min(len, (1 << (MAX_ORDER + PAGE_SHIFT - 1))); + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI recv (U): ep %p %s\n", ep, scif_ep_states[ep->state]); + if (!len) + return 0; + + err = scif_msg_param_check(epd, len, flags); + if (err) + goto recv_err; + + tmp = kmalloc(chunk_len, GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto recv_err; + } + /* + * Grabbing the lock before breaking up the transfer in + * multiple chunks is required to ensure that messages do + * not get fragmented and reordered. + */ + mutex_lock(&ep->recvlock); + while (recv_len != len) { + loop_len = len - recv_len; + loop_len = min(chunk_len, loop_len); + err = _scif_recv(epd, tmp, loop_len, flags); + if (err < 0) + goto recv_free_err; + if (copy_to_user(msg, tmp, err)) { + err = -EFAULT; + goto recv_free_err; + } + recv_len += err; + msg += err; + if (err != loop_len) + goto recv_free_err; + } +recv_free_err: + mutex_unlock(&ep->recvlock); + kfree(tmp); +recv_err: + return err < 0 ? err : recv_len; +} + +/** + * scif_send() - Send data to connection queue + * @epd: The end point returned from scif_open() + * @msg: Address to place data + * @len: Length to receive + * @flags: blocking or non blocking + * + * This function is called from the kernel mode only and is + * a wrapper for _scif_send(). + */ +int scif_send(scif_epd_t epd, void *msg, int len, int flags) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int ret; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI send (K): ep %p %s\n", ep, scif_ep_states[ep->state]); + if (!len) + return 0; + + ret = scif_msg_param_check(epd, len, flags); + if (ret) + return ret; + if (!ep->remote_dev) + return -ENOTCONN; + /* + * Grab the mutex lock in the blocking case only + * to ensure messages do not get fragmented/reordered. + * The non blocking mode is protected using spin locks + * in _scif_send(). + */ + if (flags & SCIF_SEND_BLOCK) + mutex_lock(&ep->sendlock); + + ret = _scif_send(epd, msg, len, flags); + + if (flags & SCIF_SEND_BLOCK) + mutex_unlock(&ep->sendlock); + return ret; +} +EXPORT_SYMBOL_GPL(scif_send); + +/** + * scif_recv() - Receive data from connection queue + * @epd: The end point returned from scif_open() + * @msg: Address to place data + * @len: Length to receive + * @flags: blocking or non blocking + * + * This function is called from the kernel mode only and is + * a wrapper for _scif_recv(). + */ +int scif_recv(scif_epd_t epd, void *msg, int len, int flags) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int ret; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI recv (K): ep %p %s\n", ep, scif_ep_states[ep->state]); + if (!len) + return 0; + + ret = scif_msg_param_check(epd, len, flags); + if (ret) + return ret; + /* + * Grab the mutex lock in the blocking case only + * to ensure messages do not get fragmented/reordered. + * The non blocking mode is protected using spin locks + * in _scif_send(). + */ + if (flags & SCIF_RECV_BLOCK) + mutex_lock(&ep->recvlock); + + ret = _scif_recv(epd, msg, len, flags); + + if (flags & SCIF_RECV_BLOCK) + mutex_unlock(&ep->recvlock); + + return ret; +} +EXPORT_SYMBOL_GPL(scif_recv); + +static inline void _scif_poll_wait(struct file *f, wait_queue_head_t *wq, + poll_table *p, struct scif_endpt *ep) +{ + /* + * Because poll_wait makes a GFP_KERNEL allocation, give up the lock + * and regrab it afterwards. Because the endpoint state might have + * changed while the lock was given up, the state must be checked + * again after re-acquiring the lock. The code in __scif_pollfd(..) + * does this. + */ + spin_unlock(&ep->lock); + poll_wait(f, wq, p); + spin_lock(&ep->lock); +} + +unsigned int +__scif_pollfd(struct file *f, poll_table *wait, struct scif_endpt *ep) +{ + unsigned int mask = 0; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI pollfd: ep %p %s\n", ep, scif_ep_states[ep->state]); + + spin_lock(&ep->lock); + + /* Endpoint is waiting for a non-blocking connect to complete */ + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { + _scif_poll_wait(f, &ep->conn_pend_wq, wait, ep); + if (ep->conn_async_state == ASYNC_CONN_INPROGRESS) { + if (ep->state == SCIFEP_CONNECTED || + ep->state == SCIFEP_DISCONNECTED || + ep->conn_err) + mask |= POLLOUT; + goto exit; + } + } + + /* Endpoint is listening for incoming connection requests */ + if (ep->state == SCIFEP_LISTENING) { + _scif_poll_wait(f, &ep->conwq, wait, ep); + if (ep->state == SCIFEP_LISTENING) { + if (ep->conreqcnt) + mask |= POLLIN; + goto exit; + } + } + + /* Endpoint is connected or disconnected */ + if (ep->state == SCIFEP_CONNECTED || ep->state == SCIFEP_DISCONNECTED) { + if (poll_requested_events(wait) & POLLIN) + _scif_poll_wait(f, &ep->recvwq, wait, ep); + if (poll_requested_events(wait) & POLLOUT) + _scif_poll_wait(f, &ep->sendwq, wait, ep); + if (ep->state == SCIFEP_CONNECTED || + ep->state == SCIFEP_DISCONNECTED) { + /* Data can be read without blocking */ + if (scif_rb_count(&ep->qp_info.qp->inbound_q, 1)) + mask |= POLLIN; + /* Data can be written without blocking */ + if (scif_rb_space(&ep->qp_info.qp->outbound_q)) + mask |= POLLOUT; + /* Return POLLHUP if endpoint is disconnected */ + if (ep->state == SCIFEP_DISCONNECTED) + mask |= POLLHUP; + goto exit; + } + } + + /* Return POLLERR if the endpoint is in none of the above states */ + mask |= POLLERR; +exit: + spin_unlock(&ep->lock); + return mask; +} + +/** + * scif_poll() - Kernel mode SCIF poll + * @ufds: Array of scif_pollepd structures containing the end points + * and events to poll on + * @nfds: Size of the ufds array + * @timeout_msecs: Timeout in msecs, -ve implies infinite timeout + * + * The code flow in this function is based on do_poll(..) in select.c + * + * Returns the number of endpoints which have pending events or 0 in + * the event of a timeout. If a signal is used for wake up, -EINTR is + * returned. + */ +int +scif_poll(struct scif_pollepd *ufds, unsigned int nfds, long timeout_msecs) +{ + struct poll_wqueues table; + poll_table *pt; + int i, mask, count = 0, timed_out = timeout_msecs == 0; + u64 timeout = timeout_msecs < 0 ? MAX_SCHEDULE_TIMEOUT + : msecs_to_jiffies(timeout_msecs); + + poll_initwait(&table); + pt = &table.pt; + while (1) { + for (i = 0; i < nfds; i++) { + pt->_key = ufds[i].events | POLLERR | POLLHUP; + mask = __scif_pollfd(ufds[i].epd->anon, + pt, ufds[i].epd); + mask &= ufds[i].events | POLLERR | POLLHUP; + if (mask) { + count++; + pt->_qproc = NULL; + } + ufds[i].revents = mask; + } + pt->_qproc = NULL; + if (!count) { + count = table.error; + if (signal_pending(current)) + count = -EINTR; + } + if (count || timed_out) + break; + + if (!schedule_timeout_interruptible(timeout)) + timed_out = 1; + } + poll_freewait(&table); + return count; +} +EXPORT_SYMBOL_GPL(scif_poll); + +int scif_get_node_ids(u16 *nodes, int len, u16 *self) +{ + int online = 0; + int offset = 0; + int node; + + if (!scif_is_mgmt_node()) + scif_get_node_info(); + + *self = scif_info.nodeid; + mutex_lock(&scif_info.conflock); + len = min_t(int, len, scif_info.total); + for (node = 0; node <= scif_info.maxid; node++) { + if (_scifdev_alive(&scif_dev[node])) { + online++; + if (offset < len) + nodes[offset++] = node; + } + } + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI get_node_ids total %d online %d filled in %d nodes\n", + scif_info.total, online, offset); + mutex_unlock(&scif_info.conflock); + + return online; +} +EXPORT_SYMBOL_GPL(scif_get_node_ids); + +static int scif_add_client_dev(struct device *dev, struct subsys_interface *si) +{ + struct scif_client *client = + container_of(si, struct scif_client, si); + struct scif_peer_dev *spdev = + container_of(dev, struct scif_peer_dev, dev); + + if (client->probe) + client->probe(spdev); + return 0; +} + +static void scif_remove_client_dev(struct device *dev, + struct subsys_interface *si) +{ + struct scif_client *client = + container_of(si, struct scif_client, si); + struct scif_peer_dev *spdev = + container_of(dev, struct scif_peer_dev, dev); + + if (client->remove) + client->remove(spdev); +} + +void scif_client_unregister(struct scif_client *client) +{ + subsys_interface_unregister(&client->si); +} +EXPORT_SYMBOL_GPL(scif_client_unregister); + +int scif_client_register(struct scif_client *client) +{ + struct subsys_interface *si = &client->si; + + si->name = client->name; + si->subsys = &scif_peer_bus; + si->add_dev = scif_add_client_dev; + si->remove_dev = scif_remove_client_dev; + + return subsys_interface_register(&client->si); +} +EXPORT_SYMBOL_GPL(scif_client_register); diff --git a/kernel/drivers/misc/mic/scif/scif_debugfs.c b/kernel/drivers/misc/mic/scif/scif_debugfs.c new file mode 100644 index 000000000..6884dad97 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_debugfs.c @@ -0,0 +1,162 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include "../common/mic_dev.h" +#include "scif_main.h" + +/* Debugfs parent dir */ +static struct dentry *scif_dbg; + +static int scif_dev_test(struct seq_file *s, void *unused) +{ + int node; + + seq_printf(s, "Total Nodes %d Self Node Id %d Maxid %d\n", + scif_info.total, scif_info.nodeid, + scif_info.maxid); + + if (!scif_dev) + return 0; + + seq_printf(s, "%-16s\t%-16s\n", "node_id", "state"); + + for (node = 0; node <= scif_info.maxid; node++) + seq_printf(s, "%-16d\t%-16s\n", scif_dev[node].node, + _scifdev_alive(&scif_dev[node]) ? + "Running" : "Offline"); + return 0; +} + +static int scif_dev_test_open(struct inode *inode, struct file *file) +{ + return single_open(file, scif_dev_test, inode->i_private); +} + +static int scif_dev_test_release(struct inode *inode, struct file *file) +{ + return single_release(inode, file); +} + +static const struct file_operations scif_dev_ops = { + .owner = THIS_MODULE, + .open = scif_dev_test_open, + .read = seq_read, + .llseek = seq_lseek, + .release = scif_dev_test_release +}; + +static void scif_display_window(struct scif_window *window, struct seq_file *s) +{ + int j; + struct scatterlist *sg; + scif_pinned_pages_t pin = window->pinned_pages; + + seq_printf(s, "window %p type %d temp %d offset 0x%llx ", + window, window->type, window->temp, window->offset); + seq_printf(s, "nr_pages 0x%llx nr_contig_chunks 0x%x prot %d ", + window->nr_pages, window->nr_contig_chunks, window->prot); + seq_printf(s, "ref_count %d magic 0x%llx peer_window 0x%llx ", + window->ref_count, window->magic, window->peer_window); + seq_printf(s, "unreg_state 0x%x va_for_temp 0x%lx\n", + window->unreg_state, window->va_for_temp); + + for (j = 0; j < window->nr_contig_chunks; j++) + seq_printf(s, "page[%d] dma_addr 0x%llx num_pages 0x%llx\n", j, + window->dma_addr[j], window->num_pages[j]); + + if (window->type == SCIF_WINDOW_SELF && pin) + for (j = 0; j < window->nr_pages; j++) + seq_printf(s, "page[%d] = pinned_pages %p address %p\n", + j, pin->pages[j], + page_address(pin->pages[j])); + + if (window->st) + for_each_sg(window->st->sgl, sg, window->st->nents, j) + seq_printf(s, "sg[%d] dma addr 0x%llx length 0x%x\n", + j, sg_dma_address(sg), sg_dma_len(sg)); +} + +static void scif_display_all_windows(struct list_head *head, struct seq_file *s) +{ + struct list_head *item; + struct scif_window *window; + + list_for_each(item, head) { + window = list_entry(item, struct scif_window, list); + scif_display_window(window, s); + } +} + +static int scif_rma_test(struct seq_file *s, void *unused) +{ + struct scif_endpt *ep; + struct list_head *pos; + + mutex_lock(&scif_info.connlock); + list_for_each(pos, &scif_info.connected) { + ep = list_entry(pos, struct scif_endpt, list); + seq_printf(s, "ep %p self windows\n", ep); + mutex_lock(&ep->rma_info.rma_lock); + scif_display_all_windows(&ep->rma_info.reg_list, s); + seq_printf(s, "ep %p remote windows\n", ep); + scif_display_all_windows(&ep->rma_info.remote_reg_list, s); + mutex_unlock(&ep->rma_info.rma_lock); + } + mutex_unlock(&scif_info.connlock); + return 0; +} + +static int scif_rma_test_open(struct inode *inode, struct file *file) +{ + return single_open(file, scif_rma_test, inode->i_private); +} + +static int scif_rma_test_release(struct inode *inode, struct file *file) +{ + return single_release(inode, file); +} + +static const struct file_operations scif_rma_ops = { + .owner = THIS_MODULE, + .open = scif_rma_test_open, + .read = seq_read, + .llseek = seq_lseek, + .release = scif_rma_test_release +}; + +void __init scif_init_debugfs(void) +{ + scif_dbg = debugfs_create_dir(KBUILD_MODNAME, NULL); + if (!scif_dbg) { + dev_err(scif_info.mdev.this_device, + "can't create debugfs dir scif\n"); + return; + } + + debugfs_create_file("scif_dev", 0444, scif_dbg, NULL, &scif_dev_ops); + debugfs_create_file("scif_rma", 0444, scif_dbg, NULL, &scif_rma_ops); + debugfs_create_u8("en_msg_log", 0666, scif_dbg, &scif_info.en_msg_log); + debugfs_create_u8("p2p_enable", 0666, scif_dbg, &scif_info.p2p_enable); +} + +void scif_exit_debugfs(void) +{ + debugfs_remove_recursive(scif_dbg); +} diff --git a/kernel/drivers/misc/mic/scif/scif_dma.c b/kernel/drivers/misc/mic/scif/scif_dma.c new file mode 100644 index 000000000..95a13c629 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_dma.c @@ -0,0 +1,1979 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include "scif_main.h" +#include "scif_map.h" + +/* + * struct scif_dma_comp_cb - SCIF DMA completion callback + * + * @dma_completion_func: DMA completion callback + * @cb_cookie: DMA completion callback cookie + * @temp_buf: Temporary buffer + * @temp_buf_to_free: Temporary buffer to be freed + * @is_cache: Is a kmem_cache allocated buffer + * @dst_offset: Destination registration offset + * @dst_window: Destination registration window + * @len: Length of the temp buffer + * @temp_phys: DMA address of the temp buffer + * @sdev: The SCIF device + * @header_padding: padding for cache line alignment + */ +struct scif_dma_comp_cb { + void (*dma_completion_func)(void *cookie); + void *cb_cookie; + u8 *temp_buf; + u8 *temp_buf_to_free; + bool is_cache; + s64 dst_offset; + struct scif_window *dst_window; + size_t len; + dma_addr_t temp_phys; + struct scif_dev *sdev; + int header_padding; +}; + +/** + * struct scif_copy_work - Work for DMA copy + * + * @src_offset: Starting source offset + * @dst_offset: Starting destination offset + * @src_window: Starting src registered window + * @dst_window: Starting dst registered window + * @loopback: true if this is a loopback DMA transfer + * @len: Length of the transfer + * @comp_cb: DMA copy completion callback + * @remote_dev: The remote SCIF peer device + * @fence_type: polling or interrupt based + * @ordered: is this a tail byte ordered DMA transfer + */ +struct scif_copy_work { + s64 src_offset; + s64 dst_offset; + struct scif_window *src_window; + struct scif_window *dst_window; + int loopback; + size_t len; + struct scif_dma_comp_cb *comp_cb; + struct scif_dev *remote_dev; + int fence_type; + bool ordered; +}; + +#ifndef list_entry_next +#define list_entry_next(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) +#endif + +/** + * scif_reserve_dma_chan: + * @ep: Endpoint Descriptor. + * + * This routine reserves a DMA channel for a particular + * endpoint. All DMA transfers for an endpoint are always + * programmed on the same DMA channel. + */ +int scif_reserve_dma_chan(struct scif_endpt *ep) +{ + int err = 0; + struct scif_dev *scifdev; + struct scif_hw_dev *sdev; + struct dma_chan *chan; + + /* Loopback DMAs are not supported on the management node */ + if (!scif_info.nodeid && scifdev_self(ep->remote_dev)) + return 0; + if (scif_info.nodeid) + scifdev = &scif_dev[0]; + else + scifdev = ep->remote_dev; + sdev = scifdev->sdev; + if (!sdev->num_dma_ch) + return -ENODEV; + chan = sdev->dma_ch[scifdev->dma_ch_idx]; + scifdev->dma_ch_idx = (scifdev->dma_ch_idx + 1) % sdev->num_dma_ch; + mutex_lock(&ep->rma_info.rma_lock); + ep->rma_info.dma_chan = chan; + mutex_unlock(&ep->rma_info.rma_lock); + return err; +} + +#ifdef CONFIG_MMU_NOTIFIER +/** + * scif_rma_destroy_tcw: + * + * This routine destroys temporary cached windows + */ +static +void __scif_rma_destroy_tcw(struct scif_mmu_notif *mmn, + struct scif_endpt *ep, + u64 start, u64 len) +{ + struct list_head *item, *tmp; + struct scif_window *window; + u64 start_va, end_va; + u64 end = start + len; + + if (end <= start) + return; + + list_for_each_safe(item, tmp, &mmn->tc_reg_list) { + window = list_entry(item, struct scif_window, list); + ep = (struct scif_endpt *)window->ep; + if (!len) + break; + start_va = window->va_for_temp; + end_va = start_va + (window->nr_pages << PAGE_SHIFT); + if (start < start_va && end <= start_va) + break; + if (start >= end_va) + continue; + __scif_rma_destroy_tcw_helper(window); + } +} + +static void scif_rma_destroy_tcw(struct scif_mmu_notif *mmn, u64 start, u64 len) +{ + struct scif_endpt *ep = mmn->ep; + + spin_lock(&ep->rma_info.tc_lock); + __scif_rma_destroy_tcw(mmn, ep, start, len); + spin_unlock(&ep->rma_info.tc_lock); +} + +static void scif_rma_destroy_tcw_ep(struct scif_endpt *ep) +{ + struct list_head *item, *tmp; + struct scif_mmu_notif *mmn; + + list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) { + mmn = list_entry(item, struct scif_mmu_notif, list); + scif_rma_destroy_tcw(mmn, 0, ULONG_MAX); + } +} + +static void __scif_rma_destroy_tcw_ep(struct scif_endpt *ep) +{ + struct list_head *item, *tmp; + struct scif_mmu_notif *mmn; + + spin_lock(&ep->rma_info.tc_lock); + list_for_each_safe(item, tmp, &ep->rma_info.mmn_list) { + mmn = list_entry(item, struct scif_mmu_notif, list); + __scif_rma_destroy_tcw(mmn, ep, 0, ULONG_MAX); + } + spin_unlock(&ep->rma_info.tc_lock); +} + +static bool scif_rma_tc_can_cache(struct scif_endpt *ep, size_t cur_bytes) +{ + if ((cur_bytes >> PAGE_SHIFT) > scif_info.rma_tc_limit) + return false; + if ((atomic_read(&ep->rma_info.tcw_total_pages) + + (cur_bytes >> PAGE_SHIFT)) > + scif_info.rma_tc_limit) { + dev_info(scif_info.mdev.this_device, + "%s %d total=%d, current=%zu reached max\n", + __func__, __LINE__, + atomic_read(&ep->rma_info.tcw_total_pages), + (1 + (cur_bytes >> PAGE_SHIFT))); + scif_rma_destroy_tcw_invalid(); + __scif_rma_destroy_tcw_ep(ep); + } + return true; +} + +static void scif_mmu_notifier_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct scif_mmu_notif *mmn; + + mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); + scif_rma_destroy_tcw(mmn, 0, ULONG_MAX); + schedule_work(&scif_info.misc_work); +} + +static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct scif_mmu_notif *mmn; + + mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); + scif_rma_destroy_tcw(mmn, address, PAGE_SIZE); +} + +static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct scif_mmu_notif *mmn; + + mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); + scif_rma_destroy_tcw(mmn, start, end - start); +} + +static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + /* + * Nothing to do here, everything needed was done in + * invalidate_range_start. + */ +} + +static const struct mmu_notifier_ops scif_mmu_notifier_ops = { + .release = scif_mmu_notifier_release, + .clear_flush_young = NULL, + .invalidate_page = scif_mmu_notifier_invalidate_page, + .invalidate_range_start = scif_mmu_notifier_invalidate_range_start, + .invalidate_range_end = scif_mmu_notifier_invalidate_range_end}; + +static void scif_ep_unregister_mmu_notifier(struct scif_endpt *ep) +{ + struct scif_endpt_rma_info *rma = &ep->rma_info; + struct scif_mmu_notif *mmn = NULL; + struct list_head *item, *tmp; + + mutex_lock(&ep->rma_info.mmn_lock); + list_for_each_safe(item, tmp, &rma->mmn_list) { + mmn = list_entry(item, struct scif_mmu_notif, list); + mmu_notifier_unregister(&mmn->ep_mmu_notifier, mmn->mm); + list_del(item); + kfree(mmn); + } + mutex_unlock(&ep->rma_info.mmn_lock); +} + +static void scif_init_mmu_notifier(struct scif_mmu_notif *mmn, + struct mm_struct *mm, struct scif_endpt *ep) +{ + mmn->ep = ep; + mmn->mm = mm; + mmn->ep_mmu_notifier.ops = &scif_mmu_notifier_ops; + INIT_LIST_HEAD(&mmn->list); + INIT_LIST_HEAD(&mmn->tc_reg_list); +} + +static struct scif_mmu_notif * +scif_find_mmu_notifier(struct mm_struct *mm, struct scif_endpt_rma_info *rma) +{ + struct scif_mmu_notif *mmn; + struct list_head *item; + + list_for_each(item, &rma->mmn_list) { + mmn = list_entry(item, struct scif_mmu_notif, list); + if (mmn->mm == mm) + return mmn; + } + return NULL; +} + +static struct scif_mmu_notif * +scif_add_mmu_notifier(struct mm_struct *mm, struct scif_endpt *ep) +{ + struct scif_mmu_notif *mmn + = kzalloc(sizeof(*mmn), GFP_KERNEL); + + if (!mmn) + return ERR_PTR(ENOMEM); + + scif_init_mmu_notifier(mmn, current->mm, ep); + if (mmu_notifier_register(&mmn->ep_mmu_notifier, + current->mm)) { + kfree(mmn); + return ERR_PTR(EBUSY); + } + list_add(&mmn->list, &ep->rma_info.mmn_list); + return mmn; +} + +/* + * Called from the misc thread to destroy temporary cached windows and + * unregister the MMU notifier for the SCIF endpoint. + */ +void scif_mmu_notif_handler(struct work_struct *work) +{ + struct list_head *pos, *tmpq; + struct scif_endpt *ep; +restart: + scif_rma_destroy_tcw_invalid(); + spin_lock(&scif_info.rmalock); + list_for_each_safe(pos, tmpq, &scif_info.mmu_notif_cleanup) { + ep = list_entry(pos, struct scif_endpt, mmu_list); + list_del(&ep->mmu_list); + spin_unlock(&scif_info.rmalock); + scif_rma_destroy_tcw_ep(ep); + scif_ep_unregister_mmu_notifier(ep); + goto restart; + } + spin_unlock(&scif_info.rmalock); +} + +static bool scif_is_set_reg_cache(int flags) +{ + return !!(flags & SCIF_RMA_USECACHE); +} +#else +static struct scif_mmu_notif * +scif_find_mmu_notifier(struct mm_struct *mm, + struct scif_endpt_rma_info *rma) +{ + return NULL; +} + +static struct scif_mmu_notif * +scif_add_mmu_notifier(struct mm_struct *mm, struct scif_endpt *ep) +{ + return NULL; +} + +void scif_mmu_notif_handler(struct work_struct *work) +{ +} + +static bool scif_is_set_reg_cache(int flags) +{ + return false; +} + +static bool scif_rma_tc_can_cache(struct scif_endpt *ep, size_t cur_bytes) +{ + return false; +} +#endif + +/** + * scif_register_temp: + * @epd: End Point Descriptor. + * @addr: virtual address to/from which to copy + * @len: length of range to copy + * @out_offset: computed offset returned by reference. + * @out_window: allocated registered window returned by reference. + * + * Create a temporary registered window. The peer will not know about this + * window. This API is used for scif_vreadfrom()/scif_vwriteto() API's. + */ +static int +scif_register_temp(scif_epd_t epd, unsigned long addr, size_t len, int prot, + off_t *out_offset, struct scif_window **out_window) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int err; + scif_pinned_pages_t pinned_pages; + size_t aligned_len; + + aligned_len = ALIGN(len, PAGE_SIZE); + + err = __scif_pin_pages((void *)(addr & PAGE_MASK), + aligned_len, &prot, 0, &pinned_pages); + if (err) + return err; + + pinned_pages->prot = prot; + + /* Compute the offset for this registration */ + err = scif_get_window_offset(ep, 0, 0, + aligned_len >> PAGE_SHIFT, + (s64 *)out_offset); + if (err) + goto error_unpin; + + /* Allocate and prepare self registration window */ + *out_window = scif_create_window(ep, aligned_len >> PAGE_SHIFT, + *out_offset, true); + if (!*out_window) { + scif_free_window_offset(ep, NULL, *out_offset); + err = -ENOMEM; + goto error_unpin; + } + + (*out_window)->pinned_pages = pinned_pages; + (*out_window)->nr_pages = pinned_pages->nr_pages; + (*out_window)->prot = pinned_pages->prot; + + (*out_window)->va_for_temp = addr & PAGE_MASK; + err = scif_map_window(ep->remote_dev, *out_window); + if (err) { + /* Something went wrong! Rollback */ + scif_destroy_window(ep, *out_window); + *out_window = NULL; + } else { + *out_offset |= (addr - (*out_window)->va_for_temp); + } + return err; +error_unpin: + if (err) + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + scif_unpin_pages(pinned_pages); + return err; +} + +#define SCIF_DMA_TO (3 * HZ) + +/* + * scif_sync_dma - Program a DMA without an interrupt descriptor + * + * @dev - The address of the pointer to the device instance used + * for DMA registration. + * @chan - DMA channel to be used. + * @sync_wait: Wait for DMA to complete? + * + * Return 0 on success and -errno on error. + */ +static int scif_sync_dma(struct scif_hw_dev *sdev, struct dma_chan *chan, + bool sync_wait) +{ + int err = 0; + struct dma_async_tx_descriptor *tx = NULL; + enum dma_ctrl_flags flags = DMA_PREP_FENCE; + dma_cookie_t cookie; + struct dma_device *ddev; + + if (!chan) { + err = -EIO; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + ddev = chan->device; + + tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, flags); + if (!tx) { + err = -ENOMEM; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } + cookie = tx->tx_submit(tx); + + if (dma_submit_error(cookie)) { + err = -ENOMEM; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } + if (!sync_wait) { + dma_async_issue_pending(chan); + } else { + if (dma_sync_wait(chan, cookie) == DMA_COMPLETE) { + err = 0; + } else { + err = -EIO; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + } + } +release: + return err; +} + +static void scif_dma_callback(void *arg) +{ + struct completion *done = (struct completion *)arg; + + complete(done); +} + +#define SCIF_DMA_SYNC_WAIT true +#define SCIF_DMA_POLL BIT(0) +#define SCIF_DMA_INTR BIT(1) + +/* + * scif_async_dma - Program a DMA with an interrupt descriptor + * + * @dev - The address of the pointer to the device instance used + * for DMA registration. + * @chan - DMA channel to be used. + * Return 0 on success and -errno on error. + */ +static int scif_async_dma(struct scif_hw_dev *sdev, struct dma_chan *chan) +{ + int err = 0; + struct dma_device *ddev; + struct dma_async_tx_descriptor *tx = NULL; + enum dma_ctrl_flags flags = DMA_PREP_INTERRUPT | DMA_PREP_FENCE; + DECLARE_COMPLETION_ONSTACK(done_wait); + dma_cookie_t cookie; + enum dma_status status; + + if (!chan) { + err = -EIO; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + ddev = chan->device; + + tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, flags); + if (!tx) { + err = -ENOMEM; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } + reinit_completion(&done_wait); + tx->callback = scif_dma_callback; + tx->callback_param = &done_wait; + cookie = tx->tx_submit(tx); + + if (dma_submit_error(cookie)) { + err = -ENOMEM; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } + dma_async_issue_pending(chan); + + err = wait_for_completion_timeout(&done_wait, SCIF_DMA_TO); + if (!err) { + err = -EIO; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } + err = 0; + status = dma_async_is_tx_complete(chan, cookie, NULL, NULL); + if (status != DMA_COMPLETE) { + err = -EIO; + dev_err(&sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto release; + } +release: + return err; +} + +/* + * scif_drain_dma_poll - Drain all outstanding DMA operations for a particular + * DMA channel via polling. + * + * @sdev - The SCIF device + * @chan - DMA channel + * Return 0 on success and -errno on error. + */ +static int scif_drain_dma_poll(struct scif_hw_dev *sdev, struct dma_chan *chan) +{ + if (!chan) + return -EINVAL; + return scif_sync_dma(sdev, chan, SCIF_DMA_SYNC_WAIT); +} + +/* + * scif_drain_dma_intr - Drain all outstanding DMA operations for a particular + * DMA channel via interrupt based blocking wait. + * + * @sdev - The SCIF device + * @chan - DMA channel + * Return 0 on success and -errno on error. + */ +int scif_drain_dma_intr(struct scif_hw_dev *sdev, struct dma_chan *chan) +{ + if (!chan) + return -EINVAL; + return scif_async_dma(sdev, chan); +} + +/** + * scif_rma_destroy_windows: + * + * This routine destroys all windows queued for cleanup + */ +void scif_rma_destroy_windows(void) +{ + struct list_head *item, *tmp; + struct scif_window *window; + struct scif_endpt *ep; + struct dma_chan *chan; + + might_sleep(); +restart: + spin_lock(&scif_info.rmalock); + list_for_each_safe(item, tmp, &scif_info.rma) { + window = list_entry(item, struct scif_window, + list); + ep = (struct scif_endpt *)window->ep; + chan = ep->rma_info.dma_chan; + + list_del_init(&window->list); + spin_unlock(&scif_info.rmalock); + if (!chan || !scifdev_alive(ep) || + !scif_drain_dma_intr(ep->remote_dev->sdev, + ep->rma_info.dma_chan)) + /* Remove window from global list */ + window->unreg_state = OP_COMPLETED; + else + dev_warn(&ep->remote_dev->sdev->dev, + "DMA engine hung?\n"); + if (window->unreg_state == OP_COMPLETED) { + if (window->type == SCIF_WINDOW_SELF) + scif_destroy_window(ep, window); + else + scif_destroy_remote_window(window); + atomic_dec(&ep->rma_info.tw_refcount); + } + goto restart; + } + spin_unlock(&scif_info.rmalock); +} + +/** + * scif_rma_destroy_tcw: + * + * This routine destroys temporary cached registered windows + * which have been queued for cleanup. + */ +void scif_rma_destroy_tcw_invalid(void) +{ + struct list_head *item, *tmp; + struct scif_window *window; + struct scif_endpt *ep; + struct dma_chan *chan; + + might_sleep(); +restart: + spin_lock(&scif_info.rmalock); + list_for_each_safe(item, tmp, &scif_info.rma_tc) { + window = list_entry(item, struct scif_window, list); + ep = (struct scif_endpt *)window->ep; + chan = ep->rma_info.dma_chan; + list_del_init(&window->list); + spin_unlock(&scif_info.rmalock); + mutex_lock(&ep->rma_info.rma_lock); + if (!chan || !scifdev_alive(ep) || + !scif_drain_dma_intr(ep->remote_dev->sdev, + ep->rma_info.dma_chan)) { + atomic_sub(window->nr_pages, + &ep->rma_info.tcw_total_pages); + scif_destroy_window(ep, window); + atomic_dec(&ep->rma_info.tcw_refcount); + } else { + dev_warn(&ep->remote_dev->sdev->dev, + "DMA engine hung?\n"); + } + mutex_unlock(&ep->rma_info.rma_lock); + goto restart; + } + spin_unlock(&scif_info.rmalock); +} + +static inline +void *_get_local_va(off_t off, struct scif_window *window, size_t len) +{ + int page_nr = (off - window->offset) >> PAGE_SHIFT; + off_t page_off = off & ~PAGE_MASK; + void *va = NULL; + + if (window->type == SCIF_WINDOW_SELF) { + struct page **pages = window->pinned_pages->pages; + + va = page_address(pages[page_nr]) + page_off; + } + return va; +} + +static inline +void *ioremap_remote(off_t off, struct scif_window *window, + size_t len, struct scif_dev *dev, + struct scif_window_iter *iter) +{ + dma_addr_t phys = scif_off_to_dma_addr(window, off, NULL, iter); + + /* + * If the DMA address is not card relative then we need the DMA + * addresses to be an offset into the bar. The aperture base was already + * added so subtract it here since scif_ioremap is going to add it again + */ + if (!scifdev_self(dev) && window->type == SCIF_WINDOW_PEER && + dev->sdev->aper && !dev->sdev->card_rel_da) + phys = phys - dev->sdev->aper->pa; + return scif_ioremap(phys, len, dev); +} + +static inline void +iounmap_remote(void *virt, size_t size, struct scif_copy_work *work) +{ + scif_iounmap(virt, size, work->remote_dev); +} + +/* + * Takes care of ordering issue caused by + * 1. Hardware: Only in the case of cpu copy from mgmt node to card + * because of WC memory. + * 2. Software: If memcpy reorders copy instructions for optimization. + * This could happen at both mgmt node and card. + */ +static inline void +scif_ordered_memcpy_toio(char *dst, const char *src, size_t count) +{ + if (!count) + return; + + memcpy_toio((void __iomem __force *)dst, src, --count); + /* Order the last byte with the previous stores */ + wmb(); + *(dst + count) = *(src + count); +} + +static inline void scif_unaligned_cpy_toio(char *dst, const char *src, + size_t count, bool ordered) +{ + if (ordered) + scif_ordered_memcpy_toio(dst, src, count); + else + memcpy_toio((void __iomem __force *)dst, src, count); +} + +static inline +void scif_ordered_memcpy_fromio(char *dst, const char *src, size_t count) +{ + if (!count) + return; + + memcpy_fromio(dst, (void __iomem __force *)src, --count); + /* Order the last byte with the previous loads */ + rmb(); + *(dst + count) = *(src + count); +} + +static inline void scif_unaligned_cpy_fromio(char *dst, const char *src, + size_t count, bool ordered) +{ + if (ordered) + scif_ordered_memcpy_fromio(dst, src, count); + else + memcpy_fromio(dst, (void __iomem __force *)src, count); +} + +#define SCIF_RMA_ERROR_CODE (~(dma_addr_t)0x0) + +/* + * scif_off_to_dma_addr: + * Obtain the dma_addr given the window and the offset. + * @window: Registered window. + * @off: Window offset. + * @nr_bytes: Return the number of contiguous bytes till next DMA addr index. + * @index: Return the index of the dma_addr array found. + * @start_off: start offset of index of the dma addr array found. + * The nr_bytes provides the callee an estimate of the maximum possible + * DMA xfer possible while the index/start_off provide faster lookups + * for the next iteration. + */ +dma_addr_t scif_off_to_dma_addr(struct scif_window *window, s64 off, + size_t *nr_bytes, struct scif_window_iter *iter) +{ + int i, page_nr; + s64 start, end; + off_t page_off; + + if (window->nr_pages == window->nr_contig_chunks) { + page_nr = (off - window->offset) >> PAGE_SHIFT; + page_off = off & ~PAGE_MASK; + + if (nr_bytes) + *nr_bytes = PAGE_SIZE - page_off; + return window->dma_addr[page_nr] | page_off; + } + if (iter) { + i = iter->index; + start = iter->offset; + } else { + i = 0; + start = window->offset; + } + for (; i < window->nr_contig_chunks; i++) { + end = start + (window->num_pages[i] << PAGE_SHIFT); + if (off >= start && off < end) { + if (iter) { + iter->index = i; + iter->offset = start; + } + if (nr_bytes) + *nr_bytes = end - off; + return (window->dma_addr[i] + (off - start)); + } + start += (window->num_pages[i] << PAGE_SHIFT); + } + dev_err(scif_info.mdev.this_device, + "%s %d BUG. Addr not found? window %p off 0x%llx\n", + __func__, __LINE__, window, off); + return SCIF_RMA_ERROR_CODE; +} + +/* + * Copy between rma window and temporary buffer + */ +static void scif_rma_local_cpu_copy(s64 offset, struct scif_window *window, + u8 *temp, size_t rem_len, bool to_temp) +{ + void *window_virt; + size_t loop_len; + int offset_in_page; + s64 end_offset; + + offset_in_page = offset & ~PAGE_MASK; + loop_len = PAGE_SIZE - offset_in_page; + + if (rem_len < loop_len) + loop_len = rem_len; + + window_virt = _get_local_va(offset, window, loop_len); + if (!window_virt) + return; + if (to_temp) + memcpy(temp, window_virt, loop_len); + else + memcpy(window_virt, temp, loop_len); + + offset += loop_len; + temp += loop_len; + rem_len -= loop_len; + + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + while (rem_len) { + if (offset == end_offset) { + window = list_entry_next(window, list); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + } + loop_len = min(PAGE_SIZE, rem_len); + window_virt = _get_local_va(offset, window, loop_len); + if (!window_virt) + return; + if (to_temp) + memcpy(temp, window_virt, loop_len); + else + memcpy(window_virt, temp, loop_len); + offset += loop_len; + temp += loop_len; + rem_len -= loop_len; + } +} + +/** + * scif_rma_completion_cb: + * @data: RMA cookie + * + * RMA interrupt completion callback. + */ +static void scif_rma_completion_cb(void *data) +{ + struct scif_dma_comp_cb *comp_cb = data; + + /* Free DMA Completion CB. */ + if (comp_cb->dst_window) + scif_rma_local_cpu_copy(comp_cb->dst_offset, + comp_cb->dst_window, + comp_cb->temp_buf + + comp_cb->header_padding, + comp_cb->len, false); + scif_unmap_single(comp_cb->temp_phys, comp_cb->sdev, + SCIF_KMEM_UNALIGNED_BUF_SIZE); + if (comp_cb->is_cache) + kmem_cache_free(unaligned_cache, + comp_cb->temp_buf_to_free); + else + kfree(comp_cb->temp_buf_to_free); +} + +/* Copies between temporary buffer and offsets provided in work */ +static int +scif_rma_list_dma_copy_unaligned(struct scif_copy_work *work, + u8 *temp, struct dma_chan *chan, + bool src_local) +{ + struct scif_dma_comp_cb *comp_cb = work->comp_cb; + dma_addr_t window_dma_addr, temp_dma_addr; + dma_addr_t temp_phys = comp_cb->temp_phys; + size_t loop_len, nr_contig_bytes = 0, remaining_len = work->len; + int offset_in_ca, ret = 0; + s64 end_offset, offset; + struct scif_window *window; + void *window_virt_addr; + size_t tail_len; + struct dma_async_tx_descriptor *tx; + struct dma_device *dev = chan->device; + dma_cookie_t cookie; + + if (src_local) { + offset = work->dst_offset; + window = work->dst_window; + } else { + offset = work->src_offset; + window = work->src_window; + } + + offset_in_ca = offset & (L1_CACHE_BYTES - 1); + if (offset_in_ca) { + loop_len = L1_CACHE_BYTES - offset_in_ca; + loop_len = min(loop_len, remaining_len); + window_virt_addr = ioremap_remote(offset, window, + loop_len, + work->remote_dev, + NULL); + if (!window_virt_addr) + return -ENOMEM; + if (src_local) + scif_unaligned_cpy_toio(window_virt_addr, temp, + loop_len, + work->ordered && + !(remaining_len - loop_len)); + else + scif_unaligned_cpy_fromio(temp, window_virt_addr, + loop_len, work->ordered && + !(remaining_len - loop_len)); + iounmap_remote(window_virt_addr, loop_len, work); + + offset += loop_len; + temp += loop_len; + temp_phys += loop_len; + remaining_len -= loop_len; + } + + offset_in_ca = offset & ~PAGE_MASK; + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + + tail_len = remaining_len & (L1_CACHE_BYTES - 1); + remaining_len -= tail_len; + while (remaining_len) { + if (offset == end_offset) { + window = list_entry_next(window, list); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + } + if (scif_is_mgmt_node()) + temp_dma_addr = temp_phys; + else + /* Fix if we ever enable IOMMU on the card */ + temp_dma_addr = (dma_addr_t)virt_to_phys(temp); + window_dma_addr = scif_off_to_dma_addr(window, offset, + &nr_contig_bytes, + NULL); + loop_len = min(nr_contig_bytes, remaining_len); + if (src_local) { + if (work->ordered && !tail_len && + !(remaining_len - loop_len) && + loop_len != L1_CACHE_BYTES) { + /* + * Break up the last chunk of the transfer into + * two steps. if there is no tail to guarantee + * DMA ordering. SCIF_DMA_POLLING inserts + * a status update descriptor in step 1 which + * acts as a double sided synchronization fence + * for the DMA engine to ensure that the last + * cache line in step 2 is updated last. + */ + /* Step 1) DMA: Body Length - L1_CACHE_BYTES. */ + tx = + dev->device_prep_dma_memcpy(chan, + window_dma_addr, + temp_dma_addr, + loop_len - + L1_CACHE_BYTES, + DMA_PREP_FENCE); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + offset += (loop_len - L1_CACHE_BYTES); + temp_dma_addr += (loop_len - L1_CACHE_BYTES); + window_dma_addr += (loop_len - L1_CACHE_BYTES); + remaining_len -= (loop_len - L1_CACHE_BYTES); + loop_len = remaining_len; + + /* Step 2) DMA: L1_CACHE_BYTES */ + tx = + dev->device_prep_dma_memcpy(chan, + window_dma_addr, + temp_dma_addr, + loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } else { + tx = + dev->device_prep_dma_memcpy(chan, + window_dma_addr, + temp_dma_addr, + loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } + } else { + tx = dev->device_prep_dma_memcpy(chan, temp_dma_addr, + window_dma_addr, loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } + if (ret < 0) + goto err; + offset += loop_len; + temp += loop_len; + temp_phys += loop_len; + remaining_len -= loop_len; + offset_in_ca = 0; + } + if (tail_len) { + if (offset == end_offset) { + window = list_entry_next(window, list); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + } + window_virt_addr = ioremap_remote(offset, window, tail_len, + work->remote_dev, + NULL); + if (!window_virt_addr) + return -ENOMEM; + /* + * The CPU copy for the tail bytes must be initiated only once + * previous DMA transfers for this endpoint have completed + * to guarantee ordering. + */ + if (work->ordered) { + struct scif_dev *rdev = work->remote_dev; + + ret = scif_drain_dma_intr(rdev->sdev, chan); + if (ret) + return ret; + } + if (src_local) + scif_unaligned_cpy_toio(window_virt_addr, temp, + tail_len, work->ordered); + else + scif_unaligned_cpy_fromio(temp, window_virt_addr, + tail_len, work->ordered); + iounmap_remote(window_virt_addr, tail_len, work); + } + tx = dev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_INTERRUPT); + if (!tx) { + ret = -ENOMEM; + return ret; + } + tx->callback = &scif_rma_completion_cb; + tx->callback_param = comp_cb; + cookie = tx->tx_submit(tx); + + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + return ret; + } + dma_async_issue_pending(chan); + return 0; +err: + dev_err(scif_info.mdev.this_device, + "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; +} + +/* + * _scif_rma_list_dma_copy_aligned: + * + * Traverse all the windows and perform DMA copy. + */ +static int _scif_rma_list_dma_copy_aligned(struct scif_copy_work *work, + struct dma_chan *chan) +{ + dma_addr_t src_dma_addr, dst_dma_addr; + size_t loop_len, remaining_len, src_contig_bytes = 0; + size_t dst_contig_bytes = 0; + struct scif_window_iter src_win_iter; + struct scif_window_iter dst_win_iter; + s64 end_src_offset, end_dst_offset; + struct scif_window *src_window = work->src_window; + struct scif_window *dst_window = work->dst_window; + s64 src_offset = work->src_offset, dst_offset = work->dst_offset; + int ret = 0; + struct dma_async_tx_descriptor *tx; + struct dma_device *dev = chan->device; + dma_cookie_t cookie; + + remaining_len = work->len; + + scif_init_window_iter(src_window, &src_win_iter); + scif_init_window_iter(dst_window, &dst_win_iter); + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + while (remaining_len) { + if (src_offset == end_src_offset) { + src_window = list_entry_next(src_window, list); + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + scif_init_window_iter(src_window, &src_win_iter); + } + if (dst_offset == end_dst_offset) { + dst_window = list_entry_next(dst_window, list); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + scif_init_window_iter(dst_window, &dst_win_iter); + } + + /* compute dma addresses for transfer */ + src_dma_addr = scif_off_to_dma_addr(src_window, src_offset, + &src_contig_bytes, + &src_win_iter); + dst_dma_addr = scif_off_to_dma_addr(dst_window, dst_offset, + &dst_contig_bytes, + &dst_win_iter); + loop_len = min(src_contig_bytes, dst_contig_bytes); + loop_len = min(loop_len, remaining_len); + if (work->ordered && !(remaining_len - loop_len)) { + /* + * Break up the last chunk of the transfer into two + * steps to ensure that the last byte in step 2 is + * updated last. + */ + /* Step 1) DMA: Body Length - 1 */ + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, + loop_len - 1, + DMA_PREP_FENCE); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + src_offset += (loop_len - 1); + dst_offset += (loop_len - 1); + src_dma_addr += (loop_len - 1); + dst_dma_addr += (loop_len - 1); + remaining_len -= (loop_len - 1); + loop_len = remaining_len; + + /* Step 2) DMA: 1 BYTES */ + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } else { + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + } + src_offset += loop_len; + dst_offset += loop_len; + remaining_len -= loop_len; + } + return ret; +err: + dev_err(scif_info.mdev.this_device, + "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; +} + +/* + * scif_rma_list_dma_copy_aligned: + * + * Traverse all the windows and perform DMA copy. + */ +static int scif_rma_list_dma_copy_aligned(struct scif_copy_work *work, + struct dma_chan *chan) +{ + dma_addr_t src_dma_addr, dst_dma_addr; + size_t loop_len, remaining_len, tail_len, src_contig_bytes = 0; + size_t dst_contig_bytes = 0; + int src_cache_off; + s64 end_src_offset, end_dst_offset; + struct scif_window_iter src_win_iter; + struct scif_window_iter dst_win_iter; + void *src_virt, *dst_virt; + struct scif_window *src_window = work->src_window; + struct scif_window *dst_window = work->dst_window; + s64 src_offset = work->src_offset, dst_offset = work->dst_offset; + int ret = 0; + struct dma_async_tx_descriptor *tx; + struct dma_device *dev = chan->device; + dma_cookie_t cookie; + + remaining_len = work->len; + scif_init_window_iter(src_window, &src_win_iter); + scif_init_window_iter(dst_window, &dst_win_iter); + + src_cache_off = src_offset & (L1_CACHE_BYTES - 1); + if (src_cache_off != 0) { + /* Head */ + loop_len = L1_CACHE_BYTES - src_cache_off; + loop_len = min(loop_len, remaining_len); + src_dma_addr = __scif_off_to_dma_addr(src_window, src_offset); + dst_dma_addr = __scif_off_to_dma_addr(dst_window, dst_offset); + if (src_window->type == SCIF_WINDOW_SELF) + src_virt = _get_local_va(src_offset, src_window, + loop_len); + else + src_virt = ioremap_remote(src_offset, src_window, + loop_len, + work->remote_dev, NULL); + if (!src_virt) + return -ENOMEM; + if (dst_window->type == SCIF_WINDOW_SELF) + dst_virt = _get_local_va(dst_offset, dst_window, + loop_len); + else + dst_virt = ioremap_remote(dst_offset, dst_window, + loop_len, + work->remote_dev, NULL); + if (!dst_virt) { + if (src_window->type != SCIF_WINDOW_SELF) + iounmap_remote(src_virt, loop_len, work); + return -ENOMEM; + } + if (src_window->type == SCIF_WINDOW_SELF) + scif_unaligned_cpy_toio(dst_virt, src_virt, loop_len, + remaining_len == loop_len ? + work->ordered : false); + else + scif_unaligned_cpy_fromio(dst_virt, src_virt, loop_len, + remaining_len == loop_len ? + work->ordered : false); + if (src_window->type != SCIF_WINDOW_SELF) + iounmap_remote(src_virt, loop_len, work); + if (dst_window->type != SCIF_WINDOW_SELF) + iounmap_remote(dst_virt, loop_len, work); + src_offset += loop_len; + dst_offset += loop_len; + remaining_len -= loop_len; + } + + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + tail_len = remaining_len & (L1_CACHE_BYTES - 1); + remaining_len -= tail_len; + while (remaining_len) { + if (src_offset == end_src_offset) { + src_window = list_entry_next(src_window, list); + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + scif_init_window_iter(src_window, &src_win_iter); + } + if (dst_offset == end_dst_offset) { + dst_window = list_entry_next(dst_window, list); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + scif_init_window_iter(dst_window, &dst_win_iter); + } + + /* compute dma addresses for transfer */ + src_dma_addr = scif_off_to_dma_addr(src_window, src_offset, + &src_contig_bytes, + &src_win_iter); + dst_dma_addr = scif_off_to_dma_addr(dst_window, dst_offset, + &dst_contig_bytes, + &dst_win_iter); + loop_len = min(src_contig_bytes, dst_contig_bytes); + loop_len = min(loop_len, remaining_len); + if (work->ordered && !tail_len && + !(remaining_len - loop_len)) { + /* + * Break up the last chunk of the transfer into two + * steps. if there is no tail to gurantee DMA ordering. + * Passing SCIF_DMA_POLLING inserts a status update + * descriptor in step 1 which acts as a double sided + * synchronization fence for the DMA engine to ensure + * that the last cache line in step 2 is updated last. + */ + /* Step 1) DMA: Body Length - L1_CACHE_BYTES. */ + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, + loop_len - + L1_CACHE_BYTES, + DMA_PREP_FENCE); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + src_offset += (loop_len - L1_CACHE_BYTES); + dst_offset += (loop_len - L1_CACHE_BYTES); + src_dma_addr += (loop_len - L1_CACHE_BYTES); + dst_dma_addr += (loop_len - L1_CACHE_BYTES); + remaining_len -= (loop_len - L1_CACHE_BYTES); + loop_len = remaining_len; + + /* Step 2) DMA: L1_CACHE_BYTES */ + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, + loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } else { + tx = dev->device_prep_dma_memcpy(chan, dst_dma_addr, + src_dma_addr, + loop_len, 0); + if (!tx) { + ret = -ENOMEM; + goto err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + ret = -ENOMEM; + goto err; + } + dma_async_issue_pending(chan); + } + src_offset += loop_len; + dst_offset += loop_len; + remaining_len -= loop_len; + } + remaining_len = tail_len; + if (remaining_len) { + loop_len = remaining_len; + if (src_offset == end_src_offset) + src_window = list_entry_next(src_window, list); + if (dst_offset == end_dst_offset) + dst_window = list_entry_next(dst_window, list); + + src_dma_addr = __scif_off_to_dma_addr(src_window, src_offset); + dst_dma_addr = __scif_off_to_dma_addr(dst_window, dst_offset); + /* + * The CPU copy for the tail bytes must be initiated only once + * previous DMA transfers for this endpoint have completed to + * guarantee ordering. + */ + if (work->ordered) { + struct scif_dev *rdev = work->remote_dev; + + ret = scif_drain_dma_poll(rdev->sdev, chan); + if (ret) + return ret; + } + if (src_window->type == SCIF_WINDOW_SELF) + src_virt = _get_local_va(src_offset, src_window, + loop_len); + else + src_virt = ioremap_remote(src_offset, src_window, + loop_len, + work->remote_dev, NULL); + if (!src_virt) + return -ENOMEM; + + if (dst_window->type == SCIF_WINDOW_SELF) + dst_virt = _get_local_va(dst_offset, dst_window, + loop_len); + else + dst_virt = ioremap_remote(dst_offset, dst_window, + loop_len, + work->remote_dev, NULL); + if (!dst_virt) { + if (src_window->type != SCIF_WINDOW_SELF) + iounmap_remote(src_virt, loop_len, work); + return -ENOMEM; + } + + if (src_window->type == SCIF_WINDOW_SELF) + scif_unaligned_cpy_toio(dst_virt, src_virt, loop_len, + work->ordered); + else + scif_unaligned_cpy_fromio(dst_virt, src_virt, + loop_len, work->ordered); + if (src_window->type != SCIF_WINDOW_SELF) + iounmap_remote(src_virt, loop_len, work); + + if (dst_window->type != SCIF_WINDOW_SELF) + iounmap_remote(dst_virt, loop_len, work); + remaining_len -= loop_len; + } + return ret; +err: + dev_err(scif_info.mdev.this_device, + "%s %d Desc Prog Failed ret %d\n", + __func__, __LINE__, ret); + return ret; +} + +/* + * scif_rma_list_cpu_copy: + * + * Traverse all the windows and perform CPU copy. + */ +static int scif_rma_list_cpu_copy(struct scif_copy_work *work) +{ + void *src_virt, *dst_virt; + size_t loop_len, remaining_len; + int src_page_off, dst_page_off; + s64 src_offset = work->src_offset, dst_offset = work->dst_offset; + struct scif_window *src_window = work->src_window; + struct scif_window *dst_window = work->dst_window; + s64 end_src_offset, end_dst_offset; + int ret = 0; + struct scif_window_iter src_win_iter; + struct scif_window_iter dst_win_iter; + + remaining_len = work->len; + + scif_init_window_iter(src_window, &src_win_iter); + scif_init_window_iter(dst_window, &dst_win_iter); + while (remaining_len) { + src_page_off = src_offset & ~PAGE_MASK; + dst_page_off = dst_offset & ~PAGE_MASK; + loop_len = min(PAGE_SIZE - + max(src_page_off, dst_page_off), + remaining_len); + + if (src_window->type == SCIF_WINDOW_SELF) + src_virt = _get_local_va(src_offset, src_window, + loop_len); + else + src_virt = ioremap_remote(src_offset, src_window, + loop_len, + work->remote_dev, + &src_win_iter); + if (!src_virt) { + ret = -ENOMEM; + goto error; + } + + if (dst_window->type == SCIF_WINDOW_SELF) + dst_virt = _get_local_va(dst_offset, dst_window, + loop_len); + else + dst_virt = ioremap_remote(dst_offset, dst_window, + loop_len, + work->remote_dev, + &dst_win_iter); + if (!dst_virt) { + if (src_window->type == SCIF_WINDOW_PEER) + iounmap_remote(src_virt, loop_len, work); + ret = -ENOMEM; + goto error; + } + + if (work->loopback) { + memcpy(dst_virt, src_virt, loop_len); + } else { + if (src_window->type == SCIF_WINDOW_SELF) + memcpy_toio((void __iomem __force *)dst_virt, + src_virt, loop_len); + else + memcpy_fromio(dst_virt, + (void __iomem __force *)src_virt, + loop_len); + } + if (src_window->type == SCIF_WINDOW_PEER) + iounmap_remote(src_virt, loop_len, work); + + if (dst_window->type == SCIF_WINDOW_PEER) + iounmap_remote(dst_virt, loop_len, work); + + src_offset += loop_len; + dst_offset += loop_len; + remaining_len -= loop_len; + if (remaining_len) { + end_src_offset = src_window->offset + + (src_window->nr_pages << PAGE_SHIFT); + end_dst_offset = dst_window->offset + + (dst_window->nr_pages << PAGE_SHIFT); + if (src_offset == end_src_offset) { + src_window = list_entry_next(src_window, list); + scif_init_window_iter(src_window, + &src_win_iter); + } + if (dst_offset == end_dst_offset) { + dst_window = list_entry_next(dst_window, list); + scif_init_window_iter(dst_window, + &dst_win_iter); + } + } + } +error: + return ret; +} + +static int scif_rma_list_dma_copy_wrapper(struct scif_endpt *epd, + struct scif_copy_work *work, + struct dma_chan *chan, off_t loffset) +{ + int src_cache_off, dst_cache_off; + s64 src_offset = work->src_offset, dst_offset = work->dst_offset; + u8 *temp = NULL; + bool src_local = true, dst_local = false; + struct scif_dma_comp_cb *comp_cb; + dma_addr_t src_dma_addr, dst_dma_addr; + int err; + + if (is_dma_copy_aligned(chan->device, 1, 1, 1)) + return _scif_rma_list_dma_copy_aligned(work, chan); + + src_cache_off = src_offset & (L1_CACHE_BYTES - 1); + dst_cache_off = dst_offset & (L1_CACHE_BYTES - 1); + + if (dst_cache_off == src_cache_off) + return scif_rma_list_dma_copy_aligned(work, chan); + + if (work->loopback) + return scif_rma_list_cpu_copy(work); + src_dma_addr = __scif_off_to_dma_addr(work->src_window, src_offset); + dst_dma_addr = __scif_off_to_dma_addr(work->dst_window, dst_offset); + src_local = work->src_window->type == SCIF_WINDOW_SELF; + dst_local = work->dst_window->type == SCIF_WINDOW_SELF; + + dst_local = dst_local; + /* Allocate dma_completion cb */ + comp_cb = kzalloc(sizeof(*comp_cb), GFP_KERNEL); + if (!comp_cb) + goto error; + + work->comp_cb = comp_cb; + comp_cb->cb_cookie = comp_cb; + comp_cb->dma_completion_func = &scif_rma_completion_cb; + + if (work->len + (L1_CACHE_BYTES << 1) < SCIF_KMEM_UNALIGNED_BUF_SIZE) { + comp_cb->is_cache = false; + /* Allocate padding bytes to align to a cache line */ + temp = kmalloc(work->len + (L1_CACHE_BYTES << 1), + GFP_KERNEL); + if (!temp) + goto free_comp_cb; + comp_cb->temp_buf_to_free = temp; + /* kmalloc(..) does not guarantee cache line alignment */ + if (!IS_ALIGNED((u64)temp, L1_CACHE_BYTES)) + temp = PTR_ALIGN(temp, L1_CACHE_BYTES); + } else { + comp_cb->is_cache = true; + temp = kmem_cache_alloc(unaligned_cache, GFP_KERNEL); + if (!temp) + goto free_comp_cb; + comp_cb->temp_buf_to_free = temp; + } + + if (src_local) { + temp += dst_cache_off; + scif_rma_local_cpu_copy(work->src_offset, work->src_window, + temp, work->len, true); + } else { + comp_cb->dst_window = work->dst_window; + comp_cb->dst_offset = work->dst_offset; + work->src_offset = work->src_offset - src_cache_off; + comp_cb->len = work->len; + work->len = ALIGN(work->len + src_cache_off, L1_CACHE_BYTES); + comp_cb->header_padding = src_cache_off; + } + comp_cb->temp_buf = temp; + + err = scif_map_single(&comp_cb->temp_phys, temp, + work->remote_dev, SCIF_KMEM_UNALIGNED_BUF_SIZE); + if (err) + goto free_temp_buf; + comp_cb->sdev = work->remote_dev; + if (scif_rma_list_dma_copy_unaligned(work, temp, chan, src_local) < 0) + goto free_temp_buf; + if (!src_local) + work->fence_type = SCIF_DMA_INTR; + return 0; +free_temp_buf: + if (comp_cb->is_cache) + kmem_cache_free(unaligned_cache, comp_cb->temp_buf_to_free); + else + kfree(comp_cb->temp_buf_to_free); +free_comp_cb: + kfree(comp_cb); +error: + return -ENOMEM; +} + +/** + * scif_rma_copy: + * @epd: end point descriptor. + * @loffset: offset in local registered address space to/from which to copy + * @addr: user virtual address to/from which to copy + * @len: length of range to copy + * @roffset: offset in remote registered address space to/from which to copy + * @flags: flags + * @dir: LOCAL->REMOTE or vice versa. + * @last_chunk: true if this is the last chunk of a larger transfer + * + * Validate parameters, check if src/dst registered ranges requested for copy + * are valid and initiate either CPU or DMA copy. + */ +static int scif_rma_copy(scif_epd_t epd, off_t loffset, unsigned long addr, + size_t len, off_t roffset, int flags, + enum scif_rma_dir dir, bool last_chunk) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct scif_rma_req remote_req; + struct scif_rma_req req; + struct scif_window *local_window = NULL; + struct scif_window *remote_window = NULL; + struct scif_copy_work copy_work; + bool loopback; + int err = 0; + struct dma_chan *chan; + struct scif_mmu_notif *mmn = NULL; + bool cache = false; + struct device *spdev; + + err = scif_verify_epd(ep); + if (err) + return err; + + if (flags && !(flags & (SCIF_RMA_USECPU | SCIF_RMA_USECACHE | + SCIF_RMA_SYNC | SCIF_RMA_ORDERED))) + return -EINVAL; + + loopback = scifdev_self(ep->remote_dev) ? true : false; + copy_work.fence_type = ((flags & SCIF_RMA_SYNC) && last_chunk) ? + SCIF_DMA_POLL : 0; + copy_work.ordered = !!((flags & SCIF_RMA_ORDERED) && last_chunk); + + /* Use CPU for Mgmt node <-> Mgmt node copies */ + if (loopback && scif_is_mgmt_node()) { + flags |= SCIF_RMA_USECPU; + copy_work.fence_type = 0x0; + } + + cache = scif_is_set_reg_cache(flags); + + remote_req.out_window = &remote_window; + remote_req.offset = roffset; + remote_req.nr_bytes = len; + /* + * If transfer is from local to remote then the remote window + * must be writeable and vice versa. + */ + remote_req.prot = dir == SCIF_LOCAL_TO_REMOTE ? VM_WRITE : VM_READ; + remote_req.type = SCIF_WINDOW_PARTIAL; + remote_req.head = &ep->rma_info.remote_reg_list; + + spdev = scif_get_peer_dev(ep->remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + return err; + } + + if (addr && cache) { + mutex_lock(&ep->rma_info.mmn_lock); + mmn = scif_find_mmu_notifier(current->mm, &ep->rma_info); + if (!mmn) + scif_add_mmu_notifier(current->mm, ep); + mutex_unlock(&ep->rma_info.mmn_lock); + if (IS_ERR(mmn)) { + scif_put_peer_dev(spdev); + return PTR_ERR(mmn); + } + cache = cache && !scif_rma_tc_can_cache(ep, len); + } + mutex_lock(&ep->rma_info.rma_lock); + if (addr) { + req.out_window = &local_window; + req.nr_bytes = ALIGN(len + (addr & ~PAGE_MASK), + PAGE_SIZE); + req.va_for_temp = addr & PAGE_MASK; + req.prot = (dir == SCIF_LOCAL_TO_REMOTE ? + VM_READ : VM_WRITE | VM_READ); + /* Does a valid local window exist? */ + if (mmn) { + spin_lock(&ep->rma_info.tc_lock); + req.head = &mmn->tc_reg_list; + err = scif_query_tcw(ep, &req); + spin_unlock(&ep->rma_info.tc_lock); + } + if (!mmn || err) { + err = scif_register_temp(epd, req.va_for_temp, + req.nr_bytes, req.prot, + &loffset, &local_window); + if (err) { + mutex_unlock(&ep->rma_info.rma_lock); + goto error; + } + if (!cache) + goto skip_cache; + atomic_inc(&ep->rma_info.tcw_refcount); + atomic_add_return(local_window->nr_pages, + &ep->rma_info.tcw_total_pages); + if (mmn) { + spin_lock(&ep->rma_info.tc_lock); + scif_insert_tcw(local_window, + &mmn->tc_reg_list); + spin_unlock(&ep->rma_info.tc_lock); + } + } +skip_cache: + loffset = local_window->offset + + (addr - local_window->va_for_temp); + } else { + req.out_window = &local_window; + req.offset = loffset; + /* + * If transfer is from local to remote then the self window + * must be readable and vice versa. + */ + req.prot = dir == SCIF_LOCAL_TO_REMOTE ? VM_READ : VM_WRITE; + req.nr_bytes = len; + req.type = SCIF_WINDOW_PARTIAL; + req.head = &ep->rma_info.reg_list; + /* Does a valid local window exist? */ + err = scif_query_window(&req); + if (err) { + mutex_unlock(&ep->rma_info.rma_lock); + goto error; + } + } + + /* Does a valid remote window exist? */ + err = scif_query_window(&remote_req); + if (err) { + mutex_unlock(&ep->rma_info.rma_lock); + goto error; + } + + /* + * Prepare copy_work for submitting work to the DMA kernel thread + * or CPU copy routine. + */ + copy_work.len = len; + copy_work.loopback = loopback; + copy_work.remote_dev = ep->remote_dev; + if (dir == SCIF_LOCAL_TO_REMOTE) { + copy_work.src_offset = loffset; + copy_work.src_window = local_window; + copy_work.dst_offset = roffset; + copy_work.dst_window = remote_window; + } else { + copy_work.src_offset = roffset; + copy_work.src_window = remote_window; + copy_work.dst_offset = loffset; + copy_work.dst_window = local_window; + } + + if (flags & SCIF_RMA_USECPU) { + scif_rma_list_cpu_copy(©_work); + } else { + chan = ep->rma_info.dma_chan; + err = scif_rma_list_dma_copy_wrapper(epd, ©_work, + chan, loffset); + } + if (addr && !cache) + atomic_inc(&ep->rma_info.tw_refcount); + + mutex_unlock(&ep->rma_info.rma_lock); + + if (last_chunk) { + struct scif_dev *rdev = ep->remote_dev; + + if (copy_work.fence_type == SCIF_DMA_POLL) + err = scif_drain_dma_poll(rdev->sdev, + ep->rma_info.dma_chan); + else if (copy_work.fence_type == SCIF_DMA_INTR) + err = scif_drain_dma_intr(rdev->sdev, + ep->rma_info.dma_chan); + } + + if (addr && !cache) + scif_queue_for_cleanup(local_window, &scif_info.rma); + scif_put_peer_dev(spdev); + return err; +error: + if (err) { + if (addr && local_window && !cache) + scif_destroy_window(ep, local_window); + dev_err(scif_info.mdev.this_device, + "%s %d err %d len 0x%lx\n", + __func__, __LINE__, err, len); + } + scif_put_peer_dev(spdev); + return err; +} + +int scif_readfrom(scif_epd_t epd, off_t loffset, size_t len, + off_t roffset, int flags) +{ + int err; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI readfrom: ep %p loffset 0x%lx len 0x%lx offset 0x%lx flags 0x%x\n", + epd, loffset, len, roffset, flags); + if (scif_unaligned(loffset, roffset)) { + while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { + err = scif_rma_copy(epd, loffset, 0x0, + SCIF_MAX_UNALIGNED_BUF_SIZE, + roffset, flags, + SCIF_REMOTE_TO_LOCAL, false); + if (err) + goto readfrom_err; + loffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + len -= SCIF_MAX_UNALIGNED_BUF_SIZE; + } + } + err = scif_rma_copy(epd, loffset, 0x0, len, + roffset, flags, SCIF_REMOTE_TO_LOCAL, true); +readfrom_err: + return err; +} +EXPORT_SYMBOL_GPL(scif_readfrom); + +int scif_writeto(scif_epd_t epd, off_t loffset, size_t len, + off_t roffset, int flags) +{ + int err; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI writeto: ep %p loffset 0x%lx len 0x%lx roffset 0x%lx flags 0x%x\n", + epd, loffset, len, roffset, flags); + if (scif_unaligned(loffset, roffset)) { + while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { + err = scif_rma_copy(epd, loffset, 0x0, + SCIF_MAX_UNALIGNED_BUF_SIZE, + roffset, flags, + SCIF_LOCAL_TO_REMOTE, false); + if (err) + goto writeto_err; + loffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + len -= SCIF_MAX_UNALIGNED_BUF_SIZE; + } + } + err = scif_rma_copy(epd, loffset, 0x0, len, + roffset, flags, SCIF_LOCAL_TO_REMOTE, true); +writeto_err: + return err; +} +EXPORT_SYMBOL_GPL(scif_writeto); + +int scif_vreadfrom(scif_epd_t epd, void *addr, size_t len, + off_t roffset, int flags) +{ + int err; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI vreadfrom: ep %p addr %p len 0x%lx roffset 0x%lx flags 0x%x\n", + epd, addr, len, roffset, flags); + if (scif_unaligned((off_t __force)addr, roffset)) { + if (len > SCIF_MAX_UNALIGNED_BUF_SIZE) + flags &= ~SCIF_RMA_USECACHE; + + while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { + err = scif_rma_copy(epd, 0, (u64)addr, + SCIF_MAX_UNALIGNED_BUF_SIZE, + roffset, flags, + SCIF_REMOTE_TO_LOCAL, false); + if (err) + goto vreadfrom_err; + addr += SCIF_MAX_UNALIGNED_BUF_SIZE; + roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + len -= SCIF_MAX_UNALIGNED_BUF_SIZE; + } + } + err = scif_rma_copy(epd, 0, (u64)addr, len, + roffset, flags, SCIF_REMOTE_TO_LOCAL, true); +vreadfrom_err: + return err; +} +EXPORT_SYMBOL_GPL(scif_vreadfrom); + +int scif_vwriteto(scif_epd_t epd, void *addr, size_t len, + off_t roffset, int flags) +{ + int err; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI vwriteto: ep %p addr %p len 0x%lx roffset 0x%lx flags 0x%x\n", + epd, addr, len, roffset, flags); + if (scif_unaligned((off_t __force)addr, roffset)) { + if (len > SCIF_MAX_UNALIGNED_BUF_SIZE) + flags &= ~SCIF_RMA_USECACHE; + + while (len > SCIF_MAX_UNALIGNED_BUF_SIZE) { + err = scif_rma_copy(epd, 0, (u64)addr, + SCIF_MAX_UNALIGNED_BUF_SIZE, + roffset, flags, + SCIF_LOCAL_TO_REMOTE, false); + if (err) + goto vwriteto_err; + addr += SCIF_MAX_UNALIGNED_BUF_SIZE; + roffset += SCIF_MAX_UNALIGNED_BUF_SIZE; + len -= SCIF_MAX_UNALIGNED_BUF_SIZE; + } + } + err = scif_rma_copy(epd, 0, (u64)addr, len, + roffset, flags, SCIF_LOCAL_TO_REMOTE, true); +vwriteto_err: + return err; +} +EXPORT_SYMBOL_GPL(scif_vwriteto); diff --git a/kernel/drivers/misc/mic/scif/scif_epd.c b/kernel/drivers/misc/mic/scif/scif_epd.c new file mode 100644 index 000000000..00e5d6d66 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_epd.c @@ -0,0 +1,357 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include "scif_main.h" +#include "scif_map.h" + +void scif_cleanup_ep_qp(struct scif_endpt *ep) +{ + struct scif_qp *qp = ep->qp_info.qp; + + if (qp->outbound_q.rb_base) { + scif_iounmap((void *)qp->outbound_q.rb_base, + qp->outbound_q.size, ep->remote_dev); + qp->outbound_q.rb_base = NULL; + } + if (qp->remote_qp) { + scif_iounmap((void *)qp->remote_qp, + sizeof(struct scif_qp), ep->remote_dev); + qp->remote_qp = NULL; + } + if (qp->local_qp) { + scif_unmap_single(qp->local_qp, ep->remote_dev, + sizeof(struct scif_qp)); + qp->local_qp = 0x0; + } + if (qp->local_buf) { + scif_unmap_single(qp->local_buf, ep->remote_dev, + SCIF_ENDPT_QP_SIZE); + qp->local_buf = 0; + } +} + +void scif_teardown_ep(void *endpt) +{ + struct scif_endpt *ep = endpt; + struct scif_qp *qp = ep->qp_info.qp; + + if (qp) { + spin_lock(&ep->lock); + scif_cleanup_ep_qp(ep); + spin_unlock(&ep->lock); + kfree(qp->inbound_q.rb_base); + kfree(qp); + } +} + +/* + * Enqueue the endpoint to the zombie list for cleanup. + * The endpoint should not be accessed once this API returns. + */ +void scif_add_epd_to_zombie_list(struct scif_endpt *ep, bool eplock_held) +{ + if (!eplock_held) + mutex_lock(&scif_info.eplock); + spin_lock(&ep->lock); + ep->state = SCIFEP_ZOMBIE; + spin_unlock(&ep->lock); + list_add_tail(&ep->list, &scif_info.zombie); + scif_info.nr_zombies++; + if (!eplock_held) + mutex_unlock(&scif_info.eplock); + schedule_work(&scif_info.misc_work); +} + +static struct scif_endpt *scif_find_listen_ep(u16 port) +{ + struct scif_endpt *ep = NULL; + struct list_head *pos, *tmpq; + + mutex_lock(&scif_info.eplock); + list_for_each_safe(pos, tmpq, &scif_info.listen) { + ep = list_entry(pos, struct scif_endpt, list); + if (ep->port.port == port) { + mutex_unlock(&scif_info.eplock); + return ep; + } + } + mutex_unlock(&scif_info.eplock); + return NULL; +} + +void scif_cleanup_zombie_epd(void) +{ + struct list_head *pos, *tmpq; + struct scif_endpt *ep; + + mutex_lock(&scif_info.eplock); + list_for_each_safe(pos, tmpq, &scif_info.zombie) { + ep = list_entry(pos, struct scif_endpt, list); + if (scif_rma_ep_can_uninit(ep)) { + list_del(pos); + scif_info.nr_zombies--; + put_iova_domain(&ep->rma_info.iovad); + kfree(ep); + } + } + mutex_unlock(&scif_info.eplock); +} + +/** + * scif_cnctreq() - Respond to SCIF_CNCT_REQ interrupt message + * @msg: Interrupt message + * + * This message is initiated by the remote node to request a connection + * to the local node. This function looks for an end point in the + * listen state on the requested port id. + * + * If it finds a listening port it places the connect request on the + * listening end points queue and wakes up any pending accept calls. + * + * If it does not find a listening end point it sends a connection + * reject message to the remote node. + */ +void scif_cnctreq(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = NULL; + struct scif_conreq *conreq; + + conreq = kmalloc(sizeof(*conreq), GFP_KERNEL); + if (!conreq) + /* Lack of resources so reject the request. */ + goto conreq_sendrej; + + ep = scif_find_listen_ep(msg->dst.port); + if (!ep) + /* Send reject due to no listening ports */ + goto conreq_sendrej_free; + else + spin_lock(&ep->lock); + + if (ep->backlog <= ep->conreqcnt) { + /* Send reject due to too many pending requests */ + spin_unlock(&ep->lock); + goto conreq_sendrej_free; + } + + conreq->msg = *msg; + list_add_tail(&conreq->list, &ep->conlist); + ep->conreqcnt++; + wake_up_interruptible(&ep->conwq); + spin_unlock(&ep->lock); + return; + +conreq_sendrej_free: + kfree(conreq); +conreq_sendrej: + msg->uop = SCIF_CNCT_REJ; + scif_nodeqp_send(&scif_dev[msg->src.node], msg); +} + +/** + * scif_cnctgnt() - Respond to SCIF_CNCT_GNT interrupt message + * @msg: Interrupt message + * + * An accept() on the remote node has occurred and sent this message + * to indicate success. Place the end point in the MAPPING state and + * save the remote nodes memory information. Then wake up the connect + * request so it can finish. + */ +void scif_cnctgnt(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + + spin_lock(&ep->lock); + if (SCIFEP_CONNECTING == ep->state) { + ep->peer.node = msg->src.node; + ep->peer.port = msg->src.port; + ep->qp_info.gnt_pld = msg->payload[1]; + ep->remote_ep = msg->payload[2]; + ep->state = SCIFEP_MAPPING; + + wake_up(&ep->conwq); + } + spin_unlock(&ep->lock); +} + +/** + * scif_cnctgnt_ack() - Respond to SCIF_CNCT_GNTACK interrupt message + * @msg: Interrupt message + * + * The remote connection request has finished mapping the local memory. + * Place the connection in the connected state and wake up the pending + * accept() call. + */ +void scif_cnctgnt_ack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + + mutex_lock(&scif_info.connlock); + spin_lock(&ep->lock); + /* New ep is now connected with all resources set. */ + ep->state = SCIFEP_CONNECTED; + list_add_tail(&ep->list, &scif_info.connected); + wake_up(&ep->conwq); + spin_unlock(&ep->lock); + mutex_unlock(&scif_info.connlock); +} + +/** + * scif_cnctgnt_nack() - Respond to SCIF_CNCT_GNTNACK interrupt message + * @msg: Interrupt message + * + * The remote connection request failed to map the local memory it was sent. + * Place the end point in the CLOSING state to indicate it and wake up + * the pending accept(); + */ +void scif_cnctgnt_nack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + + spin_lock(&ep->lock); + ep->state = SCIFEP_CLOSING; + wake_up(&ep->conwq); + spin_unlock(&ep->lock); +} + +/** + * scif_cnctrej() - Respond to SCIF_CNCT_REJ interrupt message + * @msg: Interrupt message + * + * The remote end has rejected the connection request. Set the end + * point back to the bound state and wake up the pending connect(). + */ +void scif_cnctrej(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + + spin_lock(&ep->lock); + if (SCIFEP_CONNECTING == ep->state) { + ep->state = SCIFEP_BOUND; + wake_up(&ep->conwq); + } + spin_unlock(&ep->lock); +} + +/** + * scif_discnct() - Respond to SCIF_DISCNCT interrupt message + * @msg: Interrupt message + * + * The remote node has indicated close() has been called on its end + * point. Remove the local end point from the connected list, set its + * state to disconnected and ensure accesses to the remote node are + * shutdown. + * + * When all accesses to the remote end have completed then send a + * DISCNT_ACK to indicate it can remove its resources and complete + * the close routine. + */ +void scif_discnct(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = NULL; + struct scif_endpt *tmpep; + struct list_head *pos, *tmpq; + + mutex_lock(&scif_info.connlock); + list_for_each_safe(pos, tmpq, &scif_info.connected) { + tmpep = list_entry(pos, struct scif_endpt, list); + /* + * The local ep may have sent a disconnect and and been closed + * due to a message response time out. It may have been + * allocated again and formed a new connection so we want to + * check if the remote ep matches + */ + if (((u64)tmpep == msg->payload[1]) && + ((u64)tmpep->remote_ep == msg->payload[0])) { + list_del(pos); + ep = tmpep; + spin_lock(&ep->lock); + break; + } + } + + /* + * If the terminated end is not found then this side started closing + * before the other side sent the disconnect. If so the ep will no + * longer be on the connected list. Regardless the other side + * needs to be acked to let it know close is complete. + */ + if (!ep) { + mutex_unlock(&scif_info.connlock); + goto discnct_ack; + } + + ep->state = SCIFEP_DISCONNECTED; + list_add_tail(&ep->list, &scif_info.disconnected); + + wake_up_interruptible(&ep->sendwq); + wake_up_interruptible(&ep->recvwq); + spin_unlock(&ep->lock); + mutex_unlock(&scif_info.connlock); + +discnct_ack: + msg->uop = SCIF_DISCNT_ACK; + scif_nodeqp_send(&scif_dev[msg->src.node], msg); +} + +/** + * scif_discnct_ack() - Respond to SCIF_DISCNT_ACK interrupt message + * @msg: Interrupt message + * + * Remote side has indicated it has not more references to local resources + */ +void scif_discnt_ack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + + spin_lock(&ep->lock); + ep->state = SCIFEP_DISCONNECTED; + spin_unlock(&ep->lock); + complete(&ep->discon); +} + +/** + * scif_clientsend() - Respond to SCIF_CLIENT_SEND interrupt message + * @msg: Interrupt message + * + * Remote side is confirming send or receive interrupt handling is complete. + */ +void scif_clientsend(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + + spin_lock(&ep->lock); + if (SCIFEP_CONNECTED == ep->state) + wake_up_interruptible(&ep->recvwq); + spin_unlock(&ep->lock); +} + +/** + * scif_clientrcvd() - Respond to SCIF_CLIENT_RCVD interrupt message + * @msg: Interrupt message + * + * Remote side is confirming send or receive interrupt handling is complete. + */ +void scif_clientrcvd(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + + spin_lock(&ep->lock); + if (SCIFEP_CONNECTED == ep->state) + wake_up_interruptible(&ep->sendwq); + spin_unlock(&ep->lock); +} diff --git a/kernel/drivers/misc/mic/scif/scif_epd.h b/kernel/drivers/misc/mic/scif/scif_epd.h new file mode 100644 index 000000000..1771d7a9b --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_epd.h @@ -0,0 +1,210 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#ifndef SCIF_EPD_H +#define SCIF_EPD_H + +#include <linux/delay.h> +#include <linux/scif.h> +#include <linux/scif_ioctl.h> + +#define SCIF_EPLOCK_HELD true + +enum scif_epd_state { + SCIFEP_UNBOUND, + SCIFEP_BOUND, + SCIFEP_LISTENING, + SCIFEP_CONNECTED, + SCIFEP_CONNECTING, + SCIFEP_MAPPING, + SCIFEP_CLOSING, + SCIFEP_CLLISTEN, + SCIFEP_DISCONNECTED, + SCIFEP_ZOMBIE +}; + +/* + * struct scif_conreq - Data structure added to the connection list. + * + * @msg: connection request message received + * @list: link to list of connection requests + */ +struct scif_conreq { + struct scifmsg msg; + struct list_head list; +}; + +/* Size of the RB for the Endpoint QP */ +#define SCIF_ENDPT_QP_SIZE 0x1000 + +/* + * scif_endpt_qp_info - SCIF endpoint queue pair + * + * @qp - Qpair for this endpoint + * @qp_offset - DMA address of the QP + * @gnt_pld - Payload in a SCIF_CNCT_GNT message containing the + * physical address of the remote_qp. + */ +struct scif_endpt_qp_info { + struct scif_qp *qp; + dma_addr_t qp_offset; + dma_addr_t gnt_pld; +}; + +/* + * struct scif_endpt - The SCIF endpoint data structure + * + * @state: end point state + * @lock: lock synchronizing access to endpoint fields like state etc + * @port: self port information + * @peer: peer port information + * @backlog: maximum pending connection requests + * @qp_info: Endpoint QP information for SCIF messaging + * @remote_dev: scifdev used by this endpt to communicate with remote node. + * @remote_ep: remote endpoint + * @conreqcnt: Keep track of number of connection requests. + * @files: Open file information used to match the id passed in with + * the flush routine. + * @conlist: list of connection requests + * @conwq: waitqueue for connection processing + * @discon: completion used during disconnection + * @sendwq: waitqueue used during sending messages + * @recvwq: waitqueue used during message receipt + * @sendlock: Synchronize ordering of messages sent + * @recvlock: Synchronize ordering of messages received + * @list: link to list of various endpoints like connected, listening etc + * @li_accept: pending ACCEPTREG + * @acceptcnt: pending ACCEPTREG cnt + * @liacceptlist: link to listen accept + * @miacceptlist: link to uaccept + * @listenep: associated listen ep + * @conn_work: Non blocking connect work + * @conn_port: Connection port + * @conn_err: Errors during connection + * @conn_async_state: Async connection + * @conn_pend_wq: Used by poll while waiting for incoming connections + * @conn_list: List of async connection requests + * @rma_info: Information for triggering SCIF RMA and DMA operations + * @mmu_list: link to list of MMU notifier cleanup work + * @anon: anonymous file for use in kernel mode scif poll + */ +struct scif_endpt { + enum scif_epd_state state; + spinlock_t lock; + struct scif_port_id port; + struct scif_port_id peer; + int backlog; + struct scif_endpt_qp_info qp_info; + struct scif_dev *remote_dev; + u64 remote_ep; + int conreqcnt; + struct files_struct *files; + struct list_head conlist; + wait_queue_head_t conwq; + struct completion discon; + wait_queue_head_t sendwq; + wait_queue_head_t recvwq; + struct mutex sendlock; + struct mutex recvlock; + struct list_head list; + struct list_head li_accept; + int acceptcnt; + struct list_head liacceptlist; + struct list_head miacceptlist; + struct scif_endpt *listenep; + struct scif_port_id conn_port; + int conn_err; + int conn_async_state; + wait_queue_head_t conn_pend_wq; + struct list_head conn_list; + struct scif_endpt_rma_info rma_info; + struct list_head mmu_list; + struct file *anon; +}; + +static inline int scifdev_alive(struct scif_endpt *ep) +{ + return _scifdev_alive(ep->remote_dev); +} + +/* + * scif_verify_epd: + * ep: SCIF endpoint + * + * Checks several generic error conditions and returns the + * appropriate error. + */ +static inline int scif_verify_epd(struct scif_endpt *ep) +{ + if (ep->state == SCIFEP_DISCONNECTED) + return -ECONNRESET; + + if (ep->state != SCIFEP_CONNECTED) + return -ENOTCONN; + + if (!scifdev_alive(ep)) + return -ENODEV; + + return 0; +} + +static inline int scif_anon_inode_getfile(scif_epd_t epd) +{ + epd->anon = anon_inode_getfile("scif", &scif_anon_fops, NULL, 0); + if (IS_ERR(epd->anon)) + return PTR_ERR(epd->anon); + return 0; +} + +static inline void scif_anon_inode_fput(scif_epd_t epd) +{ + if (epd->anon) { + fput(epd->anon); + epd->anon = NULL; + } +} + +void scif_cleanup_zombie_epd(void); +void scif_teardown_ep(void *endpt); +void scif_cleanup_ep_qp(struct scif_endpt *ep); +void scif_add_epd_to_zombie_list(struct scif_endpt *ep, bool eplock_held); +void scif_get_node_info(void); +void scif_send_acks(struct scif_dev *dev); +void scif_conn_handler(struct work_struct *work); +int scif_rsrv_port(u16 port); +void scif_get_port(u16 port); +int scif_get_new_port(void); +void scif_put_port(u16 port); +int scif_user_send(scif_epd_t epd, void __user *msg, int len, int flags); +int scif_user_recv(scif_epd_t epd, void __user *msg, int len, int flags); +void scif_cnctreq(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_cnctgnt(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_cnctgnt_ack(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_cnctgnt_nack(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_cnctrej(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_discnct(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_discnt_ack(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_clientsend(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_clientrcvd(struct scif_dev *scifdev, struct scifmsg *msg); +int __scif_connect(scif_epd_t epd, struct scif_port_id *dst, bool non_block); +int __scif_flush(scif_epd_t epd); +int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd); +unsigned int __scif_pollfd(struct file *f, poll_table *wait, + struct scif_endpt *ep); +int __scif_pin_pages(void *addr, size_t len, int *out_prot, + int map_flags, scif_pinned_pages_t *pages); +#endif /* SCIF_EPD_H */ diff --git a/kernel/drivers/misc/mic/scif/scif_fd.c b/kernel/drivers/misc/mic/scif/scif_fd.c new file mode 100644 index 000000000..f7e826142 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_fd.c @@ -0,0 +1,471 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include "scif_main.h" + +static int scif_fdopen(struct inode *inode, struct file *f) +{ + struct scif_endpt *priv = scif_open(); + + if (!priv) + return -ENOMEM; + f->private_data = priv; + return 0; +} + +static int scif_fdclose(struct inode *inode, struct file *f) +{ + struct scif_endpt *priv = f->private_data; + + return scif_close(priv); +} + +static int scif_fdmmap(struct file *f, struct vm_area_struct *vma) +{ + struct scif_endpt *priv = f->private_data; + + return scif_mmap(vma, priv); +} + +static unsigned int scif_fdpoll(struct file *f, poll_table *wait) +{ + struct scif_endpt *priv = f->private_data; + + return __scif_pollfd(f, wait, priv); +} + +static int scif_fdflush(struct file *f, fl_owner_t id) +{ + struct scif_endpt *ep = f->private_data; + + spin_lock(&ep->lock); + /* + * The listening endpoint stashes the open file information before + * waiting for incoming connections. The release callback would never be + * called if the application closed the endpoint, while waiting for + * incoming connections from a separate thread since the file descriptor + * reference count is bumped up in the accept IOCTL. Call the flush + * routine if the id matches the endpoint open file information so that + * the listening endpoint can be woken up and the fd released. + */ + if (ep->files == id) + __scif_flush(ep); + spin_unlock(&ep->lock); + return 0; +} + +static __always_inline void scif_err_debug(int err, const char *str) +{ + /* + * ENOTCONN is a common uninteresting error which is + * flooding debug messages to the console unnecessarily. + */ + if (err < 0 && err != -ENOTCONN) + dev_dbg(scif_info.mdev.this_device, "%s err %d\n", str, err); +} + +static long scif_fdioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + struct scif_endpt *priv = f->private_data; + void __user *argp = (void __user *)arg; + int err = 0; + struct scifioctl_msg request; + bool non_block = false; + + non_block = !!(f->f_flags & O_NONBLOCK); + + switch (cmd) { + case SCIF_BIND: + { + int pn; + + if (copy_from_user(&pn, argp, sizeof(pn))) + return -EFAULT; + + pn = scif_bind(priv, pn); + if (pn < 0) + return pn; + + if (copy_to_user(argp, &pn, sizeof(pn))) + return -EFAULT; + + return 0; + } + case SCIF_LISTEN: + return scif_listen(priv, arg); + case SCIF_CONNECT: + { + struct scifioctl_connect req; + struct scif_endpt *ep = (struct scif_endpt *)priv; + + if (copy_from_user(&req, argp, sizeof(req))) + return -EFAULT; + + err = __scif_connect(priv, &req.peer, non_block); + if (err < 0) + return err; + + req.self.node = ep->port.node; + req.self.port = ep->port.port; + + if (copy_to_user(argp, &req, sizeof(req))) + return -EFAULT; + + return 0; + } + /* + * Accept is done in two halves. The request ioctl does the basic + * functionality of accepting the request and returning the information + * about it including the internal ID of the end point. The register + * is done with the internal ID on a new file descriptor opened by the + * requesting process. + */ + case SCIF_ACCEPTREQ: + { + struct scifioctl_accept request; + scif_epd_t *ep = (scif_epd_t *)&request.endpt; + + if (copy_from_user(&request, argp, sizeof(request))) + return -EFAULT; + + err = scif_accept(priv, &request.peer, ep, request.flags); + if (err < 0) + return err; + + if (copy_to_user(argp, &request, sizeof(request))) { + scif_close(*ep); + return -EFAULT; + } + /* + * Add to the list of user mode eps where the second half + * of the accept is not yet completed. + */ + mutex_lock(&scif_info.eplock); + list_add_tail(&((*ep)->miacceptlist), &scif_info.uaccept); + list_add_tail(&((*ep)->liacceptlist), &priv->li_accept); + (*ep)->listenep = priv; + priv->acceptcnt++; + mutex_unlock(&scif_info.eplock); + + return 0; + } + case SCIF_ACCEPTREG: + { + struct scif_endpt *priv = f->private_data; + struct scif_endpt *newep; + struct scif_endpt *lisep; + struct scif_endpt *fep = NULL; + struct scif_endpt *tmpep; + struct list_head *pos, *tmpq; + + /* Finally replace the pointer to the accepted endpoint */ + if (copy_from_user(&newep, argp, sizeof(void *))) + return -EFAULT; + + /* Remove form the user accept queue */ + mutex_lock(&scif_info.eplock); + list_for_each_safe(pos, tmpq, &scif_info.uaccept) { + tmpep = list_entry(pos, + struct scif_endpt, miacceptlist); + if (tmpep == newep) { + list_del(pos); + fep = tmpep; + break; + } + } + + if (!fep) { + mutex_unlock(&scif_info.eplock); + return -ENOENT; + } + + lisep = newep->listenep; + list_for_each_safe(pos, tmpq, &lisep->li_accept) { + tmpep = list_entry(pos, + struct scif_endpt, liacceptlist); + if (tmpep == newep) { + list_del(pos); + lisep->acceptcnt--; + break; + } + } + + mutex_unlock(&scif_info.eplock); + + /* Free the resources automatically created from the open. */ + scif_anon_inode_fput(priv); + scif_teardown_ep(priv); + scif_add_epd_to_zombie_list(priv, !SCIF_EPLOCK_HELD); + f->private_data = newep; + return 0; + } + case SCIF_SEND: + { + struct scif_endpt *priv = f->private_data; + + if (copy_from_user(&request, argp, + sizeof(struct scifioctl_msg))) { + err = -EFAULT; + goto send_err; + } + err = scif_user_send(priv, (void __user *)request.msg, + request.len, request.flags); + if (err < 0) + goto send_err; + if (copy_to_user(& + ((struct scifioctl_msg __user *)argp)->out_len, + &err, sizeof(err))) { + err = -EFAULT; + goto send_err; + } + err = 0; +send_err: + scif_err_debug(err, "scif_send"); + return err; + } + case SCIF_RECV: + { + struct scif_endpt *priv = f->private_data; + + if (copy_from_user(&request, argp, + sizeof(struct scifioctl_msg))) { + err = -EFAULT; + goto recv_err; + } + + err = scif_user_recv(priv, (void __user *)request.msg, + request.len, request.flags); + if (err < 0) + goto recv_err; + + if (copy_to_user(& + ((struct scifioctl_msg __user *)argp)->out_len, + &err, sizeof(err))) { + err = -EFAULT; + goto recv_err; + } + err = 0; +recv_err: + scif_err_debug(err, "scif_recv"); + return err; + } + case SCIF_GET_NODEIDS: + { + struct scifioctl_node_ids node_ids; + int entries; + u16 *nodes; + void __user *unodes, *uself; + u16 self; + + if (copy_from_user(&node_ids, argp, sizeof(node_ids))) { + err = -EFAULT; + goto getnodes_err2; + } + + entries = min_t(int, scif_info.maxid, node_ids.len); + nodes = kmalloc_array(entries, sizeof(u16), GFP_KERNEL); + if (entries && !nodes) { + err = -ENOMEM; + goto getnodes_err2; + } + node_ids.len = scif_get_node_ids(nodes, entries, &self); + + unodes = (void __user *)node_ids.nodes; + if (copy_to_user(unodes, nodes, sizeof(u16) * entries)) { + err = -EFAULT; + goto getnodes_err1; + } + + uself = (void __user *)node_ids.self; + if (copy_to_user(uself, &self, sizeof(u16))) { + err = -EFAULT; + goto getnodes_err1; + } + + if (copy_to_user(argp, &node_ids, sizeof(node_ids))) { + err = -EFAULT; + goto getnodes_err1; + } +getnodes_err1: + kfree(nodes); +getnodes_err2: + return err; + } + case SCIF_REG: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_reg reg; + off_t ret; + + if (copy_from_user(®, argp, sizeof(reg))) { + err = -EFAULT; + goto reg_err; + } + if (reg.flags & SCIF_MAP_KERNEL) { + err = -EINVAL; + goto reg_err; + } + ret = scif_register(priv, (void *)reg.addr, reg.len, + reg.offset, reg.prot, reg.flags); + if (ret < 0) { + err = (int)ret; + goto reg_err; + } + + if (copy_to_user(&((struct scifioctl_reg __user *)argp) + ->out_offset, &ret, sizeof(reg.out_offset))) { + err = -EFAULT; + goto reg_err; + } + err = 0; +reg_err: + scif_err_debug(err, "scif_register"); + return err; + } + case SCIF_UNREG: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_unreg unreg; + + if (copy_from_user(&unreg, argp, sizeof(unreg))) { + err = -EFAULT; + goto unreg_err; + } + err = scif_unregister(priv, unreg.offset, unreg.len); +unreg_err: + scif_err_debug(err, "scif_unregister"); + return err; + } + case SCIF_READFROM: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto readfrom_err; + } + err = scif_readfrom(priv, copy.loffset, copy.len, copy.roffset, + copy.flags); +readfrom_err: + scif_err_debug(err, "scif_readfrom"); + return err; + } + case SCIF_WRITETO: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto writeto_err; + } + err = scif_writeto(priv, copy.loffset, copy.len, copy.roffset, + copy.flags); +writeto_err: + scif_err_debug(err, "scif_writeto"); + return err; + } + case SCIF_VREADFROM: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto vreadfrom_err; + } + err = scif_vreadfrom(priv, (void __force *)copy.addr, copy.len, + copy.roffset, copy.flags); +vreadfrom_err: + scif_err_debug(err, "scif_vreadfrom"); + return err; + } + case SCIF_VWRITETO: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_copy copy; + + if (copy_from_user(©, argp, sizeof(copy))) { + err = -EFAULT; + goto vwriteto_err; + } + err = scif_vwriteto(priv, (void __force *)copy.addr, copy.len, + copy.roffset, copy.flags); +vwriteto_err: + scif_err_debug(err, "scif_vwriteto"); + return err; + } + case SCIF_FENCE_MARK: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_fence_mark mark; + int tmp_mark = 0; + + if (copy_from_user(&mark, argp, sizeof(mark))) { + err = -EFAULT; + goto fence_mark_err; + } + err = scif_fence_mark(priv, mark.flags, &tmp_mark); + if (err) + goto fence_mark_err; + if (copy_to_user((void __user *)mark.mark, &tmp_mark, + sizeof(tmp_mark))) { + err = -EFAULT; + goto fence_mark_err; + } +fence_mark_err: + scif_err_debug(err, "scif_fence_mark"); + return err; + } + case SCIF_FENCE_WAIT: + { + struct scif_endpt *priv = f->private_data; + + err = scif_fence_wait(priv, arg); + scif_err_debug(err, "scif_fence_wait"); + return err; + } + case SCIF_FENCE_SIGNAL: + { + struct scif_endpt *priv = f->private_data; + struct scifioctl_fence_signal signal; + + if (copy_from_user(&signal, argp, sizeof(signal))) { + err = -EFAULT; + goto fence_signal_err; + } + + err = scif_fence_signal(priv, signal.loff, signal.lval, + signal.roff, signal.rval, signal.flags); +fence_signal_err: + scif_err_debug(err, "scif_fence_signal"); + return err; + } + } + return -EINVAL; +} + +const struct file_operations scif_fops = { + .open = scif_fdopen, + .release = scif_fdclose, + .unlocked_ioctl = scif_fdioctl, + .mmap = scif_fdmmap, + .poll = scif_fdpoll, + .flush = scif_fdflush, + .owner = THIS_MODULE, +}; diff --git a/kernel/drivers/misc/mic/scif/scif_fence.c b/kernel/drivers/misc/mic/scif/scif_fence.c new file mode 100644 index 000000000..7f2c96f57 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_fence.c @@ -0,0 +1,771 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ + +#include "scif_main.h" + +/** + * scif_recv_mark: Handle SCIF_MARK request + * @msg: Interrupt message + * + * The peer has requested a mark. + */ +void scif_recv_mark(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + int mark, err; + + err = _scif_fence_mark(ep, &mark); + if (err) + msg->uop = SCIF_MARK_NACK; + else + msg->uop = SCIF_MARK_ACK; + msg->payload[0] = ep->remote_ep; + msg->payload[2] = mark; + scif_nodeqp_send(ep->remote_dev, msg); +} + +/** + * scif_recv_mark_resp: Handle SCIF_MARK_(N)ACK messages. + * @msg: Interrupt message + * + * The peer has responded to a SCIF_MARK message. + */ +void scif_recv_mark_resp(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + struct scif_fence_info *fence_req = + (struct scif_fence_info *)msg->payload[1]; + + mutex_lock(&ep->rma_info.rma_lock); + if (msg->uop == SCIF_MARK_ACK) { + fence_req->state = OP_COMPLETED; + fence_req->dma_mark = (int)msg->payload[2]; + } else { + fence_req->state = OP_FAILED; + } + mutex_unlock(&ep->rma_info.rma_lock); + complete(&fence_req->comp); +} + +/** + * scif_recv_wait: Handle SCIF_WAIT request + * @msg: Interrupt message + * + * The peer has requested waiting on a fence. + */ +void scif_recv_wait(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + struct scif_remote_fence_info *fence; + + /* + * Allocate structure for remote fence information and + * send a NACK if the allocation failed. The peer will + * return ENOMEM upon receiving a NACK. + */ + fence = kmalloc(sizeof(*fence), GFP_KERNEL); + if (!fence) { + msg->payload[0] = ep->remote_ep; + msg->uop = SCIF_WAIT_NACK; + scif_nodeqp_send(ep->remote_dev, msg); + return; + } + + /* Prepare the fence request */ + memcpy(&fence->msg, msg, sizeof(struct scifmsg)); + INIT_LIST_HEAD(&fence->list); + + /* Insert to the global remote fence request list */ + mutex_lock(&scif_info.fencelock); + atomic_inc(&ep->rma_info.fence_refcount); + list_add_tail(&fence->list, &scif_info.fence); + mutex_unlock(&scif_info.fencelock); + + schedule_work(&scif_info.misc_work); +} + +/** + * scif_recv_wait_resp: Handle SCIF_WAIT_(N)ACK messages. + * @msg: Interrupt message + * + * The peer has responded to a SCIF_WAIT message. + */ +void scif_recv_wait_resp(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + struct scif_fence_info *fence_req = + (struct scif_fence_info *)msg->payload[1]; + + mutex_lock(&ep->rma_info.rma_lock); + if (msg->uop == SCIF_WAIT_ACK) + fence_req->state = OP_COMPLETED; + else + fence_req->state = OP_FAILED; + mutex_unlock(&ep->rma_info.rma_lock); + complete(&fence_req->comp); +} + +/** + * scif_recv_sig_local: Handle SCIF_SIG_LOCAL request + * @msg: Interrupt message + * + * The peer has requested a signal on a local offset. + */ +void scif_recv_sig_local(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + int err; + + err = scif_prog_signal(ep, msg->payload[1], msg->payload[2], + SCIF_WINDOW_SELF); + if (err) + msg->uop = SCIF_SIG_NACK; + else + msg->uop = SCIF_SIG_ACK; + msg->payload[0] = ep->remote_ep; + scif_nodeqp_send(ep->remote_dev, msg); +} + +/** + * scif_recv_sig_remote: Handle SCIF_SIGNAL_REMOTE request + * @msg: Interrupt message + * + * The peer has requested a signal on a remote offset. + */ +void scif_recv_sig_remote(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + int err; + + err = scif_prog_signal(ep, msg->payload[1], msg->payload[2], + SCIF_WINDOW_PEER); + if (err) + msg->uop = SCIF_SIG_NACK; + else + msg->uop = SCIF_SIG_ACK; + msg->payload[0] = ep->remote_ep; + scif_nodeqp_send(ep->remote_dev, msg); +} + +/** + * scif_recv_sig_resp: Handle SCIF_SIG_(N)ACK messages. + * @msg: Interrupt message + * + * The peer has responded to a signal request. + */ +void scif_recv_sig_resp(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + struct scif_fence_info *fence_req = + (struct scif_fence_info *)msg->payload[3]; + + mutex_lock(&ep->rma_info.rma_lock); + if (msg->uop == SCIF_SIG_ACK) + fence_req->state = OP_COMPLETED; + else + fence_req->state = OP_FAILED; + mutex_unlock(&ep->rma_info.rma_lock); + complete(&fence_req->comp); +} + +static inline void *scif_get_local_va(off_t off, struct scif_window *window) +{ + struct page **pages = window->pinned_pages->pages; + int page_nr = (off - window->offset) >> PAGE_SHIFT; + off_t page_off = off & ~PAGE_MASK; + + return page_address(pages[page_nr]) + page_off; +} + +static void scif_prog_signal_cb(void *arg) +{ + struct scif_status *status = arg; + + dma_pool_free(status->ep->remote_dev->signal_pool, status, + status->src_dma_addr); +} + +static int _scif_prog_signal(scif_epd_t epd, dma_addr_t dst, u64 val) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct dma_chan *chan = ep->rma_info.dma_chan; + struct dma_device *ddev = chan->device; + bool x100 = !is_dma_copy_aligned(chan->device, 1, 1, 1); + struct dma_async_tx_descriptor *tx; + struct scif_status *status = NULL; + dma_addr_t src; + dma_cookie_t cookie; + int err; + + tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_FENCE); + if (!tx) { + err = -ENOMEM; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto alloc_fail; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + err = (int)cookie; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto alloc_fail; + } + dma_async_issue_pending(chan); + if (x100) { + /* + * For X100 use the status descriptor to write the value to + * the destination. + */ + tx = ddev->device_prep_dma_imm_data(chan, dst, val, 0); + } else { + status = dma_pool_alloc(ep->remote_dev->signal_pool, GFP_KERNEL, + &src); + if (!status) { + err = -ENOMEM; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto alloc_fail; + } + status->val = val; + status->src_dma_addr = src; + status->ep = ep; + src += offsetof(struct scif_status, val); + tx = ddev->device_prep_dma_memcpy(chan, dst, src, sizeof(val), + DMA_PREP_INTERRUPT); + } + if (!tx) { + err = -ENOMEM; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto dma_fail; + } + if (!x100) { + tx->callback = scif_prog_signal_cb; + tx->callback_param = status; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + err = -EIO; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + goto dma_fail; + } + dma_async_issue_pending(chan); + return 0; +dma_fail: + if (!x100) + dma_pool_free(ep->remote_dev->signal_pool, status, + status->src_dma_addr); +alloc_fail: + return err; +} + +/* + * scif_prog_signal: + * @epd - Endpoint Descriptor + * @offset - registered address to write @val to + * @val - Value to be written at @offset + * @type - Type of the window. + * + * Arrange to write a value to the registered offset after ensuring that the + * offset provided is indeed valid. + */ +int scif_prog_signal(scif_epd_t epd, off_t offset, u64 val, + enum scif_window_type type) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct scif_window *window = NULL; + struct scif_rma_req req; + dma_addr_t dst_dma_addr; + int err; + + mutex_lock(&ep->rma_info.rma_lock); + req.out_window = &window; + req.offset = offset; + req.nr_bytes = sizeof(u64); + req.prot = SCIF_PROT_WRITE; + req.type = SCIF_WINDOW_SINGLE; + if (type == SCIF_WINDOW_SELF) + req.head = &ep->rma_info.reg_list; + else + req.head = &ep->rma_info.remote_reg_list; + /* Does a valid window exist? */ + err = scif_query_window(&req); + if (err) { + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + goto unlock_ret; + } + + if (scif_is_mgmt_node() && scifdev_self(ep->remote_dev)) { + u64 *dst_virt; + + if (type == SCIF_WINDOW_SELF) + dst_virt = scif_get_local_va(offset, window); + else + dst_virt = + scif_get_local_va(offset, (struct scif_window *) + window->peer_window); + *dst_virt = val; + } else { + dst_dma_addr = __scif_off_to_dma_addr(window, offset); + err = _scif_prog_signal(epd, dst_dma_addr, val); + } +unlock_ret: + mutex_unlock(&ep->rma_info.rma_lock); + return err; +} + +static int _scif_fence_wait(scif_epd_t epd, int mark) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + dma_cookie_t cookie = mark & ~SCIF_REMOTE_FENCE; + int err; + + /* Wait for DMA callback in scif_fence_mark_cb(..) */ + err = wait_event_interruptible_timeout(ep->rma_info.markwq, + dma_async_is_tx_complete( + ep->rma_info.dma_chan, + cookie, NULL, NULL) == + DMA_COMPLETE, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err) + err = -ETIMEDOUT; + else if (err > 0) + err = 0; + return err; +} + +/** + * scif_rma_handle_remote_fences: + * + * This routine services remote fence requests. + */ +void scif_rma_handle_remote_fences(void) +{ + struct list_head *item, *tmp; + struct scif_remote_fence_info *fence; + struct scif_endpt *ep; + int mark, err; + + might_sleep(); + mutex_lock(&scif_info.fencelock); + list_for_each_safe(item, tmp, &scif_info.fence) { + fence = list_entry(item, struct scif_remote_fence_info, + list); + /* Remove fence from global list */ + list_del(&fence->list); + + /* Initiate the fence operation */ + ep = (struct scif_endpt *)fence->msg.payload[0]; + mark = fence->msg.payload[2]; + err = _scif_fence_wait(ep, mark); + if (err) + fence->msg.uop = SCIF_WAIT_NACK; + else + fence->msg.uop = SCIF_WAIT_ACK; + fence->msg.payload[0] = ep->remote_ep; + scif_nodeqp_send(ep->remote_dev, &fence->msg); + kfree(fence); + if (!atomic_sub_return(1, &ep->rma_info.fence_refcount)) + schedule_work(&scif_info.misc_work); + } + mutex_unlock(&scif_info.fencelock); +} + +static int _scif_send_fence(scif_epd_t epd, int uop, int mark, int *out_mark) +{ + int err; + struct scifmsg msg; + struct scif_fence_info *fence_req; + struct scif_endpt *ep = (struct scif_endpt *)epd; + + fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL); + if (!fence_req) { + err = -ENOMEM; + goto error; + } + + fence_req->state = OP_IN_PROGRESS; + init_completion(&fence_req->comp); + + msg.src = ep->port; + msg.uop = uop; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = (u64)fence_req; + if (uop == SCIF_WAIT) + msg.payload[2] = mark; + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) + err = scif_nodeqp_send(ep->remote_dev, &msg); + else + err = -ENOTCONN; + spin_unlock(&ep->lock); + if (err) + goto error_free; +retry: + /* Wait for a SCIF_WAIT_(N)ACK message */ + err = wait_for_completion_timeout(&fence_req->comp, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + if (!err) + err = -ENODEV; + if (err > 0) + err = 0; + mutex_lock(&ep->rma_info.rma_lock); + if (err < 0) { + if (fence_req->state == OP_IN_PROGRESS) + fence_req->state = OP_FAILED; + } + if (fence_req->state == OP_FAILED && !err) + err = -ENOMEM; + if (uop == SCIF_MARK && fence_req->state == OP_COMPLETED) + *out_mark = SCIF_REMOTE_FENCE | fence_req->dma_mark; + mutex_unlock(&ep->rma_info.rma_lock); +error_free: + kfree(fence_req); +error: + return err; +} + +/** + * scif_send_fence_mark: + * @epd: end point descriptor. + * @out_mark: Output DMA mark reported by peer. + * + * Send a remote fence mark request. + */ +static int scif_send_fence_mark(scif_epd_t epd, int *out_mark) +{ + return _scif_send_fence(epd, SCIF_MARK, 0, out_mark); +} + +/** + * scif_send_fence_wait: + * @epd: end point descriptor. + * @mark: DMA mark to wait for. + * + * Send a remote fence wait request. + */ +static int scif_send_fence_wait(scif_epd_t epd, int mark) +{ + return _scif_send_fence(epd, SCIF_WAIT, mark, NULL); +} + +static int _scif_send_fence_signal_wait(struct scif_endpt *ep, + struct scif_fence_info *fence_req) +{ + int err; + +retry: + /* Wait for a SCIF_SIG_(N)ACK message */ + err = wait_for_completion_timeout(&fence_req->comp, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + if (!err) + err = -ENODEV; + if (err > 0) + err = 0; + if (err < 0) { + mutex_lock(&ep->rma_info.rma_lock); + if (fence_req->state == OP_IN_PROGRESS) + fence_req->state = OP_FAILED; + mutex_unlock(&ep->rma_info.rma_lock); + } + if (fence_req->state == OP_FAILED && !err) + err = -ENXIO; + return err; +} + +/** + * scif_send_fence_signal: + * @epd - endpoint descriptor + * @loff - local offset + * @lval - local value to write to loffset + * @roff - remote offset + * @rval - remote value to write to roffset + * @flags - flags + * + * Sends a remote fence signal request + */ +static int scif_send_fence_signal(scif_epd_t epd, off_t roff, u64 rval, + off_t loff, u64 lval, int flags) +{ + int err = 0; + struct scifmsg msg; + struct scif_fence_info *fence_req; + struct scif_endpt *ep = (struct scif_endpt *)epd; + + fence_req = kmalloc(sizeof(*fence_req), GFP_KERNEL); + if (!fence_req) { + err = -ENOMEM; + goto error; + } + + fence_req->state = OP_IN_PROGRESS; + init_completion(&fence_req->comp); + msg.src = ep->port; + if (flags & SCIF_SIGNAL_LOCAL) { + msg.uop = SCIF_SIG_LOCAL; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = roff; + msg.payload[2] = rval; + msg.payload[3] = (u64)fence_req; + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) + err = scif_nodeqp_send(ep->remote_dev, &msg); + else + err = -ENOTCONN; + spin_unlock(&ep->lock); + if (err) + goto error_free; + err = _scif_send_fence_signal_wait(ep, fence_req); + if (err) + goto error_free; + } + fence_req->state = OP_IN_PROGRESS; + + if (flags & SCIF_SIGNAL_REMOTE) { + msg.uop = SCIF_SIG_REMOTE; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = loff; + msg.payload[2] = lval; + msg.payload[3] = (u64)fence_req; + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) + err = scif_nodeqp_send(ep->remote_dev, &msg); + else + err = -ENOTCONN; + spin_unlock(&ep->lock); + if (err) + goto error_free; + err = _scif_send_fence_signal_wait(ep, fence_req); + } +error_free: + kfree(fence_req); +error: + return err; +} + +static void scif_fence_mark_cb(void *arg) +{ + struct scif_endpt *ep = (struct scif_endpt *)arg; + + wake_up_interruptible(&ep->rma_info.markwq); + atomic_dec(&ep->rma_info.fence_refcount); +} + +/* + * _scif_fence_mark: + * + * @epd - endpoint descriptor + * Set up a mark for this endpoint and return the value of the mark. + */ +int _scif_fence_mark(scif_epd_t epd, int *mark) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct dma_chan *chan = ep->rma_info.dma_chan; + struct dma_device *ddev = chan->device; + struct dma_async_tx_descriptor *tx; + dma_cookie_t cookie; + int err; + + tx = ddev->device_prep_dma_memcpy(chan, 0, 0, 0, DMA_PREP_FENCE); + if (!tx) { + err = -ENOMEM; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + err = (int)cookie; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + dma_async_issue_pending(chan); + tx = ddev->device_prep_dma_interrupt(chan, DMA_PREP_INTERRUPT); + if (!tx) { + err = -ENOMEM; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + tx->callback = scif_fence_mark_cb; + tx->callback_param = ep; + *mark = cookie = tx->tx_submit(tx); + if (dma_submit_error(cookie)) { + err = (int)cookie; + dev_err(&ep->remote_dev->sdev->dev, "%s %d err %d\n", + __func__, __LINE__, err); + return err; + } + atomic_inc(&ep->rma_info.fence_refcount); + dma_async_issue_pending(chan); + return 0; +} + +#define SCIF_LOOPB_MAGIC_MARK 0xdead + +int scif_fence_mark(scif_epd_t epd, int flags, int *mark) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int err = 0; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI fence_mark: ep %p flags 0x%x mark 0x%x\n", + ep, flags, *mark); + err = scif_verify_epd(ep); + if (err) + return err; + + /* Invalid flags? */ + if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER)) + return -EINVAL; + + /* At least one of init self or peer RMA should be set */ + if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))) + return -EINVAL; + + /* Exactly one of init self or peer RMA should be set but not both */ + if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER)) + return -EINVAL; + + /* + * Management node loopback does not need to use DMA. + * Return a valid mark to be symmetric. + */ + if (scifdev_self(ep->remote_dev) && scif_is_mgmt_node()) { + *mark = SCIF_LOOPB_MAGIC_MARK; + return 0; + } + + if (flags & SCIF_FENCE_INIT_SELF) + err = _scif_fence_mark(epd, mark); + else + err = scif_send_fence_mark(ep, mark); + + if (err) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI fence_mark: ep %p flags 0x%x mark 0x%x err %d\n", + ep, flags, *mark, err); + return err; +} +EXPORT_SYMBOL_GPL(scif_fence_mark); + +int scif_fence_wait(scif_epd_t epd, int mark) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int err = 0; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI fence_wait: ep %p mark 0x%x\n", + ep, mark); + err = scif_verify_epd(ep); + if (err) + return err; + /* + * Management node loopback does not need to use DMA. + * The only valid mark provided is 0 so simply + * return success if the mark is valid. + */ + if (scifdev_self(ep->remote_dev) && scif_is_mgmt_node()) { + if (mark == SCIF_LOOPB_MAGIC_MARK) + return 0; + else + return -EINVAL; + } + if (mark & SCIF_REMOTE_FENCE) + err = scif_send_fence_wait(epd, mark); + else + err = _scif_fence_wait(epd, mark); + if (err < 0) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + return err; +} +EXPORT_SYMBOL_GPL(scif_fence_wait); + +int scif_fence_signal(scif_epd_t epd, off_t loff, u64 lval, + off_t roff, u64 rval, int flags) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + int err = 0; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI fence_signal: ep %p loff 0x%lx lval 0x%llx roff 0x%lx rval 0x%llx flags 0x%x\n", + ep, loff, lval, roff, rval, flags); + err = scif_verify_epd(ep); + if (err) + return err; + + /* Invalid flags? */ + if (flags & ~(SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER | + SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE)) + return -EINVAL; + + /* At least one of init self or peer RMA should be set */ + if (!(flags & (SCIF_FENCE_INIT_SELF | SCIF_FENCE_INIT_PEER))) + return -EINVAL; + + /* Exactly one of init self or peer RMA should be set but not both */ + if ((flags & SCIF_FENCE_INIT_SELF) && (flags & SCIF_FENCE_INIT_PEER)) + return -EINVAL; + + /* At least one of SCIF_SIGNAL_LOCAL or SCIF_SIGNAL_REMOTE required */ + if (!(flags & (SCIF_SIGNAL_LOCAL | SCIF_SIGNAL_REMOTE))) + return -EINVAL; + + /* Only Dword offsets allowed */ + if ((flags & SCIF_SIGNAL_LOCAL) && (loff & (sizeof(u32) - 1))) + return -EINVAL; + + /* Only Dword aligned offsets allowed */ + if ((flags & SCIF_SIGNAL_REMOTE) && (roff & (sizeof(u32) - 1))) + return -EINVAL; + + if (flags & SCIF_FENCE_INIT_PEER) { + err = scif_send_fence_signal(epd, roff, rval, loff, + lval, flags); + } else { + /* Local Signal in Local RAS */ + if (flags & SCIF_SIGNAL_LOCAL) { + err = scif_prog_signal(epd, loff, lval, + SCIF_WINDOW_SELF); + if (err) + goto error_ret; + } + + /* Signal in Remote RAS */ + if (flags & SCIF_SIGNAL_REMOTE) + err = scif_prog_signal(epd, roff, + rval, SCIF_WINDOW_PEER); + } +error_ret: + if (err) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + return err; +} +EXPORT_SYMBOL_GPL(scif_fence_signal); diff --git a/kernel/drivers/misc/mic/scif/scif_main.c b/kernel/drivers/misc/mic/scif/scif_main.c new file mode 100644 index 000000000..36d847af1 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_main.c @@ -0,0 +1,359 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include <linux/module.h> +#include <linux/idr.h> + +#include <linux/mic_common.h> +#include "../common/mic_dev.h" +#include "../bus/scif_bus.h" +#include "scif_peer_bus.h" +#include "scif_main.h" +#include "scif_map.h" + +struct scif_info scif_info = { + .mdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "scif", + .fops = &scif_fops, + } +}; + +struct scif_dev *scif_dev; +struct kmem_cache *unaligned_cache; +static atomic_t g_loopb_cnt; + +/* Runs in the context of intr_wq */ +static void scif_intr_bh_handler(struct work_struct *work) +{ + struct scif_dev *scifdev = + container_of(work, struct scif_dev, intr_bh); + + if (scifdev_self(scifdev)) + scif_loopb_msg_handler(scifdev, scifdev->qpairs); + else + scif_nodeqp_intrhandler(scifdev, scifdev->qpairs); +} + +int scif_setup_intr_wq(struct scif_dev *scifdev) +{ + if (!scifdev->intr_wq) { + snprintf(scifdev->intr_wqname, sizeof(scifdev->intr_wqname), + "SCIF INTR %d", scifdev->node); + scifdev->intr_wq = + alloc_ordered_workqueue(scifdev->intr_wqname, 0); + if (!scifdev->intr_wq) + return -ENOMEM; + INIT_WORK(&scifdev->intr_bh, scif_intr_bh_handler); + } + return 0; +} + +void scif_destroy_intr_wq(struct scif_dev *scifdev) +{ + if (scifdev->intr_wq) { + destroy_workqueue(scifdev->intr_wq); + scifdev->intr_wq = NULL; + } +} + +irqreturn_t scif_intr_handler(int irq, void *data) +{ + struct scif_dev *scifdev = data; + struct scif_hw_dev *sdev = scifdev->sdev; + + sdev->hw_ops->ack_interrupt(sdev, scifdev->db); + queue_work(scifdev->intr_wq, &scifdev->intr_bh); + return IRQ_HANDLED; +} + +static void scif_qp_setup_handler(struct work_struct *work) +{ + struct scif_dev *scifdev = container_of(work, struct scif_dev, + qp_dwork.work); + struct scif_hw_dev *sdev = scifdev->sdev; + dma_addr_t da = 0; + int err; + + if (scif_is_mgmt_node()) { + struct mic_bootparam *bp = sdev->dp; + + da = bp->scif_card_dma_addr; + scifdev->rdb = bp->h2c_scif_db; + } else { + struct mic_bootparam __iomem *bp = sdev->rdp; + + da = readq(&bp->scif_host_dma_addr); + scifdev->rdb = ioread8(&bp->c2h_scif_db); + } + if (da) { + err = scif_qp_response(da, scifdev); + if (err) + dev_err(&scifdev->sdev->dev, + "scif_qp_response err %d\n", err); + } else { + schedule_delayed_work(&scifdev->qp_dwork, + msecs_to_jiffies(1000)); + } +} + +static int scif_setup_scifdev(void) +{ + /* We support a maximum of 129 SCIF nodes including the mgmt node */ +#define MAX_SCIF_NODES 129 + int i; + u8 num_nodes = MAX_SCIF_NODES; + + scif_dev = kcalloc(num_nodes, sizeof(*scif_dev), GFP_KERNEL); + if (!scif_dev) + return -ENOMEM; + for (i = 0; i < num_nodes; i++) { + struct scif_dev *scifdev = &scif_dev[i]; + + scifdev->node = i; + scifdev->exit = OP_IDLE; + init_waitqueue_head(&scifdev->disconn_wq); + mutex_init(&scifdev->lock); + INIT_WORK(&scifdev->peer_add_work, scif_add_peer_device); + INIT_DELAYED_WORK(&scifdev->p2p_dwork, + scif_poll_qp_state); + INIT_DELAYED_WORK(&scifdev->qp_dwork, + scif_qp_setup_handler); + INIT_LIST_HEAD(&scifdev->p2p); + RCU_INIT_POINTER(scifdev->spdev, NULL); + } + return 0; +} + +static void scif_destroy_scifdev(void) +{ + kfree(scif_dev); +} + +static int scif_probe(struct scif_hw_dev *sdev) +{ + struct scif_dev *scifdev = &scif_dev[sdev->dnode]; + int rc; + + dev_set_drvdata(&sdev->dev, sdev); + scifdev->sdev = sdev; + + if (1 == atomic_add_return(1, &g_loopb_cnt)) { + struct scif_dev *loopb_dev = &scif_dev[sdev->snode]; + + loopb_dev->sdev = sdev; + rc = scif_setup_loopback_qp(loopb_dev); + if (rc) + goto exit; + } + + rc = scif_setup_intr_wq(scifdev); + if (rc) + goto destroy_loopb; + rc = scif_setup_qp(scifdev); + if (rc) + goto destroy_intr; + scifdev->db = sdev->hw_ops->next_db(sdev); + scifdev->cookie = sdev->hw_ops->request_irq(sdev, scif_intr_handler, + "SCIF_INTR", scifdev, + scifdev->db); + if (IS_ERR(scifdev->cookie)) { + rc = PTR_ERR(scifdev->cookie); + goto free_qp; + } + if (scif_is_mgmt_node()) { + struct mic_bootparam *bp = sdev->dp; + + bp->c2h_scif_db = scifdev->db; + bp->scif_host_dma_addr = scifdev->qp_dma_addr; + } else { + struct mic_bootparam __iomem *bp = sdev->rdp; + + iowrite8(scifdev->db, &bp->h2c_scif_db); + writeq(scifdev->qp_dma_addr, &bp->scif_card_dma_addr); + } + schedule_delayed_work(&scifdev->qp_dwork, + msecs_to_jiffies(1000)); + return rc; +free_qp: + scif_free_qp(scifdev); +destroy_intr: + scif_destroy_intr_wq(scifdev); +destroy_loopb: + if (atomic_dec_and_test(&g_loopb_cnt)) + scif_destroy_loopback_qp(&scif_dev[sdev->snode]); +exit: + return rc; +} + +void scif_stop(struct scif_dev *scifdev) +{ + struct scif_dev *dev; + int i; + + for (i = scif_info.maxid; i >= 0; i--) { + dev = &scif_dev[i]; + if (scifdev_self(dev)) + continue; + scif_handle_remove_node(i); + } +} + +static void scif_remove(struct scif_hw_dev *sdev) +{ + struct scif_dev *scifdev = &scif_dev[sdev->dnode]; + + if (scif_is_mgmt_node()) { + struct mic_bootparam *bp = sdev->dp; + + bp->c2h_scif_db = -1; + bp->scif_host_dma_addr = 0x0; + } else { + struct mic_bootparam __iomem *bp = sdev->rdp; + + iowrite8(-1, &bp->h2c_scif_db); + writeq(0x0, &bp->scif_card_dma_addr); + } + if (scif_is_mgmt_node()) { + scif_disconnect_node(scifdev->node, true); + } else { + scif_info.card_initiated_exit = true; + scif_stop(scifdev); + } + if (atomic_dec_and_test(&g_loopb_cnt)) + scif_destroy_loopback_qp(&scif_dev[sdev->snode]); + if (scifdev->cookie) { + sdev->hw_ops->free_irq(sdev, scifdev->cookie, scifdev); + scifdev->cookie = NULL; + } + scif_destroy_intr_wq(scifdev); + cancel_delayed_work(&scifdev->qp_dwork); + scif_free_qp(scifdev); + scifdev->rdb = -1; + scifdev->sdev = NULL; +} + +static struct scif_hw_dev_id id_table[] = { + { MIC_SCIF_DEV, SCIF_DEV_ANY_ID }, + { 0 }, +}; + +static struct scif_driver scif_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = scif_probe, + .remove = scif_remove, +}; + +static int _scif_init(void) +{ + int rc; + + mutex_init(&scif_info.eplock); + spin_lock_init(&scif_info.rmalock); + spin_lock_init(&scif_info.nb_connect_lock); + spin_lock_init(&scif_info.port_lock); + mutex_init(&scif_info.conflock); + mutex_init(&scif_info.connlock); + mutex_init(&scif_info.fencelock); + INIT_LIST_HEAD(&scif_info.uaccept); + INIT_LIST_HEAD(&scif_info.listen); + INIT_LIST_HEAD(&scif_info.zombie); + INIT_LIST_HEAD(&scif_info.connected); + INIT_LIST_HEAD(&scif_info.disconnected); + INIT_LIST_HEAD(&scif_info.rma); + INIT_LIST_HEAD(&scif_info.rma_tc); + INIT_LIST_HEAD(&scif_info.mmu_notif_cleanup); + INIT_LIST_HEAD(&scif_info.fence); + INIT_LIST_HEAD(&scif_info.nb_connect_list); + init_waitqueue_head(&scif_info.exitwq); + scif_info.rma_tc_limit = SCIF_RMA_TEMP_CACHE_LIMIT; + scif_info.en_msg_log = 0; + scif_info.p2p_enable = 1; + rc = scif_setup_scifdev(); + if (rc) + goto error; + unaligned_cache = kmem_cache_create("Unaligned_DMA", + SCIF_KMEM_UNALIGNED_BUF_SIZE, + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!unaligned_cache) { + rc = -ENOMEM; + goto free_sdev; + } + INIT_WORK(&scif_info.misc_work, scif_misc_handler); + INIT_WORK(&scif_info.mmu_notif_work, scif_mmu_notif_handler); + INIT_WORK(&scif_info.conn_work, scif_conn_handler); + idr_init(&scif_ports); + return 0; +free_sdev: + scif_destroy_scifdev(); +error: + return rc; +} + +static void _scif_exit(void) +{ + idr_destroy(&scif_ports); + kmem_cache_destroy(unaligned_cache); + scif_destroy_scifdev(); +} + +static int __init scif_init(void) +{ + struct miscdevice *mdev = &scif_info.mdev; + int rc; + + _scif_init(); + iova_cache_get(); + rc = scif_peer_bus_init(); + if (rc) + goto exit; + rc = scif_register_driver(&scif_driver); + if (rc) + goto peer_bus_exit; + rc = misc_register(mdev); + if (rc) + goto unreg_scif; + scif_init_debugfs(); + return 0; +unreg_scif: + scif_unregister_driver(&scif_driver); +peer_bus_exit: + scif_peer_bus_exit(); +exit: + _scif_exit(); + return rc; +} + +static void __exit scif_exit(void) +{ + scif_exit_debugfs(); + misc_deregister(&scif_info.mdev); + scif_unregister_driver(&scif_driver); + scif_peer_bus_exit(); + iova_cache_put(); + _scif_exit(); +} + +module_init(scif_init); +module_exit(scif_exit); + +MODULE_DEVICE_TABLE(scif, id_table); +MODULE_AUTHOR("Intel Corporation"); +MODULE_DESCRIPTION("Intel(R) SCIF driver"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/drivers/misc/mic/scif/scif_main.h b/kernel/drivers/misc/mic/scif/scif_main.h new file mode 100644 index 000000000..a08f0b600 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_main.h @@ -0,0 +1,283 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#ifndef SCIF_MAIN_H +#define SCIF_MAIN_H + +#include <linux/sched.h> +#include <linux/pci.h> +#include <linux/miscdevice.h> +#include <linux/dmaengine.h> +#include <linux/iova.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/vmalloc.h> +#include <linux/scif.h> +#include "../common/mic_dev.h" + +#define SCIF_MGMT_NODE 0 +#define SCIF_DEFAULT_WATCHDOG_TO 30 +#define SCIF_NODE_ACCEPT_TIMEOUT (3 * HZ) +#define SCIF_NODE_ALIVE_TIMEOUT (SCIF_DEFAULT_WATCHDOG_TO * HZ) +#define SCIF_RMA_TEMP_CACHE_LIMIT 0x20000 + +/* + * Generic state used for certain node QP message exchanges + * like Unregister, Alloc etc. + */ +enum scif_msg_state { + OP_IDLE = 1, + OP_IN_PROGRESS, + OP_COMPLETED, + OP_FAILED +}; + +/* + * struct scif_info - Global SCIF information + * + * @nodeid: Node ID this node is to others + * @maxid: Max known node ID + * @total: Total number of SCIF nodes + * @nr_zombies: number of zombie endpoints + * @eplock: Lock to synchronize listening, zombie endpoint lists + * @connlock: Lock to synchronize connected and disconnected lists + * @nb_connect_lock: Synchronize non blocking connect operations + * @port_lock: Synchronize access to SCIF ports + * @uaccept: List of user acceptreq waiting for acceptreg + * @listen: List of listening end points + * @zombie: List of zombie end points with pending RMA's + * @connected: List of end points in connected state + * @disconnected: List of end points in disconnected state + * @nb_connect_list: List for non blocking connections + * @misc_work: miscellaneous SCIF tasks + * @conflock: Lock to synchronize SCIF node configuration changes + * @en_msg_log: Enable debug message logging + * @p2p_enable: Enable P2P SCIF network + * @mdev: The MISC device + * @conn_work: Work for workqueue handling all connections + * @exitwq: Wait queue for waiting for an EXIT node QP message response + * @loopb_dev: Dummy SCIF device used for loopback + * @loopb_wq: Workqueue used for handling loopback messages + * @loopb_wqname[16]: Name of loopback workqueue + * @loopb_work: Used for submitting work to loopb_wq + * @loopb_recv_q: List of messages received on the loopb_wq + * @card_initiated_exit: set when the card has initiated the exit + * @rmalock: Synchronize access to RMA operations + * @fencelock: Synchronize access to list of remote fences requested. + * @rma: List of temporary registered windows to be destroyed. + * @rma_tc: List of temporary registered & cached Windows to be destroyed + * @fence: List of remote fence requests + * @mmu_notif_work: Work for registration caching MMU notifier workqueue + * @mmu_notif_cleanup: List of temporary cached windows for reg cache + * @rma_tc_limit: RMA temporary cache limit + */ +struct scif_info { + u8 nodeid; + u8 maxid; + u8 total; + u32 nr_zombies; + struct mutex eplock; + struct mutex connlock; + spinlock_t nb_connect_lock; + spinlock_t port_lock; + struct list_head uaccept; + struct list_head listen; + struct list_head zombie; + struct list_head connected; + struct list_head disconnected; + struct list_head nb_connect_list; + struct work_struct misc_work; + struct mutex conflock; + u8 en_msg_log; + u8 p2p_enable; + struct miscdevice mdev; + struct work_struct conn_work; + wait_queue_head_t exitwq; + struct scif_dev *loopb_dev; + struct workqueue_struct *loopb_wq; + char loopb_wqname[16]; + struct work_struct loopb_work; + struct list_head loopb_recv_q; + bool card_initiated_exit; + spinlock_t rmalock; + struct mutex fencelock; + struct list_head rma; + struct list_head rma_tc; + struct list_head fence; + struct work_struct mmu_notif_work; + struct list_head mmu_notif_cleanup; + unsigned long rma_tc_limit; +}; + +/* + * struct scif_p2p_info - SCIF mapping information used for P2P + * + * @ppi_peer_id - SCIF peer node id + * @ppi_sg - Scatter list for bar information (One for mmio and one for aper) + * @sg_nentries - Number of entries in the scatterlist + * @ppi_da: DMA address for MMIO and APER bars + * @ppi_len: Length of MMIO and APER bars + * @ppi_list: Link in list of mapping information + */ +struct scif_p2p_info { + u8 ppi_peer_id; + struct scatterlist *ppi_sg[2]; + u64 sg_nentries[2]; + dma_addr_t ppi_da[2]; + u64 ppi_len[2]; +#define SCIF_PPI_MMIO 0 +#define SCIF_PPI_APER 1 + struct list_head ppi_list; +}; + +/* + * struct scif_dev - SCIF remote device specific fields + * + * @node: Node id + * @p2p: List of P2P mapping information + * @qpairs: The node queue pair for exchanging control messages + * @intr_wq: Workqueue for handling Node QP messages + * @intr_wqname: Name of node QP workqueue for handling interrupts + * @intr_bh: Used for submitting work to intr_wq + * @lock: Lock used for synchronizing access to the scif device + * @sdev: SCIF hardware device on the SCIF hardware bus + * @db: doorbell the peer will trigger to generate an interrupt on self + * @rdb: Doorbell to trigger on the peer to generate an interrupt on the peer + * @cookie: Cookie received while registering the interrupt handler + * @peer_add_work: Work for handling device_add for peer devices + * @p2p_dwork: Delayed work to enable polling for P2P state + * @qp_dwork: Delayed work for enabling polling for remote QP information + * @p2p_retry: Number of times to retry polling of P2P state + * @base_addr: P2P aperture bar base address + * @mic_mw mmio: The peer MMIO information used for P2P + * @spdev: SCIF peer device on the SCIF peer bus + * @node_remove_ack_pending: True if a node_remove_ack is pending + * @exit_ack_pending: true if an exit_ack is pending + * @disconn_wq: Used while waiting for a node remove response + * @disconn_rescnt: Keeps track of number of node remove requests sent + * @exit: Status of exit message + * @qp_dma_addr: Queue pair DMA address passed to the peer + * @dma_ch_idx: Round robin index for DMA channels + * @signal_pool: DMA pool used for scheduling scif_fence_signal DMA's +*/ +struct scif_dev { + u8 node; + struct list_head p2p; + struct scif_qp *qpairs; + struct workqueue_struct *intr_wq; + char intr_wqname[16]; + struct work_struct intr_bh; + struct mutex lock; + struct scif_hw_dev *sdev; + int db; + int rdb; + struct mic_irq *cookie; + struct work_struct peer_add_work; + struct delayed_work p2p_dwork; + struct delayed_work qp_dwork; + int p2p_retry; + dma_addr_t base_addr; + struct mic_mw mmio; + struct scif_peer_dev __rcu *spdev; + bool node_remove_ack_pending; + bool exit_ack_pending; + wait_queue_head_t disconn_wq; + atomic_t disconn_rescnt; + enum scif_msg_state exit; + dma_addr_t qp_dma_addr; + int dma_ch_idx; + struct dma_pool *signal_pool; +}; + +extern bool scif_reg_cache_enable; +extern bool scif_ulimit_check; +extern struct scif_info scif_info; +extern struct idr scif_ports; +extern struct bus_type scif_peer_bus; +extern struct scif_dev *scif_dev; +extern const struct file_operations scif_fops; +extern const struct file_operations scif_anon_fops; + +/* Size of the RB for the Node QP */ +#define SCIF_NODE_QP_SIZE 0x10000 + +#include "scif_nodeqp.h" +#include "scif_rma.h" +#include "scif_rma_list.h" + +/* + * scifdev_self: + * @dev: The remote SCIF Device + * + * Returns true if the SCIF Device passed is the self aka Loopback SCIF device. + */ +static inline int scifdev_self(struct scif_dev *dev) +{ + return dev->node == scif_info.nodeid; +} + +static inline bool scif_is_mgmt_node(void) +{ + return !scif_info.nodeid; +} + +/* + * scifdev_is_p2p: + * @dev: The remote SCIF Device + * + * Returns true if the SCIF Device is a MIC Peer to Peer SCIF device. + */ +static inline bool scifdev_is_p2p(struct scif_dev *dev) +{ + if (scif_is_mgmt_node()) + return false; + else + return dev != &scif_dev[SCIF_MGMT_NODE] && + !scifdev_self(dev); +} + +/* + * scifdev_alive: + * @scifdev: The remote SCIF Device + * + * Returns true if the remote SCIF Device is running or sleeping for + * this endpoint. + */ +static inline int _scifdev_alive(struct scif_dev *scifdev) +{ + struct scif_peer_dev *spdev; + + rcu_read_lock(); + spdev = rcu_dereference(scifdev->spdev); + rcu_read_unlock(); + return !!spdev; +} + +#include "scif_epd.h" + +void __init scif_init_debugfs(void); +void scif_exit_debugfs(void); +int scif_setup_intr_wq(struct scif_dev *scifdev); +void scif_destroy_intr_wq(struct scif_dev *scifdev); +void scif_cleanup_scifdev(struct scif_dev *dev); +void scif_handle_remove_node(int node); +void scif_disconnect_node(u32 node_id, bool mgmt_initiated); +void scif_free_qp(struct scif_dev *dev); +void scif_misc_handler(struct work_struct *work); +void scif_stop(struct scif_dev *scifdev); +irqreturn_t scif_intr_handler(int irq, void *data); +#endif /* SCIF_MAIN_H */ diff --git a/kernel/drivers/misc/mic/scif/scif_map.h b/kernel/drivers/misc/mic/scif/scif_map.h new file mode 100644 index 000000000..3e86360ba --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_map.h @@ -0,0 +1,136 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#ifndef SCIF_MAP_H +#define SCIF_MAP_H + +#include "../bus/scif_bus.h" + +static __always_inline void * +scif_alloc_coherent(dma_addr_t *dma_handle, + struct scif_dev *scifdev, size_t size, + gfp_t gfp) +{ + void *va; + + if (scifdev_self(scifdev)) { + va = kmalloc(size, gfp); + if (va) + *dma_handle = virt_to_phys(va); + } else { + va = dma_alloc_coherent(&scifdev->sdev->dev, + size, dma_handle, gfp); + if (va && scifdev_is_p2p(scifdev)) + *dma_handle = *dma_handle + scifdev->base_addr; + } + return va; +} + +static __always_inline void +scif_free_coherent(void *va, dma_addr_t local, + struct scif_dev *scifdev, size_t size) +{ + if (scifdev_self(scifdev)) { + kfree(va); + } else { + if (scifdev_is_p2p(scifdev) && local > scifdev->base_addr) + local = local - scifdev->base_addr; + dma_free_coherent(&scifdev->sdev->dev, + size, va, local); + } +} + +static __always_inline int +scif_map_single(dma_addr_t *dma_handle, + void *local, struct scif_dev *scifdev, size_t size) +{ + int err = 0; + + if (scifdev_self(scifdev)) { + *dma_handle = virt_to_phys((local)); + } else { + *dma_handle = dma_map_single(&scifdev->sdev->dev, + local, size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(&scifdev->sdev->dev, *dma_handle)) + err = -ENOMEM; + else if (scifdev_is_p2p(scifdev)) + *dma_handle = *dma_handle + scifdev->base_addr; + } + if (err) + *dma_handle = 0; + return err; +} + +static __always_inline void +scif_unmap_single(dma_addr_t local, struct scif_dev *scifdev, + size_t size) +{ + if (!scifdev_self(scifdev)) { + if (scifdev_is_p2p(scifdev)) + local = local - scifdev->base_addr; + dma_unmap_single(&scifdev->sdev->dev, local, + size, DMA_BIDIRECTIONAL); + } +} + +static __always_inline void * +scif_ioremap(dma_addr_t phys, size_t size, struct scif_dev *scifdev) +{ + void *out_virt; + struct scif_hw_dev *sdev = scifdev->sdev; + + if (scifdev_self(scifdev)) + out_virt = phys_to_virt(phys); + else + out_virt = (void __force *) + sdev->hw_ops->ioremap(sdev, phys, size); + return out_virt; +} + +static __always_inline void +scif_iounmap(void *virt, size_t len, struct scif_dev *scifdev) +{ + if (!scifdev_self(scifdev)) { + struct scif_hw_dev *sdev = scifdev->sdev; + + sdev->hw_ops->iounmap(sdev, (void __force __iomem *)virt); + } +} + +static __always_inline int +scif_map_page(dma_addr_t *dma_handle, struct page *page, + struct scif_dev *scifdev) +{ + int err = 0; + + if (scifdev_self(scifdev)) { + *dma_handle = page_to_phys(page); + } else { + struct scif_hw_dev *sdev = scifdev->sdev; + *dma_handle = dma_map_page(&sdev->dev, + page, 0x0, PAGE_SIZE, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(&sdev->dev, *dma_handle)) + err = -ENOMEM; + else if (scifdev_is_p2p(scifdev)) + *dma_handle = *dma_handle + scifdev->base_addr; + } + if (err) + *dma_handle = 0; + return err; +} +#endif /* SCIF_MAP_H */ diff --git a/kernel/drivers/misc/mic/scif/scif_mmap.c b/kernel/drivers/misc/mic/scif/scif_mmap.c new file mode 100644 index 000000000..49cb8f7b4 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_mmap.c @@ -0,0 +1,699 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include "scif_main.h" + +/* + * struct scif_vma_info - Information about a remote memory mapping + * created via scif_mmap(..) + * @vma: VM area struct + * @list: link to list of active vmas + */ +struct scif_vma_info { + struct vm_area_struct *vma; + struct list_head list; +}; + +void scif_recv_munmap(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_rma_req req; + struct scif_window *window = NULL; + struct scif_window *recv_window = + (struct scif_window *)msg->payload[0]; + struct scif_endpt *ep; + + ep = (struct scif_endpt *)recv_window->ep; + req.out_window = &window; + req.offset = recv_window->offset; + req.prot = recv_window->prot; + req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT; + req.type = SCIF_WINDOW_FULL; + req.head = &ep->rma_info.reg_list; + msg->payload[0] = ep->remote_ep; + + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + if (scif_query_window(&req)) { + dev_err(&scifdev->sdev->dev, + "%s %d -ENXIO\n", __func__, __LINE__); + msg->uop = SCIF_UNREGISTER_ACK; + goto error; + } + + scif_put_window(window, window->nr_pages); + + if (!window->ref_count) { + atomic_inc(&ep->rma_info.tw_refcount); + ep->rma_info.async_list_del = 1; + list_del_init(&window->list); + scif_free_window_offset(ep, window, window->offset); + } +error: + mutex_unlock(&ep->rma_info.rma_lock); + if (window && !window->ref_count) + scif_queue_for_cleanup(window, &scif_info.rma); +} + +/* + * Remove valid remote memory mappings created via scif_mmap(..) from the + * process address space since the remote node is lost + */ +static void __scif_zap_mmaps(struct scif_endpt *ep) +{ + struct list_head *item; + struct scif_vma_info *info; + struct vm_area_struct *vma; + unsigned long size; + + spin_lock(&ep->lock); + list_for_each(item, &ep->rma_info.vma_list) { + info = list_entry(item, struct scif_vma_info, list); + vma = info->vma; + size = vma->vm_end - vma->vm_start; + zap_vma_ptes(vma, vma->vm_start, size); + dev_dbg(scif_info.mdev.this_device, + "%s ep %p zap vma %p size 0x%lx\n", + __func__, ep, info->vma, size); + } + spin_unlock(&ep->lock); +} + +/* + * Traverse the list of endpoints for a particular remote node and + * zap valid remote memory mappings since the remote node is lost + */ +static void _scif_zap_mmaps(int node, struct list_head *head) +{ + struct scif_endpt *ep; + struct list_head *item; + + mutex_lock(&scif_info.connlock); + list_for_each(item, head) { + ep = list_entry(item, struct scif_endpt, list); + if (ep->remote_dev->node == node) + __scif_zap_mmaps(ep); + } + mutex_unlock(&scif_info.connlock); +} + +/* + * Wrapper for removing remote memory mappings for a particular node. This API + * is called by peer nodes as part of handling a lost node. + */ +void scif_zap_mmaps(int node) +{ + _scif_zap_mmaps(node, &scif_info.connected); + _scif_zap_mmaps(node, &scif_info.disconnected); +} + +/* + * This API is only called while handling a lost node: + * a) Remote node is dead. + * b) Remote memory mappings have been zapped + * So we can traverse the remote_reg_list without any locks. Since + * the window has not yet been unregistered we can drop the ref count + * and queue it to the cleanup thread. + */ +static void __scif_cleanup_rma_for_zombies(struct scif_endpt *ep) +{ + struct list_head *pos, *tmp; + struct scif_window *window; + + list_for_each_safe(pos, tmp, &ep->rma_info.remote_reg_list) { + window = list_entry(pos, struct scif_window, list); + if (window->ref_count) + scif_put_window(window, window->nr_pages); + else + dev_err(scif_info.mdev.this_device, + "%s %d unexpected\n", + __func__, __LINE__); + if (!window->ref_count) { + atomic_inc(&ep->rma_info.tw_refcount); + list_del_init(&window->list); + scif_queue_for_cleanup(window, &scif_info.rma); + } + } +} + +/* Cleanup remote registration lists for zombie endpoints */ +void scif_cleanup_rma_for_zombies(int node) +{ + struct scif_endpt *ep; + struct list_head *item; + + mutex_lock(&scif_info.eplock); + list_for_each(item, &scif_info.zombie) { + ep = list_entry(item, struct scif_endpt, list); + if (ep->remote_dev && ep->remote_dev->node == node) + __scif_cleanup_rma_for_zombies(ep); + } + mutex_unlock(&scif_info.eplock); + flush_work(&scif_info.misc_work); +} + +/* Insert the VMA into the per endpoint VMA list */ +static int scif_insert_vma(struct scif_endpt *ep, struct vm_area_struct *vma) +{ + struct scif_vma_info *info; + int err = 0; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + err = -ENOMEM; + goto done; + } + info->vma = vma; + spin_lock(&ep->lock); + list_add_tail(&info->list, &ep->rma_info.vma_list); + spin_unlock(&ep->lock); +done: + return err; +} + +/* Delete the VMA from the per endpoint VMA list */ +static void scif_delete_vma(struct scif_endpt *ep, struct vm_area_struct *vma) +{ + struct list_head *item; + struct scif_vma_info *info; + + spin_lock(&ep->lock); + list_for_each(item, &ep->rma_info.vma_list) { + info = list_entry(item, struct scif_vma_info, list); + if (info->vma == vma) { + list_del(&info->list); + kfree(info); + break; + } + } + spin_unlock(&ep->lock); +} + +static phys_addr_t scif_get_phys(phys_addr_t phys, struct scif_endpt *ep) +{ + struct scif_dev *scifdev = (struct scif_dev *)ep->remote_dev; + struct scif_hw_dev *sdev = scifdev->sdev; + phys_addr_t out_phys, apt_base = 0; + + /* + * If the DMA address is card relative then we need to add the + * aperture base for mmap to work correctly + */ + if (!scifdev_self(scifdev) && sdev->aper && sdev->card_rel_da) + apt_base = sdev->aper->pa; + out_phys = apt_base + phys; + return out_phys; +} + +int scif_get_pages(scif_epd_t epd, off_t offset, size_t len, + struct scif_range **pages) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct scif_rma_req req; + struct scif_window *window = NULL; + int nr_pages, err, i; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI get_pinned_pages: ep %p offset 0x%lx len 0x%lx\n", + ep, offset, len); + err = scif_verify_epd(ep); + if (err) + return err; + + if (!len || (offset < 0) || + (offset + len < offset) || + (ALIGN(offset, PAGE_SIZE) != offset) || + (ALIGN(len, PAGE_SIZE) != len)) + return -EINVAL; + + nr_pages = len >> PAGE_SHIFT; + + req.out_window = &window; + req.offset = offset; + req.prot = 0; + req.nr_bytes = len; + req.type = SCIF_WINDOW_SINGLE; + req.head = &ep->rma_info.remote_reg_list; + + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + err = scif_query_window(&req); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error; + } + + /* Allocate scif_range */ + *pages = kzalloc(sizeof(**pages), GFP_KERNEL); + if (!*pages) { + err = -ENOMEM; + goto error; + } + + /* Allocate phys addr array */ + (*pages)->phys_addr = scif_zalloc(nr_pages * sizeof(dma_addr_t)); + if (!((*pages)->phys_addr)) { + err = -ENOMEM; + goto error; + } + + if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev)) { + /* Allocate virtual address array */ + ((*pages)->va = scif_zalloc(nr_pages * sizeof(void *))); + if (!(*pages)->va) { + err = -ENOMEM; + goto error; + } + } + /* Populate the values */ + (*pages)->cookie = window; + (*pages)->nr_pages = nr_pages; + (*pages)->prot_flags = window->prot; + + for (i = 0; i < nr_pages; i++) { + (*pages)->phys_addr[i] = + __scif_off_to_dma_addr(window, offset + + (i * PAGE_SIZE)); + (*pages)->phys_addr[i] = scif_get_phys((*pages)->phys_addr[i], + ep); + if (scif_is_mgmt_node() && !scifdev_self(ep->remote_dev)) + (*pages)->va[i] = + ep->remote_dev->sdev->aper->va + + (*pages)->phys_addr[i] - + ep->remote_dev->sdev->aper->pa; + } + + scif_get_window(window, nr_pages); +error: + mutex_unlock(&ep->rma_info.rma_lock); + if (err) { + if (*pages) { + scif_free((*pages)->phys_addr, + nr_pages * sizeof(dma_addr_t)); + scif_free((*pages)->va, + nr_pages * sizeof(void *)); + kfree(*pages); + *pages = NULL; + } + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + } + return err; +} +EXPORT_SYMBOL_GPL(scif_get_pages); + +int scif_put_pages(struct scif_range *pages) +{ + struct scif_endpt *ep; + struct scif_window *window; + struct scifmsg msg; + + if (!pages || !pages->cookie) + return -EINVAL; + + window = pages->cookie; + + if (!window || window->magic != SCIFEP_MAGIC) + return -EINVAL; + + ep = (struct scif_endpt *)window->ep; + /* + * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the + * callee should be allowed to release references to the pages, + * else the endpoint was not connected in the first place, + * hence the ENOTCONN. + */ + if (ep->state != SCIFEP_CONNECTED && ep->state != SCIFEP_DISCONNECTED) + return -ENOTCONN; + + mutex_lock(&ep->rma_info.rma_lock); + + scif_put_window(window, pages->nr_pages); + + /* Initiate window destruction if ref count is zero */ + if (!window->ref_count) { + list_del(&window->list); + mutex_unlock(&ep->rma_info.rma_lock); + scif_drain_dma_intr(ep->remote_dev->sdev, + ep->rma_info.dma_chan); + /* Inform the peer about this window being destroyed. */ + msg.uop = SCIF_MUNMAP; + msg.src = ep->port; + msg.payload[0] = window->peer_window; + /* No error handling for notification messages */ + scif_nodeqp_send(ep->remote_dev, &msg); + /* Destroy this window from the peer's registered AS */ + scif_destroy_remote_window(window); + } else { + mutex_unlock(&ep->rma_info.rma_lock); + } + + scif_free(pages->phys_addr, pages->nr_pages * sizeof(dma_addr_t)); + scif_free(pages->va, pages->nr_pages * sizeof(void *)); + kfree(pages); + return 0; +} +EXPORT_SYMBOL_GPL(scif_put_pages); + +/* + * scif_rma_list_mmap: + * + * Traverse the remote registration list starting from start_window: + * 1) Create VtoP mappings via remap_pfn_range(..) + * 2) Once step 1) and 2) complete successfully then traverse the range of + * windows again and bump the reference count. + * RMA lock must be held. + */ +static int scif_rma_list_mmap(struct scif_window *start_window, s64 offset, + int nr_pages, struct vm_area_struct *vma) +{ + s64 end_offset, loop_offset = offset; + struct scif_window *window = start_window; + int loop_nr_pages, nr_pages_left = nr_pages; + struct scif_endpt *ep = (struct scif_endpt *)start_window->ep; + struct list_head *head = &ep->rma_info.remote_reg_list; + int i, err = 0; + dma_addr_t phys_addr; + struct scif_window_iter src_win_iter; + size_t contig_bytes = 0; + + might_sleep(); + list_for_each_entry_from(window, head, list) { + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + loop_nr_pages = min_t(int, + (end_offset - loop_offset) >> PAGE_SHIFT, + nr_pages_left); + scif_init_window_iter(window, &src_win_iter); + for (i = 0; i < loop_nr_pages; i++) { + phys_addr = scif_off_to_dma_addr(window, loop_offset, + &contig_bytes, + &src_win_iter); + phys_addr = scif_get_phys(phys_addr, ep); + err = remap_pfn_range(vma, + vma->vm_start + + loop_offset - offset, + phys_addr >> PAGE_SHIFT, + PAGE_SIZE, + vma->vm_page_prot); + if (err) + goto error; + loop_offset += PAGE_SIZE; + } + nr_pages_left -= loop_nr_pages; + if (!nr_pages_left) + break; + } + /* + * No more failures expected. Bump up the ref count for all + * the windows. Another traversal from start_window required + * for handling errors encountered across windows during + * remap_pfn_range(..). + */ + loop_offset = offset; + nr_pages_left = nr_pages; + window = start_window; + head = &ep->rma_info.remote_reg_list; + list_for_each_entry_from(window, head, list) { + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + loop_nr_pages = min_t(int, + (end_offset - loop_offset) >> PAGE_SHIFT, + nr_pages_left); + scif_get_window(window, loop_nr_pages); + nr_pages_left -= loop_nr_pages; + loop_offset += (loop_nr_pages << PAGE_SHIFT); + if (!nr_pages_left) + break; + } +error: + if (err) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + return err; +} + +/* + * scif_rma_list_munmap: + * + * Traverse the remote registration list starting from window: + * 1) Decrement ref count. + * 2) If the ref count drops to zero then send a SCIF_MUNMAP message to peer. + * RMA lock must be held. + */ +static void scif_rma_list_munmap(struct scif_window *start_window, + s64 offset, int nr_pages) +{ + struct scifmsg msg; + s64 loop_offset = offset, end_offset; + int loop_nr_pages, nr_pages_left = nr_pages; + struct scif_endpt *ep = (struct scif_endpt *)start_window->ep; + struct list_head *head = &ep->rma_info.remote_reg_list; + struct scif_window *window = start_window, *_window; + + msg.uop = SCIF_MUNMAP; + msg.src = ep->port; + loop_offset = offset; + nr_pages_left = nr_pages; + list_for_each_entry_safe_from(window, _window, head, list) { + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + loop_nr_pages = min_t(int, + (end_offset - loop_offset) >> PAGE_SHIFT, + nr_pages_left); + scif_put_window(window, loop_nr_pages); + if (!window->ref_count) { + struct scif_dev *rdev = ep->remote_dev; + + scif_drain_dma_intr(rdev->sdev, + ep->rma_info.dma_chan); + /* Inform the peer about this munmap */ + msg.payload[0] = window->peer_window; + /* No error handling for Notification messages. */ + scif_nodeqp_send(ep->remote_dev, &msg); + list_del(&window->list); + /* Destroy this window from the peer's registered AS */ + scif_destroy_remote_window(window); + } + nr_pages_left -= loop_nr_pages; + loop_offset += (loop_nr_pages << PAGE_SHIFT); + if (!nr_pages_left) + break; + } +} + +/* + * The private data field of each VMA used to mmap a remote window + * points to an instance of struct vma_pvt + */ +struct vma_pvt { + struct scif_endpt *ep; /* End point for remote window */ + s64 offset; /* offset within remote window */ + bool valid_offset; /* offset is valid only if the original + * mmap request was for a single page + * else the offset within the vma is + * the correct offset + */ + struct kref ref; +}; + +static void vma_pvt_release(struct kref *ref) +{ + struct vma_pvt *vmapvt = container_of(ref, struct vma_pvt, ref); + + kfree(vmapvt); +} + +/** + * scif_vma_open - VMA open driver callback + * @vma: VMM memory area. + * The open method is called by the kernel to allow the subsystem implementing + * the VMA to initialize the area. This method is invoked any time a new + * reference to the VMA is made (when a process forks, for example). + * The one exception happens when the VMA is first created by mmap; + * in this case, the driver's mmap method is called instead. + * This function is also invoked when an existing VMA is split by the kernel + * due to a call to munmap on a subset of the VMA resulting in two VMAs. + * The kernel invokes this function only on one of the two VMAs. + */ +static void scif_vma_open(struct vm_area_struct *vma) +{ + struct vma_pvt *vmapvt = vma->vm_private_data; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n", + vma->vm_start, vma->vm_end); + scif_insert_vma(vmapvt->ep, vma); + kref_get(&vmapvt->ref); +} + +/** + * scif_munmap - VMA close driver callback. + * @vma: VMM memory area. + * When an area is destroyed, the kernel calls its close operation. + * Note that there's no usage count associated with VMA's; the area + * is opened and closed exactly once by each process that uses it. + */ +static void scif_munmap(struct vm_area_struct *vma) +{ + struct scif_endpt *ep; + struct vma_pvt *vmapvt = vma->vm_private_data; + int nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + s64 offset; + struct scif_rma_req req; + struct scif_window *window = NULL; + int err; + + might_sleep(); + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n", + vma->vm_start, vma->vm_end); + ep = vmapvt->ep; + offset = vmapvt->valid_offset ? vmapvt->offset : + (vma->vm_pgoff) << PAGE_SHIFT; + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI munmap: ep %p nr_pages 0x%x offset 0x%llx\n", + ep, nr_pages, offset); + req.out_window = &window; + req.offset = offset; + req.nr_bytes = vma->vm_end - vma->vm_start; + req.prot = vma->vm_flags & (VM_READ | VM_WRITE); + req.type = SCIF_WINDOW_PARTIAL; + req.head = &ep->rma_info.remote_reg_list; + + mutex_lock(&ep->rma_info.rma_lock); + + err = scif_query_window(&req); + if (err) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + else + scif_rma_list_munmap(window, offset, nr_pages); + + mutex_unlock(&ep->rma_info.rma_lock); + /* + * The kernel probably zeroes these out but we still want + * to clean up our own mess just in case. + */ + vma->vm_ops = NULL; + vma->vm_private_data = NULL; + kref_put(&vmapvt->ref, vma_pvt_release); + scif_delete_vma(ep, vma); +} + +static const struct vm_operations_struct scif_vm_ops = { + .open = scif_vma_open, + .close = scif_munmap, +}; + +/** + * scif_mmap - Map pages in virtual address space to a remote window. + * @vma: VMM memory area. + * @epd: endpoint descriptor + * + * Return: Upon successful completion, scif_mmap() returns zero + * else an apt error is returned as documented in scif.h + */ +int scif_mmap(struct vm_area_struct *vma, scif_epd_t epd) +{ + struct scif_rma_req req; + struct scif_window *window = NULL; + struct scif_endpt *ep = (struct scif_endpt *)epd; + s64 start_offset = vma->vm_pgoff << PAGE_SHIFT; + int nr_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int err; + struct vma_pvt *vmapvt; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI mmap: ep %p start_offset 0x%llx nr_pages 0x%x\n", + ep, start_offset, nr_pages); + err = scif_verify_epd(ep); + if (err) + return err; + + might_sleep(); + + err = scif_insert_vma(ep, vma); + if (err) + return err; + + vmapvt = kzalloc(sizeof(*vmapvt), GFP_KERNEL); + if (!vmapvt) { + scif_delete_vma(ep, vma); + return -ENOMEM; + } + + vmapvt->ep = ep; + kref_init(&vmapvt->ref); + + req.out_window = &window; + req.offset = start_offset; + req.nr_bytes = vma->vm_end - vma->vm_start; + req.prot = vma->vm_flags & (VM_READ | VM_WRITE); + req.type = SCIF_WINDOW_PARTIAL; + req.head = &ep->rma_info.remote_reg_list; + + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + err = scif_query_window(&req); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error_unlock; + } + + /* Default prot for loopback */ + if (!scifdev_self(ep->remote_dev)) + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + + /* + * VM_DONTCOPY - Do not copy this vma on fork + * VM_DONTEXPAND - Cannot expand with mremap() + * VM_RESERVED - Count as reserved_vm like IO + * VM_PFNMAP - Page-ranges managed without "struct page" + * VM_IO - Memory mapped I/O or similar + * + * We do not want to copy this VMA automatically on a fork(), + * expand this VMA due to mremap() or swap out these pages since + * the VMA is actually backed by physical pages in the remote + * node's physical memory and not via a struct page. + */ + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; + + if (!scifdev_self(ep->remote_dev)) + vma->vm_flags |= VM_IO | VM_PFNMAP; + + /* Map this range of windows */ + err = scif_rma_list_mmap(window, start_offset, nr_pages, vma); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error_unlock; + } + /* Set up the driver call back */ + vma->vm_ops = &scif_vm_ops; + vma->vm_private_data = vmapvt; +error_unlock: + mutex_unlock(&ep->rma_info.rma_lock); + if (err) { + kfree(vmapvt); + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + scif_delete_vma(ep, vma); + } + return err; +} diff --git a/kernel/drivers/misc/mic/scif/scif_nm.c b/kernel/drivers/misc/mic/scif/scif_nm.c new file mode 100644 index 000000000..79f26a02a --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_nm.c @@ -0,0 +1,237 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include "scif_peer_bus.h" + +#include "scif_main.h" +#include "scif_map.h" + +/** + * scif_invalidate_ep() - Set state for all connected endpoints + * to disconnected and wake up all send/recv waitqueues + */ +static void scif_invalidate_ep(int node) +{ + struct scif_endpt *ep; + struct list_head *pos, *tmpq; + + flush_work(&scif_info.conn_work); + mutex_lock(&scif_info.connlock); + list_for_each_safe(pos, tmpq, &scif_info.disconnected) { + ep = list_entry(pos, struct scif_endpt, list); + if (ep->remote_dev->node == node) { + scif_unmap_all_windows(ep); + spin_lock(&ep->lock); + scif_cleanup_ep_qp(ep); + spin_unlock(&ep->lock); + } + } + list_for_each_safe(pos, tmpq, &scif_info.connected) { + ep = list_entry(pos, struct scif_endpt, list); + if (ep->remote_dev->node == node) { + list_del(pos); + spin_lock(&ep->lock); + ep->state = SCIFEP_DISCONNECTED; + list_add_tail(&ep->list, &scif_info.disconnected); + scif_cleanup_ep_qp(ep); + wake_up_interruptible(&ep->sendwq); + wake_up_interruptible(&ep->recvwq); + spin_unlock(&ep->lock); + scif_unmap_all_windows(ep); + } + } + mutex_unlock(&scif_info.connlock); +} + +void scif_free_qp(struct scif_dev *scifdev) +{ + struct scif_qp *qp = scifdev->qpairs; + + if (!qp) + return; + scif_unmap_single(qp->local_buf, scifdev, qp->inbound_q.size); + kfree(qp->inbound_q.rb_base); + scif_unmap_single(qp->local_qp, scifdev, sizeof(struct scif_qp)); + kfree(scifdev->qpairs); + scifdev->qpairs = NULL; +} + +static void scif_cleanup_qp(struct scif_dev *dev) +{ + struct scif_qp *qp = &dev->qpairs[0]; + + if (!qp) + return; + scif_iounmap((void *)qp->remote_qp, sizeof(struct scif_qp), dev); + scif_iounmap((void *)qp->outbound_q.rb_base, + sizeof(struct scif_qp), dev); + qp->remote_qp = NULL; + qp->local_write = 0; + qp->inbound_q.current_write_offset = 0; + qp->inbound_q.current_read_offset = 0; + if (scifdev_is_p2p(dev)) + scif_free_qp(dev); +} + +void scif_send_acks(struct scif_dev *dev) +{ + struct scifmsg msg; + + if (dev->node_remove_ack_pending) { + msg.uop = SCIF_NODE_REMOVE_ACK; + msg.src.node = scif_info.nodeid; + msg.dst.node = SCIF_MGMT_NODE; + msg.payload[0] = dev->node; + scif_nodeqp_send(&scif_dev[SCIF_MGMT_NODE], &msg); + dev->node_remove_ack_pending = false; + } + if (dev->exit_ack_pending) { + msg.uop = SCIF_EXIT_ACK; + msg.src.node = scif_info.nodeid; + msg.dst.node = dev->node; + scif_nodeqp_send(dev, &msg); + dev->exit_ack_pending = false; + } +} + +/* + * scif_cleanup_scifdev + * + * @dev: Remote SCIF device. + * Uninitialize SCIF data structures for remote SCIF device. + */ +void scif_cleanup_scifdev(struct scif_dev *dev) +{ + struct scif_hw_dev *sdev = dev->sdev; + + if (!dev->sdev) + return; + if (scifdev_is_p2p(dev)) { + if (dev->cookie) { + sdev->hw_ops->free_irq(sdev, dev->cookie, dev); + dev->cookie = NULL; + } + scif_destroy_intr_wq(dev); + } + flush_work(&scif_info.misc_work); + scif_destroy_p2p(dev); + scif_invalidate_ep(dev->node); + scif_zap_mmaps(dev->node); + scif_cleanup_rma_for_zombies(dev->node); + flush_work(&scif_info.misc_work); + scif_send_acks(dev); + if (!dev->node && scif_info.card_initiated_exit) { + /* + * Send an SCIF_EXIT message which is the last message from MIC + * to the Host and wait for a SCIF_EXIT_ACK + */ + scif_send_exit(dev); + scif_info.card_initiated_exit = false; + } + scif_cleanup_qp(dev); +} + +/* + * scif_remove_node: + * + * @node: Node to remove + */ +void scif_handle_remove_node(int node) +{ + struct scif_dev *scifdev = &scif_dev[node]; + + if (scif_peer_unregister_device(scifdev)) + scif_send_acks(scifdev); +} + +static int scif_send_rmnode_msg(int node, int remove_node) +{ + struct scifmsg notif_msg; + struct scif_dev *dev = &scif_dev[node]; + + notif_msg.uop = SCIF_NODE_REMOVE; + notif_msg.src.node = scif_info.nodeid; + notif_msg.dst.node = node; + notif_msg.payload[0] = remove_node; + return scif_nodeqp_send(dev, ¬if_msg); +} + +/** + * scif_node_disconnect: + * + * @node_id[in]: source node id. + * @mgmt_initiated: Disconnection initiated from the mgmt node + * + * Disconnect a node from the scif network. + */ +void scif_disconnect_node(u32 node_id, bool mgmt_initiated) +{ + int ret; + int msg_cnt = 0; + u32 i = 0; + struct scif_dev *scifdev = &scif_dev[node_id]; + + if (!node_id) + return; + + atomic_set(&scifdev->disconn_rescnt, 0); + + /* Destroy p2p network */ + for (i = 1; i <= scif_info.maxid; i++) { + if (i == node_id) + continue; + ret = scif_send_rmnode_msg(i, node_id); + if (!ret) + msg_cnt++; + } + /* Wait for the remote nodes to respond with SCIF_NODE_REMOVE_ACK */ + ret = wait_event_timeout(scifdev->disconn_wq, + (atomic_read(&scifdev->disconn_rescnt) + == msg_cnt), SCIF_NODE_ALIVE_TIMEOUT); + /* Tell the card to clean up */ + if (mgmt_initiated && _scifdev_alive(scifdev)) + /* + * Send an SCIF_EXIT message which is the last message from Host + * to the MIC and wait for a SCIF_EXIT_ACK + */ + scif_send_exit(scifdev); + atomic_set(&scifdev->disconn_rescnt, 0); + /* Tell the mgmt node to clean up */ + ret = scif_send_rmnode_msg(SCIF_MGMT_NODE, node_id); + if (!ret) + /* Wait for mgmt node to respond with SCIF_NODE_REMOVE_ACK */ + wait_event_timeout(scifdev->disconn_wq, + (atomic_read(&scifdev->disconn_rescnt) == 1), + SCIF_NODE_ALIVE_TIMEOUT); +} + +void scif_get_node_info(void) +{ + struct scifmsg msg; + DECLARE_COMPLETION_ONSTACK(node_info); + + msg.uop = SCIF_GET_NODE_INFO; + msg.src.node = scif_info.nodeid; + msg.dst.node = SCIF_MGMT_NODE; + msg.payload[3] = (u64)&node_info; + + if ((scif_nodeqp_send(&scif_dev[SCIF_MGMT_NODE], &msg))) + return; + + /* Wait for a response with SCIF_GET_NODE_INFO */ + wait_for_completion(&node_info); +} diff --git a/kernel/drivers/misc/mic/scif/scif_nodeqp.c b/kernel/drivers/misc/mic/scif/scif_nodeqp.c new file mode 100644 index 000000000..c66ca1a58 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_nodeqp.c @@ -0,0 +1,1354 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include "../bus/scif_bus.h" +#include "scif_peer_bus.h" +#include "scif_main.h" +#include "scif_nodeqp.h" +#include "scif_map.h" + +/* + ************************************************************************ + * SCIF node Queue Pair (QP) setup flow: + * + * 1) SCIF driver gets probed with a scif_hw_dev via the scif_hw_bus + * 2) scif_setup_qp(..) allocates the local qp and calls + * scif_setup_qp_connect(..) which allocates and maps the local + * buffer for the inbound QP + * 3) The local node updates the device page with the DMA address of the QP + * 4) A delayed work is scheduled (qp_dwork) which periodically reads if + * the peer node has updated its QP DMA address + * 5) Once a valid non zero address is found in the QP DMA address field + * in the device page, the local node maps the remote node's QP, + * updates its outbound QP and sends a SCIF_INIT message to the peer + * 6) The SCIF_INIT message is received by the peer node QP interrupt bottom + * half handler by calling scif_init(..) + * 7) scif_init(..) registers a new SCIF peer node by calling + * scif_peer_register_device(..) which signifies the addition of a new + * SCIF node + * 8) On the mgmt node, P2P network setup/teardown is initiated if all the + * remote nodes are online via scif_p2p_setup(..) + * 9) For P2P setup, the host maps the remote nodes' aperture and memory + * bars and sends a SCIF_NODE_ADD message to both nodes + * 10) As part of scif_nodeadd, both nodes set up their local inbound + * QPs and send a SCIF_NODE_ADD_ACK to the mgmt node + * 11) As part of scif_node_add_ack(..) the mgmt node forwards the + * SCIF_NODE_ADD_ACK to the remote nodes + * 12) As part of scif_node_add_ack(..) the remote nodes update their + * outbound QPs, make sure they can access memory on the remote node + * and then add a new SCIF peer node by calling + * scif_peer_register_device(..) which signifies the addition of a new + * SCIF node. + * 13) The SCIF network is now established across all nodes. + * + ************************************************************************ + * SCIF node QP teardown flow (initiated by non mgmt node): + * + * 1) SCIF driver gets a remove callback with a scif_hw_dev via the scif_hw_bus + * 2) The device page QP DMA address field is updated with 0x0 + * 3) A non mgmt node now cleans up all local data structures and sends a + * SCIF_EXIT message to the peer and waits for a SCIF_EXIT_ACK + * 4) As part of scif_exit(..) handling scif_disconnect_node(..) is called + * 5) scif_disconnect_node(..) sends a SCIF_NODE_REMOVE message to all the + * peers and waits for a SCIF_NODE_REMOVE_ACK + * 6) As part of scif_node_remove(..) a remote node unregisters the peer + * node from the SCIF network and sends a SCIF_NODE_REMOVE_ACK + * 7) When the mgmt node has received all the SCIF_NODE_REMOVE_ACKs + * it sends itself a node remove message whose handling cleans up local + * data structures and unregisters the peer node from the SCIF network + * 8) The mgmt node sends a SCIF_EXIT_ACK + * 9) Upon receipt of the SCIF_EXIT_ACK the node initiating the teardown + * completes the SCIF remove routine + * 10) The SCIF network is now torn down for the node initiating the + * teardown sequence + * + ************************************************************************ + * SCIF node QP teardown flow (initiated by mgmt node): + * + * 1) SCIF driver gets a remove callback with a scif_hw_dev via the scif_hw_bus + * 2) The device page QP DMA address field is updated with 0x0 + * 3) The mgmt node calls scif_disconnect_node(..) + * 4) scif_disconnect_node(..) sends a SCIF_NODE_REMOVE message to all the peers + * and waits for a SCIF_NODE_REMOVE_ACK + * 5) As part of scif_node_remove(..) a remote node unregisters the peer + * node from the SCIF network and sends a SCIF_NODE_REMOVE_ACK + * 6) When the mgmt node has received all the SCIF_NODE_REMOVE_ACKs + * it unregisters the peer node from the SCIF network + * 7) The mgmt node sends a SCIF_EXIT message and waits for a SCIF_EXIT_ACK. + * 8) A non mgmt node upon receipt of a SCIF_EXIT message calls scif_stop(..) + * which would clean up local data structures for all SCIF nodes and + * then send a SCIF_EXIT_ACK back to the mgmt node + * 9) Upon receipt of the SCIF_EXIT_ACK the the mgmt node sends itself a node + * remove message whose handling cleans up local data structures and + * destroys any P2P mappings. + * 10) The SCIF hardware device for which a remove callback was received is now + * disconnected from the SCIF network. + */ +/* + * Initializes "local" data structures for the QP. Allocates the QP + * ring buffer (rb) and initializes the "in bound" queue. + */ +int scif_setup_qp_connect(struct scif_qp *qp, dma_addr_t *qp_offset, + int local_size, struct scif_dev *scifdev) +{ + void *local_q = qp->inbound_q.rb_base; + int err = 0; + u32 tmp_rd = 0; + + spin_lock_init(&qp->send_lock); + spin_lock_init(&qp->recv_lock); + + /* Allocate rb only if not already allocated */ + if (!local_q) { + local_q = kzalloc(local_size, GFP_KERNEL); + if (!local_q) { + err = -ENOMEM; + return err; + } + } + + err = scif_map_single(&qp->local_buf, local_q, scifdev, local_size); + if (err) + goto kfree; + /* + * To setup the inbound_q, the buffer lives locally, the read pointer + * is remote and the write pointer is local. + */ + scif_rb_init(&qp->inbound_q, + &tmp_rd, + &qp->local_write, + local_q, get_count_order(local_size)); + /* + * The read pointer is NULL initially and it is unsafe to use the ring + * buffer til this changes! + */ + qp->inbound_q.read_ptr = NULL; + err = scif_map_single(qp_offset, qp, + scifdev, sizeof(struct scif_qp)); + if (err) + goto unmap; + qp->local_qp = *qp_offset; + return err; +unmap: + scif_unmap_single(qp->local_buf, scifdev, local_size); + qp->local_buf = 0; +kfree: + kfree(local_q); + return err; +} + +/* When the other side has already done it's allocation, this is called */ +int scif_setup_qp_accept(struct scif_qp *qp, dma_addr_t *qp_offset, + dma_addr_t phys, int local_size, + struct scif_dev *scifdev) +{ + void *local_q; + void *remote_q; + struct scif_qp *remote_qp; + int remote_size; + int err = 0; + + spin_lock_init(&qp->send_lock); + spin_lock_init(&qp->recv_lock); + /* Start by figuring out where we need to point */ + remote_qp = scif_ioremap(phys, sizeof(struct scif_qp), scifdev); + if (!remote_qp) + return -EIO; + qp->remote_qp = remote_qp; + if (qp->remote_qp->magic != SCIFEP_MAGIC) { + err = -EIO; + goto iounmap; + } + qp->remote_buf = remote_qp->local_buf; + remote_size = qp->remote_qp->inbound_q.size; + remote_q = scif_ioremap(qp->remote_buf, remote_size, scifdev); + if (!remote_q) { + err = -EIO; + goto iounmap; + } + qp->remote_qp->local_write = 0; + /* + * To setup the outbound_q, the buffer lives in remote memory, + * the read pointer is local, the write pointer is remote + */ + scif_rb_init(&qp->outbound_q, + &qp->local_read, + &qp->remote_qp->local_write, + remote_q, + get_count_order(remote_size)); + local_q = kzalloc(local_size, GFP_KERNEL); + if (!local_q) { + err = -ENOMEM; + goto iounmap_1; + } + err = scif_map_single(&qp->local_buf, local_q, scifdev, local_size); + if (err) + goto kfree; + qp->remote_qp->local_read = 0; + /* + * To setup the inbound_q, the buffer lives locally, the read pointer + * is remote and the write pointer is local + */ + scif_rb_init(&qp->inbound_q, + &qp->remote_qp->local_read, + &qp->local_write, + local_q, get_count_order(local_size)); + err = scif_map_single(qp_offset, qp, scifdev, + sizeof(struct scif_qp)); + if (err) + goto unmap; + qp->local_qp = *qp_offset; + return err; +unmap: + scif_unmap_single(qp->local_buf, scifdev, local_size); + qp->local_buf = 0; +kfree: + kfree(local_q); +iounmap_1: + scif_iounmap(remote_q, remote_size, scifdev); + qp->outbound_q.rb_base = NULL; +iounmap: + scif_iounmap(qp->remote_qp, sizeof(struct scif_qp), scifdev); + qp->remote_qp = NULL; + return err; +} + +int scif_setup_qp_connect_response(struct scif_dev *scifdev, + struct scif_qp *qp, u64 payload) +{ + int err = 0; + void *r_buf; + int remote_size; + phys_addr_t tmp_phys; + + qp->remote_qp = scif_ioremap(payload, sizeof(struct scif_qp), scifdev); + + if (!qp->remote_qp) { + err = -ENOMEM; + goto error; + } + + if (qp->remote_qp->magic != SCIFEP_MAGIC) { + dev_err(&scifdev->sdev->dev, + "SCIFEP_MAGIC mismatch between self %d remote %d\n", + scif_dev[scif_info.nodeid].node, scifdev->node); + err = -ENODEV; + goto error; + } + + tmp_phys = qp->remote_qp->local_buf; + remote_size = qp->remote_qp->inbound_q.size; + r_buf = scif_ioremap(tmp_phys, remote_size, scifdev); + + if (!r_buf) + return -EIO; + + qp->local_read = 0; + scif_rb_init(&qp->outbound_q, + &qp->local_read, + &qp->remote_qp->local_write, + r_buf, + get_count_order(remote_size)); + /* + * Because the node QP may already be processing an INIT message, set + * the read pointer so the cached read offset isn't lost + */ + qp->remote_qp->local_read = qp->inbound_q.current_read_offset; + /* + * resetup the inbound_q now that we know where the + * inbound_read really is. + */ + scif_rb_init(&qp->inbound_q, + &qp->remote_qp->local_read, + &qp->local_write, + qp->inbound_q.rb_base, + get_count_order(qp->inbound_q.size)); +error: + return err; +} + +static __always_inline void +scif_send_msg_intr(struct scif_dev *scifdev) +{ + struct scif_hw_dev *sdev = scifdev->sdev; + + if (scifdev_is_p2p(scifdev)) + sdev->hw_ops->send_p2p_intr(sdev, scifdev->rdb, &scifdev->mmio); + else + sdev->hw_ops->send_intr(sdev, scifdev->rdb); +} + +int scif_qp_response(phys_addr_t phys, struct scif_dev *scifdev) +{ + int err = 0; + struct scifmsg msg; + + err = scif_setup_qp_connect_response(scifdev, scifdev->qpairs, phys); + if (!err) { + /* + * Now that everything is setup and mapped, we're ready + * to tell the peer about our queue's location + */ + msg.uop = SCIF_INIT; + msg.dst.node = scifdev->node; + err = scif_nodeqp_send(scifdev, &msg); + } + return err; +} + +void scif_send_exit(struct scif_dev *scifdev) +{ + struct scifmsg msg; + int ret; + + scifdev->exit = OP_IN_PROGRESS; + msg.uop = SCIF_EXIT; + msg.src.node = scif_info.nodeid; + msg.dst.node = scifdev->node; + ret = scif_nodeqp_send(scifdev, &msg); + if (ret) + goto done; + /* Wait for a SCIF_EXIT_ACK message */ + wait_event_timeout(scif_info.exitwq, scifdev->exit == OP_COMPLETED, + SCIF_NODE_ALIVE_TIMEOUT); +done: + scifdev->exit = OP_IDLE; +} + +int scif_setup_qp(struct scif_dev *scifdev) +{ + int err = 0; + int local_size; + struct scif_qp *qp; + + local_size = SCIF_NODE_QP_SIZE; + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + err = -ENOMEM; + return err; + } + qp->magic = SCIFEP_MAGIC; + scifdev->qpairs = qp; + err = scif_setup_qp_connect(qp, &scifdev->qp_dma_addr, + local_size, scifdev); + if (err) + goto free_qp; + /* + * We're as setup as we can be. The inbound_q is setup, w/o a usable + * outbound q. When we get a message, the read_ptr will be updated, + * and we will pull the message. + */ + return err; +free_qp: + kfree(scifdev->qpairs); + scifdev->qpairs = NULL; + return err; +} + +static void scif_p2p_freesg(struct scatterlist *sg) +{ + kfree(sg); +} + +static struct scatterlist * +scif_p2p_setsg(phys_addr_t pa, int page_size, int page_cnt) +{ + struct scatterlist *sg; + struct page *page; + int i; + + sg = kcalloc(page_cnt, sizeof(struct scatterlist), GFP_KERNEL); + if (!sg) + return NULL; + sg_init_table(sg, page_cnt); + for (i = 0; i < page_cnt; i++) { + page = pfn_to_page(pa >> PAGE_SHIFT); + sg_set_page(&sg[i], page, page_size, 0); + pa += page_size; + } + return sg; +} + +/* Init p2p mappings required to access peerdev from scifdev */ +static struct scif_p2p_info * +scif_init_p2p_info(struct scif_dev *scifdev, struct scif_dev *peerdev) +{ + struct scif_p2p_info *p2p; + int num_mmio_pages, num_aper_pages, sg_page_shift, err, num_aper_chunks; + struct scif_hw_dev *psdev = peerdev->sdev; + struct scif_hw_dev *sdev = scifdev->sdev; + + num_mmio_pages = psdev->mmio->len >> PAGE_SHIFT; + num_aper_pages = psdev->aper->len >> PAGE_SHIFT; + + p2p = kzalloc(sizeof(*p2p), GFP_KERNEL); + if (!p2p) + return NULL; + p2p->ppi_sg[SCIF_PPI_MMIO] = scif_p2p_setsg(psdev->mmio->pa, + PAGE_SIZE, num_mmio_pages); + if (!p2p->ppi_sg[SCIF_PPI_MMIO]) + goto free_p2p; + p2p->sg_nentries[SCIF_PPI_MMIO] = num_mmio_pages; + sg_page_shift = get_order(min(psdev->aper->len, (u64)(1 << 30))); + num_aper_chunks = num_aper_pages >> (sg_page_shift - PAGE_SHIFT); + p2p->ppi_sg[SCIF_PPI_APER] = scif_p2p_setsg(psdev->aper->pa, + 1 << sg_page_shift, + num_aper_chunks); + p2p->sg_nentries[SCIF_PPI_APER] = num_aper_chunks; + err = dma_map_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_MMIO], + num_mmio_pages, PCI_DMA_BIDIRECTIONAL); + if (err != num_mmio_pages) + goto scif_p2p_free; + err = dma_map_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_APER], + num_aper_chunks, PCI_DMA_BIDIRECTIONAL); + if (err != num_aper_chunks) + goto dma_unmap; + p2p->ppi_da[SCIF_PPI_MMIO] = sg_dma_address(p2p->ppi_sg[SCIF_PPI_MMIO]); + p2p->ppi_da[SCIF_PPI_APER] = sg_dma_address(p2p->ppi_sg[SCIF_PPI_APER]); + p2p->ppi_len[SCIF_PPI_MMIO] = num_mmio_pages; + p2p->ppi_len[SCIF_PPI_APER] = num_aper_pages; + p2p->ppi_peer_id = peerdev->node; + return p2p; +dma_unmap: + dma_unmap_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_MMIO], + p2p->sg_nentries[SCIF_PPI_MMIO], DMA_BIDIRECTIONAL); +scif_p2p_free: + scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_MMIO]); + scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_APER]); +free_p2p: + kfree(p2p); + return NULL; +} + +/* Uninitialize and release resources from a p2p mapping */ +static void scif_deinit_p2p_info(struct scif_dev *scifdev, + struct scif_p2p_info *p2p) +{ + struct scif_hw_dev *sdev = scifdev->sdev; + + dma_unmap_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_MMIO], + p2p->sg_nentries[SCIF_PPI_MMIO], DMA_BIDIRECTIONAL); + dma_unmap_sg(&sdev->dev, p2p->ppi_sg[SCIF_PPI_APER], + p2p->sg_nentries[SCIF_PPI_APER], DMA_BIDIRECTIONAL); + scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_MMIO]); + scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_APER]); + kfree(p2p); +} + +/** + * scif_node_connect: Respond to SCIF_NODE_CONNECT interrupt message + * @dst: Destination node + * + * Connect the src and dst node by setting up the p2p connection + * between them. Management node here acts like a proxy. + */ +static void scif_node_connect(struct scif_dev *scifdev, int dst) +{ + struct scif_dev *dev_j = scifdev; + struct scif_dev *dev_i = NULL; + struct scif_p2p_info *p2p_ij = NULL; /* bus addr for j from i */ + struct scif_p2p_info *p2p_ji = NULL; /* bus addr for i from j */ + struct scif_p2p_info *p2p; + struct list_head *pos, *tmp; + struct scifmsg msg; + int err; + u64 tmppayload; + + if (dst < 1 || dst > scif_info.maxid) + return; + + dev_i = &scif_dev[dst]; + + if (!_scifdev_alive(dev_i)) + return; + /* + * If the p2p connection is already setup or in the process of setting + * up then just ignore this request. The requested node will get + * informed by SCIF_NODE_ADD_ACK or SCIF_NODE_ADD_NACK + */ + if (!list_empty(&dev_i->p2p)) { + list_for_each_safe(pos, tmp, &dev_i->p2p) { + p2p = list_entry(pos, struct scif_p2p_info, ppi_list); + if (p2p->ppi_peer_id == dev_j->node) + return; + } + } + p2p_ij = scif_init_p2p_info(dev_i, dev_j); + if (!p2p_ij) + return; + p2p_ji = scif_init_p2p_info(dev_j, dev_i); + if (!p2p_ji) { + scif_deinit_p2p_info(dev_i, p2p_ij); + return; + } + list_add_tail(&p2p_ij->ppi_list, &dev_i->p2p); + list_add_tail(&p2p_ji->ppi_list, &dev_j->p2p); + + /* + * Send a SCIF_NODE_ADD to dev_i, pass it its bus address + * as seen from dev_j + */ + msg.uop = SCIF_NODE_ADD; + msg.src.node = dev_j->node; + msg.dst.node = dev_i->node; + + msg.payload[0] = p2p_ji->ppi_da[SCIF_PPI_APER]; + msg.payload[1] = p2p_ij->ppi_da[SCIF_PPI_MMIO]; + msg.payload[2] = p2p_ij->ppi_da[SCIF_PPI_APER]; + msg.payload[3] = p2p_ij->ppi_len[SCIF_PPI_APER] << PAGE_SHIFT; + + err = scif_nodeqp_send(dev_i, &msg); + if (err) { + dev_err(&scifdev->sdev->dev, + "%s %d error %d\n", __func__, __LINE__, err); + return; + } + + /* Same as above but to dev_j */ + msg.uop = SCIF_NODE_ADD; + msg.src.node = dev_i->node; + msg.dst.node = dev_j->node; + + tmppayload = msg.payload[0]; + msg.payload[0] = msg.payload[2]; + msg.payload[2] = tmppayload; + msg.payload[1] = p2p_ji->ppi_da[SCIF_PPI_MMIO]; + msg.payload[3] = p2p_ji->ppi_len[SCIF_PPI_APER] << PAGE_SHIFT; + + scif_nodeqp_send(dev_j, &msg); +} + +static void scif_p2p_setup(void) +{ + int i, j; + + if (!scif_info.p2p_enable) + return; + + for (i = 1; i <= scif_info.maxid; i++) + if (!_scifdev_alive(&scif_dev[i])) + return; + + for (i = 1; i <= scif_info.maxid; i++) { + for (j = 1; j <= scif_info.maxid; j++) { + struct scif_dev *scifdev = &scif_dev[i]; + + if (i == j) + continue; + scif_node_connect(scifdev, j); + } + } +} + +static char *message_types[] = {"BAD", + "INIT", + "EXIT", + "SCIF_EXIT_ACK", + "SCIF_NODE_ADD", + "SCIF_NODE_ADD_ACK", + "SCIF_NODE_ADD_NACK", + "REMOVE_NODE", + "REMOVE_NODE_ACK", + "CNCT_REQ", + "CNCT_GNT", + "CNCT_GNTACK", + "CNCT_GNTNACK", + "CNCT_REJ", + "DISCNCT", + "DISCNT_ACK", + "CLIENT_SENT", + "CLIENT_RCVD", + "SCIF_GET_NODE_INFO", + "REGISTER", + "REGISTER_ACK", + "REGISTER_NACK", + "UNREGISTER", + "UNREGISTER_ACK", + "UNREGISTER_NACK", + "ALLOC_REQ", + "ALLOC_GNT", + "ALLOC_REJ", + "FREE_PHYS", + "FREE_VIRT", + "MUNMAP", + "MARK", + "MARK_ACK", + "MARK_NACK", + "WAIT", + "WAIT_ACK", + "WAIT_NACK", + "SIGNAL_LOCAL", + "SIGNAL_REMOTE", + "SIG_ACK", + "SIG_NACK"}; + +static void +scif_display_message(struct scif_dev *scifdev, struct scifmsg *msg, + const char *label) +{ + if (!scif_info.en_msg_log) + return; + if (msg->uop > SCIF_MAX_MSG) { + dev_err(&scifdev->sdev->dev, + "%s: unknown msg type %d\n", label, msg->uop); + return; + } + dev_info(&scifdev->sdev->dev, + "%s: msg type %s, src %d:%d, dest %d:%d payload 0x%llx:0x%llx:0x%llx:0x%llx\n", + label, message_types[msg->uop], msg->src.node, msg->src.port, + msg->dst.node, msg->dst.port, msg->payload[0], msg->payload[1], + msg->payload[2], msg->payload[3]); +} + +int _scif_nodeqp_send(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_qp *qp = scifdev->qpairs; + int err = -ENOMEM, loop_cnt = 0; + + scif_display_message(scifdev, msg, "Sent"); + if (!qp) { + err = -EINVAL; + goto error; + } + spin_lock(&qp->send_lock); + + while ((err = scif_rb_write(&qp->outbound_q, + msg, sizeof(struct scifmsg)))) { + mdelay(1); +#define SCIF_NODEQP_SEND_TO_MSEC (3 * 1000) + if (loop_cnt++ > (SCIF_NODEQP_SEND_TO_MSEC)) { + err = -ENODEV; + break; + } + } + if (!err) + scif_rb_commit(&qp->outbound_q); + spin_unlock(&qp->send_lock); + if (!err) { + if (scifdev_self(scifdev)) + /* + * For loopback we need to emulate an interrupt by + * queuing work for the queue handling real node + * Qp interrupts. + */ + queue_work(scifdev->intr_wq, &scifdev->intr_bh); + else + scif_send_msg_intr(scifdev); + } +error: + if (err) + dev_dbg(&scifdev->sdev->dev, + "%s %d error %d uop %d\n", + __func__, __LINE__, err, msg->uop); + return err; +} + +/** + * scif_nodeqp_send - Send a message on the node queue pair + * @scifdev: Scif Device. + * @msg: The message to be sent. + */ +int scif_nodeqp_send(struct scif_dev *scifdev, struct scifmsg *msg) +{ + int err; + struct device *spdev = NULL; + + if (msg->uop > SCIF_EXIT_ACK) { + /* Dont send messages once the exit flow has begun */ + if (OP_IDLE != scifdev->exit) + return -ENODEV; + spdev = scif_get_peer_dev(scifdev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + return err; + } + } + err = _scif_nodeqp_send(scifdev, msg); + if (msg->uop > SCIF_EXIT_ACK) + scif_put_peer_dev(spdev); + return err; +} + +/* + * scif_misc_handler: + * + * Work queue handler for servicing miscellaneous SCIF tasks. + * Examples include: + * 1) Remote fence requests. + * 2) Destruction of temporary registered windows + * created during scif_vreadfrom()/scif_vwriteto(). + * 3) Cleanup of zombie endpoints. + */ +void scif_misc_handler(struct work_struct *work) +{ + scif_rma_handle_remote_fences(); + scif_rma_destroy_windows(); + scif_rma_destroy_tcw_invalid(); + scif_cleanup_zombie_epd(); +} + +/** + * scif_init() - Respond to SCIF_INIT interrupt message + * @scifdev: Remote SCIF device node + * @msg: Interrupt message + */ +static __always_inline void +scif_init(struct scif_dev *scifdev, struct scifmsg *msg) +{ + /* + * Allow the thread waiting for device page updates for the peer QP DMA + * address to complete initializing the inbound_q. + */ + flush_delayed_work(&scifdev->qp_dwork); + + scif_peer_register_device(scifdev); + + if (scif_is_mgmt_node()) { + mutex_lock(&scif_info.conflock); + scif_p2p_setup(); + mutex_unlock(&scif_info.conflock); + } +} + +/** + * scif_exit() - Respond to SCIF_EXIT interrupt message + * @scifdev: Remote SCIF device node + * @msg: Interrupt message + * + * This function stops the SCIF interface for the node which sent + * the SCIF_EXIT message and starts waiting for that node to + * resetup the queue pair again. + */ +static __always_inline void +scif_exit(struct scif_dev *scifdev, struct scifmsg *unused) +{ + scifdev->exit_ack_pending = true; + if (scif_is_mgmt_node()) + scif_disconnect_node(scifdev->node, false); + else + scif_stop(scifdev); + schedule_delayed_work(&scifdev->qp_dwork, + msecs_to_jiffies(1000)); +} + +/** + * scif_exitack() - Respond to SCIF_EXIT_ACK interrupt message + * @scifdev: Remote SCIF device node + * @msg: Interrupt message + * + */ +static __always_inline void +scif_exit_ack(struct scif_dev *scifdev, struct scifmsg *unused) +{ + scifdev->exit = OP_COMPLETED; + wake_up(&scif_info.exitwq); +} + +/** + * scif_node_add() - Respond to SCIF_NODE_ADD interrupt message + * @scifdev: Remote SCIF device node + * @msg: Interrupt message + * + * When the mgmt node driver has finished initializing a MIC node queue pair it + * marks the node as online. It then looks for all currently online MIC cards + * and send a SCIF_NODE_ADD message to identify the ID of the new card for + * peer to peer initialization + * + * The local node allocates its incoming queue and sends its address in the + * SCIF_NODE_ADD_ACK message back to the mgmt node, the mgmt node "reflects" + * this message to the new node + */ +static __always_inline void +scif_node_add(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_dev *newdev; + dma_addr_t qp_offset; + int qp_connect; + struct scif_hw_dev *sdev; + + dev_dbg(&scifdev->sdev->dev, + "Scifdev %d:%d received NODE_ADD msg for node %d\n", + scifdev->node, msg->dst.node, msg->src.node); + dev_dbg(&scifdev->sdev->dev, + "Remote address for this node's aperture %llx\n", + msg->payload[0]); + newdev = &scif_dev[msg->src.node]; + newdev->node = msg->src.node; + newdev->sdev = scif_dev[SCIF_MGMT_NODE].sdev; + sdev = newdev->sdev; + + if (scif_setup_intr_wq(newdev)) { + dev_err(&scifdev->sdev->dev, + "failed to setup interrupts for %d\n", msg->src.node); + goto interrupt_setup_error; + } + newdev->mmio.va = ioremap_nocache(msg->payload[1], sdev->mmio->len); + if (!newdev->mmio.va) { + dev_err(&scifdev->sdev->dev, + "failed to map mmio for %d\n", msg->src.node); + goto mmio_map_error; + } + newdev->qpairs = kzalloc(sizeof(*newdev->qpairs), GFP_KERNEL); + if (!newdev->qpairs) + goto qp_alloc_error; + /* + * Set the base address of the remote node's memory since it gets + * added to qp_offset + */ + newdev->base_addr = msg->payload[0]; + + qp_connect = scif_setup_qp_connect(newdev->qpairs, &qp_offset, + SCIF_NODE_QP_SIZE, newdev); + if (qp_connect) { + dev_err(&scifdev->sdev->dev, + "failed to setup qp_connect %d\n", qp_connect); + goto qp_connect_error; + } + + newdev->db = sdev->hw_ops->next_db(sdev); + newdev->cookie = sdev->hw_ops->request_irq(sdev, scif_intr_handler, + "SCIF_INTR", newdev, + newdev->db); + if (IS_ERR(newdev->cookie)) + goto qp_connect_error; + newdev->qpairs->magic = SCIFEP_MAGIC; + newdev->qpairs->qp_state = SCIF_QP_OFFLINE; + + msg->uop = SCIF_NODE_ADD_ACK; + msg->dst.node = msg->src.node; + msg->src.node = scif_info.nodeid; + msg->payload[0] = qp_offset; + msg->payload[2] = newdev->db; + scif_nodeqp_send(&scif_dev[SCIF_MGMT_NODE], msg); + return; +qp_connect_error: + kfree(newdev->qpairs); + newdev->qpairs = NULL; +qp_alloc_error: + iounmap(newdev->mmio.va); + newdev->mmio.va = NULL; +mmio_map_error: +interrupt_setup_error: + dev_err(&scifdev->sdev->dev, + "node add failed for node %d\n", msg->src.node); + msg->uop = SCIF_NODE_ADD_NACK; + msg->dst.node = msg->src.node; + msg->src.node = scif_info.nodeid; + scif_nodeqp_send(&scif_dev[SCIF_MGMT_NODE], msg); +} + +void scif_poll_qp_state(struct work_struct *work) +{ +#define SCIF_NODE_QP_RETRY 100 +#define SCIF_NODE_QP_TIMEOUT 100 + struct scif_dev *peerdev = container_of(work, struct scif_dev, + p2p_dwork.work); + struct scif_qp *qp = &peerdev->qpairs[0]; + + if (qp->qp_state != SCIF_QP_ONLINE || + qp->remote_qp->qp_state != SCIF_QP_ONLINE) { + if (peerdev->p2p_retry++ == SCIF_NODE_QP_RETRY) { + dev_err(&peerdev->sdev->dev, + "Warning: QP check timeout with state %d\n", + qp->qp_state); + goto timeout; + } + schedule_delayed_work(&peerdev->p2p_dwork, + msecs_to_jiffies(SCIF_NODE_QP_TIMEOUT)); + return; + } + return; +timeout: + dev_err(&peerdev->sdev->dev, + "%s %d remote node %d offline, state = 0x%x\n", + __func__, __LINE__, peerdev->node, qp->qp_state); + qp->remote_qp->qp_state = SCIF_QP_OFFLINE; + scif_peer_unregister_device(peerdev); + scif_cleanup_scifdev(peerdev); +} + +/** + * scif_node_add_ack() - Respond to SCIF_NODE_ADD_ACK interrupt message + * @scifdev: Remote SCIF device node + * @msg: Interrupt message + * + * After a MIC node receives the SCIF_NODE_ADD_ACK message it send this + * message to the mgmt node to confirm the sequence is finished. + * + */ +static __always_inline void +scif_node_add_ack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_dev *peerdev; + struct scif_qp *qp; + struct scif_dev *dst_dev = &scif_dev[msg->dst.node]; + + dev_dbg(&scifdev->sdev->dev, + "Scifdev %d received SCIF_NODE_ADD_ACK msg src %d dst %d\n", + scifdev->node, msg->src.node, msg->dst.node); + dev_dbg(&scifdev->sdev->dev, + "payload %llx %llx %llx %llx\n", msg->payload[0], + msg->payload[1], msg->payload[2], msg->payload[3]); + if (scif_is_mgmt_node()) { + /* + * the lock serializes with scif_qp_response_ack. The mgmt node + * is forwarding the NODE_ADD_ACK message from src to dst we + * need to make sure that the dst has already received a + * NODE_ADD for src and setup its end of the qp to dst + */ + mutex_lock(&scif_info.conflock); + msg->payload[1] = scif_info.maxid; + scif_nodeqp_send(dst_dev, msg); + mutex_unlock(&scif_info.conflock); + return; + } + peerdev = &scif_dev[msg->src.node]; + peerdev->sdev = scif_dev[SCIF_MGMT_NODE].sdev; + peerdev->node = msg->src.node; + + qp = &peerdev->qpairs[0]; + + if ((scif_setup_qp_connect_response(peerdev, &peerdev->qpairs[0], + msg->payload[0]))) + goto local_error; + peerdev->rdb = msg->payload[2]; + qp->remote_qp->qp_state = SCIF_QP_ONLINE; + + scif_peer_register_device(peerdev); + + schedule_delayed_work(&peerdev->p2p_dwork, 0); + return; +local_error: + scif_cleanup_scifdev(peerdev); +} + +/** + * scif_node_add_nack: Respond to SCIF_NODE_ADD_NACK interrupt message + * @msg: Interrupt message + * + * SCIF_NODE_ADD failed, so inform the waiting wq. + */ +static __always_inline void +scif_node_add_nack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + if (scif_is_mgmt_node()) { + struct scif_dev *dst_dev = &scif_dev[msg->dst.node]; + + dev_dbg(&scifdev->sdev->dev, + "SCIF_NODE_ADD_NACK received from %d\n", scifdev->node); + scif_nodeqp_send(dst_dev, msg); + } +} + +/* + * scif_node_remove: Handle SCIF_NODE_REMOVE message + * @msg: Interrupt message + * + * Handle node removal. + */ +static __always_inline void +scif_node_remove(struct scif_dev *scifdev, struct scifmsg *msg) +{ + int node = msg->payload[0]; + struct scif_dev *scdev = &scif_dev[node]; + + scdev->node_remove_ack_pending = true; + scif_handle_remove_node(node); +} + +/* + * scif_node_remove_ack: Handle SCIF_NODE_REMOVE_ACK message + * @msg: Interrupt message + * + * The peer has acked a SCIF_NODE_REMOVE message. + */ +static __always_inline void +scif_node_remove_ack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_dev *sdev = &scif_dev[msg->payload[0]]; + + atomic_inc(&sdev->disconn_rescnt); + wake_up(&sdev->disconn_wq); +} + +/** + * scif_get_node_info: Respond to SCIF_GET_NODE_INFO interrupt message + * @msg: Interrupt message + * + * Retrieve node info i.e maxid and total from the mgmt node. + */ +static __always_inline void +scif_get_node_info_resp(struct scif_dev *scifdev, struct scifmsg *msg) +{ + if (scif_is_mgmt_node()) { + swap(msg->dst.node, msg->src.node); + mutex_lock(&scif_info.conflock); + msg->payload[1] = scif_info.maxid; + msg->payload[2] = scif_info.total; + mutex_unlock(&scif_info.conflock); + scif_nodeqp_send(scifdev, msg); + } else { + struct completion *node_info = + (struct completion *)msg->payload[3]; + + mutex_lock(&scif_info.conflock); + scif_info.maxid = msg->payload[1]; + scif_info.total = msg->payload[2]; + complete_all(node_info); + mutex_unlock(&scif_info.conflock); + } +} + +static void +scif_msg_unknown(struct scif_dev *scifdev, struct scifmsg *msg) +{ + /* Bogus Node Qp Message? */ + dev_err(&scifdev->sdev->dev, + "Unknown message 0x%xn scifdev->node 0x%x\n", + msg->uop, scifdev->node); +} + +static void (*scif_intr_func[SCIF_MAX_MSG + 1]) + (struct scif_dev *, struct scifmsg *msg) = { + scif_msg_unknown, /* Error */ + scif_init, /* SCIF_INIT */ + scif_exit, /* SCIF_EXIT */ + scif_exit_ack, /* SCIF_EXIT_ACK */ + scif_node_add, /* SCIF_NODE_ADD */ + scif_node_add_ack, /* SCIF_NODE_ADD_ACK */ + scif_node_add_nack, /* SCIF_NODE_ADD_NACK */ + scif_node_remove, /* SCIF_NODE_REMOVE */ + scif_node_remove_ack, /* SCIF_NODE_REMOVE_ACK */ + scif_cnctreq, /* SCIF_CNCT_REQ */ + scif_cnctgnt, /* SCIF_CNCT_GNT */ + scif_cnctgnt_ack, /* SCIF_CNCT_GNTACK */ + scif_cnctgnt_nack, /* SCIF_CNCT_GNTNACK */ + scif_cnctrej, /* SCIF_CNCT_REJ */ + scif_discnct, /* SCIF_DISCNCT */ + scif_discnt_ack, /* SCIF_DISCNT_ACK */ + scif_clientsend, /* SCIF_CLIENT_SENT */ + scif_clientrcvd, /* SCIF_CLIENT_RCVD */ + scif_get_node_info_resp,/* SCIF_GET_NODE_INFO */ + scif_recv_reg, /* SCIF_REGISTER */ + scif_recv_reg_ack, /* SCIF_REGISTER_ACK */ + scif_recv_reg_nack, /* SCIF_REGISTER_NACK */ + scif_recv_unreg, /* SCIF_UNREGISTER */ + scif_recv_unreg_ack, /* SCIF_UNREGISTER_ACK */ + scif_recv_unreg_nack, /* SCIF_UNREGISTER_NACK */ + scif_alloc_req, /* SCIF_ALLOC_REQ */ + scif_alloc_gnt_rej, /* SCIF_ALLOC_GNT */ + scif_alloc_gnt_rej, /* SCIF_ALLOC_REJ */ + scif_free_virt, /* SCIF_FREE_VIRT */ + scif_recv_munmap, /* SCIF_MUNMAP */ + scif_recv_mark, /* SCIF_MARK */ + scif_recv_mark_resp, /* SCIF_MARK_ACK */ + scif_recv_mark_resp, /* SCIF_MARK_NACK */ + scif_recv_wait, /* SCIF_WAIT */ + scif_recv_wait_resp, /* SCIF_WAIT_ACK */ + scif_recv_wait_resp, /* SCIF_WAIT_NACK */ + scif_recv_sig_local, /* SCIF_SIG_LOCAL */ + scif_recv_sig_remote, /* SCIF_SIG_REMOTE */ + scif_recv_sig_resp, /* SCIF_SIG_ACK */ + scif_recv_sig_resp, /* SCIF_SIG_NACK */ +}; + +/** + * scif_nodeqp_msg_handler() - Common handler for node messages + * @scifdev: Remote device to respond to + * @qp: Remote memory pointer + * @msg: The message to be handled. + * + * This routine calls the appropriate routine to handle a Node Qp + * message receipt + */ +static int scif_max_msg_id = SCIF_MAX_MSG; + +static void +scif_nodeqp_msg_handler(struct scif_dev *scifdev, + struct scif_qp *qp, struct scifmsg *msg) +{ + scif_display_message(scifdev, msg, "Rcvd"); + + if (msg->uop > (u32)scif_max_msg_id) { + /* Bogus Node Qp Message? */ + dev_err(&scifdev->sdev->dev, + "Unknown message 0x%xn scifdev->node 0x%x\n", + msg->uop, scifdev->node); + return; + } + + scif_intr_func[msg->uop](scifdev, msg); +} + +/** + * scif_nodeqp_intrhandler() - Interrupt handler for node messages + * @scifdev: Remote device to respond to + * @qp: Remote memory pointer + * + * This routine is triggered by the interrupt mechanism. It reads + * messages from the node queue RB and calls the Node QP Message handling + * routine. + */ +void scif_nodeqp_intrhandler(struct scif_dev *scifdev, struct scif_qp *qp) +{ + struct scifmsg msg; + int read_size; + + do { + read_size = scif_rb_get_next(&qp->inbound_q, &msg, sizeof(msg)); + if (!read_size) + break; + scif_nodeqp_msg_handler(scifdev, qp, &msg); + /* + * The node queue pair is unmapped so skip the read pointer + * update after receipt of a SCIF_EXIT_ACK + */ + if (SCIF_EXIT_ACK == msg.uop) + break; + scif_rb_update_read_ptr(&qp->inbound_q); + } while (1); +} + +/** + * scif_loopb_wq_handler - Loopback Workqueue Handler. + * @work: loop back work + * + * This work queue routine is invoked by the loopback work queue handler. + * It grabs the recv lock, dequeues any available messages from the head + * of the loopback message list, calls the node QP message handler, + * waits for it to return, then frees up this message and dequeues more + * elements of the list if available. + */ +static void scif_loopb_wq_handler(struct work_struct *unused) +{ + struct scif_dev *scifdev = scif_info.loopb_dev; + struct scif_qp *qp = scifdev->qpairs; + struct scif_loopb_msg *msg; + + do { + msg = NULL; + spin_lock(&qp->recv_lock); + if (!list_empty(&scif_info.loopb_recv_q)) { + msg = list_first_entry(&scif_info.loopb_recv_q, + struct scif_loopb_msg, + list); + list_del(&msg->list); + } + spin_unlock(&qp->recv_lock); + + if (msg) { + scif_nodeqp_msg_handler(scifdev, qp, &msg->msg); + kfree(msg); + } + } while (msg); +} + +/** + * scif_loopb_msg_handler() - Workqueue handler for loopback messages. + * @scifdev: SCIF device + * @qp: Queue pair. + * + * This work queue routine is triggered when a loopback message is received. + * + * We need special handling for receiving Node Qp messages on a loopback SCIF + * device via two workqueues for receiving messages. + * + * The reason we need the extra workqueue which is not required with *normal* + * non-loopback SCIF devices is the potential classic deadlock described below: + * + * Thread A tries to send a message on a loopback SCIF device and blocks since + * there is no space in the RB while it has the send_lock held or another + * lock called lock X for example. + * + * Thread B: The Loopback Node QP message receive workqueue receives the message + * and tries to send a message (eg an ACK) to the loopback SCIF device. It tries + * to grab the send lock again or lock X and deadlocks with Thread A. The RB + * cannot be drained any further due to this classic deadlock. + * + * In order to avoid deadlocks as mentioned above we have an extra level of + * indirection achieved by having two workqueues. + * 1) The first workqueue whose handler is scif_loopb_msg_handler reads + * messages from the Node QP RB, adds them to a list and queues work for the + * second workqueue. + * + * 2) The second workqueue whose handler is scif_loopb_wq_handler dequeues + * messages from the list, handles them, frees up the memory and dequeues + * more elements from the list if possible. + */ +int +scif_loopb_msg_handler(struct scif_dev *scifdev, struct scif_qp *qp) +{ + int read_size; + struct scif_loopb_msg *msg; + + do { + msg = kmalloc(sizeof(*msg), GFP_KERNEL); + if (!msg) + return -ENOMEM; + read_size = scif_rb_get_next(&qp->inbound_q, &msg->msg, + sizeof(struct scifmsg)); + if (read_size != sizeof(struct scifmsg)) { + kfree(msg); + scif_rb_update_read_ptr(&qp->inbound_q); + break; + } + spin_lock(&qp->recv_lock); + list_add_tail(&msg->list, &scif_info.loopb_recv_q); + spin_unlock(&qp->recv_lock); + queue_work(scif_info.loopb_wq, &scif_info.loopb_work); + scif_rb_update_read_ptr(&qp->inbound_q); + } while (read_size == sizeof(struct scifmsg)); + return read_size; +} + +/** + * scif_setup_loopback_qp - One time setup work for Loopback Node Qp. + * @scifdev: SCIF device + * + * Sets up the required loopback workqueues, queue pairs and ring buffers + */ +int scif_setup_loopback_qp(struct scif_dev *scifdev) +{ + int err = 0; + void *local_q; + struct scif_qp *qp; + + err = scif_setup_intr_wq(scifdev); + if (err) + goto exit; + INIT_LIST_HEAD(&scif_info.loopb_recv_q); + snprintf(scif_info.loopb_wqname, sizeof(scif_info.loopb_wqname), + "SCIF LOOPB %d", scifdev->node); + scif_info.loopb_wq = + alloc_ordered_workqueue(scif_info.loopb_wqname, 0); + if (!scif_info.loopb_wq) { + err = -ENOMEM; + goto destroy_intr; + } + INIT_WORK(&scif_info.loopb_work, scif_loopb_wq_handler); + /* Allocate Self Qpair */ + scifdev->qpairs = kzalloc(sizeof(*scifdev->qpairs), GFP_KERNEL); + if (!scifdev->qpairs) { + err = -ENOMEM; + goto destroy_loopb_wq; + } + + qp = scifdev->qpairs; + qp->magic = SCIFEP_MAGIC; + spin_lock_init(&qp->send_lock); + spin_lock_init(&qp->recv_lock); + + local_q = kzalloc(SCIF_NODE_QP_SIZE, GFP_KERNEL); + if (!local_q) { + err = -ENOMEM; + goto free_qpairs; + } + /* + * For loopback the inbound_q and outbound_q are essentially the same + * since the Node sends a message on the loopback interface to the + * outbound_q which is then received on the inbound_q. + */ + scif_rb_init(&qp->outbound_q, + &qp->local_read, + &qp->local_write, + local_q, get_count_order(SCIF_NODE_QP_SIZE)); + + scif_rb_init(&qp->inbound_q, + &qp->local_read, + &qp->local_write, + local_q, get_count_order(SCIF_NODE_QP_SIZE)); + scif_info.nodeid = scifdev->node; + + scif_peer_register_device(scifdev); + + scif_info.loopb_dev = scifdev; + return err; +free_qpairs: + kfree(scifdev->qpairs); +destroy_loopb_wq: + destroy_workqueue(scif_info.loopb_wq); +destroy_intr: + scif_destroy_intr_wq(scifdev); +exit: + return err; +} + +/** + * scif_destroy_loopback_qp - One time uninit work for Loopback Node Qp + * @scifdev: SCIF device + * + * Destroys the workqueues and frees up the Ring Buffer and Queue Pair memory. + */ +int scif_destroy_loopback_qp(struct scif_dev *scifdev) +{ + scif_peer_unregister_device(scifdev); + destroy_workqueue(scif_info.loopb_wq); + scif_destroy_intr_wq(scifdev); + kfree(scifdev->qpairs->outbound_q.rb_base); + kfree(scifdev->qpairs); + scifdev->sdev = NULL; + scif_info.loopb_dev = NULL; + return 0; +} + +void scif_destroy_p2p(struct scif_dev *scifdev) +{ + struct scif_dev *peer_dev; + struct scif_p2p_info *p2p; + struct list_head *pos, *tmp; + int bd; + + mutex_lock(&scif_info.conflock); + /* Free P2P mappings in the given node for all its peer nodes */ + list_for_each_safe(pos, tmp, &scifdev->p2p) { + p2p = list_entry(pos, struct scif_p2p_info, ppi_list); + dma_unmap_sg(&scifdev->sdev->dev, p2p->ppi_sg[SCIF_PPI_MMIO], + p2p->sg_nentries[SCIF_PPI_MMIO], + DMA_BIDIRECTIONAL); + dma_unmap_sg(&scifdev->sdev->dev, p2p->ppi_sg[SCIF_PPI_APER], + p2p->sg_nentries[SCIF_PPI_APER], + DMA_BIDIRECTIONAL); + scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_MMIO]); + scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_APER]); + list_del(pos); + kfree(p2p); + } + + /* Free P2P mapping created in the peer nodes for the given node */ + for (bd = SCIF_MGMT_NODE + 1; bd <= scif_info.maxid; bd++) { + peer_dev = &scif_dev[bd]; + list_for_each_safe(pos, tmp, &peer_dev->p2p) { + p2p = list_entry(pos, struct scif_p2p_info, ppi_list); + if (p2p->ppi_peer_id == scifdev->node) { + dma_unmap_sg(&peer_dev->sdev->dev, + p2p->ppi_sg[SCIF_PPI_MMIO], + p2p->sg_nentries[SCIF_PPI_MMIO], + DMA_BIDIRECTIONAL); + dma_unmap_sg(&peer_dev->sdev->dev, + p2p->ppi_sg[SCIF_PPI_APER], + p2p->sg_nentries[SCIF_PPI_APER], + DMA_BIDIRECTIONAL); + scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_MMIO]); + scif_p2p_freesg(p2p->ppi_sg[SCIF_PPI_APER]); + list_del(pos); + kfree(p2p); + } + } + } + mutex_unlock(&scif_info.conflock); +} diff --git a/kernel/drivers/misc/mic/scif/scif_nodeqp.h b/kernel/drivers/misc/mic/scif/scif_nodeqp.h new file mode 100644 index 000000000..958962731 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_nodeqp.h @@ -0,0 +1,221 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Intel SCIF driver. + * + */ +#ifndef SCIF_NODEQP +#define SCIF_NODEQP + +#include "scif_rb.h" +#include "scif_peer_bus.h" + +#define SCIF_INIT 1 /* First message sent to the peer node for discovery */ +#define SCIF_EXIT 2 /* Last message from the peer informing intent to exit */ +#define SCIF_EXIT_ACK 3 /* Response to SCIF_EXIT message */ +#define SCIF_NODE_ADD 4 /* Tell Online nodes a new node exits */ +#define SCIF_NODE_ADD_ACK 5 /* Confirm to mgmt node sequence is finished */ +#define SCIF_NODE_ADD_NACK 6 /* SCIF_NODE_ADD failed */ +#define SCIF_NODE_REMOVE 7 /* Request to deactivate a SCIF node */ +#define SCIF_NODE_REMOVE_ACK 8 /* Response to a SCIF_NODE_REMOVE message */ +#define SCIF_CNCT_REQ 9 /* Phys addr of Request connection to a port */ +#define SCIF_CNCT_GNT 10 /* Phys addr of new Grant connection request */ +#define SCIF_CNCT_GNTACK 11 /* Error type Reject a connection request */ +#define SCIF_CNCT_GNTNACK 12 /* Error type Reject a connection request */ +#define SCIF_CNCT_REJ 13 /* Error type Reject a connection request */ +#define SCIF_DISCNCT 14 /* Notify peer that connection is being terminated */ +#define SCIF_DISCNT_ACK 15 /* Notify peer that connection is being terminated */ +#define SCIF_CLIENT_SENT 16 /* Notify the peer that data has been written */ +#define SCIF_CLIENT_RCVD 17 /* Notify the peer that data has been read */ +#define SCIF_GET_NODE_INFO 18 /* Get current node mask from the mgmt node*/ +#define SCIF_REGISTER 19 /* Tell peer about a new registered window */ +#define SCIF_REGISTER_ACK 20 /* Notify peer about unregistration success */ +#define SCIF_REGISTER_NACK 21 /* Notify peer about registration success */ +#define SCIF_UNREGISTER 22 /* Tell peer about unregistering a window */ +#define SCIF_UNREGISTER_ACK 23 /* Notify peer about registration failure */ +#define SCIF_UNREGISTER_NACK 24 /* Notify peer about unregistration failure */ +#define SCIF_ALLOC_REQ 25 /* Request a mapped buffer */ +#define SCIF_ALLOC_GNT 26 /* Notify peer about allocation success */ +#define SCIF_ALLOC_REJ 27 /* Notify peer about allocation failure */ +#define SCIF_FREE_VIRT 28 /* Free previously allocated virtual memory */ +#define SCIF_MUNMAP 29 /* Acknowledgment for a SCIF_MMAP request */ +#define SCIF_MARK 30 /* SCIF Remote Fence Mark Request */ +#define SCIF_MARK_ACK 31 /* SCIF Remote Fence Mark Success */ +#define SCIF_MARK_NACK 32 /* SCIF Remote Fence Mark Failure */ +#define SCIF_WAIT 33 /* SCIF Remote Fence Wait Request */ +#define SCIF_WAIT_ACK 34 /* SCIF Remote Fence Wait Success */ +#define SCIF_WAIT_NACK 35 /* SCIF Remote Fence Wait Failure */ +#define SCIF_SIG_LOCAL 36 /* SCIF Remote Fence Local Signal Request */ +#define SCIF_SIG_REMOTE 37 /* SCIF Remote Fence Remote Signal Request */ +#define SCIF_SIG_ACK 38 /* SCIF Remote Fence Remote Signal Success */ +#define SCIF_SIG_NACK 39 /* SCIF Remote Fence Remote Signal Failure */ +#define SCIF_MAX_MSG SCIF_SIG_NACK + +/* + * struct scifmsg - Node QP message format + * + * @src: Source information + * @dst: Destination information + * @uop: The message opcode + * @payload: Unique payload format for each message + */ +struct scifmsg { + struct scif_port_id src; + struct scif_port_id dst; + u32 uop; + u64 payload[4]; +} __packed; + +/* + * struct scif_allocmsg - Used with SCIF_ALLOC_REQ to request + * the remote note to allocate memory + * + * phys_addr: Physical address of the buffer + * vaddr: Virtual address of the buffer + * size: Size of the buffer + * state: Current state + * allocwq: wait queue for status + */ +struct scif_allocmsg { + dma_addr_t phys_addr; + unsigned long vaddr; + size_t size; + enum scif_msg_state state; + wait_queue_head_t allocwq; +}; + +/* + * struct scif_qp - Node Queue Pair + * + * Interesting structure -- a little difficult because we can only + * write across the PCIe, so any r/w pointer we need to read is + * local. We only need to read the read pointer on the inbound_q + * and read the write pointer in the outbound_q + * + * @magic: Magic value to ensure the peer sees the QP correctly + * @outbound_q: The outbound ring buffer for sending messages + * @inbound_q: The inbound ring buffer for receiving messages + * @local_write: Local write index + * @local_read: Local read index + * @remote_qp: The remote queue pair + * @local_buf: DMA address of local ring buffer + * @local_qp: DMA address of the local queue pair data structure + * @remote_buf: DMA address of remote ring buffer + * @qp_state: QP state i.e. online or offline used for P2P + * @send_lock: synchronize access to outbound queue + * @recv_lock: Synchronize access to inbound queue + */ +struct scif_qp { + u64 magic; +#define SCIFEP_MAGIC 0x5c1f000000005c1fULL + struct scif_rb outbound_q; + struct scif_rb inbound_q; + + u32 local_write __aligned(64); + u32 local_read __aligned(64); + struct scif_qp *remote_qp; + dma_addr_t local_buf; + dma_addr_t local_qp; + dma_addr_t remote_buf; + u32 qp_state; +#define SCIF_QP_OFFLINE 0xdead +#define SCIF_QP_ONLINE 0xc0de + spinlock_t send_lock; + spinlock_t recv_lock; +}; + +/* + * struct scif_loopb_msg - An element in the loopback Node QP message list. + * + * @msg - The SCIF node QP message + * @list - link in the list of messages + */ +struct scif_loopb_msg { + struct scifmsg msg; + struct list_head list; +}; + +int scif_nodeqp_send(struct scif_dev *scifdev, struct scifmsg *msg); +int _scif_nodeqp_send(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_nodeqp_intrhandler(struct scif_dev *scifdev, struct scif_qp *qp); +int scif_loopb_msg_handler(struct scif_dev *scifdev, struct scif_qp *qp); +int scif_setup_qp(struct scif_dev *scifdev); +int scif_qp_response(phys_addr_t phys, struct scif_dev *dev); +int scif_setup_qp_connect(struct scif_qp *qp, dma_addr_t *qp_offset, + int local_size, struct scif_dev *scifdev); +int scif_setup_qp_accept(struct scif_qp *qp, dma_addr_t *qp_offset, + dma_addr_t phys, int local_size, + struct scif_dev *scifdev); +int scif_setup_qp_connect_response(struct scif_dev *scifdev, + struct scif_qp *qp, u64 payload); +int scif_setup_loopback_qp(struct scif_dev *scifdev); +int scif_destroy_loopback_qp(struct scif_dev *scifdev); +void scif_poll_qp_state(struct work_struct *work); +void scif_destroy_p2p(struct scif_dev *scifdev); +void scif_send_exit(struct scif_dev *scifdev); +static inline struct device *scif_get_peer_dev(struct scif_dev *scifdev) +{ + struct scif_peer_dev *spdev; + struct device *spdev_ret; + + rcu_read_lock(); + spdev = rcu_dereference(scifdev->spdev); + if (spdev) + spdev_ret = get_device(&spdev->dev); + else + spdev_ret = ERR_PTR(-ENODEV); + rcu_read_unlock(); + return spdev_ret; +} + +static inline void scif_put_peer_dev(struct device *dev) +{ + put_device(dev); +} +#endif /* SCIF_NODEQP */ diff --git a/kernel/drivers/misc/mic/scif/scif_peer_bus.c b/kernel/drivers/misc/mic/scif/scif_peer_bus.c new file mode 100644 index 000000000..6ffa3bdbd --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_peer_bus.c @@ -0,0 +1,183 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + */ +#include "scif_main.h" +#include "../bus/scif_bus.h" +#include "scif_peer_bus.h" + +static inline struct scif_peer_dev * +dev_to_scif_peer(struct device *dev) +{ + return container_of(dev, struct scif_peer_dev, dev); +} + +struct bus_type scif_peer_bus = { + .name = "scif_peer_bus", +}; + +static void scif_peer_release_dev(struct device *d) +{ + struct scif_peer_dev *sdev = dev_to_scif_peer(d); + struct scif_dev *scifdev = &scif_dev[sdev->dnode]; + + scif_cleanup_scifdev(scifdev); + kfree(sdev); +} + +static int scif_peer_initialize_device(struct scif_dev *scifdev) +{ + struct scif_peer_dev *spdev; + int ret; + + spdev = kzalloc(sizeof(*spdev), GFP_KERNEL); + if (!spdev) { + ret = -ENOMEM; + goto err; + } + + spdev->dev.parent = scifdev->sdev->dev.parent; + spdev->dev.release = scif_peer_release_dev; + spdev->dnode = scifdev->node; + spdev->dev.bus = &scif_peer_bus; + dev_set_name(&spdev->dev, "scif_peer-dev%u", spdev->dnode); + + device_initialize(&spdev->dev); + get_device(&spdev->dev); + rcu_assign_pointer(scifdev->spdev, spdev); + + mutex_lock(&scif_info.conflock); + scif_info.total++; + scif_info.maxid = max_t(u32, spdev->dnode, scif_info.maxid); + mutex_unlock(&scif_info.conflock); + return 0; +err: + dev_err(&scifdev->sdev->dev, + "dnode %d: initialize_device rc %d\n", scifdev->node, ret); + return ret; +} + +static int scif_peer_add_device(struct scif_dev *scifdev) +{ + struct scif_peer_dev *spdev = rcu_dereference(scifdev->spdev); + char pool_name[16]; + int ret; + + ret = device_add(&spdev->dev); + put_device(&spdev->dev); + if (ret) { + dev_err(&scifdev->sdev->dev, + "dnode %d: peer device_add failed\n", scifdev->node); + goto put_spdev; + } + + scnprintf(pool_name, sizeof(pool_name), "scif-%d", spdev->dnode); + scifdev->signal_pool = dmam_pool_create(pool_name, &scifdev->sdev->dev, + sizeof(struct scif_status), 1, + 0); + if (!scifdev->signal_pool) { + dev_err(&scifdev->sdev->dev, + "dnode %d: dmam_pool_create failed\n", scifdev->node); + ret = -ENOMEM; + goto del_spdev; + } + dev_dbg(&spdev->dev, "Added peer dnode %d\n", spdev->dnode); + return 0; +del_spdev: + device_del(&spdev->dev); +put_spdev: + RCU_INIT_POINTER(scifdev->spdev, NULL); + synchronize_rcu(); + put_device(&spdev->dev); + + mutex_lock(&scif_info.conflock); + scif_info.total--; + mutex_unlock(&scif_info.conflock); + return ret; +} + +void scif_add_peer_device(struct work_struct *work) +{ + struct scif_dev *scifdev = container_of(work, struct scif_dev, + peer_add_work); + + scif_peer_add_device(scifdev); +} + +/* + * Peer device registration is split into a device_initialize and a device_add. + * The reason for doing this is as follows: First, peer device registration + * itself cannot be done in the message processing thread and must be delegated + * to another workqueue, otherwise if SCIF client probe, called during peer + * device registration, calls scif_connect(..), it will block the message + * processing thread causing a deadlock. Next, device_initialize is done in the + * "top-half" message processing thread and device_add in the "bottom-half" + * workqueue. If this is not done, SCIF_CNCT_REQ message processing executing + * concurrently with SCIF_INIT message processing is unable to get a reference + * on the peer device, thereby failing the connect request. + */ +void scif_peer_register_device(struct scif_dev *scifdev) +{ + int ret; + + mutex_lock(&scifdev->lock); + ret = scif_peer_initialize_device(scifdev); + if (ret) + goto exit; + schedule_work(&scifdev->peer_add_work); +exit: + mutex_unlock(&scifdev->lock); +} + +int scif_peer_unregister_device(struct scif_dev *scifdev) +{ + struct scif_peer_dev *spdev; + + mutex_lock(&scifdev->lock); + /* Flush work to ensure device register is complete */ + flush_work(&scifdev->peer_add_work); + + /* + * Continue holding scifdev->lock since theoretically unregister_device + * can be called simultaneously from multiple threads + */ + spdev = rcu_dereference(scifdev->spdev); + if (!spdev) { + mutex_unlock(&scifdev->lock); + return -ENODEV; + } + + RCU_INIT_POINTER(scifdev->spdev, NULL); + synchronize_rcu(); + mutex_unlock(&scifdev->lock); + + dev_dbg(&spdev->dev, "Removing peer dnode %d\n", spdev->dnode); + device_unregister(&spdev->dev); + + mutex_lock(&scif_info.conflock); + scif_info.total--; + mutex_unlock(&scif_info.conflock); + return 0; +} + +int scif_peer_bus_init(void) +{ + return bus_register(&scif_peer_bus); +} + +void scif_peer_bus_exit(void) +{ + bus_unregister(&scif_peer_bus); +} diff --git a/kernel/drivers/misc/mic/scif/scif_peer_bus.h b/kernel/drivers/misc/mic/scif/scif_peer_bus.h new file mode 100644 index 000000000..a3b8dd2ed --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_peer_bus.h @@ -0,0 +1,31 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + */ +#ifndef _SCIF_PEER_BUS_H_ +#define _SCIF_PEER_BUS_H_ + +#include <linux/device.h> +#include <linux/mic_common.h> +#include <linux/scif.h> + +struct scif_dev; + +void scif_add_peer_device(struct work_struct *work); +void scif_peer_register_device(struct scif_dev *sdev); +int scif_peer_unregister_device(struct scif_dev *scifdev); +int scif_peer_bus_init(void); +void scif_peer_bus_exit(void); +#endif /* _SCIF_PEER_BUS_H */ diff --git a/kernel/drivers/misc/mic/scif/scif_ports.c b/kernel/drivers/misc/mic/scif/scif_ports.c new file mode 100644 index 000000000..594e18d27 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_ports.c @@ -0,0 +1,124 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include <linux/idr.h> + +#include "scif_main.h" + +#define SCIF_PORT_COUNT 0x10000 /* Ports available */ + +struct idr scif_ports; + +/* + * struct scif_port - SCIF port information + * + * @ref_cnt - Reference count since there can be multiple endpoints + * created via scif_accept(..) simultaneously using a port. + */ +struct scif_port { + int ref_cnt; +}; + +/** + * __scif_get_port - Reserve a specified port # for SCIF and add it + * to the global list. + * @port : port # to be reserved. + * + * @return : Allocated SCIF port #, or -ENOSPC if port unavailable. + * On memory allocation failure, returns -ENOMEM. + */ +static int __scif_get_port(int start, int end) +{ + int id; + struct scif_port *port = kzalloc(sizeof(*port), GFP_ATOMIC); + + if (!port) + return -ENOMEM; + spin_lock(&scif_info.port_lock); + id = idr_alloc(&scif_ports, port, start, end, GFP_ATOMIC); + if (id >= 0) + port->ref_cnt++; + spin_unlock(&scif_info.port_lock); + return id; +} + +/** + * scif_rsrv_port - Reserve a specified port # for SCIF. + * @port : port # to be reserved. + * + * @return : Allocated SCIF port #, or -ENOSPC if port unavailable. + * On memory allocation failure, returns -ENOMEM. + */ +int scif_rsrv_port(u16 port) +{ + return __scif_get_port(port, port + 1); +} + +/** + * scif_get_new_port - Get and reserve any port # for SCIF in the range + * SCIF_PORT_RSVD + 1 to SCIF_PORT_COUNT - 1. + * + * @return : Allocated SCIF port #, or -ENOSPC if no ports available. + * On memory allocation failure, returns -ENOMEM. + */ +int scif_get_new_port(void) +{ + return __scif_get_port(SCIF_PORT_RSVD + 1, SCIF_PORT_COUNT); +} + +/** + * scif_get_port - Increment the reference count for a SCIF port + * @id : SCIF port + * + * @return : None + */ +void scif_get_port(u16 id) +{ + struct scif_port *port; + + if (!id) + return; + spin_lock(&scif_info.port_lock); + port = idr_find(&scif_ports, id); + if (port) + port->ref_cnt++; + spin_unlock(&scif_info.port_lock); +} + +/** + * scif_put_port - Release a reserved SCIF port + * @id : SCIF port to be released. + * + * @return : None + */ +void scif_put_port(u16 id) +{ + struct scif_port *port; + + if (!id) + return; + spin_lock(&scif_info.port_lock); + port = idr_find(&scif_ports, id); + if (port) { + port->ref_cnt--; + if (!port->ref_cnt) { + idr_remove(&scif_ports, id); + kfree(port); + } + } + spin_unlock(&scif_info.port_lock); +} diff --git a/kernel/drivers/misc/mic/scif/scif_rb.c b/kernel/drivers/misc/mic/scif/scif_rb.c new file mode 100644 index 000000000..637cc4686 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_rb.c @@ -0,0 +1,249 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include <linux/circ_buf.h> +#include <linux/types.h> +#include <linux/io.h> +#include <linux/errno.h> + +#include "scif_rb.h" + +#define scif_rb_ring_cnt(head, tail, size) CIRC_CNT(head, tail, size) +#define scif_rb_ring_space(head, tail, size) CIRC_SPACE(head, tail, size) + +/** + * scif_rb_init - Initializes the ring buffer + * @rb: ring buffer + * @read_ptr: A pointer to the read offset + * @write_ptr: A pointer to the write offset + * @rb_base: A pointer to the base of the ring buffer + * @size: The size of the ring buffer in powers of two + */ +void scif_rb_init(struct scif_rb *rb, u32 *read_ptr, u32 *write_ptr, + void *rb_base, u8 size) +{ + rb->rb_base = rb_base; + rb->size = (1 << size); + rb->read_ptr = read_ptr; + rb->write_ptr = write_ptr; + rb->current_read_offset = *read_ptr; + rb->current_write_offset = *write_ptr; +} + +/* Copies a message to the ring buffer -- handles the wrap around case */ +static void memcpy_torb(struct scif_rb *rb, void *header, + void *msg, u32 size) +{ + u32 size1, size2; + + if (header + size >= rb->rb_base + rb->size) { + /* Need to call two copies if it wraps around */ + size1 = (u32)(rb->rb_base + rb->size - header); + size2 = size - size1; + memcpy_toio((void __iomem __force *)header, msg, size1); + memcpy_toio((void __iomem __force *)rb->rb_base, + msg + size1, size2); + } else { + memcpy_toio((void __iomem __force *)header, msg, size); + } +} + +/* Copies a message from the ring buffer -- handles the wrap around case */ +static void memcpy_fromrb(struct scif_rb *rb, void *header, + void *msg, u32 size) +{ + u32 size1, size2; + + if (header + size >= rb->rb_base + rb->size) { + /* Need to call two copies if it wraps around */ + size1 = (u32)(rb->rb_base + rb->size - header); + size2 = size - size1; + memcpy_fromio(msg, (void __iomem __force *)header, size1); + memcpy_fromio(msg + size1, + (void __iomem __force *)rb->rb_base, size2); + } else { + memcpy_fromio(msg, (void __iomem __force *)header, size); + } +} + +/** + * scif_rb_space - Query space available for writing to the RB + * @rb: ring buffer + * + * Return: size available for writing to RB in bytes. + */ +u32 scif_rb_space(struct scif_rb *rb) +{ + rb->current_read_offset = *rb->read_ptr; + /* + * Update from the HW read pointer only once the peer has exposed the + * new empty slot. This barrier is paired with the memory barrier + * scif_rb_update_read_ptr() + */ + mb(); + return scif_rb_ring_space(rb->current_write_offset, + rb->current_read_offset, rb->size); +} + +/** + * scif_rb_write - Write a message to the RB + * @rb: ring buffer + * @msg: buffer to send the message. Must be at least size bytes long + * @size: the size (in bytes) to be copied to the RB + * + * This API does not block if there isn't enough space in the RB. + * Returns: 0 on success or -ENOMEM on failure + */ +int scif_rb_write(struct scif_rb *rb, void *msg, u32 size) +{ + void *header; + + if (scif_rb_space(rb) < size) + return -ENOMEM; + header = rb->rb_base + rb->current_write_offset; + memcpy_torb(rb, header, msg, size); + /* + * Wait until scif_rb_commit(). Update the local ring + * buffer data, not the shared data until commit. + */ + rb->current_write_offset = + (rb->current_write_offset + size) & (rb->size - 1); + return 0; +} + +/** + * scif_rb_commit - To submit the message to let the peer fetch it + * @rb: ring buffer + */ +void scif_rb_commit(struct scif_rb *rb) +{ + /* + * We must ensure ordering between the all the data committed + * previously before we expose the new message to the peer by + * updating the write_ptr. This write barrier is paired with + * the read barrier in scif_rb_count(..) + */ + wmb(); + ACCESS_ONCE(*rb->write_ptr) = rb->current_write_offset; +#ifdef CONFIG_INTEL_MIC_CARD + /* + * X100 Si bug: For the case where a Core is performing an EXT_WR + * followed by a Doorbell Write, the Core must perform two EXT_WR to the + * same address with the same data before it does the Doorbell Write. + * This way, if ordering is violated for the Interrupt Message, it will + * fall just behind the first Posted associated with the first EXT_WR. + */ + ACCESS_ONCE(*rb->write_ptr) = rb->current_write_offset; +#endif +} + +/** + * scif_rb_get - To get next message from the ring buffer + * @rb: ring buffer + * @size: Number of bytes to be read + * + * Return: NULL if no bytes to be read from the ring buffer, otherwise the + * pointer to the next byte + */ +static void *scif_rb_get(struct scif_rb *rb, u32 size) +{ + void *header = NULL; + + if (scif_rb_count(rb, size) >= size) + header = rb->rb_base + rb->current_read_offset; + return header; +} + +/* + * scif_rb_get_next - Read from ring buffer. + * @rb: ring buffer + * @msg: buffer to hold the message. Must be at least size bytes long + * @size: Number of bytes to be read + * + * Return: number of bytes read if available bytes are >= size, otherwise + * returns zero. + */ +u32 scif_rb_get_next(struct scif_rb *rb, void *msg, u32 size) +{ + void *header = NULL; + int read_size = 0; + + header = scif_rb_get(rb, size); + if (header) { + u32 next_cmd_offset = + (rb->current_read_offset + size) & (rb->size - 1); + + read_size = size; + rb->current_read_offset = next_cmd_offset; + memcpy_fromrb(rb, header, msg, size); + } + return read_size; +} + +/** + * scif_rb_update_read_ptr + * @rb: ring buffer + */ +void scif_rb_update_read_ptr(struct scif_rb *rb) +{ + u32 new_offset; + + new_offset = rb->current_read_offset; + /* + * We must ensure ordering between the all the data committed or read + * previously before we expose the empty slot to the peer by updating + * the read_ptr. This barrier is paired with the memory barrier in + * scif_rb_space(..) + */ + mb(); + ACCESS_ONCE(*rb->read_ptr) = new_offset; +#ifdef CONFIG_INTEL_MIC_CARD + /* + * X100 Si Bug: For the case where a Core is performing an EXT_WR + * followed by a Doorbell Write, the Core must perform two EXT_WR to the + * same address with the same data before it does the Doorbell Write. + * This way, if ordering is violated for the Interrupt Message, it will + * fall just behind the first Posted associated with the first EXT_WR. + */ + ACCESS_ONCE(*rb->read_ptr) = new_offset; +#endif +} + +/** + * scif_rb_count + * @rb: ring buffer + * @size: Number of bytes expected to be read + * + * Return: number of bytes that can be read from the RB + */ +u32 scif_rb_count(struct scif_rb *rb, u32 size) +{ + if (scif_rb_ring_cnt(rb->current_write_offset, + rb->current_read_offset, + rb->size) < size) { + rb->current_write_offset = *rb->write_ptr; + /* + * Update from the HW write pointer if empty only once the peer + * has exposed the new message. This read barrier is paired + * with the write barrier in scif_rb_commit(..) + */ + smp_rmb(); + } + return scif_rb_ring_cnt(rb->current_write_offset, + rb->current_read_offset, + rb->size); +} diff --git a/kernel/drivers/misc/mic/scif/scif_rb.h b/kernel/drivers/misc/mic/scif/scif_rb.h new file mode 100644 index 000000000..166dffe30 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_rb.h @@ -0,0 +1,100 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Intel SCIF driver. + */ +#ifndef SCIF_RB_H +#define SCIF_RB_H +/* + * This file describes a general purpose, byte based ring buffer. Writers to the + * ring buffer need to synchronize using a lock. The same is true for readers, + * although in practice, the ring buffer has a single reader. It is lockless + * between producer and consumer so it can handle being used across the PCIe + * bus. The ring buffer ensures that there are no reads across the PCIe bus for + * performance reasons. Two of these are used to form a single bidirectional + * queue-pair across PCIe. + */ +/* + * struct scif_rb - SCIF Ring Buffer + * + * @rb_base: The base of the memory used for storing RB messages + * @read_ptr: Pointer to the read offset + * @write_ptr: Pointer to the write offset + * @size: Size of the memory in rb_base + * @current_read_offset: Cached read offset for performance + * @current_write_offset: Cached write offset for performance + */ +struct scif_rb { + void *rb_base; + u32 *read_ptr; + u32 *write_ptr; + u32 size; + u32 current_read_offset; + u32 current_write_offset; +}; + +/* methods used by both */ +void scif_rb_init(struct scif_rb *rb, u32 *read_ptr, u32 *write_ptr, + void *rb_base, u8 size); +/* writer only methods */ +/* write a new command, then scif_rb_commit() */ +int scif_rb_write(struct scif_rb *rb, void *msg, u32 size); +/* after write(), then scif_rb_commit() */ +void scif_rb_commit(struct scif_rb *rb); +/* query space available for writing to a RB. */ +u32 scif_rb_space(struct scif_rb *rb); + +/* reader only methods */ +/* read a new message from the ring buffer of size bytes */ +u32 scif_rb_get_next(struct scif_rb *rb, void *msg, u32 size); +/* update the read pointer so that the space can be reused */ +void scif_rb_update_read_ptr(struct scif_rb *rb); +/* count the number of bytes that can be read */ +u32 scif_rb_count(struct scif_rb *rb, u32 size); +#endif diff --git a/kernel/drivers/misc/mic/scif/scif_rma.c b/kernel/drivers/misc/mic/scif/scif_rma.c new file mode 100644 index 000000000..8310b4dbf --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_rma.c @@ -0,0 +1,1775 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include <linux/dma_remapping.h> +#include <linux/pagemap.h> +#include "scif_main.h" +#include "scif_map.h" + +/* Used to skip ulimit checks for registrations with SCIF_MAP_KERNEL flag */ +#define SCIF_MAP_ULIMIT 0x40 + +bool scif_ulimit_check = 1; + +/** + * scif_rma_ep_init: + * @ep: end point + * + * Initialize RMA per EP data structures. + */ +void scif_rma_ep_init(struct scif_endpt *ep) +{ + struct scif_endpt_rma_info *rma = &ep->rma_info; + + mutex_init(&rma->rma_lock); + init_iova_domain(&rma->iovad, PAGE_SIZE, SCIF_IOVA_START_PFN, + SCIF_DMA_64BIT_PFN); + spin_lock_init(&rma->tc_lock); + mutex_init(&rma->mmn_lock); + INIT_LIST_HEAD(&rma->reg_list); + INIT_LIST_HEAD(&rma->remote_reg_list); + atomic_set(&rma->tw_refcount, 0); + atomic_set(&rma->tcw_refcount, 0); + atomic_set(&rma->tcw_total_pages, 0); + atomic_set(&rma->fence_refcount, 0); + + rma->async_list_del = 0; + rma->dma_chan = NULL; + INIT_LIST_HEAD(&rma->mmn_list); + INIT_LIST_HEAD(&rma->vma_list); + init_waitqueue_head(&rma->markwq); +} + +/** + * scif_rma_ep_can_uninit: + * @ep: end point + * + * Returns 1 if an endpoint can be uninitialized and 0 otherwise. + */ +int scif_rma_ep_can_uninit(struct scif_endpt *ep) +{ + int ret = 0; + + mutex_lock(&ep->rma_info.rma_lock); + /* Destroy RMA Info only if both lists are empty */ + if (list_empty(&ep->rma_info.reg_list) && + list_empty(&ep->rma_info.remote_reg_list) && + list_empty(&ep->rma_info.mmn_list) && + !atomic_read(&ep->rma_info.tw_refcount) && + !atomic_read(&ep->rma_info.tcw_refcount) && + !atomic_read(&ep->rma_info.fence_refcount)) + ret = 1; + mutex_unlock(&ep->rma_info.rma_lock); + return ret; +} + +/** + * scif_create_pinned_pages: + * @nr_pages: number of pages in window + * @prot: read/write protection + * + * Allocate and prepare a set of pinned pages. + */ +static struct scif_pinned_pages * +scif_create_pinned_pages(int nr_pages, int prot) +{ + struct scif_pinned_pages *pin; + + might_sleep(); + pin = scif_zalloc(sizeof(*pin)); + if (!pin) + goto error; + + pin->pages = scif_zalloc(nr_pages * sizeof(*pin->pages)); + if (!pin->pages) + goto error_free_pinned_pages; + + pin->prot = prot; + pin->magic = SCIFEP_MAGIC; + return pin; + +error_free_pinned_pages: + scif_free(pin, sizeof(*pin)); +error: + return NULL; +} + +/** + * scif_destroy_pinned_pages: + * @pin: A set of pinned pages. + * + * Deallocate resources for pinned pages. + */ +static int scif_destroy_pinned_pages(struct scif_pinned_pages *pin) +{ + int j; + int writeable = pin->prot & SCIF_PROT_WRITE; + int kernel = SCIF_MAP_KERNEL & pin->map_flags; + + for (j = 0; j < pin->nr_pages; j++) { + if (pin->pages[j] && !kernel) { + if (writeable) + SetPageDirty(pin->pages[j]); + put_page(pin->pages[j]); + } + } + + scif_free(pin->pages, + pin->nr_pages * sizeof(*pin->pages)); + scif_free(pin, sizeof(*pin)); + return 0; +} + +/* + * scif_create_window: + * @ep: end point + * @nr_pages: number of pages + * @offset: registration offset + * @temp: true if a temporary window is being created + * + * Allocate and prepare a self registration window. + */ +struct scif_window *scif_create_window(struct scif_endpt *ep, int nr_pages, + s64 offset, bool temp) +{ + struct scif_window *window; + + might_sleep(); + window = scif_zalloc(sizeof(*window)); + if (!window) + goto error; + + window->dma_addr = scif_zalloc(nr_pages * sizeof(*window->dma_addr)); + if (!window->dma_addr) + goto error_free_window; + + window->num_pages = scif_zalloc(nr_pages * sizeof(*window->num_pages)); + if (!window->num_pages) + goto error_free_window; + + window->offset = offset; + window->ep = (u64)ep; + window->magic = SCIFEP_MAGIC; + window->reg_state = OP_IDLE; + init_waitqueue_head(&window->regwq); + window->unreg_state = OP_IDLE; + init_waitqueue_head(&window->unregwq); + INIT_LIST_HEAD(&window->list); + window->type = SCIF_WINDOW_SELF; + window->temp = temp; + return window; + +error_free_window: + scif_free(window->dma_addr, + nr_pages * sizeof(*window->dma_addr)); + scif_free(window, sizeof(*window)); +error: + return NULL; +} + +/** + * scif_destroy_incomplete_window: + * @ep: end point + * @window: registration window + * + * Deallocate resources for self window. + */ +static void scif_destroy_incomplete_window(struct scif_endpt *ep, + struct scif_window *window) +{ + int err; + int nr_pages = window->nr_pages; + struct scif_allocmsg *alloc = &window->alloc_handle; + struct scifmsg msg; + +retry: + /* Wait for a SCIF_ALLOC_GNT/REJ message */ + err = wait_event_timeout(alloc->allocwq, + alloc->state != OP_IN_PROGRESS, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + + mutex_lock(&ep->rma_info.rma_lock); + if (alloc->state == OP_COMPLETED) { + msg.uop = SCIF_FREE_VIRT; + msg.src = ep->port; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = window->alloc_handle.vaddr; + msg.payload[2] = (u64)window; + msg.payload[3] = SCIF_REGISTER; + _scif_nodeqp_send(ep->remote_dev, &msg); + } + mutex_unlock(&ep->rma_info.rma_lock); + + scif_free_window_offset(ep, window, window->offset); + scif_free(window->dma_addr, nr_pages * sizeof(*window->dma_addr)); + scif_free(window->num_pages, nr_pages * sizeof(*window->num_pages)); + scif_free(window, sizeof(*window)); +} + +/** + * scif_unmap_window: + * @remote_dev: SCIF remote device + * @window: registration window + * + * Delete any DMA mappings created for a registered self window + */ +void scif_unmap_window(struct scif_dev *remote_dev, struct scif_window *window) +{ + int j; + + if (scif_is_iommu_enabled() && !scifdev_self(remote_dev)) { + if (window->st) { + dma_unmap_sg(&remote_dev->sdev->dev, + window->st->sgl, window->st->nents, + DMA_BIDIRECTIONAL); + sg_free_table(window->st); + kfree(window->st); + window->st = NULL; + } + } else { + for (j = 0; j < window->nr_contig_chunks; j++) { + if (window->dma_addr[j]) { + scif_unmap_single(window->dma_addr[j], + remote_dev, + window->num_pages[j] << + PAGE_SHIFT); + window->dma_addr[j] = 0x0; + } + } + } +} + +static inline struct mm_struct *__scif_acquire_mm(void) +{ + if (scif_ulimit_check) + return get_task_mm(current); + return NULL; +} + +static inline void __scif_release_mm(struct mm_struct *mm) +{ + if (mm) + mmput(mm); +} + +static inline int +__scif_dec_pinned_vm_lock(struct mm_struct *mm, + int nr_pages, bool try_lock) +{ + if (!mm || !nr_pages || !scif_ulimit_check) + return 0; + if (try_lock) { + if (!down_write_trylock(&mm->mmap_sem)) { + dev_err(scif_info.mdev.this_device, + "%s %d err\n", __func__, __LINE__); + return -1; + } + } else { + down_write(&mm->mmap_sem); + } + mm->pinned_vm -= nr_pages; + up_write(&mm->mmap_sem); + return 0; +} + +static inline int __scif_check_inc_pinned_vm(struct mm_struct *mm, + int nr_pages) +{ + unsigned long locked, lock_limit; + + if (!mm || !nr_pages || !scif_ulimit_check) + return 0; + + locked = nr_pages; + locked += mm->pinned_vm; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + dev_err(scif_info.mdev.this_device, + "locked(%lu) > lock_limit(%lu)\n", + locked, lock_limit); + return -ENOMEM; + } + mm->pinned_vm = locked; + return 0; +} + +/** + * scif_destroy_window: + * @ep: end point + * @window: registration window + * + * Deallocate resources for self window. + */ +int scif_destroy_window(struct scif_endpt *ep, struct scif_window *window) +{ + int j; + struct scif_pinned_pages *pinned_pages = window->pinned_pages; + int nr_pages = window->nr_pages; + + might_sleep(); + if (!window->temp && window->mm) { + __scif_dec_pinned_vm_lock(window->mm, window->nr_pages, 0); + __scif_release_mm(window->mm); + window->mm = NULL; + } + + scif_free_window_offset(ep, window, window->offset); + scif_unmap_window(ep->remote_dev, window); + /* + * Decrement references for this set of pinned pages from + * this window. + */ + j = atomic_sub_return(1, &pinned_pages->ref_count); + if (j < 0) + dev_err(scif_info.mdev.this_device, + "%s %d incorrect ref count %d\n", + __func__, __LINE__, j); + /* + * If the ref count for pinned_pages is zero then someone + * has already called scif_unpin_pages() for it and we should + * destroy the page cache. + */ + if (!j) + scif_destroy_pinned_pages(window->pinned_pages); + scif_free(window->dma_addr, nr_pages * sizeof(*window->dma_addr)); + scif_free(window->num_pages, nr_pages * sizeof(*window->num_pages)); + window->magic = 0; + scif_free(window, sizeof(*window)); + return 0; +} + +/** + * scif_create_remote_lookup: + * @remote_dev: SCIF remote device + * @window: remote window + * + * Allocate and prepare lookup entries for the remote + * end to copy over the physical addresses. + * Returns 0 on success and appropriate errno on failure. + */ +static int scif_create_remote_lookup(struct scif_dev *remote_dev, + struct scif_window *window) +{ + int i, j, err = 0; + int nr_pages = window->nr_pages; + bool vmalloc_dma_phys, vmalloc_num_pages; + + might_sleep(); + /* Map window */ + err = scif_map_single(&window->mapped_offset, + window, remote_dev, sizeof(*window)); + if (err) + goto error_window; + + /* Compute the number of lookup entries. 21 == 2MB Shift */ + window->nr_lookup = ALIGN(nr_pages * PAGE_SIZE, + ((2) * 1024 * 1024)) >> 21; + + window->dma_addr_lookup.lookup = + scif_alloc_coherent(&window->dma_addr_lookup.offset, + remote_dev, window->nr_lookup * + sizeof(*window->dma_addr_lookup.lookup), + GFP_KERNEL | __GFP_ZERO); + if (!window->dma_addr_lookup.lookup) { + err = -ENOMEM; + goto error_window; + } + + window->num_pages_lookup.lookup = + scif_alloc_coherent(&window->num_pages_lookup.offset, + remote_dev, window->nr_lookup * + sizeof(*window->num_pages_lookup.lookup), + GFP_KERNEL | __GFP_ZERO); + if (!window->num_pages_lookup.lookup) { + err = -ENOMEM; + goto error_window; + } + + vmalloc_dma_phys = is_vmalloc_addr(&window->dma_addr[0]); + vmalloc_num_pages = is_vmalloc_addr(&window->num_pages[0]); + + /* Now map each of the pages containing physical addresses */ + for (i = 0, j = 0; i < nr_pages; i += SCIF_NR_ADDR_IN_PAGE, j++) { + err = scif_map_page(&window->dma_addr_lookup.lookup[j], + vmalloc_dma_phys ? + vmalloc_to_page(&window->dma_addr[i]) : + virt_to_page(&window->dma_addr[i]), + remote_dev); + if (err) + goto error_window; + err = scif_map_page(&window->num_pages_lookup.lookup[j], + vmalloc_dma_phys ? + vmalloc_to_page(&window->num_pages[i]) : + virt_to_page(&window->num_pages[i]), + remote_dev); + if (err) + goto error_window; + } + return 0; +error_window: + return err; +} + +/** + * scif_destroy_remote_lookup: + * @remote_dev: SCIF remote device + * @window: remote window + * + * Destroy lookup entries used for the remote + * end to copy over the physical addresses. + */ +static void scif_destroy_remote_lookup(struct scif_dev *remote_dev, + struct scif_window *window) +{ + int i, j; + + if (window->nr_lookup) { + struct scif_rma_lookup *lup = &window->dma_addr_lookup; + struct scif_rma_lookup *npup = &window->num_pages_lookup; + + for (i = 0, j = 0; i < window->nr_pages; + i += SCIF_NR_ADDR_IN_PAGE, j++) { + if (lup->lookup && lup->lookup[j]) + scif_unmap_single(lup->lookup[j], + remote_dev, + PAGE_SIZE); + if (npup->lookup && npup->lookup[j]) + scif_unmap_single(npup->lookup[j], + remote_dev, + PAGE_SIZE); + } + if (lup->lookup) + scif_free_coherent(lup->lookup, lup->offset, + remote_dev, window->nr_lookup * + sizeof(*lup->lookup)); + if (npup->lookup) + scif_free_coherent(npup->lookup, npup->offset, + remote_dev, window->nr_lookup * + sizeof(*npup->lookup)); + if (window->mapped_offset) + scif_unmap_single(window->mapped_offset, + remote_dev, sizeof(*window)); + window->nr_lookup = 0; + } +} + +/** + * scif_create_remote_window: + * @ep: end point + * @nr_pages: number of pages in window + * + * Allocate and prepare a remote registration window. + */ +static struct scif_window * +scif_create_remote_window(struct scif_dev *scifdev, int nr_pages) +{ + struct scif_window *window; + + might_sleep(); + window = scif_zalloc(sizeof(*window)); + if (!window) + goto error_ret; + + window->magic = SCIFEP_MAGIC; + window->nr_pages = nr_pages; + + window->dma_addr = scif_zalloc(nr_pages * sizeof(*window->dma_addr)); + if (!window->dma_addr) + goto error_window; + + window->num_pages = scif_zalloc(nr_pages * + sizeof(*window->num_pages)); + if (!window->num_pages) + goto error_window; + + if (scif_create_remote_lookup(scifdev, window)) + goto error_window; + + window->type = SCIF_WINDOW_PEER; + window->unreg_state = OP_IDLE; + INIT_LIST_HEAD(&window->list); + return window; +error_window: + scif_destroy_remote_window(window); +error_ret: + return NULL; +} + +/** + * scif_destroy_remote_window: + * @ep: end point + * @window: remote registration window + * + * Deallocate resources for remote window. + */ +void +scif_destroy_remote_window(struct scif_window *window) +{ + scif_free(window->dma_addr, window->nr_pages * + sizeof(*window->dma_addr)); + scif_free(window->num_pages, window->nr_pages * + sizeof(*window->num_pages)); + window->magic = 0; + scif_free(window, sizeof(*window)); +} + +/** + * scif_iommu_map: create DMA mappings if the IOMMU is enabled + * @remote_dev: SCIF remote device + * @window: remote registration window + * + * Map the physical pages using dma_map_sg(..) and then detect the number + * of contiguous DMA mappings allocated + */ +static int scif_iommu_map(struct scif_dev *remote_dev, + struct scif_window *window) +{ + struct scatterlist *sg; + int i, err; + scif_pinned_pages_t pin = window->pinned_pages; + + window->st = kzalloc(sizeof(*window->st), GFP_KERNEL); + if (!window->st) + return -ENOMEM; + + err = sg_alloc_table(window->st, window->nr_pages, GFP_KERNEL); + if (err) + return err; + + for_each_sg(window->st->sgl, sg, window->st->nents, i) + sg_set_page(sg, pin->pages[i], PAGE_SIZE, 0x0); + + err = dma_map_sg(&remote_dev->sdev->dev, window->st->sgl, + window->st->nents, DMA_BIDIRECTIONAL); + if (!err) + return -ENOMEM; + /* Detect contiguous ranges of DMA mappings */ + sg = window->st->sgl; + for (i = 0; sg; i++) { + dma_addr_t last_da; + + window->dma_addr[i] = sg_dma_address(sg); + window->num_pages[i] = sg_dma_len(sg) >> PAGE_SHIFT; + last_da = sg_dma_address(sg) + sg_dma_len(sg); + while ((sg = sg_next(sg)) && sg_dma_address(sg) == last_da) { + window->num_pages[i] += + (sg_dma_len(sg) >> PAGE_SHIFT); + last_da = window->dma_addr[i] + + sg_dma_len(sg); + } + window->nr_contig_chunks++; + } + return 0; +} + +/** + * scif_map_window: + * @remote_dev: SCIF remote device + * @window: self registration window + * + * Map pages of a window into the aperture/PCI. + * Also determine addresses required for DMA. + */ +int +scif_map_window(struct scif_dev *remote_dev, struct scif_window *window) +{ + int i, j, k, err = 0, nr_contig_pages; + scif_pinned_pages_t pin; + phys_addr_t phys_prev, phys_curr; + + might_sleep(); + + pin = window->pinned_pages; + + if (intel_iommu_enabled && !scifdev_self(remote_dev)) + return scif_iommu_map(remote_dev, window); + + for (i = 0, j = 0; i < window->nr_pages; i += nr_contig_pages, j++) { + phys_prev = page_to_phys(pin->pages[i]); + nr_contig_pages = 1; + + /* Detect physically contiguous chunks */ + for (k = i + 1; k < window->nr_pages; k++) { + phys_curr = page_to_phys(pin->pages[k]); + if (phys_curr != (phys_prev + PAGE_SIZE)) + break; + phys_prev = phys_curr; + nr_contig_pages++; + } + window->num_pages[j] = nr_contig_pages; + window->nr_contig_chunks++; + if (scif_is_mgmt_node()) { + /* + * Management node has to deal with SMPT on X100 and + * hence the DMA mapping is required + */ + err = scif_map_single(&window->dma_addr[j], + phys_to_virt(page_to_phys( + pin->pages[i])), + remote_dev, + nr_contig_pages << PAGE_SHIFT); + if (err) + return err; + } else { + window->dma_addr[j] = page_to_phys(pin->pages[i]); + } + } + return err; +} + +/** + * scif_send_scif_unregister: + * @ep: end point + * @window: self registration window + * + * Send a SCIF_UNREGISTER message. + */ +static int scif_send_scif_unregister(struct scif_endpt *ep, + struct scif_window *window) +{ + struct scifmsg msg; + + msg.uop = SCIF_UNREGISTER; + msg.src = ep->port; + msg.payload[0] = window->alloc_handle.vaddr; + msg.payload[1] = (u64)window; + return scif_nodeqp_send(ep->remote_dev, &msg); +} + +/** + * scif_unregister_window: + * @window: self registration window + * + * Send an unregistration request and wait for a response. + */ +int scif_unregister_window(struct scif_window *window) +{ + int err = 0; + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + bool send_msg = false; + + might_sleep(); + switch (window->unreg_state) { + case OP_IDLE: + { + window->unreg_state = OP_IN_PROGRESS; + send_msg = true; + /* fall through */ + } + case OP_IN_PROGRESS: + { + scif_get_window(window, 1); + mutex_unlock(&ep->rma_info.rma_lock); + if (send_msg) { + err = scif_send_scif_unregister(ep, window); + if (err) { + window->unreg_state = OP_COMPLETED; + goto done; + } + } else { + /* Return ENXIO since unregistration is in progress */ + mutex_lock(&ep->rma_info.rma_lock); + return -ENXIO; + } +retry: + /* Wait for a SCIF_UNREGISTER_(N)ACK message */ + err = wait_event_timeout(window->unregwq, + window->unreg_state != OP_IN_PROGRESS, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + if (!err) { + err = -ENODEV; + window->unreg_state = OP_COMPLETED; + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", __func__, __LINE__, err); + } + if (err > 0) + err = 0; +done: + mutex_lock(&ep->rma_info.rma_lock); + scif_put_window(window, 1); + break; + } + case OP_FAILED: + { + if (!scifdev_alive(ep)) { + err = -ENODEV; + window->unreg_state = OP_COMPLETED; + } + break; + } + case OP_COMPLETED: + break; + default: + err = -ENODEV; + } + + if (window->unreg_state == OP_COMPLETED && window->ref_count) + scif_put_window(window, window->nr_pages); + + if (!window->ref_count) { + atomic_inc(&ep->rma_info.tw_refcount); + list_del_init(&window->list); + scif_free_window_offset(ep, window, window->offset); + mutex_unlock(&ep->rma_info.rma_lock); + if ((!!(window->pinned_pages->map_flags & SCIF_MAP_KERNEL)) && + scifdev_alive(ep)) { + scif_drain_dma_intr(ep->remote_dev->sdev, + ep->rma_info.dma_chan); + } else { + if (!__scif_dec_pinned_vm_lock(window->mm, + window->nr_pages, 1)) { + __scif_release_mm(window->mm); + window->mm = NULL; + } + } + scif_queue_for_cleanup(window, &scif_info.rma); + mutex_lock(&ep->rma_info.rma_lock); + } + return err; +} + +/** + * scif_send_alloc_request: + * @ep: end point + * @window: self registration window + * + * Send a remote window allocation request + */ +static int scif_send_alloc_request(struct scif_endpt *ep, + struct scif_window *window) +{ + struct scifmsg msg; + struct scif_allocmsg *alloc = &window->alloc_handle; + + /* Set up the Alloc Handle */ + alloc->state = OP_IN_PROGRESS; + init_waitqueue_head(&alloc->allocwq); + + /* Send out an allocation request */ + msg.uop = SCIF_ALLOC_REQ; + msg.payload[1] = window->nr_pages; + msg.payload[2] = (u64)&window->alloc_handle; + return _scif_nodeqp_send(ep->remote_dev, &msg); +} + +/** + * scif_prep_remote_window: + * @ep: end point + * @window: self registration window + * + * Send a remote window allocation request, wait for an allocation response, + * and prepares the remote window by copying over the page lists + */ +static int scif_prep_remote_window(struct scif_endpt *ep, + struct scif_window *window) +{ + struct scifmsg msg; + struct scif_window *remote_window; + struct scif_allocmsg *alloc = &window->alloc_handle; + dma_addr_t *dma_phys_lookup, *tmp, *num_pages_lookup, *tmp1; + int i = 0, j = 0; + int nr_contig_chunks, loop_nr_contig_chunks; + int remaining_nr_contig_chunks, nr_lookup; + int err, map_err; + + map_err = scif_map_window(ep->remote_dev, window); + if (map_err) + dev_err(&ep->remote_dev->sdev->dev, + "%s %d map_err %d\n", __func__, __LINE__, map_err); + remaining_nr_contig_chunks = window->nr_contig_chunks; + nr_contig_chunks = window->nr_contig_chunks; +retry: + /* Wait for a SCIF_ALLOC_GNT/REJ message */ + err = wait_event_timeout(alloc->allocwq, + alloc->state != OP_IN_PROGRESS, + SCIF_NODE_ALIVE_TIMEOUT); + mutex_lock(&ep->rma_info.rma_lock); + /* Synchronize with the thread waking up allocwq */ + mutex_unlock(&ep->rma_info.rma_lock); + if (!err && scifdev_alive(ep)) + goto retry; + + if (!err) + err = -ENODEV; + + if (err > 0) + err = 0; + else + return err; + + /* Bail out. The remote end rejected this request */ + if (alloc->state == OP_FAILED) + return -ENOMEM; + + if (map_err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, map_err); + msg.uop = SCIF_FREE_VIRT; + msg.src = ep->port; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = window->alloc_handle.vaddr; + msg.payload[2] = (u64)window; + msg.payload[3] = SCIF_REGISTER; + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) + err = _scif_nodeqp_send(ep->remote_dev, &msg); + else + err = -ENOTCONN; + spin_unlock(&ep->lock); + return err; + } + + remote_window = scif_ioremap(alloc->phys_addr, sizeof(*window), + ep->remote_dev); + + /* Compute the number of lookup entries. 21 == 2MB Shift */ + nr_lookup = ALIGN(nr_contig_chunks, SCIF_NR_ADDR_IN_PAGE) + >> ilog2(SCIF_NR_ADDR_IN_PAGE); + + dma_phys_lookup = + scif_ioremap(remote_window->dma_addr_lookup.offset, + nr_lookup * + sizeof(*remote_window->dma_addr_lookup.lookup), + ep->remote_dev); + num_pages_lookup = + scif_ioremap(remote_window->num_pages_lookup.offset, + nr_lookup * + sizeof(*remote_window->num_pages_lookup.lookup), + ep->remote_dev); + + while (remaining_nr_contig_chunks) { + loop_nr_contig_chunks = min_t(int, remaining_nr_contig_chunks, + (int)SCIF_NR_ADDR_IN_PAGE); + /* #1/2 - Copy physical addresses over to the remote side */ + + /* #2/2 - Copy DMA addresses (addresses that are fed into the + * DMA engine) We transfer bus addresses which are then + * converted into a MIC physical address on the remote + * side if it is a MIC, if the remote node is a mgmt node we + * transfer the MIC physical address + */ + tmp = scif_ioremap(dma_phys_lookup[j], + loop_nr_contig_chunks * + sizeof(*window->dma_addr), + ep->remote_dev); + tmp1 = scif_ioremap(num_pages_lookup[j], + loop_nr_contig_chunks * + sizeof(*window->num_pages), + ep->remote_dev); + if (scif_is_mgmt_node()) { + memcpy_toio((void __force __iomem *)tmp, + &window->dma_addr[i], loop_nr_contig_chunks + * sizeof(*window->dma_addr)); + memcpy_toio((void __force __iomem *)tmp1, + &window->num_pages[i], loop_nr_contig_chunks + * sizeof(*window->num_pages)); + } else { + if (scifdev_is_p2p(ep->remote_dev)) { + /* + * add remote node's base address for this node + * to convert it into a MIC address + */ + int m; + dma_addr_t dma_addr; + + for (m = 0; m < loop_nr_contig_chunks; m++) { + dma_addr = window->dma_addr[i + m] + + ep->remote_dev->base_addr; + writeq(dma_addr, + (void __force __iomem *)&tmp[m]); + } + memcpy_toio((void __force __iomem *)tmp1, + &window->num_pages[i], + loop_nr_contig_chunks + * sizeof(*window->num_pages)); + } else { + /* Mgmt node or loopback - transfer DMA + * addresses as is, this is the same as a + * MIC physical address (we use the dma_addr + * and not the phys_addr array since the + * phys_addr is only setup if there is a mmap() + * request from the mgmt node) + */ + memcpy_toio((void __force __iomem *)tmp, + &window->dma_addr[i], + loop_nr_contig_chunks * + sizeof(*window->dma_addr)); + memcpy_toio((void __force __iomem *)tmp1, + &window->num_pages[i], + loop_nr_contig_chunks * + sizeof(*window->num_pages)); + } + } + remaining_nr_contig_chunks -= loop_nr_contig_chunks; + i += loop_nr_contig_chunks; + j++; + scif_iounmap(tmp, loop_nr_contig_chunks * + sizeof(*window->dma_addr), ep->remote_dev); + scif_iounmap(tmp1, loop_nr_contig_chunks * + sizeof(*window->num_pages), ep->remote_dev); + } + + /* Prepare the remote window for the peer */ + remote_window->peer_window = (u64)window; + remote_window->offset = window->offset; + remote_window->prot = window->prot; + remote_window->nr_contig_chunks = nr_contig_chunks; + remote_window->ep = ep->remote_ep; + scif_iounmap(num_pages_lookup, + nr_lookup * + sizeof(*remote_window->num_pages_lookup.lookup), + ep->remote_dev); + scif_iounmap(dma_phys_lookup, + nr_lookup * + sizeof(*remote_window->dma_addr_lookup.lookup), + ep->remote_dev); + scif_iounmap(remote_window, sizeof(*remote_window), ep->remote_dev); + window->peer_window = alloc->vaddr; + return err; +} + +/** + * scif_send_scif_register: + * @ep: end point + * @window: self registration window + * + * Send a SCIF_REGISTER message if EP is connected and wait for a + * SCIF_REGISTER_(N)ACK message else send a SCIF_FREE_VIRT + * message so that the peer can free its remote window allocated earlier. + */ +static int scif_send_scif_register(struct scif_endpt *ep, + struct scif_window *window) +{ + int err = 0; + struct scifmsg msg; + + msg.src = ep->port; + msg.payload[0] = ep->remote_ep; + msg.payload[1] = window->alloc_handle.vaddr; + msg.payload[2] = (u64)window; + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) { + msg.uop = SCIF_REGISTER; + window->reg_state = OP_IN_PROGRESS; + err = _scif_nodeqp_send(ep->remote_dev, &msg); + spin_unlock(&ep->lock); + if (!err) { +retry: + /* Wait for a SCIF_REGISTER_(N)ACK message */ + err = wait_event_timeout(window->regwq, + window->reg_state != + OP_IN_PROGRESS, + SCIF_NODE_ALIVE_TIMEOUT); + if (!err && scifdev_alive(ep)) + goto retry; + err = !err ? -ENODEV : 0; + if (window->reg_state == OP_FAILED) + err = -ENOTCONN; + } + } else { + msg.uop = SCIF_FREE_VIRT; + msg.payload[3] = SCIF_REGISTER; + err = _scif_nodeqp_send(ep->remote_dev, &msg); + spin_unlock(&ep->lock); + if (!err) + err = -ENOTCONN; + } + return err; +} + +/** + * scif_get_window_offset: + * @ep: end point descriptor + * @flags: flags + * @offset: offset hint + * @num_pages: number of pages + * @out_offset: computed offset returned by reference. + * + * Compute/Claim a new offset for this EP. + */ +int scif_get_window_offset(struct scif_endpt *ep, int flags, s64 offset, + int num_pages, s64 *out_offset) +{ + s64 page_index; + struct iova *iova_ptr; + int err = 0; + + if (flags & SCIF_MAP_FIXED) { + page_index = SCIF_IOVA_PFN(offset); + iova_ptr = reserve_iova(&ep->rma_info.iovad, page_index, + page_index + num_pages - 1); + if (!iova_ptr) + err = -EADDRINUSE; + } else { + iova_ptr = alloc_iova(&ep->rma_info.iovad, num_pages, + SCIF_DMA_63BIT_PFN - 1, 0); + if (!iova_ptr) + err = -ENOMEM; + } + if (!err) + *out_offset = (iova_ptr->pfn_lo) << PAGE_SHIFT; + return err; +} + +/** + * scif_free_window_offset: + * @ep: end point descriptor + * @window: registration window + * @offset: Offset to be freed + * + * Free offset for this EP. The callee is supposed to grab + * the RMA mutex before calling this API. + */ +void scif_free_window_offset(struct scif_endpt *ep, + struct scif_window *window, s64 offset) +{ + if ((window && !window->offset_freed) || !window) { + free_iova(&ep->rma_info.iovad, offset >> PAGE_SHIFT); + if (window) + window->offset_freed = true; + } +} + +/** + * scif_alloc_req: Respond to SCIF_ALLOC_REQ interrupt message + * @msg: Interrupt message + * + * Remote side is requesting a memory allocation. + */ +void scif_alloc_req(struct scif_dev *scifdev, struct scifmsg *msg) +{ + int err; + struct scif_window *window = NULL; + int nr_pages = msg->payload[1]; + + window = scif_create_remote_window(scifdev, nr_pages); + if (!window) { + err = -ENOMEM; + goto error; + } + + /* The peer's allocation request is granted */ + msg->uop = SCIF_ALLOC_GNT; + msg->payload[0] = (u64)window; + msg->payload[1] = window->mapped_offset; + err = scif_nodeqp_send(scifdev, msg); + if (err) + scif_destroy_remote_window(window); + return; +error: + /* The peer's allocation request is rejected */ + dev_err(&scifdev->sdev->dev, + "%s %d error %d alloc_ptr %p nr_pages 0x%x\n", + __func__, __LINE__, err, window, nr_pages); + msg->uop = SCIF_ALLOC_REJ; + scif_nodeqp_send(scifdev, msg); +} + +/** + * scif_alloc_gnt_rej: Respond to SCIF_ALLOC_GNT/REJ interrupt message + * @msg: Interrupt message + * + * Remote side responded to a memory allocation. + */ +void scif_alloc_gnt_rej(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_allocmsg *handle = (struct scif_allocmsg *)msg->payload[2]; + struct scif_window *window = container_of(handle, struct scif_window, + alloc_handle); + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + + mutex_lock(&ep->rma_info.rma_lock); + handle->vaddr = msg->payload[0]; + handle->phys_addr = msg->payload[1]; + if (msg->uop == SCIF_ALLOC_GNT) + handle->state = OP_COMPLETED; + else + handle->state = OP_FAILED; + wake_up(&handle->allocwq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +/** + * scif_free_virt: Respond to SCIF_FREE_VIRT interrupt message + * @msg: Interrupt message + * + * Free up memory kmalloc'd earlier. + */ +void scif_free_virt(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_window *window = (struct scif_window *)msg->payload[1]; + + scif_destroy_remote_window(window); +} + +static void +scif_fixup_aper_base(struct scif_dev *dev, struct scif_window *window) +{ + int j; + struct scif_hw_dev *sdev = dev->sdev; + phys_addr_t apt_base = 0; + + /* + * Add the aperture base if the DMA address is not card relative + * since the DMA addresses need to be an offset into the bar + */ + if (!scifdev_self(dev) && window->type == SCIF_WINDOW_PEER && + sdev->aper && !sdev->card_rel_da) + apt_base = sdev->aper->pa; + else + return; + + for (j = 0; j < window->nr_contig_chunks; j++) { + if (window->num_pages[j]) + window->dma_addr[j] += apt_base; + else + break; + } +} + +/** + * scif_recv_reg: Respond to SCIF_REGISTER interrupt message + * @msg: Interrupt message + * + * Update remote window list with a new registered window. + */ +void scif_recv_reg(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_endpt *ep = (struct scif_endpt *)msg->payload[0]; + struct scif_window *window = + (struct scif_window *)msg->payload[1]; + + mutex_lock(&ep->rma_info.rma_lock); + spin_lock(&ep->lock); + if (ep->state == SCIFEP_CONNECTED) { + msg->uop = SCIF_REGISTER_ACK; + scif_nodeqp_send(ep->remote_dev, msg); + scif_fixup_aper_base(ep->remote_dev, window); + /* No further failures expected. Insert new window */ + scif_insert_window(window, &ep->rma_info.remote_reg_list); + } else { + msg->uop = SCIF_REGISTER_NACK; + scif_nodeqp_send(ep->remote_dev, msg); + } + spin_unlock(&ep->lock); + mutex_unlock(&ep->rma_info.rma_lock); + /* free up any lookup resources now that page lists are transferred */ + scif_destroy_remote_lookup(ep->remote_dev, window); + /* + * We could not insert the window but we need to + * destroy the window. + */ + if (msg->uop == SCIF_REGISTER_NACK) + scif_destroy_remote_window(window); +} + +/** + * scif_recv_unreg: Respond to SCIF_UNREGISTER interrupt message + * @msg: Interrupt message + * + * Remove window from remote registration list; + */ +void scif_recv_unreg(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_rma_req req; + struct scif_window *window = NULL; + struct scif_window *recv_window = + (struct scif_window *)msg->payload[0]; + struct scif_endpt *ep; + int del_window = 0; + + ep = (struct scif_endpt *)recv_window->ep; + req.out_window = &window; + req.offset = recv_window->offset; + req.prot = 0; + req.nr_bytes = recv_window->nr_pages << PAGE_SHIFT; + req.type = SCIF_WINDOW_FULL; + req.head = &ep->rma_info.remote_reg_list; + msg->payload[0] = ep->remote_ep; + + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + if (scif_query_window(&req)) { + dev_err(&scifdev->sdev->dev, + "%s %d -ENXIO\n", __func__, __LINE__); + msg->uop = SCIF_UNREGISTER_ACK; + goto error; + } + if (window) { + if (window->ref_count) + scif_put_window(window, window->nr_pages); + else + dev_err(&scifdev->sdev->dev, + "%s %d ref count should be +ve\n", + __func__, __LINE__); + window->unreg_state = OP_COMPLETED; + if (!window->ref_count) { + msg->uop = SCIF_UNREGISTER_ACK; + atomic_inc(&ep->rma_info.tw_refcount); + ep->rma_info.async_list_del = 1; + list_del_init(&window->list); + del_window = 1; + } else { + /* NACK! There are valid references to this window */ + msg->uop = SCIF_UNREGISTER_NACK; + } + } else { + /* The window did not make its way to the list at all. ACK */ + msg->uop = SCIF_UNREGISTER_ACK; + scif_destroy_remote_window(recv_window); + } +error: + mutex_unlock(&ep->rma_info.rma_lock); + if (del_window) + scif_drain_dma_intr(ep->remote_dev->sdev, + ep->rma_info.dma_chan); + scif_nodeqp_send(ep->remote_dev, msg); + if (del_window) + scif_queue_for_cleanup(window, &scif_info.rma); +} + +/** + * scif_recv_reg_ack: Respond to SCIF_REGISTER_ACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to complete registration. + */ +void scif_recv_reg_ack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_window *window = + (struct scif_window *)msg->payload[2]; + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + + mutex_lock(&ep->rma_info.rma_lock); + window->reg_state = OP_COMPLETED; + wake_up(&window->regwq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +/** + * scif_recv_reg_nack: Respond to SCIF_REGISTER_NACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to inform it that registration + * cannot be completed. + */ +void scif_recv_reg_nack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_window *window = + (struct scif_window *)msg->payload[2]; + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + + mutex_lock(&ep->rma_info.rma_lock); + window->reg_state = OP_FAILED; + wake_up(&window->regwq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +/** + * scif_recv_unreg_ack: Respond to SCIF_UNREGISTER_ACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to complete unregistration. + */ +void scif_recv_unreg_ack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_window *window = + (struct scif_window *)msg->payload[1]; + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + + mutex_lock(&ep->rma_info.rma_lock); + window->unreg_state = OP_COMPLETED; + wake_up(&window->unregwq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +/** + * scif_recv_unreg_nack: Respond to SCIF_UNREGISTER_NACK interrupt message + * @msg: Interrupt message + * + * Wake up the window waiting to inform it that unregistration + * cannot be completed immediately. + */ +void scif_recv_unreg_nack(struct scif_dev *scifdev, struct scifmsg *msg) +{ + struct scif_window *window = + (struct scif_window *)msg->payload[1]; + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + + mutex_lock(&ep->rma_info.rma_lock); + window->unreg_state = OP_FAILED; + wake_up(&window->unregwq); + mutex_unlock(&ep->rma_info.rma_lock); +} + +int __scif_pin_pages(void *addr, size_t len, int *out_prot, + int map_flags, scif_pinned_pages_t *pages) +{ + struct scif_pinned_pages *pinned_pages; + int nr_pages, err = 0, i; + bool vmalloc_addr = false; + bool try_upgrade = false; + int prot = *out_prot; + int ulimit = 0; + struct mm_struct *mm = NULL; + + /* Unsupported flags */ + if (map_flags & ~(SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT)) + return -EINVAL; + ulimit = !!(map_flags & SCIF_MAP_ULIMIT); + + /* Unsupported protection requested */ + if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE)) + return -EINVAL; + + /* addr/len must be page aligned. len should be non zero */ + if (!len || + (ALIGN((u64)addr, PAGE_SIZE) != (u64)addr) || + (ALIGN((u64)len, PAGE_SIZE) != (u64)len)) + return -EINVAL; + + might_sleep(); + + nr_pages = len >> PAGE_SHIFT; + + /* Allocate a set of pinned pages */ + pinned_pages = scif_create_pinned_pages(nr_pages, prot); + if (!pinned_pages) + return -ENOMEM; + + if (map_flags & SCIF_MAP_KERNEL) { + if (is_vmalloc_addr(addr)) + vmalloc_addr = true; + + for (i = 0; i < nr_pages; i++) { + if (vmalloc_addr) + pinned_pages->pages[i] = + vmalloc_to_page(addr + (i * PAGE_SIZE)); + else + pinned_pages->pages[i] = + virt_to_page(addr + (i * PAGE_SIZE)); + } + pinned_pages->nr_pages = nr_pages; + pinned_pages->map_flags = SCIF_MAP_KERNEL; + } else { + /* + * SCIF supports registration caching. If a registration has + * been requested with read only permissions, then we try + * to pin the pages with RW permissions so that a subsequent + * transfer with RW permission can hit the cache instead of + * invalidating it. If the upgrade fails with RW then we + * revert back to R permission and retry + */ + if (prot == SCIF_PROT_READ) + try_upgrade = true; + prot |= SCIF_PROT_WRITE; +retry: + mm = current->mm; + down_write(&mm->mmap_sem); + if (ulimit) { + err = __scif_check_inc_pinned_vm(mm, nr_pages); + if (err) { + up_write(&mm->mmap_sem); + pinned_pages->nr_pages = 0; + goto error_unmap; + } + } + + pinned_pages->nr_pages = get_user_pages( + current, + mm, + (u64)addr, + nr_pages, + !!(prot & SCIF_PROT_WRITE), + 0, + pinned_pages->pages, + NULL); + up_write(&mm->mmap_sem); + if (nr_pages != pinned_pages->nr_pages) { + if (try_upgrade) { + if (ulimit) + __scif_dec_pinned_vm_lock(mm, + nr_pages, 0); + /* Roll back any pinned pages */ + for (i = 0; i < pinned_pages->nr_pages; i++) { + if (pinned_pages->pages[i]) + put_page( + pinned_pages->pages[i]); + } + prot &= ~SCIF_PROT_WRITE; + try_upgrade = false; + goto retry; + } + } + pinned_pages->map_flags = 0; + } + + if (pinned_pages->nr_pages < nr_pages) { + err = -EFAULT; + pinned_pages->nr_pages = nr_pages; + goto dec_pinned; + } + + *out_prot = prot; + atomic_set(&pinned_pages->ref_count, 1); + *pages = pinned_pages; + return err; +dec_pinned: + if (ulimit) + __scif_dec_pinned_vm_lock(mm, nr_pages, 0); + /* Something went wrong! Rollback */ +error_unmap: + pinned_pages->nr_pages = nr_pages; + scif_destroy_pinned_pages(pinned_pages); + *pages = NULL; + dev_dbg(scif_info.mdev.this_device, + "%s %d err %d len 0x%lx\n", __func__, __LINE__, err, len); + return err; +} + +int scif_pin_pages(void *addr, size_t len, int prot, + int map_flags, scif_pinned_pages_t *pages) +{ + return __scif_pin_pages(addr, len, &prot, map_flags, pages); +} +EXPORT_SYMBOL_GPL(scif_pin_pages); + +int scif_unpin_pages(scif_pinned_pages_t pinned_pages) +{ + int err = 0, ret; + + if (!pinned_pages || SCIFEP_MAGIC != pinned_pages->magic) + return -EINVAL; + + ret = atomic_sub_return(1, &pinned_pages->ref_count); + if (ret < 0) { + dev_err(scif_info.mdev.this_device, + "%s %d scif_unpin_pages called without pinning? rc %d\n", + __func__, __LINE__, ret); + return -EINVAL; + } + /* + * Destroy the window if the ref count for this set of pinned + * pages has dropped to zero. If it is positive then there is + * a valid registered window which is backed by these pages and + * it will be destroyed once all such windows are unregistered. + */ + if (!ret) + err = scif_destroy_pinned_pages(pinned_pages); + + return err; +} +EXPORT_SYMBOL_GPL(scif_unpin_pages); + +static inline void +scif_insert_local_window(struct scif_window *window, struct scif_endpt *ep) +{ + mutex_lock(&ep->rma_info.rma_lock); + scif_insert_window(window, &ep->rma_info.reg_list); + mutex_unlock(&ep->rma_info.rma_lock); +} + +off_t scif_register_pinned_pages(scif_epd_t epd, + scif_pinned_pages_t pinned_pages, + off_t offset, int map_flags) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + s64 computed_offset; + struct scif_window *window; + int err; + size_t len; + struct device *spdev; + + /* Unsupported flags */ + if (map_flags & ~SCIF_MAP_FIXED) + return -EINVAL; + + len = pinned_pages->nr_pages << PAGE_SHIFT; + + /* + * Offset is not page aligned/negative or offset+len + * wraps around with SCIF_MAP_FIXED. + */ + if ((map_flags & SCIF_MAP_FIXED) && + ((ALIGN(offset, PAGE_SIZE) != offset) || + (offset < 0) || + (offset + (off_t)len < offset))) + return -EINVAL; + + might_sleep(); + + err = scif_verify_epd(ep); + if (err) + return err; + /* + * It is an error to pass pinned_pages to scif_register_pinned_pages() + * after calling scif_unpin_pages(). + */ + if (!atomic_add_unless(&pinned_pages->ref_count, 1, 0)) + return -EINVAL; + + /* Compute the offset for this registration */ + err = scif_get_window_offset(ep, map_flags, offset, + len, &computed_offset); + if (err) { + atomic_sub(1, &pinned_pages->ref_count); + return err; + } + + /* Allocate and prepare self registration window */ + window = scif_create_window(ep, pinned_pages->nr_pages, + computed_offset, false); + if (!window) { + atomic_sub(1, &pinned_pages->ref_count); + scif_free_window_offset(ep, NULL, computed_offset); + return -ENOMEM; + } + + window->pinned_pages = pinned_pages; + window->nr_pages = pinned_pages->nr_pages; + window->prot = pinned_pages->prot; + + spdev = scif_get_peer_dev(ep->remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + scif_destroy_window(ep, window); + return err; + } + err = scif_send_alloc_request(ep, window); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error_unmap; + } + + /* Prepare the remote registration window */ + err = scif_prep_remote_window(ep, window); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error_unmap; + } + + /* Tell the peer about the new window */ + err = scif_send_scif_register(ep, window); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error_unmap; + } + + scif_put_peer_dev(spdev); + /* No further failures expected. Insert new window */ + scif_insert_local_window(window, ep); + return computed_offset; +error_unmap: + scif_destroy_window(ep, window); + scif_put_peer_dev(spdev); + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + return err; +} +EXPORT_SYMBOL_GPL(scif_register_pinned_pages); + +off_t scif_register(scif_epd_t epd, void *addr, size_t len, off_t offset, + int prot, int map_flags) +{ + scif_pinned_pages_t pinned_pages; + off_t err; + struct scif_endpt *ep = (struct scif_endpt *)epd; + s64 computed_offset; + struct scif_window *window; + struct mm_struct *mm = NULL; + struct device *spdev; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI register: ep %p addr %p len 0x%lx offset 0x%lx prot 0x%x map_flags 0x%x\n", + epd, addr, len, offset, prot, map_flags); + /* Unsupported flags */ + if (map_flags & ~(SCIF_MAP_FIXED | SCIF_MAP_KERNEL)) + return -EINVAL; + + /* + * Offset is not page aligned/negative or offset+len + * wraps around with SCIF_MAP_FIXED. + */ + if ((map_flags & SCIF_MAP_FIXED) && + ((ALIGN(offset, PAGE_SIZE) != offset) || + (offset < 0) || + (offset + (off_t)len < offset))) + return -EINVAL; + + /* Unsupported protection requested */ + if (prot & ~(SCIF_PROT_READ | SCIF_PROT_WRITE)) + return -EINVAL; + + /* addr/len must be page aligned. len should be non zero */ + if (!len || (ALIGN((u64)addr, PAGE_SIZE) != (u64)addr) || + (ALIGN(len, PAGE_SIZE) != len)) + return -EINVAL; + + might_sleep(); + + err = scif_verify_epd(ep); + if (err) + return err; + + /* Compute the offset for this registration */ + err = scif_get_window_offset(ep, map_flags, offset, + len >> PAGE_SHIFT, &computed_offset); + if (err) + return err; + + spdev = scif_get_peer_dev(ep->remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + scif_free_window_offset(ep, NULL, computed_offset); + return err; + } + /* Allocate and prepare self registration window */ + window = scif_create_window(ep, len >> PAGE_SHIFT, + computed_offset, false); + if (!window) { + scif_free_window_offset(ep, NULL, computed_offset); + scif_put_peer_dev(spdev); + return -ENOMEM; + } + + window->nr_pages = len >> PAGE_SHIFT; + + err = scif_send_alloc_request(ep, window); + if (err) { + scif_destroy_incomplete_window(ep, window); + scif_put_peer_dev(spdev); + return err; + } + + if (!(map_flags & SCIF_MAP_KERNEL)) { + mm = __scif_acquire_mm(); + map_flags |= SCIF_MAP_ULIMIT; + } + /* Pin down the pages */ + err = __scif_pin_pages(addr, len, &prot, + map_flags & (SCIF_MAP_KERNEL | SCIF_MAP_ULIMIT), + &pinned_pages); + if (err) { + scif_destroy_incomplete_window(ep, window); + __scif_release_mm(mm); + goto error; + } + + window->pinned_pages = pinned_pages; + window->prot = pinned_pages->prot; + window->mm = mm; + + /* Prepare the remote registration window */ + err = scif_prep_remote_window(ep, window); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %ld\n", __func__, __LINE__, err); + goto error_unmap; + } + + /* Tell the peer about the new window */ + err = scif_send_scif_register(ep, window); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %ld\n", __func__, __LINE__, err); + goto error_unmap; + } + + scif_put_peer_dev(spdev); + /* No further failures expected. Insert new window */ + scif_insert_local_window(window, ep); + dev_dbg(&ep->remote_dev->sdev->dev, + "SCIFAPI register: ep %p addr %p len 0x%lx computed_offset 0x%llx\n", + epd, addr, len, computed_offset); + return computed_offset; +error_unmap: + scif_destroy_window(ep, window); +error: + scif_put_peer_dev(spdev); + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %ld\n", __func__, __LINE__, err); + return err; +} +EXPORT_SYMBOL_GPL(scif_register); + +int +scif_unregister(scif_epd_t epd, off_t offset, size_t len) +{ + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct scif_window *window = NULL; + struct scif_rma_req req; + int nr_pages, err; + struct device *spdev; + + dev_dbg(scif_info.mdev.this_device, + "SCIFAPI unregister: ep %p offset 0x%lx len 0x%lx\n", + ep, offset, len); + /* len must be page aligned. len should be non zero */ + if (!len || + (ALIGN((u64)len, PAGE_SIZE) != (u64)len)) + return -EINVAL; + + /* Offset is not page aligned or offset+len wraps around */ + if ((ALIGN(offset, PAGE_SIZE) != offset) || + (offset + (off_t)len < offset)) + return -EINVAL; + + err = scif_verify_epd(ep); + if (err) + return err; + + might_sleep(); + nr_pages = len >> PAGE_SHIFT; + + req.out_window = &window; + req.offset = offset; + req.prot = 0; + req.nr_bytes = len; + req.type = SCIF_WINDOW_FULL; + req.head = &ep->rma_info.reg_list; + + spdev = scif_get_peer_dev(ep->remote_dev); + if (IS_ERR(spdev)) { + err = PTR_ERR(spdev); + return err; + } + mutex_lock(&ep->rma_info.rma_lock); + /* Does a valid window exist? */ + err = scif_query_window(&req); + if (err) { + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); + goto error; + } + /* Unregister all the windows in this range */ + err = scif_rma_list_unregister(window, offset, nr_pages); + if (err) + dev_err(&ep->remote_dev->sdev->dev, + "%s %d err %d\n", __func__, __LINE__, err); +error: + mutex_unlock(&ep->rma_info.rma_lock); + scif_put_peer_dev(spdev); + return err; +} +EXPORT_SYMBOL_GPL(scif_unregister); diff --git a/kernel/drivers/misc/mic/scif/scif_rma.h b/kernel/drivers/misc/mic/scif/scif_rma.h new file mode 100644 index 000000000..fa6722279 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_rma.h @@ -0,0 +1,464 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Intel SCIF driver. + * + */ +#ifndef SCIF_RMA_H +#define SCIF_RMA_H + +#include <linux/dma_remapping.h> +#include <linux/mmu_notifier.h> + +#include "../bus/scif_bus.h" + +/* If this bit is set then the mark is a remote fence mark */ +#define SCIF_REMOTE_FENCE_BIT 31 +/* Magic value used to indicate a remote fence request */ +#define SCIF_REMOTE_FENCE BIT_ULL(SCIF_REMOTE_FENCE_BIT) + +#define SCIF_MAX_UNALIGNED_BUF_SIZE (1024 * 1024ULL) +#define SCIF_KMEM_UNALIGNED_BUF_SIZE (SCIF_MAX_UNALIGNED_BUF_SIZE + \ + (L1_CACHE_BYTES << 1)) + +#define SCIF_IOVA_START_PFN (1) +#define SCIF_IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) +#define SCIF_DMA_64BIT_PFN SCIF_IOVA_PFN(DMA_BIT_MASK(64)) +#define SCIF_DMA_63BIT_PFN SCIF_IOVA_PFN(DMA_BIT_MASK(63)) + +/* + * struct scif_endpt_rma_info - Per Endpoint Remote Memory Access Information + * + * @reg_list: List of registration windows for self + * @remote_reg_list: List of registration windows for peer + * @iovad: Offset generator + * @rma_lock: Synchronizes access to self/remote list and also protects the + * window from being destroyed while RMAs are in progress. + * @tc_lock: Synchronizes access to temporary cached windows list + * for SCIF Registration Caching. + * @mmn_lock: Synchronizes access to the list of MMU notifiers registered + * @tw_refcount: Keeps track of number of outstanding temporary registered + * windows created by scif_vreadfrom/scif_vwriteto which have + * not been destroyed. + * @tcw_refcount: Same as tw_refcount but for temporary cached windows + * @tcw_total_pages: Same as tcw_refcount but in terms of pages pinned + * @mmn_list: MMU notifier so that we can destroy the windows when required + * @fence_refcount: Keeps track of number of outstanding remote fence + * requests which have been received by the peer. + * @dma_chan: DMA channel used for all DMA transfers for this endpoint. + * @async_list_del: Detect asynchronous list entry deletion + * @vma_list: List of vmas with remote memory mappings + * @markwq: Wait queue used for scif_fence_mark/scif_fence_wait +*/ +struct scif_endpt_rma_info { + struct list_head reg_list; + struct list_head remote_reg_list; + struct iova_domain iovad; + struct mutex rma_lock; + spinlock_t tc_lock; + struct mutex mmn_lock; + atomic_t tw_refcount; + atomic_t tcw_refcount; + atomic_t tcw_total_pages; + struct list_head mmn_list; + atomic_t fence_refcount; + struct dma_chan *dma_chan; + int async_list_del; + struct list_head vma_list; + wait_queue_head_t markwq; +}; + +/* + * struct scif_fence_info - used for tracking fence requests + * + * @state: State of this transfer + * @wq: Fences wait on this queue + * @dma_mark: Used for storing the DMA mark + */ +struct scif_fence_info { + enum scif_msg_state state; + struct completion comp; + int dma_mark; +}; + +/* + * struct scif_remote_fence_info - used for tracking remote fence requests + * + * @msg: List of SCIF node QP fence messages + * @list: Link to list of remote fence requests + */ +struct scif_remote_fence_info { + struct scifmsg msg; + struct list_head list; +}; + +/* + * Specifies whether an RMA operation can span across partial windows, a single + * window or multiple contiguous windows. Mmaps can span across partial windows. + * Unregistration can span across complete windows. scif_get_pages() can span a + * single window. A window can also be of type self or peer. + */ +enum scif_window_type { + SCIF_WINDOW_PARTIAL, + SCIF_WINDOW_SINGLE, + SCIF_WINDOW_FULL, + SCIF_WINDOW_SELF, + SCIF_WINDOW_PEER +}; + +/* The number of physical addresses that can be stored in a PAGE. */ +#define SCIF_NR_ADDR_IN_PAGE (0x1000 >> 3) + +/* + * struct scif_rma_lookup - RMA lookup data structure for page list transfers + * + * Store an array of lookup offsets. Each offset in this array maps + * one 4K page containing 512 physical addresses i.e. 2MB. 512 such + * offsets in a 4K page will correspond to 1GB of registered address space. + + * @lookup: Array of offsets + * @offset: DMA offset of lookup array + */ +struct scif_rma_lookup { + dma_addr_t *lookup; + dma_addr_t offset; +}; + +/* + * struct scif_pinned_pages - A set of pinned pages obtained with + * scif_pin_pages() which could be part of multiple registered + * windows across different end points. + * + * @nr_pages: Number of pages which is defined as a s64 instead of an int + * to avoid sign extension with buffers >= 2GB + * @prot: read/write protections + * @map_flags: Flags specified during the pin operation + * @ref_count: Reference count bumped in terms of number of pages + * @magic: A magic value + * @pages: Array of pointers to struct pages populated with get_user_pages(..) + */ +struct scif_pinned_pages { + s64 nr_pages; + int prot; + int map_flags; + atomic_t ref_count; + u64 magic; + struct page **pages; +}; + +/* + * struct scif_status - Stores DMA status update information + * + * @src_dma_addr: Source buffer DMA address + * @val: src location for value to be written to the destination + * @ep: SCIF endpoint + */ +struct scif_status { + dma_addr_t src_dma_addr; + u64 val; + struct scif_endpt *ep; +}; + +/* + * struct scif_window - Registration Window for Self and Remote + * + * @nr_pages: Number of pages which is defined as a s64 instead of an int + * to avoid sign extension with buffers >= 2GB + * @nr_contig_chunks: Number of contiguous physical chunks + * @prot: read/write protections + * @ref_count: reference count in terms of number of pages + * @magic: Cookie to detect corruption + * @offset: registered offset + * @va_for_temp: va address that this window represents + * @dma_mark: Used to determine if all DMAs against the window are done + * @ep: Pointer to EP. Useful for passing EP around with messages to + avoid expensive list traversals. + * @list: link to list of windows for the endpoint + * @type: self or peer window + * @peer_window: Pointer to peer window. Useful for sending messages to peer + * without requiring an extra list traversal + * @unreg_state: unregistration state + * @offset_freed: True if the offset has been freed + * @temp: True for temporary windows created via scif_vreadfrom/scif_vwriteto + * @mm: memory descriptor for the task_struct which initiated the RMA + * @st: scatter gather table for DMA mappings with IOMMU enabled + * @pinned_pages: The set of pinned_pages backing this window + * @alloc_handle: Handle for sending ALLOC_REQ + * @regwq: Wait Queue for an registration (N)ACK + * @reg_state: Registration state + * @unregwq: Wait Queue for an unregistration (N)ACK + * @dma_addr_lookup: Lookup for physical addresses used for DMA + * @nr_lookup: Number of entries in lookup + * @mapped_offset: Offset used to map the window by the peer + * @dma_addr: Array of physical addresses used for Mgmt node & MIC initiated DMA + * @num_pages: Array specifying number of pages for each physical address + */ +struct scif_window { + s64 nr_pages; + int nr_contig_chunks; + int prot; + int ref_count; + u64 magic; + s64 offset; + unsigned long va_for_temp; + int dma_mark; + u64 ep; + struct list_head list; + enum scif_window_type type; + u64 peer_window; + enum scif_msg_state unreg_state; + bool offset_freed; + bool temp; + struct mm_struct *mm; + struct sg_table *st; + union { + struct { + struct scif_pinned_pages *pinned_pages; + struct scif_allocmsg alloc_handle; + wait_queue_head_t regwq; + enum scif_msg_state reg_state; + wait_queue_head_t unregwq; + }; + struct { + struct scif_rma_lookup dma_addr_lookup; + struct scif_rma_lookup num_pages_lookup; + int nr_lookup; + dma_addr_t mapped_offset; + }; + }; + dma_addr_t *dma_addr; + u64 *num_pages; +} __packed; + +/* + * scif_mmu_notif - SCIF mmu notifier information + * + * @mmu_notifier ep_mmu_notifier: MMU notifier operations + * @tc_reg_list: List of temp registration windows for self + * @mm: memory descriptor for the task_struct which initiated the RMA + * @ep: SCIF endpoint + * @list: link to list of MMU notifier information + */ +struct scif_mmu_notif { +#ifdef CONFIG_MMU_NOTIFIER + struct mmu_notifier ep_mmu_notifier; +#endif + struct list_head tc_reg_list; + struct mm_struct *mm; + struct scif_endpt *ep; + struct list_head list; +}; + +enum scif_rma_dir { + SCIF_LOCAL_TO_REMOTE, + SCIF_REMOTE_TO_LOCAL +}; + +extern struct kmem_cache *unaligned_cache; +/* Initialize RMA for this EP */ +void scif_rma_ep_init(struct scif_endpt *ep); +/* Check if epd can be uninitialized */ +int scif_rma_ep_can_uninit(struct scif_endpt *ep); +/* Obtain a new offset. Callee must grab RMA lock */ +int scif_get_window_offset(struct scif_endpt *ep, int flags, + s64 offset, int nr_pages, s64 *out_offset); +/* Free offset. Callee must grab RMA lock */ +void scif_free_window_offset(struct scif_endpt *ep, + struct scif_window *window, s64 offset); +/* Create self registration window */ +struct scif_window *scif_create_window(struct scif_endpt *ep, int nr_pages, + s64 offset, bool temp); +/* Destroy self registration window.*/ +int scif_destroy_window(struct scif_endpt *ep, struct scif_window *window); +void scif_unmap_window(struct scif_dev *remote_dev, struct scif_window *window); +/* Map pages of self window to Aperture/PCI */ +int scif_map_window(struct scif_dev *remote_dev, + struct scif_window *window); +/* Unregister a self window */ +int scif_unregister_window(struct scif_window *window); +/* Destroy remote registration window */ +void +scif_destroy_remote_window(struct scif_window *window); +/* remove valid remote memory mappings from process address space */ +void scif_zap_mmaps(int node); +/* Query if any applications have remote memory mappings */ +bool scif_rma_do_apps_have_mmaps(int node); +/* Cleanup remote registration lists for zombie endpoints */ +void scif_cleanup_rma_for_zombies(int node); +/* Reserve a DMA channel for a particular endpoint */ +int scif_reserve_dma_chan(struct scif_endpt *ep); +/* Setup a DMA mark for an endpoint */ +int _scif_fence_mark(scif_epd_t epd, int *mark); +int scif_prog_signal(scif_epd_t epd, off_t offset, u64 val, + enum scif_window_type type); +void scif_alloc_req(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_alloc_gnt_rej(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_free_virt(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_reg(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_unreg(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_reg_ack(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_reg_nack(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_unreg_ack(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_unreg_nack(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_munmap(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_mark(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_mark_resp(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_wait(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_wait_resp(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_sig_local(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_sig_remote(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_recv_sig_resp(struct scif_dev *scifdev, struct scifmsg *msg); +void scif_mmu_notif_handler(struct work_struct *work); +void scif_rma_handle_remote_fences(void); +void scif_rma_destroy_windows(void); +void scif_rma_destroy_tcw_invalid(void); +int scif_drain_dma_intr(struct scif_hw_dev *sdev, struct dma_chan *chan); + +struct scif_window_iter { + s64 offset; + int index; +}; + +static inline void +scif_init_window_iter(struct scif_window *window, struct scif_window_iter *iter) +{ + iter->offset = window->offset; + iter->index = 0; +} + +dma_addr_t scif_off_to_dma_addr(struct scif_window *window, s64 off, + size_t *nr_bytes, + struct scif_window_iter *iter); +static inline +dma_addr_t __scif_off_to_dma_addr(struct scif_window *window, s64 off) +{ + return scif_off_to_dma_addr(window, off, NULL, NULL); +} + +static inline bool scif_unaligned(off_t src_offset, off_t dst_offset) +{ + src_offset = src_offset & (L1_CACHE_BYTES - 1); + dst_offset = dst_offset & (L1_CACHE_BYTES - 1); + return !(src_offset == dst_offset); +} + +/* + * scif_zalloc: + * @size: Size of the allocation request. + * + * Helper API which attempts to allocate zeroed pages via + * __get_free_pages(..) first and then falls back on + * vzalloc(..) if that fails. + */ +static inline void *scif_zalloc(size_t size) +{ + void *ret = NULL; + size_t align = ALIGN(size, PAGE_SIZE); + + if (align && get_order(align) < MAX_ORDER) + ret = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(align)); + return ret ? ret : vzalloc(align); +} + +/* + * scif_free: + * @addr: Address to be freed. + * @size: Size of the allocation. + * Helper API which frees memory allocated via scif_zalloc(). + */ +static inline void scif_free(void *addr, size_t size) +{ + size_t align = ALIGN(size, PAGE_SIZE); + + if (is_vmalloc_addr(addr)) + vfree(addr); + else + free_pages((unsigned long)addr, get_order(align)); +} + +static inline void scif_get_window(struct scif_window *window, int nr_pages) +{ + window->ref_count += nr_pages; +} + +static inline void scif_put_window(struct scif_window *window, int nr_pages) +{ + window->ref_count -= nr_pages; +} + +static inline void scif_set_window_ref(struct scif_window *window, int nr_pages) +{ + window->ref_count = nr_pages; +} + +static inline void +scif_queue_for_cleanup(struct scif_window *window, struct list_head *list) +{ + spin_lock(&scif_info.rmalock); + list_add_tail(&window->list, list); + spin_unlock(&scif_info.rmalock); + schedule_work(&scif_info.misc_work); +} + +static inline void __scif_rma_destroy_tcw_helper(struct scif_window *window) +{ + list_del_init(&window->list); + scif_queue_for_cleanup(window, &scif_info.rma_tc); +} + +static inline bool scif_is_iommu_enabled(void) +{ +#ifdef CONFIG_INTEL_IOMMU + return intel_iommu_enabled; +#else + return false; +#endif +} +#endif /* SCIF_RMA_H */ diff --git a/kernel/drivers/misc/mic/scif/scif_rma_list.c b/kernel/drivers/misc/mic/scif/scif_rma_list.c new file mode 100644 index 000000000..e1ef8daed --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_rma_list.c @@ -0,0 +1,291 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#include "scif_main.h" +#include <linux/mmu_notifier.h> +#include <linux/highmem.h> + +/* + * scif_insert_tcw: + * + * Insert a temp window to the temp registration list sorted by va_for_temp. + * RMA lock must be held. + */ +void scif_insert_tcw(struct scif_window *window, struct list_head *head) +{ + struct scif_window *curr = NULL; + struct scif_window *prev = list_entry(head, struct scif_window, list); + struct list_head *item; + + INIT_LIST_HEAD(&window->list); + /* Compare with tail and if the entry is new tail add it to the end */ + if (!list_empty(head)) { + curr = list_entry(head->prev, struct scif_window, list); + if (curr->va_for_temp < window->va_for_temp) { + list_add_tail(&window->list, head); + return; + } + } + list_for_each(item, head) { + curr = list_entry(item, struct scif_window, list); + if (curr->va_for_temp > window->va_for_temp) + break; + prev = curr; + } + list_add(&window->list, &prev->list); +} + +/* + * scif_insert_window: + * + * Insert a window to the self registration list sorted by offset. + * RMA lock must be held. + */ +void scif_insert_window(struct scif_window *window, struct list_head *head) +{ + struct scif_window *curr = NULL, *prev = NULL; + struct list_head *item; + + INIT_LIST_HEAD(&window->list); + list_for_each(item, head) { + curr = list_entry(item, struct scif_window, list); + if (curr->offset > window->offset) + break; + prev = curr; + } + if (!prev) + list_add(&window->list, head); + else + list_add(&window->list, &prev->list); + scif_set_window_ref(window, window->nr_pages); +} + +/* + * scif_query_tcw: + * + * Query the temp cached registration list of ep for an overlapping window + * in case of permission mismatch, destroy the previous window. if permissions + * match and overlap is partial, destroy the window but return the new range + * RMA lock must be held. + */ +int scif_query_tcw(struct scif_endpt *ep, struct scif_rma_req *req) +{ + struct list_head *item, *temp, *head = req->head; + struct scif_window *window; + u64 start_va_window, start_va_req = req->va_for_temp; + u64 end_va_window, end_va_req = start_va_req + req->nr_bytes; + + if (!req->nr_bytes) + return -EINVAL; + /* + * Avoid traversing the entire list to find out that there + * is no entry that matches + */ + if (!list_empty(head)) { + window = list_last_entry(head, struct scif_window, list); + end_va_window = window->va_for_temp + + (window->nr_pages << PAGE_SHIFT); + if (start_va_req > end_va_window) + return -ENXIO; + } + list_for_each_safe(item, temp, head) { + window = list_entry(item, struct scif_window, list); + start_va_window = window->va_for_temp; + end_va_window = window->va_for_temp + + (window->nr_pages << PAGE_SHIFT); + if (start_va_req < start_va_window && + end_va_req < start_va_window) + break; + if (start_va_req >= end_va_window) + continue; + if ((window->prot & req->prot) == req->prot) { + if (start_va_req >= start_va_window && + end_va_req <= end_va_window) { + *req->out_window = window; + return 0; + } + /* expand window */ + if (start_va_req < start_va_window) { + req->nr_bytes += + start_va_window - start_va_req; + req->va_for_temp = start_va_window; + } + if (end_va_req >= end_va_window) + req->nr_bytes += end_va_window - end_va_req; + } + /* Destroy the old window to create a new one */ + __scif_rma_destroy_tcw_helper(window); + break; + } + return -ENXIO; +} + +/* + * scif_query_window: + * + * Query the registration list and check if a valid contiguous + * range of windows exist. + * RMA lock must be held. + */ +int scif_query_window(struct scif_rma_req *req) +{ + struct list_head *item; + struct scif_window *window; + s64 end_offset, offset = req->offset; + u64 tmp_min, nr_bytes_left = req->nr_bytes; + + if (!req->nr_bytes) + return -EINVAL; + + list_for_each(item, req->head) { + window = list_entry(item, struct scif_window, list); + end_offset = window->offset + + (window->nr_pages << PAGE_SHIFT); + if (offset < window->offset) + /* Offset not found! */ + return -ENXIO; + if (offset >= end_offset) + continue; + /* Check read/write protections. */ + if ((window->prot & req->prot) != req->prot) + return -EPERM; + if (nr_bytes_left == req->nr_bytes) + /* Store the first window */ + *req->out_window = window; + tmp_min = min((u64)end_offset - offset, nr_bytes_left); + nr_bytes_left -= tmp_min; + offset += tmp_min; + /* + * Range requested encompasses + * multiple windows contiguously. + */ + if (!nr_bytes_left) { + /* Done for partial window */ + if (req->type == SCIF_WINDOW_PARTIAL || + req->type == SCIF_WINDOW_SINGLE) + return 0; + /* Extra logic for full windows */ + if (offset == end_offset) + /* Spanning multiple whole windows */ + return 0; + /* Not spanning multiple whole windows */ + return -ENXIO; + } + if (req->type == SCIF_WINDOW_SINGLE) + break; + } + dev_err(scif_info.mdev.this_device, + "%s %d ENXIO\n", __func__, __LINE__); + return -ENXIO; +} + +/* + * scif_rma_list_unregister: + * + * Traverse the self registration list starting from window: + * 1) Call scif_unregister_window(..) + * RMA lock must be held. + */ +int scif_rma_list_unregister(struct scif_window *window, + s64 offset, int nr_pages) +{ + struct scif_endpt *ep = (struct scif_endpt *)window->ep; + struct list_head *head = &ep->rma_info.reg_list; + s64 end_offset; + int err = 0; + int loop_nr_pages; + struct scif_window *_window; + + list_for_each_entry_safe_from(window, _window, head, list) { + end_offset = window->offset + (window->nr_pages << PAGE_SHIFT); + loop_nr_pages = min((int)((end_offset - offset) >> PAGE_SHIFT), + nr_pages); + err = scif_unregister_window(window); + if (err) + return err; + nr_pages -= loop_nr_pages; + offset += (loop_nr_pages << PAGE_SHIFT); + if (!nr_pages) + break; + } + return 0; +} + +/* + * scif_unmap_all_window: + * + * Traverse all the windows in the self registration list and: + * 1) Delete any DMA mappings created + */ +void scif_unmap_all_windows(scif_epd_t epd) +{ + struct list_head *item, *tmp; + struct scif_window *window; + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct list_head *head = &ep->rma_info.reg_list; + + mutex_lock(&ep->rma_info.rma_lock); + list_for_each_safe(item, tmp, head) { + window = list_entry(item, struct scif_window, list); + scif_unmap_window(ep->remote_dev, window); + } + mutex_unlock(&ep->rma_info.rma_lock); +} + +/* + * scif_unregister_all_window: + * + * Traverse all the windows in the self registration list and: + * 1) Call scif_unregister_window(..) + * RMA lock must be held. + */ +int scif_unregister_all_windows(scif_epd_t epd) +{ + struct list_head *item, *tmp; + struct scif_window *window; + struct scif_endpt *ep = (struct scif_endpt *)epd; + struct list_head *head = &ep->rma_info.reg_list; + int err = 0; + + mutex_lock(&ep->rma_info.rma_lock); +retry: + item = NULL; + tmp = NULL; + list_for_each_safe(item, tmp, head) { + window = list_entry(item, struct scif_window, list); + ep->rma_info.async_list_del = 0; + err = scif_unregister_window(window); + if (err) + dev_err(scif_info.mdev.this_device, + "%s %d err %d\n", + __func__, __LINE__, err); + /* + * Need to restart list traversal if there has been + * an asynchronous list entry deletion. + */ + if (ACCESS_ONCE(ep->rma_info.async_list_del)) + goto retry; + } + mutex_unlock(&ep->rma_info.rma_lock); + if (!list_empty(&ep->rma_info.mmn_list)) { + spin_lock(&scif_info.rmalock); + list_add_tail(&ep->mmu_list, &scif_info.mmu_notif_cleanup); + spin_unlock(&scif_info.rmalock); + schedule_work(&scif_info.mmu_notif_work); + } + return err; +} diff --git a/kernel/drivers/misc/mic/scif/scif_rma_list.h b/kernel/drivers/misc/mic/scif/scif_rma_list.h new file mode 100644 index 000000000..7d58d1d55 --- /dev/null +++ b/kernel/drivers/misc/mic/scif/scif_rma_list.h @@ -0,0 +1,57 @@ +/* + * Intel MIC Platform Software Stack (MPSS) + * + * Copyright(c) 2015 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Intel SCIF driver. + * + */ +#ifndef SCIF_RMA_LIST_H +#define SCIF_RMA_LIST_H + +/* + * struct scif_rma_req - Self Registration list RMA Request query + * + * @out_window - Returns the window if found + * @offset: Starting offset + * @nr_bytes: number of bytes + * @prot: protection requested i.e. read or write or both + * @type: Specify single, partial or multiple windows + * @head: Head of list on which to search + * @va_for_temp: VA for searching temporary cached windows + */ +struct scif_rma_req { + struct scif_window **out_window; + union { + s64 offset; + unsigned long va_for_temp; + }; + size_t nr_bytes; + int prot; + enum scif_window_type type; + struct list_head *head; +}; + +/* Insert */ +void scif_insert_window(struct scif_window *window, struct list_head *head); +void scif_insert_tcw(struct scif_window *window, + struct list_head *head); +/* Query */ +int scif_query_window(struct scif_rma_req *request); +int scif_query_tcw(struct scif_endpt *ep, struct scif_rma_req *request); +/* Called from close to unregister all self windows */ +int scif_unregister_all_windows(scif_epd_t epd); +void scif_unmap_all_windows(scif_epd_t epd); +/* Traverse list and unregister */ +int scif_rma_list_unregister(struct scif_window *window, s64 offset, + int nr_pages); +#endif /* SCIF_RMA_LIST_H */ |