From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/drivers/block/DAC960.c | 7241 ++++++++++++++++++++++++++ kernel/drivers/block/DAC960.h | 4415 ++++++++++++++++ kernel/drivers/block/Kconfig | 573 ++ kernel/drivers/block/Makefile | 50 + kernel/drivers/block/amiflop.c | 1893 +++++++ kernel/drivers/block/aoe/Makefile | 6 + kernel/drivers/block/aoe/aoe.h | 240 + kernel/drivers/block/aoe/aoeblk.c | 464 ++ kernel/drivers/block/aoe/aoechr.c | 320 ++ kernel/drivers/block/aoe/aoecmd.c | 1826 +++++++ kernel/drivers/block/aoe/aoedev.c | 526 ++ kernel/drivers/block/aoe/aoemain.c | 115 + kernel/drivers/block/aoe/aoenet.c | 223 + kernel/drivers/block/ataflop.c | 2056 ++++++++ kernel/drivers/block/brd.c | 651 +++ kernel/drivers/block/cciss.c | 5392 +++++++++++++++++++ kernel/drivers/block/cciss.h | 435 ++ kernel/drivers/block/cciss_cmd.h | 269 + kernel/drivers/block/cciss_scsi.c | 1712 ++++++ kernel/drivers/block/cciss_scsi.h | 79 + kernel/drivers/block/cpqarray.c | 1820 +++++++ kernel/drivers/block/cpqarray.h | 126 + kernel/drivers/block/cryptoloop.c | 216 + kernel/drivers/block/drbd/Kconfig | 73 + kernel/drivers/block/drbd/Makefile | 8 + kernel/drivers/block/drbd/drbd_actlog.c | 1219 +++++ kernel/drivers/block/drbd/drbd_bitmap.c | 1648 ++++++ kernel/drivers/block/drbd/drbd_debugfs.c | 958 ++++ kernel/drivers/block/drbd/drbd_debugfs.h | 39 + kernel/drivers/block/drbd/drbd_int.h | 2328 +++++++++ kernel/drivers/block/drbd/drbd_interval.c | 179 + kernel/drivers/block/drbd/drbd_interval.h | 42 + kernel/drivers/block/drbd/drbd_main.c | 3844 ++++++++++++++ kernel/drivers/block/drbd/drbd_nl.c | 3674 +++++++++++++ kernel/drivers/block/drbd/drbd_nla.c | 54 + kernel/drivers/block/drbd/drbd_nla.h | 8 + kernel/drivers/block/drbd/drbd_proc.c | 368 ++ kernel/drivers/block/drbd/drbd_protocol.h | 307 ++ kernel/drivers/block/drbd/drbd_receiver.c | 5661 ++++++++++++++++++++ kernel/drivers/block/drbd/drbd_req.c | 1638 ++++++ kernel/drivers/block/drbd/drbd_req.h | 351 ++ kernel/drivers/block/drbd/drbd_state.c | 1918 +++++++ kernel/drivers/block/drbd/drbd_state.h | 166 + kernel/drivers/block/drbd/drbd_strings.c | 118 + kernel/drivers/block/drbd/drbd_strings.h | 9 + kernel/drivers/block/drbd/drbd_vli.h | 351 ++ kernel/drivers/block/drbd/drbd_worker.c | 2156 ++++++++ kernel/drivers/block/floppy.c | 4645 +++++++++++++++++ kernel/drivers/block/hd.c | 804 +++ kernel/drivers/block/ida_cmd.h | 349 ++ kernel/drivers/block/ida_ioctl.h | 87 + kernel/drivers/block/loop.c | 1869 +++++++ kernel/drivers/block/loop.h | 91 + kernel/drivers/block/mg_disk.c | 1112 ++++ kernel/drivers/block/mtip32xx/Kconfig | 9 + kernel/drivers/block/mtip32xx/Makefile | 5 + kernel/drivers/block/mtip32xx/mtip32xx.c | 4745 +++++++++++++++++ kernel/drivers/block/mtip32xx/mtip32xx.h | 511 ++ kernel/drivers/block/nbd.c | 896 ++++ kernel/drivers/block/null_blk.c | 674 +++ kernel/drivers/block/nvme-core.c | 3178 +++++++++++ kernel/drivers/block/nvme-scsi.c | 3070 +++++++++++ kernel/drivers/block/osdblk.c | 699 +++ kernel/drivers/block/paride/Kconfig | 300 ++ kernel/drivers/block/paride/Makefile | 28 + kernel/drivers/block/paride/Transition-notes | 128 + kernel/drivers/block/paride/aten.c | 162 + kernel/drivers/block/paride/bpck.c | 477 ++ kernel/drivers/block/paride/bpck6.c | 267 + kernel/drivers/block/paride/comm.c | 218 + kernel/drivers/block/paride/dstr.c | 233 + kernel/drivers/block/paride/epat.c | 340 ++ kernel/drivers/block/paride/epia.c | 316 ++ kernel/drivers/block/paride/fit2.c | 151 + kernel/drivers/block/paride/fit3.c | 211 + kernel/drivers/block/paride/friq.c | 276 + kernel/drivers/block/paride/frpw.c | 313 ++ kernel/drivers/block/paride/kbic.c | 305 ++ kernel/drivers/block/paride/ktti.c | 128 + kernel/drivers/block/paride/mkd | 30 + kernel/drivers/block/paride/on20.c | 153 + kernel/drivers/block/paride/on26.c | 319 ++ kernel/drivers/block/paride/paride.c | 434 ++ kernel/drivers/block/paride/paride.h | 170 + kernel/drivers/block/paride/pcd.c | 991 ++++ kernel/drivers/block/paride/pd.c | 958 ++++ kernel/drivers/block/paride/pf.c | 1007 ++++ kernel/drivers/block/paride/pg.c | 726 +++ kernel/drivers/block/paride/ppc6lnx.c | 726 +++ kernel/drivers/block/paride/pseudo.h | 102 + kernel/drivers/block/paride/pt.c | 1016 ++++ kernel/drivers/block/pktcdvd.c | 3029 +++++++++++ kernel/drivers/block/pmem.c | 262 + kernel/drivers/block/ps3disk.c | 589 +++ kernel/drivers/block/ps3vram.c | 878 ++++ kernel/drivers/block/rbd.c | 5752 ++++++++++++++++++++ kernel/drivers/block/rbd_types.h | 81 + kernel/drivers/block/rsxx/Makefile | 2 + kernel/drivers/block/rsxx/config.c | 211 + kernel/drivers/block/rsxx/core.c | 1144 ++++ kernel/drivers/block/rsxx/cregs.c | 804 +++ kernel/drivers/block/rsxx/dev.c | 340 ++ kernel/drivers/block/rsxx/dma.c | 1104 ++++ kernel/drivers/block/rsxx/rsxx.h | 47 + kernel/drivers/block/rsxx/rsxx_cfg.h | 72 + kernel/drivers/block/rsxx/rsxx_priv.h | 434 ++ kernel/drivers/block/skd_main.c | 5411 +++++++++++++++++++ kernel/drivers/block/skd_s1120.h | 330 ++ kernel/drivers/block/smart1,2.h | 278 + kernel/drivers/block/sunvdc.c | 1129 ++++ kernel/drivers/block/swim.c | 994 ++++ kernel/drivers/block/swim3.c | 1289 +++++ kernel/drivers/block/swim_asm.S | 247 + kernel/drivers/block/sx8.c | 1749 +++++++ kernel/drivers/block/umem.c | 1136 ++++ kernel/drivers/block/umem.h | 133 + kernel/drivers/block/virtio_blk.c | 901 ++++ kernel/drivers/block/xen-blkback/Makefile | 3 + kernel/drivers/block/xen-blkback/blkback.c | 1456 ++++++ kernel/drivers/block/xen-blkback/common.h | 492 ++ kernel/drivers/block/xen-blkback/xenbus.c | 934 ++++ kernel/drivers/block/xen-blkfront.c | 2126 ++++++++ kernel/drivers/block/xsysace.c | 1245 +++++ kernel/drivers/block/z2ram.c | 418 ++ kernel/drivers/block/zram/Kconfig | 34 + kernel/drivers/block/zram/Makefile | 5 + kernel/drivers/block/zram/zcomp.c | 353 ++ kernel/drivers/block/zram/zcomp.h | 68 + kernel/drivers/block/zram/zcomp_lz4.c | 47 + kernel/drivers/block/zram/zcomp_lz4.h | 17 + kernel/drivers/block/zram/zcomp_lzo.c | 47 + kernel/drivers/block/zram/zcomp_lzo.h | 17 + kernel/drivers/block/zram/zram_drv.c | 1322 +++++ kernel/drivers/block/zram/zram_drv.h | 125 + 134 files changed, 128049 insertions(+) create mode 100644 kernel/drivers/block/DAC960.c create mode 100644 kernel/drivers/block/DAC960.h create mode 100644 kernel/drivers/block/Kconfig create mode 100644 kernel/drivers/block/Makefile create mode 100644 kernel/drivers/block/amiflop.c create mode 100644 kernel/drivers/block/aoe/Makefile create mode 100644 kernel/drivers/block/aoe/aoe.h create mode 100644 kernel/drivers/block/aoe/aoeblk.c create mode 100644 kernel/drivers/block/aoe/aoechr.c create mode 100644 kernel/drivers/block/aoe/aoecmd.c create mode 100644 kernel/drivers/block/aoe/aoedev.c create mode 100644 kernel/drivers/block/aoe/aoemain.c create mode 100644 kernel/drivers/block/aoe/aoenet.c create mode 100644 kernel/drivers/block/ataflop.c create mode 100644 kernel/drivers/block/brd.c create mode 100644 kernel/drivers/block/cciss.c create mode 100644 kernel/drivers/block/cciss.h create mode 100644 kernel/drivers/block/cciss_cmd.h create mode 100644 kernel/drivers/block/cciss_scsi.c create mode 100644 kernel/drivers/block/cciss_scsi.h create mode 100644 kernel/drivers/block/cpqarray.c create mode 100644 kernel/drivers/block/cpqarray.h create mode 100644 kernel/drivers/block/cryptoloop.c create mode 100644 kernel/drivers/block/drbd/Kconfig create mode 100644 kernel/drivers/block/drbd/Makefile create mode 100644 kernel/drivers/block/drbd/drbd_actlog.c create mode 100644 kernel/drivers/block/drbd/drbd_bitmap.c create mode 100644 kernel/drivers/block/drbd/drbd_debugfs.c create mode 100644 kernel/drivers/block/drbd/drbd_debugfs.h create mode 100644 kernel/drivers/block/drbd/drbd_int.h create mode 100644 kernel/drivers/block/drbd/drbd_interval.c create mode 100644 kernel/drivers/block/drbd/drbd_interval.h create mode 100644 kernel/drivers/block/drbd/drbd_main.c create mode 100644 kernel/drivers/block/drbd/drbd_nl.c create mode 100644 kernel/drivers/block/drbd/drbd_nla.c create mode 100644 kernel/drivers/block/drbd/drbd_nla.h create mode 100644 kernel/drivers/block/drbd/drbd_proc.c create mode 100644 kernel/drivers/block/drbd/drbd_protocol.h create mode 100644 kernel/drivers/block/drbd/drbd_receiver.c create mode 100644 kernel/drivers/block/drbd/drbd_req.c create mode 100644 kernel/drivers/block/drbd/drbd_req.h create mode 100644 kernel/drivers/block/drbd/drbd_state.c create mode 100644 kernel/drivers/block/drbd/drbd_state.h create mode 100644 kernel/drivers/block/drbd/drbd_strings.c create mode 100644 kernel/drivers/block/drbd/drbd_strings.h create mode 100644 kernel/drivers/block/drbd/drbd_vli.h create mode 100644 kernel/drivers/block/drbd/drbd_worker.c create mode 100644 kernel/drivers/block/floppy.c create mode 100644 kernel/drivers/block/hd.c create mode 100644 kernel/drivers/block/ida_cmd.h create mode 100644 kernel/drivers/block/ida_ioctl.h create mode 100644 kernel/drivers/block/loop.c create mode 100644 kernel/drivers/block/loop.h create mode 100644 kernel/drivers/block/mg_disk.c create mode 100644 kernel/drivers/block/mtip32xx/Kconfig create mode 100644 kernel/drivers/block/mtip32xx/Makefile create mode 100644 kernel/drivers/block/mtip32xx/mtip32xx.c create mode 100644 kernel/drivers/block/mtip32xx/mtip32xx.h create mode 100644 kernel/drivers/block/nbd.c create mode 100644 kernel/drivers/block/null_blk.c create mode 100644 kernel/drivers/block/nvme-core.c create mode 100644 kernel/drivers/block/nvme-scsi.c create mode 100644 kernel/drivers/block/osdblk.c create mode 100644 kernel/drivers/block/paride/Kconfig create mode 100644 kernel/drivers/block/paride/Makefile create mode 100644 kernel/drivers/block/paride/Transition-notes create mode 100644 kernel/drivers/block/paride/aten.c create mode 100644 kernel/drivers/block/paride/bpck.c create mode 100644 kernel/drivers/block/paride/bpck6.c create mode 100644 kernel/drivers/block/paride/comm.c create mode 100644 kernel/drivers/block/paride/dstr.c create mode 100644 kernel/drivers/block/paride/epat.c create mode 100644 kernel/drivers/block/paride/epia.c create mode 100644 kernel/drivers/block/paride/fit2.c create mode 100644 kernel/drivers/block/paride/fit3.c create mode 100644 kernel/drivers/block/paride/friq.c create mode 100644 kernel/drivers/block/paride/frpw.c create mode 100644 kernel/drivers/block/paride/kbic.c create mode 100644 kernel/drivers/block/paride/ktti.c create mode 100644 kernel/drivers/block/paride/mkd create mode 100644 kernel/drivers/block/paride/on20.c create mode 100644 kernel/drivers/block/paride/on26.c create mode 100644 kernel/drivers/block/paride/paride.c create mode 100644 kernel/drivers/block/paride/paride.h create mode 100644 kernel/drivers/block/paride/pcd.c create mode 100644 kernel/drivers/block/paride/pd.c create mode 100644 kernel/drivers/block/paride/pf.c create mode 100644 kernel/drivers/block/paride/pg.c create mode 100644 kernel/drivers/block/paride/ppc6lnx.c create mode 100644 kernel/drivers/block/paride/pseudo.h create mode 100644 kernel/drivers/block/paride/pt.c create mode 100644 kernel/drivers/block/pktcdvd.c create mode 100644 kernel/drivers/block/pmem.c create mode 100644 kernel/drivers/block/ps3disk.c create mode 100644 kernel/drivers/block/ps3vram.c create mode 100644 kernel/drivers/block/rbd.c create mode 100644 kernel/drivers/block/rbd_types.h create mode 100644 kernel/drivers/block/rsxx/Makefile create mode 100644 kernel/drivers/block/rsxx/config.c create mode 100644 kernel/drivers/block/rsxx/core.c create mode 100644 kernel/drivers/block/rsxx/cregs.c create mode 100644 kernel/drivers/block/rsxx/dev.c create mode 100644 kernel/drivers/block/rsxx/dma.c create mode 100644 kernel/drivers/block/rsxx/rsxx.h create mode 100644 kernel/drivers/block/rsxx/rsxx_cfg.h create mode 100644 kernel/drivers/block/rsxx/rsxx_priv.h create mode 100644 kernel/drivers/block/skd_main.c create mode 100644 kernel/drivers/block/skd_s1120.h create mode 100644 kernel/drivers/block/smart1,2.h create mode 100644 kernel/drivers/block/sunvdc.c create mode 100644 kernel/drivers/block/swim.c create mode 100644 kernel/drivers/block/swim3.c create mode 100644 kernel/drivers/block/swim_asm.S create mode 100644 kernel/drivers/block/sx8.c create mode 100644 kernel/drivers/block/umem.c create mode 100644 kernel/drivers/block/umem.h create mode 100644 kernel/drivers/block/virtio_blk.c create mode 100644 kernel/drivers/block/xen-blkback/Makefile create mode 100644 kernel/drivers/block/xen-blkback/blkback.c create mode 100644 kernel/drivers/block/xen-blkback/common.h create mode 100644 kernel/drivers/block/xen-blkback/xenbus.c create mode 100644 kernel/drivers/block/xen-blkfront.c create mode 100644 kernel/drivers/block/xsysace.c create mode 100644 kernel/drivers/block/z2ram.c create mode 100644 kernel/drivers/block/zram/Kconfig create mode 100644 kernel/drivers/block/zram/Makefile create mode 100644 kernel/drivers/block/zram/zcomp.c create mode 100644 kernel/drivers/block/zram/zcomp.h create mode 100644 kernel/drivers/block/zram/zcomp_lz4.c create mode 100644 kernel/drivers/block/zram/zcomp_lz4.h create mode 100644 kernel/drivers/block/zram/zcomp_lzo.c create mode 100644 kernel/drivers/block/zram/zcomp_lzo.h create mode 100644 kernel/drivers/block/zram/zram_drv.c create mode 100644 kernel/drivers/block/zram/zram_drv.h (limited to 'kernel/drivers/block') diff --git a/kernel/drivers/block/DAC960.c b/kernel/drivers/block/DAC960.c new file mode 100644 index 000000000..811e11c82 --- /dev/null +++ b/kernel/drivers/block/DAC960.c @@ -0,0 +1,7241 @@ +/* + + Linux Driver for Mylex DAC960/AcceleRAID/eXtremeRAID PCI RAID Controllers + + Copyright 1998-2001 by Leonard N. Zubkoff + Portions Copyright 2002 by Mylex (An IBM Business Unit) + + This program is free software; you may redistribute and/or modify it under + the terms of the GNU General Public License Version 2 as published by the + Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY, without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for complete details. + +*/ + + +#define DAC960_DriverVersion "2.5.49" +#define DAC960_DriverDate "21 Aug 2007" + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "DAC960.h" + +#define DAC960_GAM_MINOR 252 + + +static DEFINE_MUTEX(DAC960_mutex); +static DAC960_Controller_T *DAC960_Controllers[DAC960_MaxControllers]; +static int DAC960_ControllerCount; +static struct proc_dir_entry *DAC960_ProcDirectoryEntry; + +static long disk_size(DAC960_Controller_T *p, int drive_nr) +{ + if (p->FirmwareType == DAC960_V1_Controller) { + if (drive_nr >= p->LogicalDriveCount) + return 0; + return p->V1.LogicalDriveInformation[drive_nr]. + LogicalDriveSize; + } else { + DAC960_V2_LogicalDeviceInfo_T *i = + p->V2.LogicalDeviceInformation[drive_nr]; + if (i == NULL) + return 0; + return i->ConfigurableDeviceSize; + } +} + +static int DAC960_open(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + DAC960_Controller_T *p = disk->queue->queuedata; + int drive_nr = (long)disk->private_data; + int ret = -ENXIO; + + mutex_lock(&DAC960_mutex); + if (p->FirmwareType == DAC960_V1_Controller) { + if (p->V1.LogicalDriveInformation[drive_nr]. + LogicalDriveState == DAC960_V1_LogicalDrive_Offline) + goto out; + } else { + DAC960_V2_LogicalDeviceInfo_T *i = + p->V2.LogicalDeviceInformation[drive_nr]; + if (!i || i->LogicalDeviceState == DAC960_V2_LogicalDevice_Offline) + goto out; + } + + check_disk_change(bdev); + + if (!get_capacity(p->disks[drive_nr])) + goto out; + ret = 0; +out: + mutex_unlock(&DAC960_mutex); + return ret; +} + +static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct gendisk *disk = bdev->bd_disk; + DAC960_Controller_T *p = disk->queue->queuedata; + int drive_nr = (long)disk->private_data; + + if (p->FirmwareType == DAC960_V1_Controller) { + geo->heads = p->V1.GeometryTranslationHeads; + geo->sectors = p->V1.GeometryTranslationSectors; + geo->cylinders = p->V1.LogicalDriveInformation[drive_nr]. + LogicalDriveSize / (geo->heads * geo->sectors); + } else { + DAC960_V2_LogicalDeviceInfo_T *i = + p->V2.LogicalDeviceInformation[drive_nr]; + switch (i->DriveGeometry) { + case DAC960_V2_Geometry_128_32: + geo->heads = 128; + geo->sectors = 32; + break; + case DAC960_V2_Geometry_255_63: + geo->heads = 255; + geo->sectors = 63; + break; + default: + DAC960_Error("Illegal Logical Device Geometry %d\n", + p, i->DriveGeometry); + return -EINVAL; + } + + geo->cylinders = i->ConfigurableDeviceSize / + (geo->heads * geo->sectors); + } + + return 0; +} + +static unsigned int DAC960_check_events(struct gendisk *disk, + unsigned int clearing) +{ + DAC960_Controller_T *p = disk->queue->queuedata; + int drive_nr = (long)disk->private_data; + + if (!p->LogicalDriveInitiallyAccessible[drive_nr]) + return DISK_EVENT_MEDIA_CHANGE; + return 0; +} + +static int DAC960_revalidate_disk(struct gendisk *disk) +{ + DAC960_Controller_T *p = disk->queue->queuedata; + int unit = (long)disk->private_data; + + set_capacity(disk, disk_size(p, unit)); + return 0; +} + +static const struct block_device_operations DAC960_BlockDeviceOperations = { + .owner = THIS_MODULE, + .open = DAC960_open, + .getgeo = DAC960_getgeo, + .check_events = DAC960_check_events, + .revalidate_disk = DAC960_revalidate_disk, +}; + + +/* + DAC960_AnnounceDriver announces the Driver Version and Date, Author's Name, + Copyright Notice, and Electronic Mail Address. +*/ + +static void DAC960_AnnounceDriver(DAC960_Controller_T *Controller) +{ + DAC960_Announce("***** DAC960 RAID Driver Version " + DAC960_DriverVersion " of " + DAC960_DriverDate " *****\n", Controller); + DAC960_Announce("Copyright 1998-2001 by Leonard N. Zubkoff " + "\n", Controller); +} + + +/* + DAC960_Failure prints a standardized error message, and then returns false. +*/ + +static bool DAC960_Failure(DAC960_Controller_T *Controller, + unsigned char *ErrorMessage) +{ + DAC960_Error("While configuring DAC960 PCI RAID Controller at\n", + Controller); + if (Controller->IO_Address == 0) + DAC960_Error("PCI Bus %d Device %d Function %d I/O Address N/A " + "PCI Address 0x%X\n", Controller, + Controller->Bus, Controller->Device, + Controller->Function, Controller->PCI_Address); + else DAC960_Error("PCI Bus %d Device %d Function %d I/O Address " + "0x%X PCI Address 0x%X\n", Controller, + Controller->Bus, Controller->Device, + Controller->Function, Controller->IO_Address, + Controller->PCI_Address); + DAC960_Error("%s FAILED - DETACHING\n", Controller, ErrorMessage); + return false; +} + +/* + init_dma_loaf() and slice_dma_loaf() are helper functions for + aggregating the dma-mapped memory for a well-known collection of + data structures that are of different lengths. + + These routines don't guarantee any alignment. The caller must + include any space needed for alignment in the sizes of the structures + that are passed in. + */ + +static bool init_dma_loaf(struct pci_dev *dev, struct dma_loaf *loaf, + size_t len) +{ + void *cpu_addr; + dma_addr_t dma_handle; + + cpu_addr = pci_alloc_consistent(dev, len, &dma_handle); + if (cpu_addr == NULL) + return false; + + loaf->cpu_free = loaf->cpu_base = cpu_addr; + loaf->dma_free =loaf->dma_base = dma_handle; + loaf->length = len; + memset(cpu_addr, 0, len); + return true; +} + +static void *slice_dma_loaf(struct dma_loaf *loaf, size_t len, + dma_addr_t *dma_handle) +{ + void *cpu_end = loaf->cpu_free + len; + void *cpu_addr = loaf->cpu_free; + + BUG_ON(cpu_end > loaf->cpu_base + loaf->length); + *dma_handle = loaf->dma_free; + loaf->cpu_free = cpu_end; + loaf->dma_free += len; + return cpu_addr; +} + +static void free_dma_loaf(struct pci_dev *dev, struct dma_loaf *loaf_handle) +{ + if (loaf_handle->cpu_base != NULL) + pci_free_consistent(dev, loaf_handle->length, + loaf_handle->cpu_base, loaf_handle->dma_base); +} + + +/* + DAC960_CreateAuxiliaryStructures allocates and initializes the auxiliary + data structures for Controller. It returns true on success and false on + failure. +*/ + +static bool DAC960_CreateAuxiliaryStructures(DAC960_Controller_T *Controller) +{ + int CommandAllocationLength, CommandAllocationGroupSize; + int CommandsRemaining = 0, CommandIdentifier, CommandGroupByteCount; + void *AllocationPointer = NULL; + void *ScatterGatherCPU = NULL; + dma_addr_t ScatterGatherDMA; + struct pci_pool *ScatterGatherPool; + void *RequestSenseCPU = NULL; + dma_addr_t RequestSenseDMA; + struct pci_pool *RequestSensePool = NULL; + + if (Controller->FirmwareType == DAC960_V1_Controller) + { + CommandAllocationLength = offsetof(DAC960_Command_T, V1.EndMarker); + CommandAllocationGroupSize = DAC960_V1_CommandAllocationGroupSize; + ScatterGatherPool = pci_pool_create("DAC960_V1_ScatterGather", + Controller->PCIDevice, + DAC960_V1_ScatterGatherLimit * sizeof(DAC960_V1_ScatterGatherSegment_T), + sizeof(DAC960_V1_ScatterGatherSegment_T), 0); + if (ScatterGatherPool == NULL) + return DAC960_Failure(Controller, + "AUXILIARY STRUCTURE CREATION (SG)"); + Controller->ScatterGatherPool = ScatterGatherPool; + } + else + { + CommandAllocationLength = offsetof(DAC960_Command_T, V2.EndMarker); + CommandAllocationGroupSize = DAC960_V2_CommandAllocationGroupSize; + ScatterGatherPool = pci_pool_create("DAC960_V2_ScatterGather", + Controller->PCIDevice, + DAC960_V2_ScatterGatherLimit * sizeof(DAC960_V2_ScatterGatherSegment_T), + sizeof(DAC960_V2_ScatterGatherSegment_T), 0); + if (ScatterGatherPool == NULL) + return DAC960_Failure(Controller, + "AUXILIARY STRUCTURE CREATION (SG)"); + RequestSensePool = pci_pool_create("DAC960_V2_RequestSense", + Controller->PCIDevice, sizeof(DAC960_SCSI_RequestSense_T), + sizeof(int), 0); + if (RequestSensePool == NULL) { + pci_pool_destroy(ScatterGatherPool); + return DAC960_Failure(Controller, + "AUXILIARY STRUCTURE CREATION (SG)"); + } + Controller->ScatterGatherPool = ScatterGatherPool; + Controller->V2.RequestSensePool = RequestSensePool; + } + Controller->CommandAllocationGroupSize = CommandAllocationGroupSize; + Controller->FreeCommands = NULL; + for (CommandIdentifier = 1; + CommandIdentifier <= Controller->DriverQueueDepth; + CommandIdentifier++) + { + DAC960_Command_T *Command; + if (--CommandsRemaining <= 0) + { + CommandsRemaining = + Controller->DriverQueueDepth - CommandIdentifier + 1; + if (CommandsRemaining > CommandAllocationGroupSize) + CommandsRemaining = CommandAllocationGroupSize; + CommandGroupByteCount = + CommandsRemaining * CommandAllocationLength; + AllocationPointer = kzalloc(CommandGroupByteCount, GFP_ATOMIC); + if (AllocationPointer == NULL) + return DAC960_Failure(Controller, + "AUXILIARY STRUCTURE CREATION"); + } + Command = (DAC960_Command_T *) AllocationPointer; + AllocationPointer += CommandAllocationLength; + Command->CommandIdentifier = CommandIdentifier; + Command->Controller = Controller; + Command->Next = Controller->FreeCommands; + Controller->FreeCommands = Command; + Controller->Commands[CommandIdentifier-1] = Command; + ScatterGatherCPU = pci_pool_alloc(ScatterGatherPool, GFP_ATOMIC, + &ScatterGatherDMA); + if (ScatterGatherCPU == NULL) + return DAC960_Failure(Controller, "AUXILIARY STRUCTURE CREATION"); + + if (RequestSensePool != NULL) { + RequestSenseCPU = pci_pool_alloc(RequestSensePool, GFP_ATOMIC, + &RequestSenseDMA); + if (RequestSenseCPU == NULL) { + pci_pool_free(ScatterGatherPool, ScatterGatherCPU, + ScatterGatherDMA); + return DAC960_Failure(Controller, + "AUXILIARY STRUCTURE CREATION"); + } + } + if (Controller->FirmwareType == DAC960_V1_Controller) { + Command->cmd_sglist = Command->V1.ScatterList; + Command->V1.ScatterGatherList = + (DAC960_V1_ScatterGatherSegment_T *)ScatterGatherCPU; + Command->V1.ScatterGatherListDMA = ScatterGatherDMA; + sg_init_table(Command->cmd_sglist, DAC960_V1_ScatterGatherLimit); + } else { + Command->cmd_sglist = Command->V2.ScatterList; + Command->V2.ScatterGatherList = + (DAC960_V2_ScatterGatherSegment_T *)ScatterGatherCPU; + Command->V2.ScatterGatherListDMA = ScatterGatherDMA; + Command->V2.RequestSense = + (DAC960_SCSI_RequestSense_T *)RequestSenseCPU; + Command->V2.RequestSenseDMA = RequestSenseDMA; + sg_init_table(Command->cmd_sglist, DAC960_V2_ScatterGatherLimit); + } + } + return true; +} + + +/* + DAC960_DestroyAuxiliaryStructures deallocates the auxiliary data + structures for Controller. +*/ + +static void DAC960_DestroyAuxiliaryStructures(DAC960_Controller_T *Controller) +{ + int i; + struct pci_pool *ScatterGatherPool = Controller->ScatterGatherPool; + struct pci_pool *RequestSensePool = NULL; + void *ScatterGatherCPU; + dma_addr_t ScatterGatherDMA; + void *RequestSenseCPU; + dma_addr_t RequestSenseDMA; + DAC960_Command_T *CommandGroup = NULL; + + + if (Controller->FirmwareType == DAC960_V2_Controller) + RequestSensePool = Controller->V2.RequestSensePool; + + Controller->FreeCommands = NULL; + for (i = 0; i < Controller->DriverQueueDepth; i++) + { + DAC960_Command_T *Command = Controller->Commands[i]; + + if (Command == NULL) + continue; + + if (Controller->FirmwareType == DAC960_V1_Controller) { + ScatterGatherCPU = (void *)Command->V1.ScatterGatherList; + ScatterGatherDMA = Command->V1.ScatterGatherListDMA; + RequestSenseCPU = NULL; + RequestSenseDMA = (dma_addr_t)0; + } else { + ScatterGatherCPU = (void *)Command->V2.ScatterGatherList; + ScatterGatherDMA = Command->V2.ScatterGatherListDMA; + RequestSenseCPU = (void *)Command->V2.RequestSense; + RequestSenseDMA = Command->V2.RequestSenseDMA; + } + if (ScatterGatherCPU != NULL) + pci_pool_free(ScatterGatherPool, ScatterGatherCPU, ScatterGatherDMA); + if (RequestSenseCPU != NULL) + pci_pool_free(RequestSensePool, RequestSenseCPU, RequestSenseDMA); + + if ((Command->CommandIdentifier + % Controller->CommandAllocationGroupSize) == 1) { + /* + * We can't free the group of commands until all of the + * request sense and scatter gather dma structures are free. + * Remember the beginning of the group, but don't free it + * until we've reached the beginning of the next group. + */ + kfree(CommandGroup); + CommandGroup = Command; + } + Controller->Commands[i] = NULL; + } + kfree(CommandGroup); + + if (Controller->CombinedStatusBuffer != NULL) + { + kfree(Controller->CombinedStatusBuffer); + Controller->CombinedStatusBuffer = NULL; + Controller->CurrentStatusBuffer = NULL; + } + + if (ScatterGatherPool != NULL) + pci_pool_destroy(ScatterGatherPool); + if (Controller->FirmwareType == DAC960_V1_Controller) + return; + + if (RequestSensePool != NULL) + pci_pool_destroy(RequestSensePool); + + for (i = 0; i < DAC960_MaxLogicalDrives; i++) { + kfree(Controller->V2.LogicalDeviceInformation[i]); + Controller->V2.LogicalDeviceInformation[i] = NULL; + } + + for (i = 0; i < DAC960_V2_MaxPhysicalDevices; i++) + { + kfree(Controller->V2.PhysicalDeviceInformation[i]); + Controller->V2.PhysicalDeviceInformation[i] = NULL; + kfree(Controller->V2.InquiryUnitSerialNumber[i]); + Controller->V2.InquiryUnitSerialNumber[i] = NULL; + } +} + + +/* + DAC960_V1_ClearCommand clears critical fields of Command for DAC960 V1 + Firmware Controllers. +*/ + +static inline void DAC960_V1_ClearCommand(DAC960_Command_T *Command) +{ + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + memset(CommandMailbox, 0, sizeof(DAC960_V1_CommandMailbox_T)); + Command->V1.CommandStatus = 0; +} + + +/* + DAC960_V2_ClearCommand clears critical fields of Command for DAC960 V2 + Firmware Controllers. +*/ + +static inline void DAC960_V2_ClearCommand(DAC960_Command_T *Command) +{ + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + memset(CommandMailbox, 0, sizeof(DAC960_V2_CommandMailbox_T)); + Command->V2.CommandStatus = 0; +} + + +/* + DAC960_AllocateCommand allocates a Command structure from Controller's + free list. During driver initialization, a special initialization command + has been placed on the free list to guarantee that command allocation can + never fail. +*/ + +static inline DAC960_Command_T *DAC960_AllocateCommand(DAC960_Controller_T + *Controller) +{ + DAC960_Command_T *Command = Controller->FreeCommands; + if (Command == NULL) return NULL; + Controller->FreeCommands = Command->Next; + Command->Next = NULL; + return Command; +} + + +/* + DAC960_DeallocateCommand deallocates Command, returning it to Controller's + free list. +*/ + +static inline void DAC960_DeallocateCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + + Command->Request = NULL; + Command->Next = Controller->FreeCommands; + Controller->FreeCommands = Command; +} + + +/* + DAC960_WaitForCommand waits for a wake_up on Controller's Command Wait Queue. +*/ + +static void DAC960_WaitForCommand(DAC960_Controller_T *Controller) +{ + spin_unlock_irq(&Controller->queue_lock); + __wait_event(Controller->CommandWaitQueue, Controller->FreeCommands); + spin_lock_irq(&Controller->queue_lock); +} + +/* + DAC960_GEM_QueueCommand queues Command for DAC960 GEM Series Controllers. +*/ + +static void DAC960_GEM_QueueCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + DAC960_V2_CommandMailbox_T *NextCommandMailbox = + Controller->V2.NextCommandMailbox; + + CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier; + DAC960_GEM_WriteCommandMailbox(NextCommandMailbox, CommandMailbox); + + if (Controller->V2.PreviousCommandMailbox1->Words[0] == 0 || + Controller->V2.PreviousCommandMailbox2->Words[0] == 0) + DAC960_GEM_MemoryMailboxNewCommand(ControllerBaseAddress); + + Controller->V2.PreviousCommandMailbox2 = + Controller->V2.PreviousCommandMailbox1; + Controller->V2.PreviousCommandMailbox1 = NextCommandMailbox; + + if (++NextCommandMailbox > Controller->V2.LastCommandMailbox) + NextCommandMailbox = Controller->V2.FirstCommandMailbox; + + Controller->V2.NextCommandMailbox = NextCommandMailbox; +} + +/* + DAC960_BA_QueueCommand queues Command for DAC960 BA Series Controllers. +*/ + +static void DAC960_BA_QueueCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + DAC960_V2_CommandMailbox_T *NextCommandMailbox = + Controller->V2.NextCommandMailbox; + CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier; + DAC960_BA_WriteCommandMailbox(NextCommandMailbox, CommandMailbox); + if (Controller->V2.PreviousCommandMailbox1->Words[0] == 0 || + Controller->V2.PreviousCommandMailbox2->Words[0] == 0) + DAC960_BA_MemoryMailboxNewCommand(ControllerBaseAddress); + Controller->V2.PreviousCommandMailbox2 = + Controller->V2.PreviousCommandMailbox1; + Controller->V2.PreviousCommandMailbox1 = NextCommandMailbox; + if (++NextCommandMailbox > Controller->V2.LastCommandMailbox) + NextCommandMailbox = Controller->V2.FirstCommandMailbox; + Controller->V2.NextCommandMailbox = NextCommandMailbox; +} + + +/* + DAC960_LP_QueueCommand queues Command for DAC960 LP Series Controllers. +*/ + +static void DAC960_LP_QueueCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + DAC960_V2_CommandMailbox_T *NextCommandMailbox = + Controller->V2.NextCommandMailbox; + CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier; + DAC960_LP_WriteCommandMailbox(NextCommandMailbox, CommandMailbox); + if (Controller->V2.PreviousCommandMailbox1->Words[0] == 0 || + Controller->V2.PreviousCommandMailbox2->Words[0] == 0) + DAC960_LP_MemoryMailboxNewCommand(ControllerBaseAddress); + Controller->V2.PreviousCommandMailbox2 = + Controller->V2.PreviousCommandMailbox1; + Controller->V2.PreviousCommandMailbox1 = NextCommandMailbox; + if (++NextCommandMailbox > Controller->V2.LastCommandMailbox) + NextCommandMailbox = Controller->V2.FirstCommandMailbox; + Controller->V2.NextCommandMailbox = NextCommandMailbox; +} + + +/* + DAC960_LA_QueueCommandDualMode queues Command for DAC960 LA Series + Controllers with Dual Mode Firmware. +*/ + +static void DAC960_LA_QueueCommandDualMode(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + DAC960_V1_CommandMailbox_T *NextCommandMailbox = + Controller->V1.NextCommandMailbox; + CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier; + DAC960_LA_WriteCommandMailbox(NextCommandMailbox, CommandMailbox); + if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 || + Controller->V1.PreviousCommandMailbox2->Words[0] == 0) + DAC960_LA_MemoryMailboxNewCommand(ControllerBaseAddress); + Controller->V1.PreviousCommandMailbox2 = + Controller->V1.PreviousCommandMailbox1; + Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox; + if (++NextCommandMailbox > Controller->V1.LastCommandMailbox) + NextCommandMailbox = Controller->V1.FirstCommandMailbox; + Controller->V1.NextCommandMailbox = NextCommandMailbox; +} + + +/* + DAC960_LA_QueueCommandSingleMode queues Command for DAC960 LA Series + Controllers with Single Mode Firmware. +*/ + +static void DAC960_LA_QueueCommandSingleMode(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + DAC960_V1_CommandMailbox_T *NextCommandMailbox = + Controller->V1.NextCommandMailbox; + CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier; + DAC960_LA_WriteCommandMailbox(NextCommandMailbox, CommandMailbox); + if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 || + Controller->V1.PreviousCommandMailbox2->Words[0] == 0) + DAC960_LA_HardwareMailboxNewCommand(ControllerBaseAddress); + Controller->V1.PreviousCommandMailbox2 = + Controller->V1.PreviousCommandMailbox1; + Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox; + if (++NextCommandMailbox > Controller->V1.LastCommandMailbox) + NextCommandMailbox = Controller->V1.FirstCommandMailbox; + Controller->V1.NextCommandMailbox = NextCommandMailbox; +} + + +/* + DAC960_PG_QueueCommandDualMode queues Command for DAC960 PG Series + Controllers with Dual Mode Firmware. +*/ + +static void DAC960_PG_QueueCommandDualMode(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + DAC960_V1_CommandMailbox_T *NextCommandMailbox = + Controller->V1.NextCommandMailbox; + CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier; + DAC960_PG_WriteCommandMailbox(NextCommandMailbox, CommandMailbox); + if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 || + Controller->V1.PreviousCommandMailbox2->Words[0] == 0) + DAC960_PG_MemoryMailboxNewCommand(ControllerBaseAddress); + Controller->V1.PreviousCommandMailbox2 = + Controller->V1.PreviousCommandMailbox1; + Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox; + if (++NextCommandMailbox > Controller->V1.LastCommandMailbox) + NextCommandMailbox = Controller->V1.FirstCommandMailbox; + Controller->V1.NextCommandMailbox = NextCommandMailbox; +} + + +/* + DAC960_PG_QueueCommandSingleMode queues Command for DAC960 PG Series + Controllers with Single Mode Firmware. +*/ + +static void DAC960_PG_QueueCommandSingleMode(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + DAC960_V1_CommandMailbox_T *NextCommandMailbox = + Controller->V1.NextCommandMailbox; + CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier; + DAC960_PG_WriteCommandMailbox(NextCommandMailbox, CommandMailbox); + if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 || + Controller->V1.PreviousCommandMailbox2->Words[0] == 0) + DAC960_PG_HardwareMailboxNewCommand(ControllerBaseAddress); + Controller->V1.PreviousCommandMailbox2 = + Controller->V1.PreviousCommandMailbox1; + Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox; + if (++NextCommandMailbox > Controller->V1.LastCommandMailbox) + NextCommandMailbox = Controller->V1.FirstCommandMailbox; + Controller->V1.NextCommandMailbox = NextCommandMailbox; +} + + +/* + DAC960_PD_QueueCommand queues Command for DAC960 PD Series Controllers. +*/ + +static void DAC960_PD_QueueCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier; + while (DAC960_PD_MailboxFullP(ControllerBaseAddress)) + udelay(1); + DAC960_PD_WriteCommandMailbox(ControllerBaseAddress, CommandMailbox); + DAC960_PD_NewCommand(ControllerBaseAddress); +} + + +/* + DAC960_P_QueueCommand queues Command for DAC960 P Series Controllers. +*/ + +static void DAC960_P_QueueCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier; + switch (CommandMailbox->Common.CommandOpcode) + { + case DAC960_V1_Enquiry: + CommandMailbox->Common.CommandOpcode = DAC960_V1_Enquiry_Old; + break; + case DAC960_V1_GetDeviceState: + CommandMailbox->Common.CommandOpcode = DAC960_V1_GetDeviceState_Old; + break; + case DAC960_V1_Read: + CommandMailbox->Common.CommandOpcode = DAC960_V1_Read_Old; + DAC960_PD_To_P_TranslateReadWriteCommand(CommandMailbox); + break; + case DAC960_V1_Write: + CommandMailbox->Common.CommandOpcode = DAC960_V1_Write_Old; + DAC960_PD_To_P_TranslateReadWriteCommand(CommandMailbox); + break; + case DAC960_V1_ReadWithScatterGather: + CommandMailbox->Common.CommandOpcode = + DAC960_V1_ReadWithScatterGather_Old; + DAC960_PD_To_P_TranslateReadWriteCommand(CommandMailbox); + break; + case DAC960_V1_WriteWithScatterGather: + CommandMailbox->Common.CommandOpcode = + DAC960_V1_WriteWithScatterGather_Old; + DAC960_PD_To_P_TranslateReadWriteCommand(CommandMailbox); + break; + default: + break; + } + while (DAC960_PD_MailboxFullP(ControllerBaseAddress)) + udelay(1); + DAC960_PD_WriteCommandMailbox(ControllerBaseAddress, CommandMailbox); + DAC960_PD_NewCommand(ControllerBaseAddress); +} + + +/* + DAC960_ExecuteCommand executes Command and waits for completion. +*/ + +static void DAC960_ExecuteCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + DECLARE_COMPLETION_ONSTACK(Completion); + unsigned long flags; + Command->Completion = &Completion; + + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_QueueCommand(Command); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + + if (in_interrupt()) + return; + wait_for_completion(&Completion); +} + + +/* + DAC960_V1_ExecuteType3 executes a DAC960 V1 Firmware Controller Type 3 + Command and waits for completion. It returns true on success and false + on failure. +*/ + +static bool DAC960_V1_ExecuteType3(DAC960_Controller_T *Controller, + DAC960_V1_CommandOpcode_T CommandOpcode, + dma_addr_t DataDMA) +{ + DAC960_Command_T *Command = DAC960_AllocateCommand(Controller); + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + DAC960_V1_CommandStatus_T CommandStatus; + DAC960_V1_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + CommandMailbox->Type3.CommandOpcode = CommandOpcode; + CommandMailbox->Type3.BusAddress = DataDMA; + DAC960_ExecuteCommand(Command); + CommandStatus = Command->V1.CommandStatus; + DAC960_DeallocateCommand(Command); + return (CommandStatus == DAC960_V1_NormalCompletion); +} + + +/* + DAC960_V1_ExecuteTypeB executes a DAC960 V1 Firmware Controller Type 3B + Command and waits for completion. It returns true on success and false + on failure. +*/ + +static bool DAC960_V1_ExecuteType3B(DAC960_Controller_T *Controller, + DAC960_V1_CommandOpcode_T CommandOpcode, + unsigned char CommandOpcode2, + dma_addr_t DataDMA) +{ + DAC960_Command_T *Command = DAC960_AllocateCommand(Controller); + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + DAC960_V1_CommandStatus_T CommandStatus; + DAC960_V1_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + CommandMailbox->Type3B.CommandOpcode = CommandOpcode; + CommandMailbox->Type3B.CommandOpcode2 = CommandOpcode2; + CommandMailbox->Type3B.BusAddress = DataDMA; + DAC960_ExecuteCommand(Command); + CommandStatus = Command->V1.CommandStatus; + DAC960_DeallocateCommand(Command); + return (CommandStatus == DAC960_V1_NormalCompletion); +} + + +/* + DAC960_V1_ExecuteType3D executes a DAC960 V1 Firmware Controller Type 3D + Command and waits for completion. It returns true on success and false + on failure. +*/ + +static bool DAC960_V1_ExecuteType3D(DAC960_Controller_T *Controller, + DAC960_V1_CommandOpcode_T CommandOpcode, + unsigned char Channel, + unsigned char TargetID, + dma_addr_t DataDMA) +{ + DAC960_Command_T *Command = DAC960_AllocateCommand(Controller); + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + DAC960_V1_CommandStatus_T CommandStatus; + DAC960_V1_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + CommandMailbox->Type3D.CommandOpcode = CommandOpcode; + CommandMailbox->Type3D.Channel = Channel; + CommandMailbox->Type3D.TargetID = TargetID; + CommandMailbox->Type3D.BusAddress = DataDMA; + DAC960_ExecuteCommand(Command); + CommandStatus = Command->V1.CommandStatus; + DAC960_DeallocateCommand(Command); + return (CommandStatus == DAC960_V1_NormalCompletion); +} + + +/* + DAC960_V2_GeneralInfo executes a DAC960 V2 Firmware General Information + Reading IOCTL Command and waits for completion. It returns true on success + and false on failure. + + Return data in The controller's HealthStatusBuffer, which is dma-able memory +*/ + +static bool DAC960_V2_GeneralInfo(DAC960_Controller_T *Controller) +{ + DAC960_Command_T *Command = DAC960_AllocateCommand(Controller); + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + DAC960_V2_CommandStatus_T CommandStatus; + DAC960_V2_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + CommandMailbox->Common.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->Common.CommandControlBits + .DataTransferControllerToHost = true; + CommandMailbox->Common.CommandControlBits + .NoAutoRequestSense = true; + CommandMailbox->Common.DataTransferSize = sizeof(DAC960_V2_HealthStatusBuffer_T); + CommandMailbox->Common.IOCTL_Opcode = DAC960_V2_GetHealthStatus; + CommandMailbox->Common.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + Controller->V2.HealthStatusBufferDMA; + CommandMailbox->Common.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->Common.DataTransferSize; + DAC960_ExecuteCommand(Command); + CommandStatus = Command->V2.CommandStatus; + DAC960_DeallocateCommand(Command); + return (CommandStatus == DAC960_V2_NormalCompletion); +} + + +/* + DAC960_V2_ControllerInfo executes a DAC960 V2 Firmware Controller + Information Reading IOCTL Command and waits for completion. It returns + true on success and false on failure. + + Data is returned in the controller's V2.NewControllerInformation dma-able + memory buffer. +*/ + +static bool DAC960_V2_NewControllerInfo(DAC960_Controller_T *Controller) +{ + DAC960_Command_T *Command = DAC960_AllocateCommand(Controller); + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + DAC960_V2_CommandStatus_T CommandStatus; + DAC960_V2_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + CommandMailbox->ControllerInfo.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->ControllerInfo.CommandControlBits + .DataTransferControllerToHost = true; + CommandMailbox->ControllerInfo.CommandControlBits + .NoAutoRequestSense = true; + CommandMailbox->ControllerInfo.DataTransferSize = sizeof(DAC960_V2_ControllerInfo_T); + CommandMailbox->ControllerInfo.ControllerNumber = 0; + CommandMailbox->ControllerInfo.IOCTL_Opcode = DAC960_V2_GetControllerInfo; + CommandMailbox->ControllerInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + Controller->V2.NewControllerInformationDMA; + CommandMailbox->ControllerInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->ControllerInfo.DataTransferSize; + DAC960_ExecuteCommand(Command); + CommandStatus = Command->V2.CommandStatus; + DAC960_DeallocateCommand(Command); + return (CommandStatus == DAC960_V2_NormalCompletion); +} + + +/* + DAC960_V2_LogicalDeviceInfo executes a DAC960 V2 Firmware Controller Logical + Device Information Reading IOCTL Command and waits for completion. It + returns true on success and false on failure. + + Data is returned in the controller's V2.NewLogicalDeviceInformation +*/ + +static bool DAC960_V2_NewLogicalDeviceInfo(DAC960_Controller_T *Controller, + unsigned short LogicalDeviceNumber) +{ + DAC960_Command_T *Command = DAC960_AllocateCommand(Controller); + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + DAC960_V2_CommandStatus_T CommandStatus; + + DAC960_V2_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + CommandMailbox->LogicalDeviceInfo.CommandOpcode = + DAC960_V2_IOCTL; + CommandMailbox->LogicalDeviceInfo.CommandControlBits + .DataTransferControllerToHost = true; + CommandMailbox->LogicalDeviceInfo.CommandControlBits + .NoAutoRequestSense = true; + CommandMailbox->LogicalDeviceInfo.DataTransferSize = + sizeof(DAC960_V2_LogicalDeviceInfo_T); + CommandMailbox->LogicalDeviceInfo.LogicalDevice.LogicalDeviceNumber = + LogicalDeviceNumber; + CommandMailbox->LogicalDeviceInfo.IOCTL_Opcode = DAC960_V2_GetLogicalDeviceInfoValid; + CommandMailbox->LogicalDeviceInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + Controller->V2.NewLogicalDeviceInformationDMA; + CommandMailbox->LogicalDeviceInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->LogicalDeviceInfo.DataTransferSize; + DAC960_ExecuteCommand(Command); + CommandStatus = Command->V2.CommandStatus; + DAC960_DeallocateCommand(Command); + return (CommandStatus == DAC960_V2_NormalCompletion); +} + + +/* + DAC960_V2_PhysicalDeviceInfo executes a DAC960 V2 Firmware Controller "Read + Physical Device Information" IOCTL Command and waits for completion. It + returns true on success and false on failure. + + The Channel, TargetID, LogicalUnit arguments should be 0 the first time + this function is called for a given controller. This will return data + for the "first" device on that controller. The returned data includes a + Channel, TargetID, LogicalUnit that can be passed in to this routine to + get data for the NEXT device on that controller. + + Data is stored in the controller's V2.NewPhysicalDeviceInfo dma-able + memory buffer. + +*/ + +static bool DAC960_V2_NewPhysicalDeviceInfo(DAC960_Controller_T *Controller, + unsigned char Channel, + unsigned char TargetID, + unsigned char LogicalUnit) +{ + DAC960_Command_T *Command = DAC960_AllocateCommand(Controller); + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + DAC960_V2_CommandStatus_T CommandStatus; + + DAC960_V2_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + CommandMailbox->PhysicalDeviceInfo.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->PhysicalDeviceInfo.CommandControlBits + .DataTransferControllerToHost = true; + CommandMailbox->PhysicalDeviceInfo.CommandControlBits + .NoAutoRequestSense = true; + CommandMailbox->PhysicalDeviceInfo.DataTransferSize = + sizeof(DAC960_V2_PhysicalDeviceInfo_T); + CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.LogicalUnit = LogicalUnit; + CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.TargetID = TargetID; + CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.Channel = Channel; + CommandMailbox->PhysicalDeviceInfo.IOCTL_Opcode = + DAC960_V2_GetPhysicalDeviceInfoValid; + CommandMailbox->PhysicalDeviceInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + Controller->V2.NewPhysicalDeviceInformationDMA; + CommandMailbox->PhysicalDeviceInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->PhysicalDeviceInfo.DataTransferSize; + DAC960_ExecuteCommand(Command); + CommandStatus = Command->V2.CommandStatus; + DAC960_DeallocateCommand(Command); + return (CommandStatus == DAC960_V2_NormalCompletion); +} + + +static void DAC960_V2_ConstructNewUnitSerialNumber( + DAC960_Controller_T *Controller, + DAC960_V2_CommandMailbox_T *CommandMailbox, int Channel, int TargetID, + int LogicalUnit) +{ + CommandMailbox->SCSI_10.CommandOpcode = DAC960_V2_SCSI_10_Passthru; + CommandMailbox->SCSI_10.CommandControlBits + .DataTransferControllerToHost = true; + CommandMailbox->SCSI_10.CommandControlBits + .NoAutoRequestSense = true; + CommandMailbox->SCSI_10.DataTransferSize = + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T); + CommandMailbox->SCSI_10.PhysicalDevice.LogicalUnit = LogicalUnit; + CommandMailbox->SCSI_10.PhysicalDevice.TargetID = TargetID; + CommandMailbox->SCSI_10.PhysicalDevice.Channel = Channel; + CommandMailbox->SCSI_10.CDBLength = 6; + CommandMailbox->SCSI_10.SCSI_CDB[0] = 0x12; /* INQUIRY */ + CommandMailbox->SCSI_10.SCSI_CDB[1] = 1; /* EVPD = 1 */ + CommandMailbox->SCSI_10.SCSI_CDB[2] = 0x80; /* Page Code */ + CommandMailbox->SCSI_10.SCSI_CDB[3] = 0; /* Reserved */ + CommandMailbox->SCSI_10.SCSI_CDB[4] = + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T); + CommandMailbox->SCSI_10.SCSI_CDB[5] = 0; /* Control */ + CommandMailbox->SCSI_10.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + Controller->V2.NewInquiryUnitSerialNumberDMA; + CommandMailbox->SCSI_10.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->SCSI_10.DataTransferSize; +} + + +/* + DAC960_V2_NewUnitSerialNumber executes an SCSI pass-through + Inquiry command to a SCSI device identified by Channel number, + Target id, Logical Unit Number. This function Waits for completion + of the command. + + The return data includes Unit Serial Number information for the + specified device. + + Data is stored in the controller's V2.NewPhysicalDeviceInfo dma-able + memory buffer. +*/ + +static bool DAC960_V2_NewInquiryUnitSerialNumber(DAC960_Controller_T *Controller, + int Channel, int TargetID, int LogicalUnit) +{ + DAC960_Command_T *Command; + DAC960_V2_CommandMailbox_T *CommandMailbox; + DAC960_V2_CommandStatus_T CommandStatus; + + Command = DAC960_AllocateCommand(Controller); + CommandMailbox = &Command->V2.CommandMailbox; + DAC960_V2_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + + DAC960_V2_ConstructNewUnitSerialNumber(Controller, CommandMailbox, + Channel, TargetID, LogicalUnit); + + DAC960_ExecuteCommand(Command); + CommandStatus = Command->V2.CommandStatus; + DAC960_DeallocateCommand(Command); + return (CommandStatus == DAC960_V2_NormalCompletion); +} + + +/* + DAC960_V2_DeviceOperation executes a DAC960 V2 Firmware Controller Device + Operation IOCTL Command and waits for completion. It returns true on + success and false on failure. +*/ + +static bool DAC960_V2_DeviceOperation(DAC960_Controller_T *Controller, + DAC960_V2_IOCTL_Opcode_T IOCTL_Opcode, + DAC960_V2_OperationDevice_T + OperationDevice) +{ + DAC960_Command_T *Command = DAC960_AllocateCommand(Controller); + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + DAC960_V2_CommandStatus_T CommandStatus; + DAC960_V2_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + CommandMailbox->DeviceOperation.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->DeviceOperation.CommandControlBits + .DataTransferControllerToHost = true; + CommandMailbox->DeviceOperation.CommandControlBits + .NoAutoRequestSense = true; + CommandMailbox->DeviceOperation.IOCTL_Opcode = IOCTL_Opcode; + CommandMailbox->DeviceOperation.OperationDevice = OperationDevice; + DAC960_ExecuteCommand(Command); + CommandStatus = Command->V2.CommandStatus; + DAC960_DeallocateCommand(Command); + return (CommandStatus == DAC960_V2_NormalCompletion); +} + + +/* + DAC960_V1_EnableMemoryMailboxInterface enables the Memory Mailbox Interface + for DAC960 V1 Firmware Controllers. + + PD and P controller types have no memory mailbox, but still need the + other dma mapped memory. +*/ + +static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T + *Controller) +{ + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_HardwareType_T hw_type = Controller->HardwareType; + struct pci_dev *PCI_Device = Controller->PCIDevice; + struct dma_loaf *DmaPages = &Controller->DmaPages; + size_t DmaPagesSize; + size_t CommandMailboxesSize; + size_t StatusMailboxesSize; + + DAC960_V1_CommandMailbox_T *CommandMailboxesMemory; + dma_addr_t CommandMailboxesMemoryDMA; + + DAC960_V1_StatusMailbox_T *StatusMailboxesMemory; + dma_addr_t StatusMailboxesMemoryDMA; + + DAC960_V1_CommandMailbox_T CommandMailbox; + DAC960_V1_CommandStatus_T CommandStatus; + int TimeoutCounter; + int i; + + memset(&CommandMailbox, 0, sizeof(DAC960_V1_CommandMailbox_T)); + + if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32))) + return DAC960_Failure(Controller, "DMA mask out of range"); + Controller->BounceBufferLimit = DMA_BIT_MASK(32); + + if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller)) { + CommandMailboxesSize = 0; + StatusMailboxesSize = 0; + } else { + CommandMailboxesSize = DAC960_V1_CommandMailboxCount * sizeof(DAC960_V1_CommandMailbox_T); + StatusMailboxesSize = DAC960_V1_StatusMailboxCount * sizeof(DAC960_V1_StatusMailbox_T); + } + DmaPagesSize = CommandMailboxesSize + StatusMailboxesSize + + sizeof(DAC960_V1_DCDB_T) + sizeof(DAC960_V1_Enquiry_T) + + sizeof(DAC960_V1_ErrorTable_T) + sizeof(DAC960_V1_EventLogEntry_T) + + sizeof(DAC960_V1_RebuildProgress_T) + + sizeof(DAC960_V1_LogicalDriveInformationArray_T) + + sizeof(DAC960_V1_BackgroundInitializationStatus_T) + + sizeof(DAC960_V1_DeviceState_T) + sizeof(DAC960_SCSI_Inquiry_T) + + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T); + + if (!init_dma_loaf(PCI_Device, DmaPages, DmaPagesSize)) + return false; + + + if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller)) + goto skip_mailboxes; + + CommandMailboxesMemory = slice_dma_loaf(DmaPages, + CommandMailboxesSize, &CommandMailboxesMemoryDMA); + + /* These are the base addresses for the command memory mailbox array */ + Controller->V1.FirstCommandMailbox = CommandMailboxesMemory; + Controller->V1.FirstCommandMailboxDMA = CommandMailboxesMemoryDMA; + + CommandMailboxesMemory += DAC960_V1_CommandMailboxCount - 1; + Controller->V1.LastCommandMailbox = CommandMailboxesMemory; + Controller->V1.NextCommandMailbox = Controller->V1.FirstCommandMailbox; + Controller->V1.PreviousCommandMailbox1 = Controller->V1.LastCommandMailbox; + Controller->V1.PreviousCommandMailbox2 = + Controller->V1.LastCommandMailbox - 1; + + /* These are the base addresses for the status memory mailbox array */ + StatusMailboxesMemory = slice_dma_loaf(DmaPages, + StatusMailboxesSize, &StatusMailboxesMemoryDMA); + + Controller->V1.FirstStatusMailbox = StatusMailboxesMemory; + Controller->V1.FirstStatusMailboxDMA = StatusMailboxesMemoryDMA; + StatusMailboxesMemory += DAC960_V1_StatusMailboxCount - 1; + Controller->V1.LastStatusMailbox = StatusMailboxesMemory; + Controller->V1.NextStatusMailbox = Controller->V1.FirstStatusMailbox; + +skip_mailboxes: + Controller->V1.MonitoringDCDB = slice_dma_loaf(DmaPages, + sizeof(DAC960_V1_DCDB_T), + &Controller->V1.MonitoringDCDB_DMA); + + Controller->V1.NewEnquiry = slice_dma_loaf(DmaPages, + sizeof(DAC960_V1_Enquiry_T), + &Controller->V1.NewEnquiryDMA); + + Controller->V1.NewErrorTable = slice_dma_loaf(DmaPages, + sizeof(DAC960_V1_ErrorTable_T), + &Controller->V1.NewErrorTableDMA); + + Controller->V1.EventLogEntry = slice_dma_loaf(DmaPages, + sizeof(DAC960_V1_EventLogEntry_T), + &Controller->V1.EventLogEntryDMA); + + Controller->V1.RebuildProgress = slice_dma_loaf(DmaPages, + sizeof(DAC960_V1_RebuildProgress_T), + &Controller->V1.RebuildProgressDMA); + + Controller->V1.NewLogicalDriveInformation = slice_dma_loaf(DmaPages, + sizeof(DAC960_V1_LogicalDriveInformationArray_T), + &Controller->V1.NewLogicalDriveInformationDMA); + + Controller->V1.BackgroundInitializationStatus = slice_dma_loaf(DmaPages, + sizeof(DAC960_V1_BackgroundInitializationStatus_T), + &Controller->V1.BackgroundInitializationStatusDMA); + + Controller->V1.NewDeviceState = slice_dma_loaf(DmaPages, + sizeof(DAC960_V1_DeviceState_T), + &Controller->V1.NewDeviceStateDMA); + + Controller->V1.NewInquiryStandardData = slice_dma_loaf(DmaPages, + sizeof(DAC960_SCSI_Inquiry_T), + &Controller->V1.NewInquiryStandardDataDMA); + + Controller->V1.NewInquiryUnitSerialNumber = slice_dma_loaf(DmaPages, + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T), + &Controller->V1.NewInquiryUnitSerialNumberDMA); + + if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller)) + return true; + + /* Enable the Memory Mailbox Interface. */ + Controller->V1.DualModeMemoryMailboxInterface = true; + CommandMailbox.TypeX.CommandOpcode = 0x2B; + CommandMailbox.TypeX.CommandIdentifier = 0; + CommandMailbox.TypeX.CommandOpcode2 = 0x14; + CommandMailbox.TypeX.CommandMailboxesBusAddress = + Controller->V1.FirstCommandMailboxDMA; + CommandMailbox.TypeX.StatusMailboxesBusAddress = + Controller->V1.FirstStatusMailboxDMA; +#define TIMEOUT_COUNT 1000000 + + for (i = 0; i < 2; i++) + switch (Controller->HardwareType) + { + case DAC960_LA_Controller: + TimeoutCounter = TIMEOUT_COUNT; + while (--TimeoutCounter >= 0) + { + if (!DAC960_LA_HardwareMailboxFullP(ControllerBaseAddress)) + break; + udelay(10); + } + if (TimeoutCounter < 0) return false; + DAC960_LA_WriteHardwareMailbox(ControllerBaseAddress, &CommandMailbox); + DAC960_LA_HardwareMailboxNewCommand(ControllerBaseAddress); + TimeoutCounter = TIMEOUT_COUNT; + while (--TimeoutCounter >= 0) + { + if (DAC960_LA_HardwareMailboxStatusAvailableP( + ControllerBaseAddress)) + break; + udelay(10); + } + if (TimeoutCounter < 0) return false; + CommandStatus = DAC960_LA_ReadStatusRegister(ControllerBaseAddress); + DAC960_LA_AcknowledgeHardwareMailboxInterrupt(ControllerBaseAddress); + DAC960_LA_AcknowledgeHardwareMailboxStatus(ControllerBaseAddress); + if (CommandStatus == DAC960_V1_NormalCompletion) return true; + Controller->V1.DualModeMemoryMailboxInterface = false; + CommandMailbox.TypeX.CommandOpcode2 = 0x10; + break; + case DAC960_PG_Controller: + TimeoutCounter = TIMEOUT_COUNT; + while (--TimeoutCounter >= 0) + { + if (!DAC960_PG_HardwareMailboxFullP(ControllerBaseAddress)) + break; + udelay(10); + } + if (TimeoutCounter < 0) return false; + DAC960_PG_WriteHardwareMailbox(ControllerBaseAddress, &CommandMailbox); + DAC960_PG_HardwareMailboxNewCommand(ControllerBaseAddress); + + TimeoutCounter = TIMEOUT_COUNT; + while (--TimeoutCounter >= 0) + { + if (DAC960_PG_HardwareMailboxStatusAvailableP( + ControllerBaseAddress)) + break; + udelay(10); + } + if (TimeoutCounter < 0) return false; + CommandStatus = DAC960_PG_ReadStatusRegister(ControllerBaseAddress); + DAC960_PG_AcknowledgeHardwareMailboxInterrupt(ControllerBaseAddress); + DAC960_PG_AcknowledgeHardwareMailboxStatus(ControllerBaseAddress); + if (CommandStatus == DAC960_V1_NormalCompletion) return true; + Controller->V1.DualModeMemoryMailboxInterface = false; + CommandMailbox.TypeX.CommandOpcode2 = 0x10; + break; + default: + DAC960_Failure(Controller, "Unknown Controller Type\n"); + break; + } + return false; +} + + +/* + DAC960_V2_EnableMemoryMailboxInterface enables the Memory Mailbox Interface + for DAC960 V2 Firmware Controllers. + + Aggregate the space needed for the controller's memory mailbox and + the other data structures that will be targets of dma transfers with + the controller. Allocate a dma-mapped region of memory to hold these + structures. Then, save CPU pointers and dma_addr_t values to reference + the structures that are contained in that region. +*/ + +static bool DAC960_V2_EnableMemoryMailboxInterface(DAC960_Controller_T + *Controller) +{ + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + struct pci_dev *PCI_Device = Controller->PCIDevice; + struct dma_loaf *DmaPages = &Controller->DmaPages; + size_t DmaPagesSize; + size_t CommandMailboxesSize; + size_t StatusMailboxesSize; + + DAC960_V2_CommandMailbox_T *CommandMailboxesMemory; + dma_addr_t CommandMailboxesMemoryDMA; + + DAC960_V2_StatusMailbox_T *StatusMailboxesMemory; + dma_addr_t StatusMailboxesMemoryDMA; + + DAC960_V2_CommandMailbox_T *CommandMailbox; + dma_addr_t CommandMailboxDMA; + DAC960_V2_CommandStatus_T CommandStatus; + + if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(64))) + Controller->BounceBufferLimit = DMA_BIT_MASK(64); + else if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32))) + Controller->BounceBufferLimit = DMA_BIT_MASK(32); + else + return DAC960_Failure(Controller, "DMA mask out of range"); + + /* This is a temporary dma mapping, used only in the scope of this function */ + CommandMailbox = pci_alloc_consistent(PCI_Device, + sizeof(DAC960_V2_CommandMailbox_T), &CommandMailboxDMA); + if (CommandMailbox == NULL) + return false; + + CommandMailboxesSize = DAC960_V2_CommandMailboxCount * sizeof(DAC960_V2_CommandMailbox_T); + StatusMailboxesSize = DAC960_V2_StatusMailboxCount * sizeof(DAC960_V2_StatusMailbox_T); + DmaPagesSize = + CommandMailboxesSize + StatusMailboxesSize + + sizeof(DAC960_V2_HealthStatusBuffer_T) + + sizeof(DAC960_V2_ControllerInfo_T) + + sizeof(DAC960_V2_LogicalDeviceInfo_T) + + sizeof(DAC960_V2_PhysicalDeviceInfo_T) + + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T) + + sizeof(DAC960_V2_Event_T) + + sizeof(DAC960_V2_PhysicalToLogicalDevice_T); + + if (!init_dma_loaf(PCI_Device, DmaPages, DmaPagesSize)) { + pci_free_consistent(PCI_Device, sizeof(DAC960_V2_CommandMailbox_T), + CommandMailbox, CommandMailboxDMA); + return false; + } + + CommandMailboxesMemory = slice_dma_loaf(DmaPages, + CommandMailboxesSize, &CommandMailboxesMemoryDMA); + + /* These are the base addresses for the command memory mailbox array */ + Controller->V2.FirstCommandMailbox = CommandMailboxesMemory; + Controller->V2.FirstCommandMailboxDMA = CommandMailboxesMemoryDMA; + + CommandMailboxesMemory += DAC960_V2_CommandMailboxCount - 1; + Controller->V2.LastCommandMailbox = CommandMailboxesMemory; + Controller->V2.NextCommandMailbox = Controller->V2.FirstCommandMailbox; + Controller->V2.PreviousCommandMailbox1 = Controller->V2.LastCommandMailbox; + Controller->V2.PreviousCommandMailbox2 = + Controller->V2.LastCommandMailbox - 1; + + /* These are the base addresses for the status memory mailbox array */ + StatusMailboxesMemory = slice_dma_loaf(DmaPages, + StatusMailboxesSize, &StatusMailboxesMemoryDMA); + + Controller->V2.FirstStatusMailbox = StatusMailboxesMemory; + Controller->V2.FirstStatusMailboxDMA = StatusMailboxesMemoryDMA; + StatusMailboxesMemory += DAC960_V2_StatusMailboxCount - 1; + Controller->V2.LastStatusMailbox = StatusMailboxesMemory; + Controller->V2.NextStatusMailbox = Controller->V2.FirstStatusMailbox; + + Controller->V2.HealthStatusBuffer = slice_dma_loaf(DmaPages, + sizeof(DAC960_V2_HealthStatusBuffer_T), + &Controller->V2.HealthStatusBufferDMA); + + Controller->V2.NewControllerInformation = slice_dma_loaf(DmaPages, + sizeof(DAC960_V2_ControllerInfo_T), + &Controller->V2.NewControllerInformationDMA); + + Controller->V2.NewLogicalDeviceInformation = slice_dma_loaf(DmaPages, + sizeof(DAC960_V2_LogicalDeviceInfo_T), + &Controller->V2.NewLogicalDeviceInformationDMA); + + Controller->V2.NewPhysicalDeviceInformation = slice_dma_loaf(DmaPages, + sizeof(DAC960_V2_PhysicalDeviceInfo_T), + &Controller->V2.NewPhysicalDeviceInformationDMA); + + Controller->V2.NewInquiryUnitSerialNumber = slice_dma_loaf(DmaPages, + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T), + &Controller->V2.NewInquiryUnitSerialNumberDMA); + + Controller->V2.Event = slice_dma_loaf(DmaPages, + sizeof(DAC960_V2_Event_T), + &Controller->V2.EventDMA); + + Controller->V2.PhysicalToLogicalDevice = slice_dma_loaf(DmaPages, + sizeof(DAC960_V2_PhysicalToLogicalDevice_T), + &Controller->V2.PhysicalToLogicalDeviceDMA); + + /* + Enable the Memory Mailbox Interface. + + I don't know why we can't just use one of the memory mailboxes + we just allocated to do this, instead of using this temporary one. + Try this change later. + */ + memset(CommandMailbox, 0, sizeof(DAC960_V2_CommandMailbox_T)); + CommandMailbox->SetMemoryMailbox.CommandIdentifier = 1; + CommandMailbox->SetMemoryMailbox.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->SetMemoryMailbox.CommandControlBits.NoAutoRequestSense = true; + CommandMailbox->SetMemoryMailbox.FirstCommandMailboxSizeKB = + (DAC960_V2_CommandMailboxCount * sizeof(DAC960_V2_CommandMailbox_T)) >> 10; + CommandMailbox->SetMemoryMailbox.FirstStatusMailboxSizeKB = + (DAC960_V2_StatusMailboxCount * sizeof(DAC960_V2_StatusMailbox_T)) >> 10; + CommandMailbox->SetMemoryMailbox.SecondCommandMailboxSizeKB = 0; + CommandMailbox->SetMemoryMailbox.SecondStatusMailboxSizeKB = 0; + CommandMailbox->SetMemoryMailbox.RequestSenseSize = 0; + CommandMailbox->SetMemoryMailbox.IOCTL_Opcode = DAC960_V2_SetMemoryMailbox; + CommandMailbox->SetMemoryMailbox.HealthStatusBufferSizeKB = 1; + CommandMailbox->SetMemoryMailbox.HealthStatusBufferBusAddress = + Controller->V2.HealthStatusBufferDMA; + CommandMailbox->SetMemoryMailbox.FirstCommandMailboxBusAddress = + Controller->V2.FirstCommandMailboxDMA; + CommandMailbox->SetMemoryMailbox.FirstStatusMailboxBusAddress = + Controller->V2.FirstStatusMailboxDMA; + switch (Controller->HardwareType) + { + case DAC960_GEM_Controller: + while (DAC960_GEM_HardwareMailboxFullP(ControllerBaseAddress)) + udelay(1); + DAC960_GEM_WriteHardwareMailbox(ControllerBaseAddress, CommandMailboxDMA); + DAC960_GEM_HardwareMailboxNewCommand(ControllerBaseAddress); + while (!DAC960_GEM_HardwareMailboxStatusAvailableP(ControllerBaseAddress)) + udelay(1); + CommandStatus = DAC960_GEM_ReadCommandStatus(ControllerBaseAddress); + DAC960_GEM_AcknowledgeHardwareMailboxInterrupt(ControllerBaseAddress); + DAC960_GEM_AcknowledgeHardwareMailboxStatus(ControllerBaseAddress); + break; + case DAC960_BA_Controller: + while (DAC960_BA_HardwareMailboxFullP(ControllerBaseAddress)) + udelay(1); + DAC960_BA_WriteHardwareMailbox(ControllerBaseAddress, CommandMailboxDMA); + DAC960_BA_HardwareMailboxNewCommand(ControllerBaseAddress); + while (!DAC960_BA_HardwareMailboxStatusAvailableP(ControllerBaseAddress)) + udelay(1); + CommandStatus = DAC960_BA_ReadCommandStatus(ControllerBaseAddress); + DAC960_BA_AcknowledgeHardwareMailboxInterrupt(ControllerBaseAddress); + DAC960_BA_AcknowledgeHardwareMailboxStatus(ControllerBaseAddress); + break; + case DAC960_LP_Controller: + while (DAC960_LP_HardwareMailboxFullP(ControllerBaseAddress)) + udelay(1); + DAC960_LP_WriteHardwareMailbox(ControllerBaseAddress, CommandMailboxDMA); + DAC960_LP_HardwareMailboxNewCommand(ControllerBaseAddress); + while (!DAC960_LP_HardwareMailboxStatusAvailableP(ControllerBaseAddress)) + udelay(1); + CommandStatus = DAC960_LP_ReadCommandStatus(ControllerBaseAddress); + DAC960_LP_AcknowledgeHardwareMailboxInterrupt(ControllerBaseAddress); + DAC960_LP_AcknowledgeHardwareMailboxStatus(ControllerBaseAddress); + break; + default: + DAC960_Failure(Controller, "Unknown Controller Type\n"); + CommandStatus = DAC960_V2_AbormalCompletion; + break; + } + pci_free_consistent(PCI_Device, sizeof(DAC960_V2_CommandMailbox_T), + CommandMailbox, CommandMailboxDMA); + return (CommandStatus == DAC960_V2_NormalCompletion); +} + + +/* + DAC960_V1_ReadControllerConfiguration reads the Configuration Information + from DAC960 V1 Firmware Controllers and initializes the Controller structure. +*/ + +static bool DAC960_V1_ReadControllerConfiguration(DAC960_Controller_T + *Controller) +{ + DAC960_V1_Enquiry2_T *Enquiry2; + dma_addr_t Enquiry2DMA; + DAC960_V1_Config2_T *Config2; + dma_addr_t Config2DMA; + int LogicalDriveNumber, Channel, TargetID; + struct dma_loaf local_dma; + + if (!init_dma_loaf(Controller->PCIDevice, &local_dma, + sizeof(DAC960_V1_Enquiry2_T) + sizeof(DAC960_V1_Config2_T))) + return DAC960_Failure(Controller, "LOGICAL DEVICE ALLOCATION"); + + Enquiry2 = slice_dma_loaf(&local_dma, sizeof(DAC960_V1_Enquiry2_T), &Enquiry2DMA); + Config2 = slice_dma_loaf(&local_dma, sizeof(DAC960_V1_Config2_T), &Config2DMA); + + if (!DAC960_V1_ExecuteType3(Controller, DAC960_V1_Enquiry, + Controller->V1.NewEnquiryDMA)) { + free_dma_loaf(Controller->PCIDevice, &local_dma); + return DAC960_Failure(Controller, "ENQUIRY"); + } + memcpy(&Controller->V1.Enquiry, Controller->V1.NewEnquiry, + sizeof(DAC960_V1_Enquiry_T)); + + if (!DAC960_V1_ExecuteType3(Controller, DAC960_V1_Enquiry2, Enquiry2DMA)) { + free_dma_loaf(Controller->PCIDevice, &local_dma); + return DAC960_Failure(Controller, "ENQUIRY2"); + } + + if (!DAC960_V1_ExecuteType3(Controller, DAC960_V1_ReadConfig2, Config2DMA)) { + free_dma_loaf(Controller->PCIDevice, &local_dma); + return DAC960_Failure(Controller, "READ CONFIG2"); + } + + if (!DAC960_V1_ExecuteType3(Controller, DAC960_V1_GetLogicalDriveInformation, + Controller->V1.NewLogicalDriveInformationDMA)) { + free_dma_loaf(Controller->PCIDevice, &local_dma); + return DAC960_Failure(Controller, "GET LOGICAL DRIVE INFORMATION"); + } + memcpy(&Controller->V1.LogicalDriveInformation, + Controller->V1.NewLogicalDriveInformation, + sizeof(DAC960_V1_LogicalDriveInformationArray_T)); + + for (Channel = 0; Channel < Enquiry2->ActualChannels; Channel++) + for (TargetID = 0; TargetID < Enquiry2->MaxTargets; TargetID++) { + if (!DAC960_V1_ExecuteType3D(Controller, DAC960_V1_GetDeviceState, + Channel, TargetID, + Controller->V1.NewDeviceStateDMA)) { + free_dma_loaf(Controller->PCIDevice, &local_dma); + return DAC960_Failure(Controller, "GET DEVICE STATE"); + } + memcpy(&Controller->V1.DeviceState[Channel][TargetID], + Controller->V1.NewDeviceState, sizeof(DAC960_V1_DeviceState_T)); + } + /* + Initialize the Controller Model Name and Full Model Name fields. + */ + switch (Enquiry2->HardwareID.SubModel) + { + case DAC960_V1_P_PD_PU: + if (Enquiry2->SCSICapability.BusSpeed == DAC960_V1_Ultra) + strcpy(Controller->ModelName, "DAC960PU"); + else strcpy(Controller->ModelName, "DAC960PD"); + break; + case DAC960_V1_PL: + strcpy(Controller->ModelName, "DAC960PL"); + break; + case DAC960_V1_PG: + strcpy(Controller->ModelName, "DAC960PG"); + break; + case DAC960_V1_PJ: + strcpy(Controller->ModelName, "DAC960PJ"); + break; + case DAC960_V1_PR: + strcpy(Controller->ModelName, "DAC960PR"); + break; + case DAC960_V1_PT: + strcpy(Controller->ModelName, "DAC960PT"); + break; + case DAC960_V1_PTL0: + strcpy(Controller->ModelName, "DAC960PTL0"); + break; + case DAC960_V1_PRL: + strcpy(Controller->ModelName, "DAC960PRL"); + break; + case DAC960_V1_PTL1: + strcpy(Controller->ModelName, "DAC960PTL1"); + break; + case DAC960_V1_1164P: + strcpy(Controller->ModelName, "DAC1164P"); + break; + default: + free_dma_loaf(Controller->PCIDevice, &local_dma); + return DAC960_Failure(Controller, "MODEL VERIFICATION"); + } + strcpy(Controller->FullModelName, "Mylex "); + strcat(Controller->FullModelName, Controller->ModelName); + /* + Initialize the Controller Firmware Version field and verify that it + is a supported firmware version. The supported firmware versions are: + + DAC1164P 5.06 and above + DAC960PTL/PRL/PJ/PG 4.06 and above + DAC960PU/PD/PL 3.51 and above + DAC960PU/PD/PL/P 2.73 and above + */ +#if defined(CONFIG_ALPHA) + /* + DEC Alpha machines were often equipped with DAC960 cards that were + OEMed from Mylex, and had their own custom firmware. Version 2.70, + the last custom FW revision to be released by DEC for these older + controllers, appears to work quite well with this driver. + + Cards tested successfully were several versions each of the PD and + PU, called by DEC the KZPSC and KZPAC, respectively, and having + the Manufacturer Numbers (from Mylex), usually on a sticker on the + back of the board, of: + + KZPSC: D040347 (1-channel) or D040348 (2-channel) or D040349 (3-channel) + KZPAC: D040395 (1-channel) or D040396 (2-channel) or D040397 (3-channel) + */ +# define FIRMWARE_27X "2.70" +#else +# define FIRMWARE_27X "2.73" +#endif + + if (Enquiry2->FirmwareID.MajorVersion == 0) + { + Enquiry2->FirmwareID.MajorVersion = + Controller->V1.Enquiry.MajorFirmwareVersion; + Enquiry2->FirmwareID.MinorVersion = + Controller->V1.Enquiry.MinorFirmwareVersion; + Enquiry2->FirmwareID.FirmwareType = '0'; + Enquiry2->FirmwareID.TurnID = 0; + } + sprintf(Controller->FirmwareVersion, "%d.%02d-%c-%02d", + Enquiry2->FirmwareID.MajorVersion, Enquiry2->FirmwareID.MinorVersion, + Enquiry2->FirmwareID.FirmwareType, Enquiry2->FirmwareID.TurnID); + if (!((Controller->FirmwareVersion[0] == '5' && + strcmp(Controller->FirmwareVersion, "5.06") >= 0) || + (Controller->FirmwareVersion[0] == '4' && + strcmp(Controller->FirmwareVersion, "4.06") >= 0) || + (Controller->FirmwareVersion[0] == '3' && + strcmp(Controller->FirmwareVersion, "3.51") >= 0) || + (Controller->FirmwareVersion[0] == '2' && + strcmp(Controller->FirmwareVersion, FIRMWARE_27X) >= 0))) + { + DAC960_Failure(Controller, "FIRMWARE VERSION VERIFICATION"); + DAC960_Error("Firmware Version = '%s'\n", Controller, + Controller->FirmwareVersion); + free_dma_loaf(Controller->PCIDevice, &local_dma); + return false; + } + /* + Initialize the Controller Channels, Targets, Memory Size, and SAF-TE + Enclosure Management Enabled fields. + */ + Controller->Channels = Enquiry2->ActualChannels; + Controller->Targets = Enquiry2->MaxTargets; + Controller->MemorySize = Enquiry2->MemorySize >> 20; + Controller->V1.SAFTE_EnclosureManagementEnabled = + (Enquiry2->FaultManagementType == DAC960_V1_SAFTE); + /* + Initialize the Controller Queue Depth, Driver Queue Depth, Logical Drive + Count, Maximum Blocks per Command, Controller Scatter/Gather Limit, and + Driver Scatter/Gather Limit. The Driver Queue Depth must be at most one + less than the Controller Queue Depth to allow for an automatic drive + rebuild operation. + */ + Controller->ControllerQueueDepth = Controller->V1.Enquiry.MaxCommands; + Controller->DriverQueueDepth = Controller->ControllerQueueDepth - 1; + if (Controller->DriverQueueDepth > DAC960_MaxDriverQueueDepth) + Controller->DriverQueueDepth = DAC960_MaxDriverQueueDepth; + Controller->LogicalDriveCount = + Controller->V1.Enquiry.NumberOfLogicalDrives; + Controller->MaxBlocksPerCommand = Enquiry2->MaxBlocksPerCommand; + Controller->ControllerScatterGatherLimit = Enquiry2->MaxScatterGatherEntries; + Controller->DriverScatterGatherLimit = + Controller->ControllerScatterGatherLimit; + if (Controller->DriverScatterGatherLimit > DAC960_V1_ScatterGatherLimit) + Controller->DriverScatterGatherLimit = DAC960_V1_ScatterGatherLimit; + /* + Initialize the Stripe Size, Segment Size, and Geometry Translation. + */ + Controller->V1.StripeSize = Config2->BlocksPerStripe * Config2->BlockFactor + >> (10 - DAC960_BlockSizeBits); + Controller->V1.SegmentSize = Config2->BlocksPerCacheLine * Config2->BlockFactor + >> (10 - DAC960_BlockSizeBits); + switch (Config2->DriveGeometry) + { + case DAC960_V1_Geometry_128_32: + Controller->V1.GeometryTranslationHeads = 128; + Controller->V1.GeometryTranslationSectors = 32; + break; + case DAC960_V1_Geometry_255_63: + Controller->V1.GeometryTranslationHeads = 255; + Controller->V1.GeometryTranslationSectors = 63; + break; + default: + free_dma_loaf(Controller->PCIDevice, &local_dma); + return DAC960_Failure(Controller, "CONFIG2 DRIVE GEOMETRY"); + } + /* + Initialize the Background Initialization Status. + */ + if ((Controller->FirmwareVersion[0] == '4' && + strcmp(Controller->FirmwareVersion, "4.08") >= 0) || + (Controller->FirmwareVersion[0] == '5' && + strcmp(Controller->FirmwareVersion, "5.08") >= 0)) + { + Controller->V1.BackgroundInitializationStatusSupported = true; + DAC960_V1_ExecuteType3B(Controller, + DAC960_V1_BackgroundInitializationControl, 0x20, + Controller-> + V1.BackgroundInitializationStatusDMA); + memcpy(&Controller->V1.LastBackgroundInitializationStatus, + Controller->V1.BackgroundInitializationStatus, + sizeof(DAC960_V1_BackgroundInitializationStatus_T)); + } + /* + Initialize the Logical Drive Initially Accessible flag. + */ + for (LogicalDriveNumber = 0; + LogicalDriveNumber < Controller->LogicalDriveCount; + LogicalDriveNumber++) + if (Controller->V1.LogicalDriveInformation + [LogicalDriveNumber].LogicalDriveState != + DAC960_V1_LogicalDrive_Offline) + Controller->LogicalDriveInitiallyAccessible[LogicalDriveNumber] = true; + Controller->V1.LastRebuildStatus = DAC960_V1_NoRebuildOrCheckInProgress; + free_dma_loaf(Controller->PCIDevice, &local_dma); + return true; +} + + +/* + DAC960_V2_ReadControllerConfiguration reads the Configuration Information + from DAC960 V2 Firmware Controllers and initializes the Controller structure. +*/ + +static bool DAC960_V2_ReadControllerConfiguration(DAC960_Controller_T + *Controller) +{ + DAC960_V2_ControllerInfo_T *ControllerInfo = + &Controller->V2.ControllerInformation; + unsigned short LogicalDeviceNumber = 0; + int ModelNameLength; + + /* Get data into dma-able area, then copy into permanent location */ + if (!DAC960_V2_NewControllerInfo(Controller)) + return DAC960_Failure(Controller, "GET CONTROLLER INFO"); + memcpy(ControllerInfo, Controller->V2.NewControllerInformation, + sizeof(DAC960_V2_ControllerInfo_T)); + + + if (!DAC960_V2_GeneralInfo(Controller)) + return DAC960_Failure(Controller, "GET HEALTH STATUS"); + + /* + Initialize the Controller Model Name and Full Model Name fields. + */ + ModelNameLength = sizeof(ControllerInfo->ControllerName); + if (ModelNameLength > sizeof(Controller->ModelName)-1) + ModelNameLength = sizeof(Controller->ModelName)-1; + memcpy(Controller->ModelName, ControllerInfo->ControllerName, + ModelNameLength); + ModelNameLength--; + while (Controller->ModelName[ModelNameLength] == ' ' || + Controller->ModelName[ModelNameLength] == '\0') + ModelNameLength--; + Controller->ModelName[++ModelNameLength] = '\0'; + strcpy(Controller->FullModelName, "Mylex "); + strcat(Controller->FullModelName, Controller->ModelName); + /* + Initialize the Controller Firmware Version field. + */ + sprintf(Controller->FirmwareVersion, "%d.%02d-%02d", + ControllerInfo->FirmwareMajorVersion, + ControllerInfo->FirmwareMinorVersion, + ControllerInfo->FirmwareTurnNumber); + if (ControllerInfo->FirmwareMajorVersion == 6 && + ControllerInfo->FirmwareMinorVersion == 0 && + ControllerInfo->FirmwareTurnNumber < 1) + { + DAC960_Info("FIRMWARE VERSION %s DOES NOT PROVIDE THE CONTROLLER\n", + Controller, Controller->FirmwareVersion); + DAC960_Info("STATUS MONITORING FUNCTIONALITY NEEDED BY THIS DRIVER.\n", + Controller); + DAC960_Info("PLEASE UPGRADE TO VERSION 6.00-01 OR ABOVE.\n", + Controller); + } + /* + Initialize the Controller Channels, Targets, and Memory Size. + */ + Controller->Channels = ControllerInfo->NumberOfPhysicalChannelsPresent; + Controller->Targets = + ControllerInfo->MaximumTargetsPerChannel + [ControllerInfo->NumberOfPhysicalChannelsPresent-1]; + Controller->MemorySize = ControllerInfo->MemorySizeMB; + /* + Initialize the Controller Queue Depth, Driver Queue Depth, Logical Drive + Count, Maximum Blocks per Command, Controller Scatter/Gather Limit, and + Driver Scatter/Gather Limit. The Driver Queue Depth must be at most one + less than the Controller Queue Depth to allow for an automatic drive + rebuild operation. + */ + Controller->ControllerQueueDepth = ControllerInfo->MaximumParallelCommands; + Controller->DriverQueueDepth = Controller->ControllerQueueDepth - 1; + if (Controller->DriverQueueDepth > DAC960_MaxDriverQueueDepth) + Controller->DriverQueueDepth = DAC960_MaxDriverQueueDepth; + Controller->LogicalDriveCount = ControllerInfo->LogicalDevicesPresent; + Controller->MaxBlocksPerCommand = + ControllerInfo->MaximumDataTransferSizeInBlocks; + Controller->ControllerScatterGatherLimit = + ControllerInfo->MaximumScatterGatherEntries; + Controller->DriverScatterGatherLimit = + Controller->ControllerScatterGatherLimit; + if (Controller->DriverScatterGatherLimit > DAC960_V2_ScatterGatherLimit) + Controller->DriverScatterGatherLimit = DAC960_V2_ScatterGatherLimit; + /* + Initialize the Logical Device Information. + */ + while (true) + { + DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInfo = + Controller->V2.NewLogicalDeviceInformation; + DAC960_V2_LogicalDeviceInfo_T *LogicalDeviceInfo; + DAC960_V2_PhysicalDevice_T PhysicalDevice; + + if (!DAC960_V2_NewLogicalDeviceInfo(Controller, LogicalDeviceNumber)) + break; + LogicalDeviceNumber = NewLogicalDeviceInfo->LogicalDeviceNumber; + if (LogicalDeviceNumber >= DAC960_MaxLogicalDrives) { + DAC960_Error("DAC960: Logical Drive Number %d not supported\n", + Controller, LogicalDeviceNumber); + break; + } + if (NewLogicalDeviceInfo->DeviceBlockSizeInBytes != DAC960_BlockSize) { + DAC960_Error("DAC960: Logical Drive Block Size %d not supported\n", + Controller, NewLogicalDeviceInfo->DeviceBlockSizeInBytes); + LogicalDeviceNumber++; + continue; + } + PhysicalDevice.Controller = 0; + PhysicalDevice.Channel = NewLogicalDeviceInfo->Channel; + PhysicalDevice.TargetID = NewLogicalDeviceInfo->TargetID; + PhysicalDevice.LogicalUnit = NewLogicalDeviceInfo->LogicalUnit; + Controller->V2.LogicalDriveToVirtualDevice[LogicalDeviceNumber] = + PhysicalDevice; + if (NewLogicalDeviceInfo->LogicalDeviceState != + DAC960_V2_LogicalDevice_Offline) + Controller->LogicalDriveInitiallyAccessible[LogicalDeviceNumber] = true; + LogicalDeviceInfo = kmalloc(sizeof(DAC960_V2_LogicalDeviceInfo_T), + GFP_ATOMIC); + if (LogicalDeviceInfo == NULL) + return DAC960_Failure(Controller, "LOGICAL DEVICE ALLOCATION"); + Controller->V2.LogicalDeviceInformation[LogicalDeviceNumber] = + LogicalDeviceInfo; + memcpy(LogicalDeviceInfo, NewLogicalDeviceInfo, + sizeof(DAC960_V2_LogicalDeviceInfo_T)); + LogicalDeviceNumber++; + } + return true; +} + + +/* + DAC960_ReportControllerConfiguration reports the Configuration Information + for Controller. +*/ + +static bool DAC960_ReportControllerConfiguration(DAC960_Controller_T + *Controller) +{ + DAC960_Info("Configuring Mylex %s PCI RAID Controller\n", + Controller, Controller->ModelName); + DAC960_Info(" Firmware Version: %s, Channels: %d, Memory Size: %dMB\n", + Controller, Controller->FirmwareVersion, + Controller->Channels, Controller->MemorySize); + DAC960_Info(" PCI Bus: %d, Device: %d, Function: %d, I/O Address: ", + Controller, Controller->Bus, + Controller->Device, Controller->Function); + if (Controller->IO_Address == 0) + DAC960_Info("Unassigned\n", Controller); + else DAC960_Info("0x%X\n", Controller, Controller->IO_Address); + DAC960_Info(" PCI Address: 0x%X mapped at 0x%lX, IRQ Channel: %d\n", + Controller, Controller->PCI_Address, + (unsigned long) Controller->BaseAddress, + Controller->IRQ_Channel); + DAC960_Info(" Controller Queue Depth: %d, " + "Maximum Blocks per Command: %d\n", + Controller, Controller->ControllerQueueDepth, + Controller->MaxBlocksPerCommand); + DAC960_Info(" Driver Queue Depth: %d, " + "Scatter/Gather Limit: %d of %d Segments\n", + Controller, Controller->DriverQueueDepth, + Controller->DriverScatterGatherLimit, + Controller->ControllerScatterGatherLimit); + if (Controller->FirmwareType == DAC960_V1_Controller) + { + DAC960_Info(" Stripe Size: %dKB, Segment Size: %dKB, " + "BIOS Geometry: %d/%d\n", Controller, + Controller->V1.StripeSize, + Controller->V1.SegmentSize, + Controller->V1.GeometryTranslationHeads, + Controller->V1.GeometryTranslationSectors); + if (Controller->V1.SAFTE_EnclosureManagementEnabled) + DAC960_Info(" SAF-TE Enclosure Management Enabled\n", Controller); + } + return true; +} + + +/* + DAC960_V1_ReadDeviceConfiguration reads the Device Configuration Information + for DAC960 V1 Firmware Controllers by requesting the SCSI Inquiry and SCSI + Inquiry Unit Serial Number information for each device connected to + Controller. +*/ + +static bool DAC960_V1_ReadDeviceConfiguration(DAC960_Controller_T + *Controller) +{ + struct dma_loaf local_dma; + + dma_addr_t DCDBs_dma[DAC960_V1_MaxChannels]; + DAC960_V1_DCDB_T *DCDBs_cpu[DAC960_V1_MaxChannels]; + + dma_addr_t SCSI_Inquiry_dma[DAC960_V1_MaxChannels]; + DAC960_SCSI_Inquiry_T *SCSI_Inquiry_cpu[DAC960_V1_MaxChannels]; + + dma_addr_t SCSI_NewInquiryUnitSerialNumberDMA[DAC960_V1_MaxChannels]; + DAC960_SCSI_Inquiry_UnitSerialNumber_T *SCSI_NewInquiryUnitSerialNumberCPU[DAC960_V1_MaxChannels]; + + struct completion Completions[DAC960_V1_MaxChannels]; + unsigned long flags; + int Channel, TargetID; + + if (!init_dma_loaf(Controller->PCIDevice, &local_dma, + DAC960_V1_MaxChannels*(sizeof(DAC960_V1_DCDB_T) + + sizeof(DAC960_SCSI_Inquiry_T) + + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T)))) + return DAC960_Failure(Controller, + "DMA ALLOCATION FAILED IN ReadDeviceConfiguration"); + + for (Channel = 0; Channel < Controller->Channels; Channel++) { + DCDBs_cpu[Channel] = slice_dma_loaf(&local_dma, + sizeof(DAC960_V1_DCDB_T), DCDBs_dma + Channel); + SCSI_Inquiry_cpu[Channel] = slice_dma_loaf(&local_dma, + sizeof(DAC960_SCSI_Inquiry_T), + SCSI_Inquiry_dma + Channel); + SCSI_NewInquiryUnitSerialNumberCPU[Channel] = slice_dma_loaf(&local_dma, + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T), + SCSI_NewInquiryUnitSerialNumberDMA + Channel); + } + + for (TargetID = 0; TargetID < Controller->Targets; TargetID++) + { + /* + * For each channel, submit a probe for a device on that channel. + * The timeout interval for a device that is present is 10 seconds. + * With this approach, the timeout periods can elapse in parallel + * on each channel. + */ + for (Channel = 0; Channel < Controller->Channels; Channel++) + { + dma_addr_t NewInquiryStandardDataDMA = SCSI_Inquiry_dma[Channel]; + DAC960_V1_DCDB_T *DCDB = DCDBs_cpu[Channel]; + dma_addr_t DCDB_dma = DCDBs_dma[Channel]; + DAC960_Command_T *Command = Controller->Commands[Channel]; + struct completion *Completion = &Completions[Channel]; + + init_completion(Completion); + DAC960_V1_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + Command->Completion = Completion; + Command->V1.CommandMailbox.Type3.CommandOpcode = DAC960_V1_DCDB; + Command->V1.CommandMailbox.Type3.BusAddress = DCDB_dma; + DCDB->Channel = Channel; + DCDB->TargetID = TargetID; + DCDB->Direction = DAC960_V1_DCDB_DataTransferDeviceToSystem; + DCDB->EarlyStatus = false; + DCDB->Timeout = DAC960_V1_DCDB_Timeout_10_seconds; + DCDB->NoAutomaticRequestSense = false; + DCDB->DisconnectPermitted = true; + DCDB->TransferLength = sizeof(DAC960_SCSI_Inquiry_T); + DCDB->BusAddress = NewInquiryStandardDataDMA; + DCDB->CDBLength = 6; + DCDB->TransferLengthHigh4 = 0; + DCDB->SenseLength = sizeof(DCDB->SenseData); + DCDB->CDB[0] = 0x12; /* INQUIRY */ + DCDB->CDB[1] = 0; /* EVPD = 0 */ + DCDB->CDB[2] = 0; /* Page Code */ + DCDB->CDB[3] = 0; /* Reserved */ + DCDB->CDB[4] = sizeof(DAC960_SCSI_Inquiry_T); + DCDB->CDB[5] = 0; /* Control */ + + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_QueueCommand(Command); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + } + /* + * Wait for the problems submitted in the previous loop + * to complete. On the probes that are successful, + * get the serial number of the device that was found. + */ + for (Channel = 0; Channel < Controller->Channels; Channel++) + { + DAC960_SCSI_Inquiry_T *InquiryStandardData = + &Controller->V1.InquiryStandardData[Channel][TargetID]; + DAC960_SCSI_Inquiry_T *NewInquiryStandardData = SCSI_Inquiry_cpu[Channel]; + dma_addr_t NewInquiryUnitSerialNumberDMA = + SCSI_NewInquiryUnitSerialNumberDMA[Channel]; + DAC960_SCSI_Inquiry_UnitSerialNumber_T *NewInquiryUnitSerialNumber = + SCSI_NewInquiryUnitSerialNumberCPU[Channel]; + DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber = + &Controller->V1.InquiryUnitSerialNumber[Channel][TargetID]; + DAC960_Command_T *Command = Controller->Commands[Channel]; + DAC960_V1_DCDB_T *DCDB = DCDBs_cpu[Channel]; + struct completion *Completion = &Completions[Channel]; + + wait_for_completion(Completion); + + if (Command->V1.CommandStatus != DAC960_V1_NormalCompletion) { + memset(InquiryStandardData, 0, sizeof(DAC960_SCSI_Inquiry_T)); + InquiryStandardData->PeripheralDeviceType = 0x1F; + continue; + } else + memcpy(InquiryStandardData, NewInquiryStandardData, sizeof(DAC960_SCSI_Inquiry_T)); + + /* Preserve Channel and TargetID values from the previous loop */ + Command->Completion = Completion; + DCDB->TransferLength = sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T); + DCDB->BusAddress = NewInquiryUnitSerialNumberDMA; + DCDB->SenseLength = sizeof(DCDB->SenseData); + DCDB->CDB[0] = 0x12; /* INQUIRY */ + DCDB->CDB[1] = 1; /* EVPD = 1 */ + DCDB->CDB[2] = 0x80; /* Page Code */ + DCDB->CDB[3] = 0; /* Reserved */ + DCDB->CDB[4] = sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T); + DCDB->CDB[5] = 0; /* Control */ + + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_QueueCommand(Command); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + wait_for_completion(Completion); + + if (Command->V1.CommandStatus != DAC960_V1_NormalCompletion) { + memset(InquiryUnitSerialNumber, 0, + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T)); + InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F; + } else + memcpy(InquiryUnitSerialNumber, NewInquiryUnitSerialNumber, + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T)); + } + } + free_dma_loaf(Controller->PCIDevice, &local_dma); + return true; +} + + +/* + DAC960_V2_ReadDeviceConfiguration reads the Device Configuration Information + for DAC960 V2 Firmware Controllers by requesting the Physical Device + Information and SCSI Inquiry Unit Serial Number information for each + device connected to Controller. +*/ + +static bool DAC960_V2_ReadDeviceConfiguration(DAC960_Controller_T + *Controller) +{ + unsigned char Channel = 0, TargetID = 0, LogicalUnit = 0; + unsigned short PhysicalDeviceIndex = 0; + + while (true) + { + DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInfo = + Controller->V2.NewPhysicalDeviceInformation; + DAC960_V2_PhysicalDeviceInfo_T *PhysicalDeviceInfo; + DAC960_SCSI_Inquiry_UnitSerialNumber_T *NewInquiryUnitSerialNumber = + Controller->V2.NewInquiryUnitSerialNumber; + DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber; + + if (!DAC960_V2_NewPhysicalDeviceInfo(Controller, Channel, TargetID, LogicalUnit)) + break; + + PhysicalDeviceInfo = kmalloc(sizeof(DAC960_V2_PhysicalDeviceInfo_T), + GFP_ATOMIC); + if (PhysicalDeviceInfo == NULL) + return DAC960_Failure(Controller, "PHYSICAL DEVICE ALLOCATION"); + Controller->V2.PhysicalDeviceInformation[PhysicalDeviceIndex] = + PhysicalDeviceInfo; + memcpy(PhysicalDeviceInfo, NewPhysicalDeviceInfo, + sizeof(DAC960_V2_PhysicalDeviceInfo_T)); + + InquiryUnitSerialNumber = kmalloc( + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T), GFP_ATOMIC); + if (InquiryUnitSerialNumber == NULL) { + kfree(PhysicalDeviceInfo); + return DAC960_Failure(Controller, "SERIAL NUMBER ALLOCATION"); + } + Controller->V2.InquiryUnitSerialNumber[PhysicalDeviceIndex] = + InquiryUnitSerialNumber; + + Channel = NewPhysicalDeviceInfo->Channel; + TargetID = NewPhysicalDeviceInfo->TargetID; + LogicalUnit = NewPhysicalDeviceInfo->LogicalUnit; + + /* + Some devices do NOT have Unit Serial Numbers. + This command fails for them. But, we still want to + remember those devices are there. Construct a + UnitSerialNumber structure for the failure case. + */ + if (!DAC960_V2_NewInquiryUnitSerialNumber(Controller, Channel, TargetID, LogicalUnit)) { + memset(InquiryUnitSerialNumber, 0, + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T)); + InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F; + } else + memcpy(InquiryUnitSerialNumber, NewInquiryUnitSerialNumber, + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T)); + + PhysicalDeviceIndex++; + LogicalUnit++; + } + return true; +} + + +/* + DAC960_SanitizeInquiryData sanitizes the Vendor, Model, Revision, and + Product Serial Number fields of the Inquiry Standard Data and Inquiry + Unit Serial Number structures. +*/ + +static void DAC960_SanitizeInquiryData(DAC960_SCSI_Inquiry_T + *InquiryStandardData, + DAC960_SCSI_Inquiry_UnitSerialNumber_T + *InquiryUnitSerialNumber, + unsigned char *Vendor, + unsigned char *Model, + unsigned char *Revision, + unsigned char *SerialNumber) +{ + int SerialNumberLength, i; + if (InquiryStandardData->PeripheralDeviceType == 0x1F) return; + for (i = 0; i < sizeof(InquiryStandardData->VendorIdentification); i++) + { + unsigned char VendorCharacter = + InquiryStandardData->VendorIdentification[i]; + Vendor[i] = (VendorCharacter >= ' ' && VendorCharacter <= '~' + ? VendorCharacter : ' '); + } + Vendor[sizeof(InquiryStandardData->VendorIdentification)] = '\0'; + for (i = 0; i < sizeof(InquiryStandardData->ProductIdentification); i++) + { + unsigned char ModelCharacter = + InquiryStandardData->ProductIdentification[i]; + Model[i] = (ModelCharacter >= ' ' && ModelCharacter <= '~' + ? ModelCharacter : ' '); + } + Model[sizeof(InquiryStandardData->ProductIdentification)] = '\0'; + for (i = 0; i < sizeof(InquiryStandardData->ProductRevisionLevel); i++) + { + unsigned char RevisionCharacter = + InquiryStandardData->ProductRevisionLevel[i]; + Revision[i] = (RevisionCharacter >= ' ' && RevisionCharacter <= '~' + ? RevisionCharacter : ' '); + } + Revision[sizeof(InquiryStandardData->ProductRevisionLevel)] = '\0'; + if (InquiryUnitSerialNumber->PeripheralDeviceType == 0x1F) return; + SerialNumberLength = InquiryUnitSerialNumber->PageLength; + if (SerialNumberLength > + sizeof(InquiryUnitSerialNumber->ProductSerialNumber)) + SerialNumberLength = sizeof(InquiryUnitSerialNumber->ProductSerialNumber); + for (i = 0; i < SerialNumberLength; i++) + { + unsigned char SerialNumberCharacter = + InquiryUnitSerialNumber->ProductSerialNumber[i]; + SerialNumber[i] = + (SerialNumberCharacter >= ' ' && SerialNumberCharacter <= '~' + ? SerialNumberCharacter : ' '); + } + SerialNumber[SerialNumberLength] = '\0'; +} + + +/* + DAC960_V1_ReportDeviceConfiguration reports the Device Configuration + Information for DAC960 V1 Firmware Controllers. +*/ + +static bool DAC960_V1_ReportDeviceConfiguration(DAC960_Controller_T + *Controller) +{ + int LogicalDriveNumber, Channel, TargetID; + DAC960_Info(" Physical Devices:\n", Controller); + for (Channel = 0; Channel < Controller->Channels; Channel++) + for (TargetID = 0; TargetID < Controller->Targets; TargetID++) + { + DAC960_SCSI_Inquiry_T *InquiryStandardData = + &Controller->V1.InquiryStandardData[Channel][TargetID]; + DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber = + &Controller->V1.InquiryUnitSerialNumber[Channel][TargetID]; + DAC960_V1_DeviceState_T *DeviceState = + &Controller->V1.DeviceState[Channel][TargetID]; + DAC960_V1_ErrorTableEntry_T *ErrorEntry = + &Controller->V1.ErrorTable.ErrorTableEntries[Channel][TargetID]; + char Vendor[1+sizeof(InquiryStandardData->VendorIdentification)]; + char Model[1+sizeof(InquiryStandardData->ProductIdentification)]; + char Revision[1+sizeof(InquiryStandardData->ProductRevisionLevel)]; + char SerialNumber[1+sizeof(InquiryUnitSerialNumber + ->ProductSerialNumber)]; + if (InquiryStandardData->PeripheralDeviceType == 0x1F) continue; + DAC960_SanitizeInquiryData(InquiryStandardData, InquiryUnitSerialNumber, + Vendor, Model, Revision, SerialNumber); + DAC960_Info(" %d:%d%s Vendor: %s Model: %s Revision: %s\n", + Controller, Channel, TargetID, (TargetID < 10 ? " " : ""), + Vendor, Model, Revision); + if (InquiryUnitSerialNumber->PeripheralDeviceType != 0x1F) + DAC960_Info(" Serial Number: %s\n", Controller, SerialNumber); + if (DeviceState->Present && + DeviceState->DeviceType == DAC960_V1_DiskType) + { + if (Controller->V1.DeviceResetCount[Channel][TargetID] > 0) + DAC960_Info(" Disk Status: %s, %u blocks, %d resets\n", + Controller, + (DeviceState->DeviceState == DAC960_V1_Device_Dead + ? "Dead" + : DeviceState->DeviceState + == DAC960_V1_Device_WriteOnly + ? "Write-Only" + : DeviceState->DeviceState + == DAC960_V1_Device_Online + ? "Online" : "Standby"), + DeviceState->DiskSize, + Controller->V1.DeviceResetCount[Channel][TargetID]); + else + DAC960_Info(" Disk Status: %s, %u blocks\n", Controller, + (DeviceState->DeviceState == DAC960_V1_Device_Dead + ? "Dead" + : DeviceState->DeviceState + == DAC960_V1_Device_WriteOnly + ? "Write-Only" + : DeviceState->DeviceState + == DAC960_V1_Device_Online + ? "Online" : "Standby"), + DeviceState->DiskSize); + } + if (ErrorEntry->ParityErrorCount > 0 || + ErrorEntry->SoftErrorCount > 0 || + ErrorEntry->HardErrorCount > 0 || + ErrorEntry->MiscErrorCount > 0) + DAC960_Info(" Errors - Parity: %d, Soft: %d, " + "Hard: %d, Misc: %d\n", Controller, + ErrorEntry->ParityErrorCount, + ErrorEntry->SoftErrorCount, + ErrorEntry->HardErrorCount, + ErrorEntry->MiscErrorCount); + } + DAC960_Info(" Logical Drives:\n", Controller); + for (LogicalDriveNumber = 0; + LogicalDriveNumber < Controller->LogicalDriveCount; + LogicalDriveNumber++) + { + DAC960_V1_LogicalDriveInformation_T *LogicalDriveInformation = + &Controller->V1.LogicalDriveInformation[LogicalDriveNumber]; + DAC960_Info(" /dev/rd/c%dd%d: RAID-%d, %s, %u blocks, %s\n", + Controller, Controller->ControllerNumber, LogicalDriveNumber, + LogicalDriveInformation->RAIDLevel, + (LogicalDriveInformation->LogicalDriveState + == DAC960_V1_LogicalDrive_Online + ? "Online" + : LogicalDriveInformation->LogicalDriveState + == DAC960_V1_LogicalDrive_Critical + ? "Critical" : "Offline"), + LogicalDriveInformation->LogicalDriveSize, + (LogicalDriveInformation->WriteBack + ? "Write Back" : "Write Thru")); + } + return true; +} + + +/* + DAC960_V2_ReportDeviceConfiguration reports the Device Configuration + Information for DAC960 V2 Firmware Controllers. +*/ + +static bool DAC960_V2_ReportDeviceConfiguration(DAC960_Controller_T + *Controller) +{ + int PhysicalDeviceIndex, LogicalDriveNumber; + DAC960_Info(" Physical Devices:\n", Controller); + for (PhysicalDeviceIndex = 0; + PhysicalDeviceIndex < DAC960_V2_MaxPhysicalDevices; + PhysicalDeviceIndex++) + { + DAC960_V2_PhysicalDeviceInfo_T *PhysicalDeviceInfo = + Controller->V2.PhysicalDeviceInformation[PhysicalDeviceIndex]; + DAC960_SCSI_Inquiry_T *InquiryStandardData = + (DAC960_SCSI_Inquiry_T *) &PhysicalDeviceInfo->SCSI_InquiryData; + DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber = + Controller->V2.InquiryUnitSerialNumber[PhysicalDeviceIndex]; + char Vendor[1+sizeof(InquiryStandardData->VendorIdentification)]; + char Model[1+sizeof(InquiryStandardData->ProductIdentification)]; + char Revision[1+sizeof(InquiryStandardData->ProductRevisionLevel)]; + char SerialNumber[1+sizeof(InquiryUnitSerialNumber->ProductSerialNumber)]; + if (PhysicalDeviceInfo == NULL) break; + DAC960_SanitizeInquiryData(InquiryStandardData, InquiryUnitSerialNumber, + Vendor, Model, Revision, SerialNumber); + DAC960_Info(" %d:%d%s Vendor: %s Model: %s Revision: %s\n", + Controller, + PhysicalDeviceInfo->Channel, + PhysicalDeviceInfo->TargetID, + (PhysicalDeviceInfo->TargetID < 10 ? " " : ""), + Vendor, Model, Revision); + if (PhysicalDeviceInfo->NegotiatedSynchronousMegaTransfers == 0) + DAC960_Info(" %sAsynchronous\n", Controller, + (PhysicalDeviceInfo->NegotiatedDataWidthBits == 16 + ? "Wide " :"")); + else + DAC960_Info(" %sSynchronous at %d MB/sec\n", Controller, + (PhysicalDeviceInfo->NegotiatedDataWidthBits == 16 + ? "Wide " :""), + (PhysicalDeviceInfo->NegotiatedSynchronousMegaTransfers + * PhysicalDeviceInfo->NegotiatedDataWidthBits/8)); + if (InquiryUnitSerialNumber->PeripheralDeviceType != 0x1F) + DAC960_Info(" Serial Number: %s\n", Controller, SerialNumber); + if (PhysicalDeviceInfo->PhysicalDeviceState == + DAC960_V2_Device_Unconfigured) + continue; + DAC960_Info(" Disk Status: %s, %u blocks\n", Controller, + (PhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Online + ? "Online" + : PhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Rebuild + ? "Rebuild" + : PhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Missing + ? "Missing" + : PhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Critical + ? "Critical" + : PhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Dead + ? "Dead" + : PhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_SuspectedDead + ? "Suspected-Dead" + : PhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_CommandedOffline + ? "Commanded-Offline" + : PhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Standby + ? "Standby" : "Unknown"), + PhysicalDeviceInfo->ConfigurableDeviceSize); + if (PhysicalDeviceInfo->ParityErrors == 0 && + PhysicalDeviceInfo->SoftErrors == 0 && + PhysicalDeviceInfo->HardErrors == 0 && + PhysicalDeviceInfo->MiscellaneousErrors == 0 && + PhysicalDeviceInfo->CommandTimeouts == 0 && + PhysicalDeviceInfo->Retries == 0 && + PhysicalDeviceInfo->Aborts == 0 && + PhysicalDeviceInfo->PredictedFailuresDetected == 0) + continue; + DAC960_Info(" Errors - Parity: %d, Soft: %d, " + "Hard: %d, Misc: %d\n", Controller, + PhysicalDeviceInfo->ParityErrors, + PhysicalDeviceInfo->SoftErrors, + PhysicalDeviceInfo->HardErrors, + PhysicalDeviceInfo->MiscellaneousErrors); + DAC960_Info(" Timeouts: %d, Retries: %d, " + "Aborts: %d, Predicted: %d\n", Controller, + PhysicalDeviceInfo->CommandTimeouts, + PhysicalDeviceInfo->Retries, + PhysicalDeviceInfo->Aborts, + PhysicalDeviceInfo->PredictedFailuresDetected); + } + DAC960_Info(" Logical Drives:\n", Controller); + for (LogicalDriveNumber = 0; + LogicalDriveNumber < DAC960_MaxLogicalDrives; + LogicalDriveNumber++) + { + DAC960_V2_LogicalDeviceInfo_T *LogicalDeviceInfo = + Controller->V2.LogicalDeviceInformation[LogicalDriveNumber]; + unsigned char *ReadCacheStatus[] = { "Read Cache Disabled", + "Read Cache Enabled", + "Read Ahead Enabled", + "Intelligent Read Ahead Enabled", + "-", "-", "-", "-" }; + unsigned char *WriteCacheStatus[] = { "Write Cache Disabled", + "Logical Device Read Only", + "Write Cache Enabled", + "Intelligent Write Cache Enabled", + "-", "-", "-", "-" }; + unsigned char *GeometryTranslation; + if (LogicalDeviceInfo == NULL) continue; + switch (LogicalDeviceInfo->DriveGeometry) + { + case DAC960_V2_Geometry_128_32: + GeometryTranslation = "128/32"; + break; + case DAC960_V2_Geometry_255_63: + GeometryTranslation = "255/63"; + break; + default: + GeometryTranslation = "Invalid"; + DAC960_Error("Illegal Logical Device Geometry %d\n", + Controller, LogicalDeviceInfo->DriveGeometry); + break; + } + DAC960_Info(" /dev/rd/c%dd%d: RAID-%d, %s, %u blocks\n", + Controller, Controller->ControllerNumber, LogicalDriveNumber, + LogicalDeviceInfo->RAIDLevel, + (LogicalDeviceInfo->LogicalDeviceState + == DAC960_V2_LogicalDevice_Online + ? "Online" + : LogicalDeviceInfo->LogicalDeviceState + == DAC960_V2_LogicalDevice_Critical + ? "Critical" : "Offline"), + LogicalDeviceInfo->ConfigurableDeviceSize); + DAC960_Info(" Logical Device %s, BIOS Geometry: %s\n", + Controller, + (LogicalDeviceInfo->LogicalDeviceControl + .LogicalDeviceInitialized + ? "Initialized" : "Uninitialized"), + GeometryTranslation); + if (LogicalDeviceInfo->StripeSize == 0) + { + if (LogicalDeviceInfo->CacheLineSize == 0) + DAC960_Info(" Stripe Size: N/A, " + "Segment Size: N/A\n", Controller); + else + DAC960_Info(" Stripe Size: N/A, " + "Segment Size: %dKB\n", Controller, + 1 << (LogicalDeviceInfo->CacheLineSize - 2)); + } + else + { + if (LogicalDeviceInfo->CacheLineSize == 0) + DAC960_Info(" Stripe Size: %dKB, " + "Segment Size: N/A\n", Controller, + 1 << (LogicalDeviceInfo->StripeSize - 2)); + else + DAC960_Info(" Stripe Size: %dKB, " + "Segment Size: %dKB\n", Controller, + 1 << (LogicalDeviceInfo->StripeSize - 2), + 1 << (LogicalDeviceInfo->CacheLineSize - 2)); + } + DAC960_Info(" %s, %s\n", Controller, + ReadCacheStatus[ + LogicalDeviceInfo->LogicalDeviceControl.ReadCache], + WriteCacheStatus[ + LogicalDeviceInfo->LogicalDeviceControl.WriteCache]); + if (LogicalDeviceInfo->SoftErrors > 0 || + LogicalDeviceInfo->CommandsFailed > 0 || + LogicalDeviceInfo->DeferredWriteErrors) + DAC960_Info(" Errors - Soft: %d, Failed: %d, " + "Deferred Write: %d\n", Controller, + LogicalDeviceInfo->SoftErrors, + LogicalDeviceInfo->CommandsFailed, + LogicalDeviceInfo->DeferredWriteErrors); + + } + return true; +} + +/* + DAC960_RegisterBlockDevice registers the Block Device structures + associated with Controller. +*/ + +static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller) +{ + int MajorNumber = DAC960_MAJOR + Controller->ControllerNumber; + int n; + + /* + Register the Block Device Major Number for this DAC960 Controller. + */ + if (register_blkdev(MajorNumber, "dac960") < 0) + return false; + + for (n = 0; n < DAC960_MaxLogicalDrives; n++) { + struct gendisk *disk = Controller->disks[n]; + struct request_queue *RequestQueue; + + /* for now, let all request queues share controller's lock */ + RequestQueue = blk_init_queue(DAC960_RequestFunction,&Controller->queue_lock); + if (!RequestQueue) { + printk("DAC960: failure to allocate request queue\n"); + continue; + } + Controller->RequestQueue[n] = RequestQueue; + blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit); + RequestQueue->queuedata = Controller; + blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit); + blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand); + disk->queue = RequestQueue; + sprintf(disk->disk_name, "rd/c%dd%d", Controller->ControllerNumber, n); + disk->major = MajorNumber; + disk->first_minor = n << DAC960_MaxPartitionsBits; + disk->fops = &DAC960_BlockDeviceOperations; + } + /* + Indicate the Block Device Registration completed successfully, + */ + return true; +} + + +/* + DAC960_UnregisterBlockDevice unregisters the Block Device structures + associated with Controller. +*/ + +static void DAC960_UnregisterBlockDevice(DAC960_Controller_T *Controller) +{ + int MajorNumber = DAC960_MAJOR + Controller->ControllerNumber; + int disk; + + /* does order matter when deleting gendisk and cleanup in request queue? */ + for (disk = 0; disk < DAC960_MaxLogicalDrives; disk++) { + del_gendisk(Controller->disks[disk]); + blk_cleanup_queue(Controller->RequestQueue[disk]); + Controller->RequestQueue[disk] = NULL; + } + + /* + Unregister the Block Device Major Number for this DAC960 Controller. + */ + unregister_blkdev(MajorNumber, "dac960"); +} + +/* + DAC960_ComputeGenericDiskInfo computes the values for the Generic Disk + Information Partition Sector Counts and Block Sizes. +*/ + +static void DAC960_ComputeGenericDiskInfo(DAC960_Controller_T *Controller) +{ + int disk; + for (disk = 0; disk < DAC960_MaxLogicalDrives; disk++) + set_capacity(Controller->disks[disk], disk_size(Controller, disk)); +} + +/* + DAC960_ReportErrorStatus reports Controller BIOS Messages passed through + the Error Status Register when the driver performs the BIOS handshaking. + It returns true for fatal errors and false otherwise. +*/ + +static bool DAC960_ReportErrorStatus(DAC960_Controller_T *Controller, + unsigned char ErrorStatus, + unsigned char Parameter0, + unsigned char Parameter1) +{ + switch (ErrorStatus) + { + case 0x00: + DAC960_Notice("Physical Device %d:%d Not Responding\n", + Controller, Parameter1, Parameter0); + break; + case 0x08: + if (Controller->DriveSpinUpMessageDisplayed) break; + DAC960_Notice("Spinning Up Drives\n", Controller); + Controller->DriveSpinUpMessageDisplayed = true; + break; + case 0x30: + DAC960_Notice("Configuration Checksum Error\n", Controller); + break; + case 0x60: + DAC960_Notice("Mirror Race Recovery Failed\n", Controller); + break; + case 0x70: + DAC960_Notice("Mirror Race Recovery In Progress\n", Controller); + break; + case 0x90: + DAC960_Notice("Physical Device %d:%d COD Mismatch\n", + Controller, Parameter1, Parameter0); + break; + case 0xA0: + DAC960_Notice("Logical Drive Installation Aborted\n", Controller); + break; + case 0xB0: + DAC960_Notice("Mirror Race On A Critical Logical Drive\n", Controller); + break; + case 0xD0: + DAC960_Notice("New Controller Configuration Found\n", Controller); + break; + case 0xF0: + DAC960_Error("Fatal Memory Parity Error for Controller at\n", Controller); + return true; + default: + DAC960_Error("Unknown Initialization Error %02X for Controller at\n", + Controller, ErrorStatus); + return true; + } + return false; +} + + +/* + * DAC960_DetectCleanup releases the resources that were allocated + * during DAC960_DetectController(). DAC960_DetectController can + * has several internal failure points, so not ALL resources may + * have been allocated. It's important to free only + * resources that HAVE been allocated. The code below always + * tests that the resource has been allocated before attempting to + * free it. + */ +static void DAC960_DetectCleanup(DAC960_Controller_T *Controller) +{ + int i; + + /* Free the memory mailbox, status, and related structures */ + free_dma_loaf(Controller->PCIDevice, &Controller->DmaPages); + if (Controller->MemoryMappedAddress) { + switch(Controller->HardwareType) + { + case DAC960_GEM_Controller: + DAC960_GEM_DisableInterrupts(Controller->BaseAddress); + break; + case DAC960_BA_Controller: + DAC960_BA_DisableInterrupts(Controller->BaseAddress); + break; + case DAC960_LP_Controller: + DAC960_LP_DisableInterrupts(Controller->BaseAddress); + break; + case DAC960_LA_Controller: + DAC960_LA_DisableInterrupts(Controller->BaseAddress); + break; + case DAC960_PG_Controller: + DAC960_PG_DisableInterrupts(Controller->BaseAddress); + break; + case DAC960_PD_Controller: + DAC960_PD_DisableInterrupts(Controller->BaseAddress); + break; + case DAC960_P_Controller: + DAC960_PD_DisableInterrupts(Controller->BaseAddress); + break; + } + iounmap(Controller->MemoryMappedAddress); + } + if (Controller->IRQ_Channel) + free_irq(Controller->IRQ_Channel, Controller); + if (Controller->IO_Address) + release_region(Controller->IO_Address, 0x80); + pci_disable_device(Controller->PCIDevice); + for (i = 0; (i < DAC960_MaxLogicalDrives) && Controller->disks[i]; i++) + put_disk(Controller->disks[i]); + DAC960_Controllers[Controller->ControllerNumber] = NULL; + kfree(Controller); +} + + +/* + DAC960_DetectController detects Mylex DAC960/AcceleRAID/eXtremeRAID + PCI RAID Controllers by interrogating the PCI Configuration Space for + Controller Type. +*/ + +static DAC960_Controller_T * +DAC960_DetectController(struct pci_dev *PCI_Device, + const struct pci_device_id *entry) +{ + struct DAC960_privdata *privdata = + (struct DAC960_privdata *)entry->driver_data; + irq_handler_t InterruptHandler = privdata->InterruptHandler; + unsigned int MemoryWindowSize = privdata->MemoryWindowSize; + DAC960_Controller_T *Controller = NULL; + unsigned char DeviceFunction = PCI_Device->devfn; + unsigned char ErrorStatus, Parameter0, Parameter1; + unsigned int IRQ_Channel; + void __iomem *BaseAddress; + int i; + + Controller = kzalloc(sizeof(DAC960_Controller_T), GFP_ATOMIC); + if (Controller == NULL) { + DAC960_Error("Unable to allocate Controller structure for " + "Controller at\n", NULL); + return NULL; + } + Controller->ControllerNumber = DAC960_ControllerCount; + DAC960_Controllers[DAC960_ControllerCount++] = Controller; + Controller->Bus = PCI_Device->bus->number; + Controller->FirmwareType = privdata->FirmwareType; + Controller->HardwareType = privdata->HardwareType; + Controller->Device = DeviceFunction >> 3; + Controller->Function = DeviceFunction & 0x7; + Controller->PCIDevice = PCI_Device; + strcpy(Controller->FullModelName, "DAC960"); + + if (pci_enable_device(PCI_Device)) + goto Failure; + + switch (Controller->HardwareType) + { + case DAC960_GEM_Controller: + Controller->PCI_Address = pci_resource_start(PCI_Device, 0); + break; + case DAC960_BA_Controller: + Controller->PCI_Address = pci_resource_start(PCI_Device, 0); + break; + case DAC960_LP_Controller: + Controller->PCI_Address = pci_resource_start(PCI_Device, 0); + break; + case DAC960_LA_Controller: + Controller->PCI_Address = pci_resource_start(PCI_Device, 0); + break; + case DAC960_PG_Controller: + Controller->PCI_Address = pci_resource_start(PCI_Device, 0); + break; + case DAC960_PD_Controller: + Controller->IO_Address = pci_resource_start(PCI_Device, 0); + Controller->PCI_Address = pci_resource_start(PCI_Device, 1); + break; + case DAC960_P_Controller: + Controller->IO_Address = pci_resource_start(PCI_Device, 0); + Controller->PCI_Address = pci_resource_start(PCI_Device, 1); + break; + } + + pci_set_drvdata(PCI_Device, (void *)((long)Controller->ControllerNumber)); + for (i = 0; i < DAC960_MaxLogicalDrives; i++) { + Controller->disks[i] = alloc_disk(1<disks[i]) + goto Failure; + Controller->disks[i]->private_data = (void *)((long)i); + } + init_waitqueue_head(&Controller->CommandWaitQueue); + init_waitqueue_head(&Controller->HealthStatusWaitQueue); + spin_lock_init(&Controller->queue_lock); + DAC960_AnnounceDriver(Controller); + /* + Map the Controller Register Window. + */ + if (MemoryWindowSize < PAGE_SIZE) + MemoryWindowSize = PAGE_SIZE; + Controller->MemoryMappedAddress = + ioremap_nocache(Controller->PCI_Address & PAGE_MASK, MemoryWindowSize); + Controller->BaseAddress = + Controller->MemoryMappedAddress + (Controller->PCI_Address & ~PAGE_MASK); + if (Controller->MemoryMappedAddress == NULL) + { + DAC960_Error("Unable to map Controller Register Window for " + "Controller at\n", Controller); + goto Failure; + } + BaseAddress = Controller->BaseAddress; + switch (Controller->HardwareType) + { + case DAC960_GEM_Controller: + DAC960_GEM_DisableInterrupts(BaseAddress); + DAC960_GEM_AcknowledgeHardwareMailboxStatus(BaseAddress); + udelay(1000); + while (DAC960_GEM_InitializationInProgressP(BaseAddress)) + { + if (DAC960_GEM_ReadErrorStatus(BaseAddress, &ErrorStatus, + &Parameter0, &Parameter1) && + DAC960_ReportErrorStatus(Controller, ErrorStatus, + Parameter0, Parameter1)) + goto Failure; + udelay(10); + } + if (!DAC960_V2_EnableMemoryMailboxInterface(Controller)) + { + DAC960_Error("Unable to Enable Memory Mailbox Interface " + "for Controller at\n", Controller); + goto Failure; + } + DAC960_GEM_EnableInterrupts(BaseAddress); + Controller->QueueCommand = DAC960_GEM_QueueCommand; + Controller->ReadControllerConfiguration = + DAC960_V2_ReadControllerConfiguration; + Controller->ReadDeviceConfiguration = + DAC960_V2_ReadDeviceConfiguration; + Controller->ReportDeviceConfiguration = + DAC960_V2_ReportDeviceConfiguration; + Controller->QueueReadWriteCommand = + DAC960_V2_QueueReadWriteCommand; + break; + case DAC960_BA_Controller: + DAC960_BA_DisableInterrupts(BaseAddress); + DAC960_BA_AcknowledgeHardwareMailboxStatus(BaseAddress); + udelay(1000); + while (DAC960_BA_InitializationInProgressP(BaseAddress)) + { + if (DAC960_BA_ReadErrorStatus(BaseAddress, &ErrorStatus, + &Parameter0, &Parameter1) && + DAC960_ReportErrorStatus(Controller, ErrorStatus, + Parameter0, Parameter1)) + goto Failure; + udelay(10); + } + if (!DAC960_V2_EnableMemoryMailboxInterface(Controller)) + { + DAC960_Error("Unable to Enable Memory Mailbox Interface " + "for Controller at\n", Controller); + goto Failure; + } + DAC960_BA_EnableInterrupts(BaseAddress); + Controller->QueueCommand = DAC960_BA_QueueCommand; + Controller->ReadControllerConfiguration = + DAC960_V2_ReadControllerConfiguration; + Controller->ReadDeviceConfiguration = + DAC960_V2_ReadDeviceConfiguration; + Controller->ReportDeviceConfiguration = + DAC960_V2_ReportDeviceConfiguration; + Controller->QueueReadWriteCommand = + DAC960_V2_QueueReadWriteCommand; + break; + case DAC960_LP_Controller: + DAC960_LP_DisableInterrupts(BaseAddress); + DAC960_LP_AcknowledgeHardwareMailboxStatus(BaseAddress); + udelay(1000); + while (DAC960_LP_InitializationInProgressP(BaseAddress)) + { + if (DAC960_LP_ReadErrorStatus(BaseAddress, &ErrorStatus, + &Parameter0, &Parameter1) && + DAC960_ReportErrorStatus(Controller, ErrorStatus, + Parameter0, Parameter1)) + goto Failure; + udelay(10); + } + if (!DAC960_V2_EnableMemoryMailboxInterface(Controller)) + { + DAC960_Error("Unable to Enable Memory Mailbox Interface " + "for Controller at\n", Controller); + goto Failure; + } + DAC960_LP_EnableInterrupts(BaseAddress); + Controller->QueueCommand = DAC960_LP_QueueCommand; + Controller->ReadControllerConfiguration = + DAC960_V2_ReadControllerConfiguration; + Controller->ReadDeviceConfiguration = + DAC960_V2_ReadDeviceConfiguration; + Controller->ReportDeviceConfiguration = + DAC960_V2_ReportDeviceConfiguration; + Controller->QueueReadWriteCommand = + DAC960_V2_QueueReadWriteCommand; + break; + case DAC960_LA_Controller: + DAC960_LA_DisableInterrupts(BaseAddress); + DAC960_LA_AcknowledgeHardwareMailboxStatus(BaseAddress); + udelay(1000); + while (DAC960_LA_InitializationInProgressP(BaseAddress)) + { + if (DAC960_LA_ReadErrorStatus(BaseAddress, &ErrorStatus, + &Parameter0, &Parameter1) && + DAC960_ReportErrorStatus(Controller, ErrorStatus, + Parameter0, Parameter1)) + goto Failure; + udelay(10); + } + if (!DAC960_V1_EnableMemoryMailboxInterface(Controller)) + { + DAC960_Error("Unable to Enable Memory Mailbox Interface " + "for Controller at\n", Controller); + goto Failure; + } + DAC960_LA_EnableInterrupts(BaseAddress); + if (Controller->V1.DualModeMemoryMailboxInterface) + Controller->QueueCommand = DAC960_LA_QueueCommandDualMode; + else Controller->QueueCommand = DAC960_LA_QueueCommandSingleMode; + Controller->ReadControllerConfiguration = + DAC960_V1_ReadControllerConfiguration; + Controller->ReadDeviceConfiguration = + DAC960_V1_ReadDeviceConfiguration; + Controller->ReportDeviceConfiguration = + DAC960_V1_ReportDeviceConfiguration; + Controller->QueueReadWriteCommand = + DAC960_V1_QueueReadWriteCommand; + break; + case DAC960_PG_Controller: + DAC960_PG_DisableInterrupts(BaseAddress); + DAC960_PG_AcknowledgeHardwareMailboxStatus(BaseAddress); + udelay(1000); + while (DAC960_PG_InitializationInProgressP(BaseAddress)) + { + if (DAC960_PG_ReadErrorStatus(BaseAddress, &ErrorStatus, + &Parameter0, &Parameter1) && + DAC960_ReportErrorStatus(Controller, ErrorStatus, + Parameter0, Parameter1)) + goto Failure; + udelay(10); + } + if (!DAC960_V1_EnableMemoryMailboxInterface(Controller)) + { + DAC960_Error("Unable to Enable Memory Mailbox Interface " + "for Controller at\n", Controller); + goto Failure; + } + DAC960_PG_EnableInterrupts(BaseAddress); + if (Controller->V1.DualModeMemoryMailboxInterface) + Controller->QueueCommand = DAC960_PG_QueueCommandDualMode; + else Controller->QueueCommand = DAC960_PG_QueueCommandSingleMode; + Controller->ReadControllerConfiguration = + DAC960_V1_ReadControllerConfiguration; + Controller->ReadDeviceConfiguration = + DAC960_V1_ReadDeviceConfiguration; + Controller->ReportDeviceConfiguration = + DAC960_V1_ReportDeviceConfiguration; + Controller->QueueReadWriteCommand = + DAC960_V1_QueueReadWriteCommand; + break; + case DAC960_PD_Controller: + if (!request_region(Controller->IO_Address, 0x80, + Controller->FullModelName)) { + DAC960_Error("IO port 0x%d busy for Controller at\n", + Controller, Controller->IO_Address); + goto Failure; + } + DAC960_PD_DisableInterrupts(BaseAddress); + DAC960_PD_AcknowledgeStatus(BaseAddress); + udelay(1000); + while (DAC960_PD_InitializationInProgressP(BaseAddress)) + { + if (DAC960_PD_ReadErrorStatus(BaseAddress, &ErrorStatus, + &Parameter0, &Parameter1) && + DAC960_ReportErrorStatus(Controller, ErrorStatus, + Parameter0, Parameter1)) + goto Failure; + udelay(10); + } + if (!DAC960_V1_EnableMemoryMailboxInterface(Controller)) + { + DAC960_Error("Unable to allocate DMA mapped memory " + "for Controller at\n", Controller); + goto Failure; + } + DAC960_PD_EnableInterrupts(BaseAddress); + Controller->QueueCommand = DAC960_PD_QueueCommand; + Controller->ReadControllerConfiguration = + DAC960_V1_ReadControllerConfiguration; + Controller->ReadDeviceConfiguration = + DAC960_V1_ReadDeviceConfiguration; + Controller->ReportDeviceConfiguration = + DAC960_V1_ReportDeviceConfiguration; + Controller->QueueReadWriteCommand = + DAC960_V1_QueueReadWriteCommand; + break; + case DAC960_P_Controller: + if (!request_region(Controller->IO_Address, 0x80, + Controller->FullModelName)){ + DAC960_Error("IO port 0x%d busy for Controller at\n", + Controller, Controller->IO_Address); + goto Failure; + } + DAC960_PD_DisableInterrupts(BaseAddress); + DAC960_PD_AcknowledgeStatus(BaseAddress); + udelay(1000); + while (DAC960_PD_InitializationInProgressP(BaseAddress)) + { + if (DAC960_PD_ReadErrorStatus(BaseAddress, &ErrorStatus, + &Parameter0, &Parameter1) && + DAC960_ReportErrorStatus(Controller, ErrorStatus, + Parameter0, Parameter1)) + goto Failure; + udelay(10); + } + if (!DAC960_V1_EnableMemoryMailboxInterface(Controller)) + { + DAC960_Error("Unable to allocate DMA mapped memory" + "for Controller at\n", Controller); + goto Failure; + } + DAC960_PD_EnableInterrupts(BaseAddress); + Controller->QueueCommand = DAC960_P_QueueCommand; + Controller->ReadControllerConfiguration = + DAC960_V1_ReadControllerConfiguration; + Controller->ReadDeviceConfiguration = + DAC960_V1_ReadDeviceConfiguration; + Controller->ReportDeviceConfiguration = + DAC960_V1_ReportDeviceConfiguration; + Controller->QueueReadWriteCommand = + DAC960_V1_QueueReadWriteCommand; + break; + } + /* + Acquire shared access to the IRQ Channel. + */ + IRQ_Channel = PCI_Device->irq; + if (request_irq(IRQ_Channel, InterruptHandler, IRQF_SHARED, + Controller->FullModelName, Controller) < 0) + { + DAC960_Error("Unable to acquire IRQ Channel %d for Controller at\n", + Controller, Controller->IRQ_Channel); + goto Failure; + } + Controller->IRQ_Channel = IRQ_Channel; + Controller->InitialCommand.CommandIdentifier = 1; + Controller->InitialCommand.Controller = Controller; + Controller->Commands[0] = &Controller->InitialCommand; + Controller->FreeCommands = &Controller->InitialCommand; + return Controller; + +Failure: + if (Controller->IO_Address == 0) + DAC960_Error("PCI Bus %d Device %d Function %d I/O Address N/A " + "PCI Address 0x%X\n", Controller, + Controller->Bus, Controller->Device, + Controller->Function, Controller->PCI_Address); + else + DAC960_Error("PCI Bus %d Device %d Function %d I/O Address " + "0x%X PCI Address 0x%X\n", Controller, + Controller->Bus, Controller->Device, + Controller->Function, Controller->IO_Address, + Controller->PCI_Address); + DAC960_DetectCleanup(Controller); + DAC960_ControllerCount--; + return NULL; +} + +/* + DAC960_InitializeController initializes Controller. +*/ + +static bool +DAC960_InitializeController(DAC960_Controller_T *Controller) +{ + if (DAC960_ReadControllerConfiguration(Controller) && + DAC960_ReportControllerConfiguration(Controller) && + DAC960_CreateAuxiliaryStructures(Controller) && + DAC960_ReadDeviceConfiguration(Controller) && + DAC960_ReportDeviceConfiguration(Controller) && + DAC960_RegisterBlockDevice(Controller)) + { + /* + Initialize the Monitoring Timer. + */ + init_timer(&Controller->MonitoringTimer); + Controller->MonitoringTimer.expires = + jiffies + DAC960_MonitoringTimerInterval; + Controller->MonitoringTimer.data = (unsigned long) Controller; + Controller->MonitoringTimer.function = DAC960_MonitoringTimerFunction; + add_timer(&Controller->MonitoringTimer); + Controller->ControllerInitialized = true; + return true; + } + return false; +} + + +/* + DAC960_FinalizeController finalizes Controller. +*/ + +static void DAC960_FinalizeController(DAC960_Controller_T *Controller) +{ + if (Controller->ControllerInitialized) + { + unsigned long flags; + + /* + * Acquiring and releasing lock here eliminates + * a very low probability race. + * + * The code below allocates controller command structures + * from the free list without holding the controller lock. + * This is safe assuming there is no other activity on + * the controller at the time. + * + * But, there might be a monitoring command still + * in progress. Setting the Shutdown flag while holding + * the lock ensures that there is no monitoring command + * in the interrupt handler currently, and any monitoring + * commands that complete from this time on will NOT return + * their command structure to the free list. + */ + + spin_lock_irqsave(&Controller->queue_lock, flags); + Controller->ShutdownMonitoringTimer = 1; + spin_unlock_irqrestore(&Controller->queue_lock, flags); + + del_timer_sync(&Controller->MonitoringTimer); + if (Controller->FirmwareType == DAC960_V1_Controller) + { + DAC960_Notice("Flushing Cache...", Controller); + DAC960_V1_ExecuteType3(Controller, DAC960_V1_Flush, 0); + DAC960_Notice("done\n", Controller); + + if (Controller->HardwareType == DAC960_PD_Controller) + release_region(Controller->IO_Address, 0x80); + } + else + { + DAC960_Notice("Flushing Cache...", Controller); + DAC960_V2_DeviceOperation(Controller, DAC960_V2_PauseDevice, + DAC960_V2_RAID_Controller); + DAC960_Notice("done\n", Controller); + } + } + DAC960_UnregisterBlockDevice(Controller); + DAC960_DestroyAuxiliaryStructures(Controller); + DAC960_DestroyProcEntries(Controller); + DAC960_DetectCleanup(Controller); +} + + +/* + DAC960_Probe verifies controller's existence and + initializes the DAC960 Driver for that controller. +*/ + +static int +DAC960_Probe(struct pci_dev *dev, const struct pci_device_id *entry) +{ + int disk; + DAC960_Controller_T *Controller; + + if (DAC960_ControllerCount == DAC960_MaxControllers) + { + DAC960_Error("More than %d DAC960 Controllers detected - " + "ignoring from Controller at\n", + NULL, DAC960_MaxControllers); + return -ENODEV; + } + + Controller = DAC960_DetectController(dev, entry); + if (!Controller) + return -ENODEV; + + if (!DAC960_InitializeController(Controller)) { + DAC960_FinalizeController(Controller); + return -ENODEV; + } + + for (disk = 0; disk < DAC960_MaxLogicalDrives; disk++) { + set_capacity(Controller->disks[disk], disk_size(Controller, disk)); + add_disk(Controller->disks[disk]); + } + DAC960_CreateProcEntries(Controller); + return 0; +} + + +/* + DAC960_Finalize finalizes the DAC960 Driver. +*/ + +static void DAC960_Remove(struct pci_dev *PCI_Device) +{ + int Controller_Number = (long)pci_get_drvdata(PCI_Device); + DAC960_Controller_T *Controller = DAC960_Controllers[Controller_Number]; + if (Controller != NULL) + DAC960_FinalizeController(Controller); +} + + +/* + DAC960_V1_QueueReadWriteCommand prepares and queues a Read/Write Command for + DAC960 V1 Firmware Controllers. +*/ + +static void DAC960_V1_QueueReadWriteCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + DAC960_V1_ScatterGatherSegment_T *ScatterGatherList = + Command->V1.ScatterGatherList; + struct scatterlist *ScatterList = Command->V1.ScatterList; + + DAC960_V1_ClearCommand(Command); + + if (Command->SegmentCount == 1) + { + if (Command->DmaDirection == PCI_DMA_FROMDEVICE) + CommandMailbox->Type5.CommandOpcode = DAC960_V1_Read; + else + CommandMailbox->Type5.CommandOpcode = DAC960_V1_Write; + + CommandMailbox->Type5.LD.TransferLength = Command->BlockCount; + CommandMailbox->Type5.LD.LogicalDriveNumber = Command->LogicalDriveNumber; + CommandMailbox->Type5.LogicalBlockAddress = Command->BlockNumber; + CommandMailbox->Type5.BusAddress = + (DAC960_BusAddress32_T)sg_dma_address(ScatterList); + } + else + { + int i; + + if (Command->DmaDirection == PCI_DMA_FROMDEVICE) + CommandMailbox->Type5.CommandOpcode = DAC960_V1_ReadWithScatterGather; + else + CommandMailbox->Type5.CommandOpcode = DAC960_V1_WriteWithScatterGather; + + CommandMailbox->Type5.LD.TransferLength = Command->BlockCount; + CommandMailbox->Type5.LD.LogicalDriveNumber = Command->LogicalDriveNumber; + CommandMailbox->Type5.LogicalBlockAddress = Command->BlockNumber; + CommandMailbox->Type5.BusAddress = Command->V1.ScatterGatherListDMA; + + CommandMailbox->Type5.ScatterGatherCount = Command->SegmentCount; + + for (i = 0; i < Command->SegmentCount; i++, ScatterList++, ScatterGatherList++) { + ScatterGatherList->SegmentDataPointer = + (DAC960_BusAddress32_T)sg_dma_address(ScatterList); + ScatterGatherList->SegmentByteCount = + (DAC960_ByteCount32_T)sg_dma_len(ScatterList); + } + } + DAC960_QueueCommand(Command); +} + + +/* + DAC960_V2_QueueReadWriteCommand prepares and queues a Read/Write Command for + DAC960 V2 Firmware Controllers. +*/ + +static void DAC960_V2_QueueReadWriteCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + struct scatterlist *ScatterList = Command->V2.ScatterList; + + DAC960_V2_ClearCommand(Command); + + CommandMailbox->SCSI_10.CommandOpcode = DAC960_V2_SCSI_10; + CommandMailbox->SCSI_10.CommandControlBits.DataTransferControllerToHost = + (Command->DmaDirection == PCI_DMA_FROMDEVICE); + CommandMailbox->SCSI_10.DataTransferSize = + Command->BlockCount << DAC960_BlockSizeBits; + CommandMailbox->SCSI_10.RequestSenseBusAddress = Command->V2.RequestSenseDMA; + CommandMailbox->SCSI_10.PhysicalDevice = + Controller->V2.LogicalDriveToVirtualDevice[Command->LogicalDriveNumber]; + CommandMailbox->SCSI_10.RequestSenseSize = sizeof(DAC960_SCSI_RequestSense_T); + CommandMailbox->SCSI_10.CDBLength = 10; + CommandMailbox->SCSI_10.SCSI_CDB[0] = + (Command->DmaDirection == PCI_DMA_FROMDEVICE ? 0x28 : 0x2A); + CommandMailbox->SCSI_10.SCSI_CDB[2] = Command->BlockNumber >> 24; + CommandMailbox->SCSI_10.SCSI_CDB[3] = Command->BlockNumber >> 16; + CommandMailbox->SCSI_10.SCSI_CDB[4] = Command->BlockNumber >> 8; + CommandMailbox->SCSI_10.SCSI_CDB[5] = Command->BlockNumber; + CommandMailbox->SCSI_10.SCSI_CDB[7] = Command->BlockCount >> 8; + CommandMailbox->SCSI_10.SCSI_CDB[8] = Command->BlockCount; + + if (Command->SegmentCount == 1) + { + CommandMailbox->SCSI_10.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + (DAC960_BusAddress64_T)sg_dma_address(ScatterList); + CommandMailbox->SCSI_10.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->SCSI_10.DataTransferSize; + } + else + { + DAC960_V2_ScatterGatherSegment_T *ScatterGatherList; + int i; + + if (Command->SegmentCount > 2) + { + ScatterGatherList = Command->V2.ScatterGatherList; + CommandMailbox->SCSI_10.CommandControlBits + .AdditionalScatterGatherListMemory = true; + CommandMailbox->SCSI_10.DataTransferMemoryAddress + .ExtendedScatterGather.ScatterGatherList0Length = Command->SegmentCount; + CommandMailbox->SCSI_10.DataTransferMemoryAddress + .ExtendedScatterGather.ScatterGatherList0Address = + Command->V2.ScatterGatherListDMA; + } + else + ScatterGatherList = CommandMailbox->SCSI_10.DataTransferMemoryAddress + .ScatterGatherSegments; + + for (i = 0; i < Command->SegmentCount; i++, ScatterList++, ScatterGatherList++) { + ScatterGatherList->SegmentDataPointer = + (DAC960_BusAddress64_T)sg_dma_address(ScatterList); + ScatterGatherList->SegmentByteCount = + (DAC960_ByteCount64_T)sg_dma_len(ScatterList); + } + } + DAC960_QueueCommand(Command); +} + + +static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_queue *req_q) +{ + struct request *Request; + DAC960_Command_T *Command; + + while(1) { + Request = blk_peek_request(req_q); + if (!Request) + return 1; + + Command = DAC960_AllocateCommand(Controller); + if (Command == NULL) + return 0; + + if (rq_data_dir(Request) == READ) { + Command->DmaDirection = PCI_DMA_FROMDEVICE; + Command->CommandType = DAC960_ReadCommand; + } else { + Command->DmaDirection = PCI_DMA_TODEVICE; + Command->CommandType = DAC960_WriteCommand; + } + Command->Completion = Request->end_io_data; + Command->LogicalDriveNumber = (long)Request->rq_disk->private_data; + Command->BlockNumber = blk_rq_pos(Request); + Command->BlockCount = blk_rq_sectors(Request); + Command->Request = Request; + blk_start_request(Request); + Command->SegmentCount = blk_rq_map_sg(req_q, + Command->Request, Command->cmd_sglist); + /* pci_map_sg MAY change the value of SegCount */ + Command->SegmentCount = pci_map_sg(Controller->PCIDevice, Command->cmd_sglist, + Command->SegmentCount, Command->DmaDirection); + + DAC960_QueueReadWriteCommand(Command); + } +} + +/* + DAC960_ProcessRequest attempts to remove one I/O Request from Controller's + I/O Request Queue and queues it to the Controller. WaitForCommand is true if + this function should wait for a Command to become available if necessary. + This function returns true if an I/O Request was queued and false otherwise. +*/ +static void DAC960_ProcessRequest(DAC960_Controller_T *controller) +{ + int i; + + if (!controller->ControllerInitialized) + return; + + /* Do this better later! */ + for (i = controller->req_q_index; i < DAC960_MaxLogicalDrives; i++) { + struct request_queue *req_q = controller->RequestQueue[i]; + + if (req_q == NULL) + continue; + + if (!DAC960_process_queue(controller, req_q)) { + controller->req_q_index = i; + return; + } + } + + if (controller->req_q_index == 0) + return; + + for (i = 0; i < controller->req_q_index; i++) { + struct request_queue *req_q = controller->RequestQueue[i]; + + if (req_q == NULL) + continue; + + if (!DAC960_process_queue(controller, req_q)) { + controller->req_q_index = i; + return; + } + } +} + + +/* + DAC960_queue_partial_rw extracts one bio from the request already + associated with argument command, and construct a new command block to retry I/O + only on that bio. Queue that command to the controller. + + This function re-uses a previously-allocated Command, + there is no failure mode from trying to allocate a command. +*/ + +static void DAC960_queue_partial_rw(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + struct request *Request = Command->Request; + struct request_queue *req_q = Controller->RequestQueue[Command->LogicalDriveNumber]; + + if (Command->DmaDirection == PCI_DMA_FROMDEVICE) + Command->CommandType = DAC960_ReadRetryCommand; + else + Command->CommandType = DAC960_WriteRetryCommand; + + /* + * We could be more efficient with these mapping requests + * and map only the portions that we need. But since this + * code should almost never be called, just go with a + * simple coding. + */ + (void)blk_rq_map_sg(req_q, Command->Request, Command->cmd_sglist); + + (void)pci_map_sg(Controller->PCIDevice, Command->cmd_sglist, 1, Command->DmaDirection); + /* + * Resubmitting the request sector at a time is really tedious. + * But, this should almost never happen. So, we're willing to pay + * this price so that in the end, as much of the transfer is completed + * successfully as possible. + */ + Command->SegmentCount = 1; + Command->BlockNumber = blk_rq_pos(Request); + Command->BlockCount = 1; + DAC960_QueueReadWriteCommand(Command); + return; +} + +/* + DAC960_RequestFunction is the I/O Request Function for DAC960 Controllers. +*/ + +static void DAC960_RequestFunction(struct request_queue *RequestQueue) +{ + DAC960_ProcessRequest(RequestQueue->queuedata); +} + +/* + DAC960_ProcessCompletedBuffer performs completion processing for an + individual Buffer. +*/ + +static inline bool DAC960_ProcessCompletedRequest(DAC960_Command_T *Command, + bool SuccessfulIO) +{ + struct request *Request = Command->Request; + int Error = SuccessfulIO ? 0 : -EIO; + + pci_unmap_sg(Command->Controller->PCIDevice, Command->cmd_sglist, + Command->SegmentCount, Command->DmaDirection); + + if (!__blk_end_request(Request, Error, Command->BlockCount << 9)) { + if (Command->Completion) { + complete(Command->Completion); + Command->Completion = NULL; + } + return true; + } + return false; +} + +/* + DAC960_V1_ReadWriteError prints an appropriate error message for Command + when an error occurs on a Read or Write operation. +*/ + +static void DAC960_V1_ReadWriteError(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + unsigned char *CommandName = "UNKNOWN"; + switch (Command->CommandType) + { + case DAC960_ReadCommand: + case DAC960_ReadRetryCommand: + CommandName = "READ"; + break; + case DAC960_WriteCommand: + case DAC960_WriteRetryCommand: + CommandName = "WRITE"; + break; + case DAC960_MonitoringCommand: + case DAC960_ImmediateCommand: + case DAC960_QueuedCommand: + break; + } + switch (Command->V1.CommandStatus) + { + case DAC960_V1_IrrecoverableDataError: + DAC960_Error("Irrecoverable Data Error on %s:\n", + Controller, CommandName); + break; + case DAC960_V1_LogicalDriveNonexistentOrOffline: + DAC960_Error("Logical Drive Nonexistent or Offline on %s:\n", + Controller, CommandName); + break; + case DAC960_V1_AccessBeyondEndOfLogicalDrive: + DAC960_Error("Attempt to Access Beyond End of Logical Drive " + "on %s:\n", Controller, CommandName); + break; + case DAC960_V1_BadDataEncountered: + DAC960_Error("Bad Data Encountered on %s:\n", Controller, CommandName); + break; + default: + DAC960_Error("Unexpected Error Status %04X on %s:\n", + Controller, Command->V1.CommandStatus, CommandName); + break; + } + DAC960_Error(" /dev/rd/c%dd%d: absolute blocks %u..%u\n", + Controller, Controller->ControllerNumber, + Command->LogicalDriveNumber, Command->BlockNumber, + Command->BlockNumber + Command->BlockCount - 1); +} + + +/* + DAC960_V1_ProcessCompletedCommand performs completion processing for Command + for DAC960 V1 Firmware Controllers. +*/ + +static void DAC960_V1_ProcessCompletedCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + DAC960_CommandType_T CommandType = Command->CommandType; + DAC960_V1_CommandOpcode_T CommandOpcode = + Command->V1.CommandMailbox.Common.CommandOpcode; + DAC960_V1_CommandStatus_T CommandStatus = Command->V1.CommandStatus; + + if (CommandType == DAC960_ReadCommand || + CommandType == DAC960_WriteCommand) + { + +#ifdef FORCE_RETRY_DEBUG + CommandStatus = DAC960_V1_IrrecoverableDataError; +#endif + + if (CommandStatus == DAC960_V1_NormalCompletion) { + + if (!DAC960_ProcessCompletedRequest(Command, true)) + BUG(); + + } else if (CommandStatus == DAC960_V1_IrrecoverableDataError || + CommandStatus == DAC960_V1_BadDataEncountered) + { + /* + * break the command down into pieces and resubmit each + * piece, hoping that some of them will succeed. + */ + DAC960_queue_partial_rw(Command); + return; + } + else + { + if (CommandStatus != DAC960_V1_LogicalDriveNonexistentOrOffline) + DAC960_V1_ReadWriteError(Command); + + if (!DAC960_ProcessCompletedRequest(Command, false)) + BUG(); + } + } + else if (CommandType == DAC960_ReadRetryCommand || + CommandType == DAC960_WriteRetryCommand) + { + bool normal_completion; +#ifdef FORCE_RETRY_FAILURE_DEBUG + static int retry_count = 1; +#endif + /* + Perform completion processing for the portion that was + retried, and submit the next portion, if any. + */ + normal_completion = true; + if (CommandStatus != DAC960_V1_NormalCompletion) { + normal_completion = false; + if (CommandStatus != DAC960_V1_LogicalDriveNonexistentOrOffline) + DAC960_V1_ReadWriteError(Command); + } + +#ifdef FORCE_RETRY_FAILURE_DEBUG + if (!(++retry_count % 10000)) { + printk("V1 error retry failure test\n"); + normal_completion = false; + DAC960_V1_ReadWriteError(Command); + } +#endif + + if (!DAC960_ProcessCompletedRequest(Command, normal_completion)) { + DAC960_queue_partial_rw(Command); + return; + } + } + + else if (CommandType == DAC960_MonitoringCommand) + { + if (Controller->ShutdownMonitoringTimer) + return; + if (CommandOpcode == DAC960_V1_Enquiry) + { + DAC960_V1_Enquiry_T *OldEnquiry = &Controller->V1.Enquiry; + DAC960_V1_Enquiry_T *NewEnquiry = Controller->V1.NewEnquiry; + unsigned int OldCriticalLogicalDriveCount = + OldEnquiry->CriticalLogicalDriveCount; + unsigned int NewCriticalLogicalDriveCount = + NewEnquiry->CriticalLogicalDriveCount; + if (NewEnquiry->NumberOfLogicalDrives > Controller->LogicalDriveCount) + { + int LogicalDriveNumber = Controller->LogicalDriveCount - 1; + while (++LogicalDriveNumber < NewEnquiry->NumberOfLogicalDrives) + DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) " + "Now Exists\n", Controller, + LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber); + Controller->LogicalDriveCount = NewEnquiry->NumberOfLogicalDrives; + DAC960_ComputeGenericDiskInfo(Controller); + } + if (NewEnquiry->NumberOfLogicalDrives < Controller->LogicalDriveCount) + { + int LogicalDriveNumber = NewEnquiry->NumberOfLogicalDrives - 1; + while (++LogicalDriveNumber < Controller->LogicalDriveCount) + DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) " + "No Longer Exists\n", Controller, + LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber); + Controller->LogicalDriveCount = NewEnquiry->NumberOfLogicalDrives; + DAC960_ComputeGenericDiskInfo(Controller); + } + if (NewEnquiry->StatusFlags.DeferredWriteError != + OldEnquiry->StatusFlags.DeferredWriteError) + DAC960_Critical("Deferred Write Error Flag is now %s\n", Controller, + (NewEnquiry->StatusFlags.DeferredWriteError + ? "TRUE" : "FALSE")); + if ((NewCriticalLogicalDriveCount > 0 || + NewCriticalLogicalDriveCount != OldCriticalLogicalDriveCount) || + (NewEnquiry->OfflineLogicalDriveCount > 0 || + NewEnquiry->OfflineLogicalDriveCount != + OldEnquiry->OfflineLogicalDriveCount) || + (NewEnquiry->DeadDriveCount > 0 || + NewEnquiry->DeadDriveCount != + OldEnquiry->DeadDriveCount) || + (NewEnquiry->EventLogSequenceNumber != + OldEnquiry->EventLogSequenceNumber) || + Controller->MonitoringTimerCount == 0 || + time_after_eq(jiffies, Controller->SecondaryMonitoringTime + + DAC960_SecondaryMonitoringInterval)) + { + Controller->V1.NeedLogicalDriveInformation = true; + Controller->V1.NewEventLogSequenceNumber = + NewEnquiry->EventLogSequenceNumber; + Controller->V1.NeedErrorTableInformation = true; + Controller->V1.NeedDeviceStateInformation = true; + Controller->V1.StartDeviceStateScan = true; + Controller->V1.NeedBackgroundInitializationStatus = + Controller->V1.BackgroundInitializationStatusSupported; + Controller->SecondaryMonitoringTime = jiffies; + } + if (NewEnquiry->RebuildFlag == DAC960_V1_StandbyRebuildInProgress || + NewEnquiry->RebuildFlag + == DAC960_V1_BackgroundRebuildInProgress || + OldEnquiry->RebuildFlag == DAC960_V1_StandbyRebuildInProgress || + OldEnquiry->RebuildFlag == DAC960_V1_BackgroundRebuildInProgress) + { + Controller->V1.NeedRebuildProgress = true; + Controller->V1.RebuildProgressFirst = + (NewEnquiry->CriticalLogicalDriveCount < + OldEnquiry->CriticalLogicalDriveCount); + } + if (OldEnquiry->RebuildFlag == DAC960_V1_BackgroundCheckInProgress) + switch (NewEnquiry->RebuildFlag) + { + case DAC960_V1_NoStandbyRebuildOrCheckInProgress: + DAC960_Progress("Consistency Check Completed Successfully\n", + Controller); + break; + case DAC960_V1_StandbyRebuildInProgress: + case DAC960_V1_BackgroundRebuildInProgress: + break; + case DAC960_V1_BackgroundCheckInProgress: + Controller->V1.NeedConsistencyCheckProgress = true; + break; + case DAC960_V1_StandbyRebuildCompletedWithError: + DAC960_Progress("Consistency Check Completed with Error\n", + Controller); + break; + case DAC960_V1_BackgroundRebuildOrCheckFailed_DriveFailed: + DAC960_Progress("Consistency Check Failed - " + "Physical Device Failed\n", Controller); + break; + case DAC960_V1_BackgroundRebuildOrCheckFailed_LogicalDriveFailed: + DAC960_Progress("Consistency Check Failed - " + "Logical Drive Failed\n", Controller); + break; + case DAC960_V1_BackgroundRebuildOrCheckFailed_OtherCauses: + DAC960_Progress("Consistency Check Failed - Other Causes\n", + Controller); + break; + case DAC960_V1_BackgroundRebuildOrCheckSuccessfullyTerminated: + DAC960_Progress("Consistency Check Successfully Terminated\n", + Controller); + break; + } + else if (NewEnquiry->RebuildFlag + == DAC960_V1_BackgroundCheckInProgress) + Controller->V1.NeedConsistencyCheckProgress = true; + Controller->MonitoringAlertMode = + (NewEnquiry->CriticalLogicalDriveCount > 0 || + NewEnquiry->OfflineLogicalDriveCount > 0 || + NewEnquiry->DeadDriveCount > 0); + if (NewEnquiry->RebuildFlag > DAC960_V1_BackgroundCheckInProgress) + { + Controller->V1.PendingRebuildFlag = NewEnquiry->RebuildFlag; + Controller->V1.RebuildFlagPending = true; + } + memcpy(&Controller->V1.Enquiry, &Controller->V1.NewEnquiry, + sizeof(DAC960_V1_Enquiry_T)); + } + else if (CommandOpcode == DAC960_V1_PerformEventLogOperation) + { + static char + *DAC960_EventMessages[] = + { "killed because write recovery failed", + "killed because of SCSI bus reset failure", + "killed because of double check condition", + "killed because it was removed", + "killed because of gross error on SCSI chip", + "killed because of bad tag returned from drive", + "killed because of timeout on SCSI command", + "killed because of reset SCSI command issued from system", + "killed because busy or parity error count exceeded limit", + "killed because of 'kill drive' command from system", + "killed because of selection timeout", + "killed due to SCSI phase sequence error", + "killed due to unknown status" }; + DAC960_V1_EventLogEntry_T *EventLogEntry = + Controller->V1.EventLogEntry; + if (EventLogEntry->SequenceNumber == + Controller->V1.OldEventLogSequenceNumber) + { + unsigned char SenseKey = EventLogEntry->SenseKey; + unsigned char AdditionalSenseCode = + EventLogEntry->AdditionalSenseCode; + unsigned char AdditionalSenseCodeQualifier = + EventLogEntry->AdditionalSenseCodeQualifier; + if (SenseKey == DAC960_SenseKey_VendorSpecific && + AdditionalSenseCode == 0x80 && + AdditionalSenseCodeQualifier < + ARRAY_SIZE(DAC960_EventMessages)) + DAC960_Critical("Physical Device %d:%d %s\n", Controller, + EventLogEntry->Channel, + EventLogEntry->TargetID, + DAC960_EventMessages[ + AdditionalSenseCodeQualifier]); + else if (SenseKey == DAC960_SenseKey_UnitAttention && + AdditionalSenseCode == 0x29) + { + if (Controller->MonitoringTimerCount > 0) + Controller->V1.DeviceResetCount[EventLogEntry->Channel] + [EventLogEntry->TargetID]++; + } + else if (!(SenseKey == DAC960_SenseKey_NoSense || + (SenseKey == DAC960_SenseKey_NotReady && + AdditionalSenseCode == 0x04 && + (AdditionalSenseCodeQualifier == 0x01 || + AdditionalSenseCodeQualifier == 0x02)))) + { + DAC960_Critical("Physical Device %d:%d Error Log: " + "Sense Key = %X, ASC = %02X, ASCQ = %02X\n", + Controller, + EventLogEntry->Channel, + EventLogEntry->TargetID, + SenseKey, + AdditionalSenseCode, + AdditionalSenseCodeQualifier); + DAC960_Critical("Physical Device %d:%d Error Log: " + "Information = %02X%02X%02X%02X " + "%02X%02X%02X%02X\n", + Controller, + EventLogEntry->Channel, + EventLogEntry->TargetID, + EventLogEntry->Information[0], + EventLogEntry->Information[1], + EventLogEntry->Information[2], + EventLogEntry->Information[3], + EventLogEntry->CommandSpecificInformation[0], + EventLogEntry->CommandSpecificInformation[1], + EventLogEntry->CommandSpecificInformation[2], + EventLogEntry->CommandSpecificInformation[3]); + } + } + Controller->V1.OldEventLogSequenceNumber++; + } + else if (CommandOpcode == DAC960_V1_GetErrorTable) + { + DAC960_V1_ErrorTable_T *OldErrorTable = &Controller->V1.ErrorTable; + DAC960_V1_ErrorTable_T *NewErrorTable = Controller->V1.NewErrorTable; + int Channel, TargetID; + for (Channel = 0; Channel < Controller->Channels; Channel++) + for (TargetID = 0; TargetID < Controller->Targets; TargetID++) + { + DAC960_V1_ErrorTableEntry_T *NewErrorEntry = + &NewErrorTable->ErrorTableEntries[Channel][TargetID]; + DAC960_V1_ErrorTableEntry_T *OldErrorEntry = + &OldErrorTable->ErrorTableEntries[Channel][TargetID]; + if ((NewErrorEntry->ParityErrorCount != + OldErrorEntry->ParityErrorCount) || + (NewErrorEntry->SoftErrorCount != + OldErrorEntry->SoftErrorCount) || + (NewErrorEntry->HardErrorCount != + OldErrorEntry->HardErrorCount) || + (NewErrorEntry->MiscErrorCount != + OldErrorEntry->MiscErrorCount)) + DAC960_Critical("Physical Device %d:%d Errors: " + "Parity = %d, Soft = %d, " + "Hard = %d, Misc = %d\n", + Controller, Channel, TargetID, + NewErrorEntry->ParityErrorCount, + NewErrorEntry->SoftErrorCount, + NewErrorEntry->HardErrorCount, + NewErrorEntry->MiscErrorCount); + } + memcpy(&Controller->V1.ErrorTable, Controller->V1.NewErrorTable, + sizeof(DAC960_V1_ErrorTable_T)); + } + else if (CommandOpcode == DAC960_V1_GetDeviceState) + { + DAC960_V1_DeviceState_T *OldDeviceState = + &Controller->V1.DeviceState[Controller->V1.DeviceStateChannel] + [Controller->V1.DeviceStateTargetID]; + DAC960_V1_DeviceState_T *NewDeviceState = + Controller->V1.NewDeviceState; + if (NewDeviceState->DeviceState != OldDeviceState->DeviceState) + DAC960_Critical("Physical Device %d:%d is now %s\n", Controller, + Controller->V1.DeviceStateChannel, + Controller->V1.DeviceStateTargetID, + (NewDeviceState->DeviceState + == DAC960_V1_Device_Dead + ? "DEAD" + : NewDeviceState->DeviceState + == DAC960_V1_Device_WriteOnly + ? "WRITE-ONLY" + : NewDeviceState->DeviceState + == DAC960_V1_Device_Online + ? "ONLINE" : "STANDBY")); + if (OldDeviceState->DeviceState == DAC960_V1_Device_Dead && + NewDeviceState->DeviceState != DAC960_V1_Device_Dead) + { + Controller->V1.NeedDeviceInquiryInformation = true; + Controller->V1.NeedDeviceSerialNumberInformation = true; + Controller->V1.DeviceResetCount + [Controller->V1.DeviceStateChannel] + [Controller->V1.DeviceStateTargetID] = 0; + } + memcpy(OldDeviceState, NewDeviceState, + sizeof(DAC960_V1_DeviceState_T)); + } + else if (CommandOpcode == DAC960_V1_GetLogicalDriveInformation) + { + int LogicalDriveNumber; + for (LogicalDriveNumber = 0; + LogicalDriveNumber < Controller->LogicalDriveCount; + LogicalDriveNumber++) + { + DAC960_V1_LogicalDriveInformation_T *OldLogicalDriveInformation = + &Controller->V1.LogicalDriveInformation[LogicalDriveNumber]; + DAC960_V1_LogicalDriveInformation_T *NewLogicalDriveInformation = + &(*Controller->V1.NewLogicalDriveInformation)[LogicalDriveNumber]; + if (NewLogicalDriveInformation->LogicalDriveState != + OldLogicalDriveInformation->LogicalDriveState) + DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) " + "is now %s\n", Controller, + LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber, + (NewLogicalDriveInformation->LogicalDriveState + == DAC960_V1_LogicalDrive_Online + ? "ONLINE" + : NewLogicalDriveInformation->LogicalDriveState + == DAC960_V1_LogicalDrive_Critical + ? "CRITICAL" : "OFFLINE")); + if (NewLogicalDriveInformation->WriteBack != + OldLogicalDriveInformation->WriteBack) + DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) " + "is now %s\n", Controller, + LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber, + (NewLogicalDriveInformation->WriteBack + ? "WRITE BACK" : "WRITE THRU")); + } + memcpy(&Controller->V1.LogicalDriveInformation, + Controller->V1.NewLogicalDriveInformation, + sizeof(DAC960_V1_LogicalDriveInformationArray_T)); + } + else if (CommandOpcode == DAC960_V1_GetRebuildProgress) + { + unsigned int LogicalDriveNumber = + Controller->V1.RebuildProgress->LogicalDriveNumber; + unsigned int LogicalDriveSize = + Controller->V1.RebuildProgress->LogicalDriveSize; + unsigned int BlocksCompleted = + LogicalDriveSize - Controller->V1.RebuildProgress->RemainingBlocks; + if (CommandStatus == DAC960_V1_NoRebuildOrCheckInProgress && + Controller->V1.LastRebuildStatus == DAC960_V1_NormalCompletion) + CommandStatus = DAC960_V1_RebuildSuccessful; + switch (CommandStatus) + { + case DAC960_V1_NormalCompletion: + Controller->EphemeralProgressMessage = true; + DAC960_Progress("Rebuild in Progress: " + "Logical Drive %d (/dev/rd/c%dd%d) " + "%d%% completed\n", + Controller, LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber, + (100 * (BlocksCompleted >> 7)) + / (LogicalDriveSize >> 7)); + Controller->EphemeralProgressMessage = false; + break; + case DAC960_V1_RebuildFailed_LogicalDriveFailure: + DAC960_Progress("Rebuild Failed due to " + "Logical Drive Failure\n", Controller); + break; + case DAC960_V1_RebuildFailed_BadBlocksOnOther: + DAC960_Progress("Rebuild Failed due to " + "Bad Blocks on Other Drives\n", Controller); + break; + case DAC960_V1_RebuildFailed_NewDriveFailed: + DAC960_Progress("Rebuild Failed due to " + "Failure of Drive Being Rebuilt\n", Controller); + break; + case DAC960_V1_NoRebuildOrCheckInProgress: + break; + case DAC960_V1_RebuildSuccessful: + DAC960_Progress("Rebuild Completed Successfully\n", Controller); + break; + case DAC960_V1_RebuildSuccessfullyTerminated: + DAC960_Progress("Rebuild Successfully Terminated\n", Controller); + break; + } + Controller->V1.LastRebuildStatus = CommandStatus; + if (CommandType != DAC960_MonitoringCommand && + Controller->V1.RebuildStatusPending) + { + Command->V1.CommandStatus = Controller->V1.PendingRebuildStatus; + Controller->V1.RebuildStatusPending = false; + } + else if (CommandType == DAC960_MonitoringCommand && + CommandStatus != DAC960_V1_NormalCompletion && + CommandStatus != DAC960_V1_NoRebuildOrCheckInProgress) + { + Controller->V1.PendingRebuildStatus = CommandStatus; + Controller->V1.RebuildStatusPending = true; + } + } + else if (CommandOpcode == DAC960_V1_RebuildStat) + { + unsigned int LogicalDriveNumber = + Controller->V1.RebuildProgress->LogicalDriveNumber; + unsigned int LogicalDriveSize = + Controller->V1.RebuildProgress->LogicalDriveSize; + unsigned int BlocksCompleted = + LogicalDriveSize - Controller->V1.RebuildProgress->RemainingBlocks; + if (CommandStatus == DAC960_V1_NormalCompletion) + { + Controller->EphemeralProgressMessage = true; + DAC960_Progress("Consistency Check in Progress: " + "Logical Drive %d (/dev/rd/c%dd%d) " + "%d%% completed\n", + Controller, LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber, + (100 * (BlocksCompleted >> 7)) + / (LogicalDriveSize >> 7)); + Controller->EphemeralProgressMessage = false; + } + } + else if (CommandOpcode == DAC960_V1_BackgroundInitializationControl) + { + unsigned int LogicalDriveNumber = + Controller->V1.BackgroundInitializationStatus->LogicalDriveNumber; + unsigned int LogicalDriveSize = + Controller->V1.BackgroundInitializationStatus->LogicalDriveSize; + unsigned int BlocksCompleted = + Controller->V1.BackgroundInitializationStatus->BlocksCompleted; + switch (CommandStatus) + { + case DAC960_V1_NormalCompletion: + switch (Controller->V1.BackgroundInitializationStatus->Status) + { + case DAC960_V1_BackgroundInitializationInvalid: + break; + case DAC960_V1_BackgroundInitializationStarted: + DAC960_Progress("Background Initialization Started\n", + Controller); + break; + case DAC960_V1_BackgroundInitializationInProgress: + if (BlocksCompleted == + Controller->V1.LastBackgroundInitializationStatus. + BlocksCompleted && + LogicalDriveNumber == + Controller->V1.LastBackgroundInitializationStatus. + LogicalDriveNumber) + break; + Controller->EphemeralProgressMessage = true; + DAC960_Progress("Background Initialization in Progress: " + "Logical Drive %d (/dev/rd/c%dd%d) " + "%d%% completed\n", + Controller, LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber, + (100 * (BlocksCompleted >> 7)) + / (LogicalDriveSize >> 7)); + Controller->EphemeralProgressMessage = false; + break; + case DAC960_V1_BackgroundInitializationSuspended: + DAC960_Progress("Background Initialization Suspended\n", + Controller); + break; + case DAC960_V1_BackgroundInitializationCancelled: + DAC960_Progress("Background Initialization Cancelled\n", + Controller); + break; + } + memcpy(&Controller->V1.LastBackgroundInitializationStatus, + Controller->V1.BackgroundInitializationStatus, + sizeof(DAC960_V1_BackgroundInitializationStatus_T)); + break; + case DAC960_V1_BackgroundInitSuccessful: + if (Controller->V1.BackgroundInitializationStatus->Status == + DAC960_V1_BackgroundInitializationInProgress) + DAC960_Progress("Background Initialization " + "Completed Successfully\n", Controller); + Controller->V1.BackgroundInitializationStatus->Status = + DAC960_V1_BackgroundInitializationInvalid; + break; + case DAC960_V1_BackgroundInitAborted: + if (Controller->V1.BackgroundInitializationStatus->Status == + DAC960_V1_BackgroundInitializationInProgress) + DAC960_Progress("Background Initialization Aborted\n", + Controller); + Controller->V1.BackgroundInitializationStatus->Status = + DAC960_V1_BackgroundInitializationInvalid; + break; + case DAC960_V1_NoBackgroundInitInProgress: + break; + } + } + else if (CommandOpcode == DAC960_V1_DCDB) + { + /* + This is a bit ugly. + + The InquiryStandardData and + the InquiryUntitSerialNumber information + retrieval operations BOTH use the DAC960_V1_DCDB + commands. the test above can't distinguish between + these two cases. + + Instead, we rely on the order of code later in this + function to ensure that DeviceInquiryInformation commands + are submitted before DeviceSerialNumber commands. + */ + if (Controller->V1.NeedDeviceInquiryInformation) + { + DAC960_SCSI_Inquiry_T *InquiryStandardData = + &Controller->V1.InquiryStandardData + [Controller->V1.DeviceStateChannel] + [Controller->V1.DeviceStateTargetID]; + if (CommandStatus != DAC960_V1_NormalCompletion) + { + memset(InquiryStandardData, 0, + sizeof(DAC960_SCSI_Inquiry_T)); + InquiryStandardData->PeripheralDeviceType = 0x1F; + } + else + memcpy(InquiryStandardData, + Controller->V1.NewInquiryStandardData, + sizeof(DAC960_SCSI_Inquiry_T)); + Controller->V1.NeedDeviceInquiryInformation = false; + } + else if (Controller->V1.NeedDeviceSerialNumberInformation) + { + DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber = + &Controller->V1.InquiryUnitSerialNumber + [Controller->V1.DeviceStateChannel] + [Controller->V1.DeviceStateTargetID]; + if (CommandStatus != DAC960_V1_NormalCompletion) + { + memset(InquiryUnitSerialNumber, 0, + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T)); + InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F; + } + else + memcpy(InquiryUnitSerialNumber, + Controller->V1.NewInquiryUnitSerialNumber, + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T)); + Controller->V1.NeedDeviceSerialNumberInformation = false; + } + } + /* + Begin submitting new monitoring commands. + */ + if (Controller->V1.NewEventLogSequenceNumber + - Controller->V1.OldEventLogSequenceNumber > 0) + { + Command->V1.CommandMailbox.Type3E.CommandOpcode = + DAC960_V1_PerformEventLogOperation; + Command->V1.CommandMailbox.Type3E.OperationType = + DAC960_V1_GetEventLogEntry; + Command->V1.CommandMailbox.Type3E.OperationQualifier = 1; + Command->V1.CommandMailbox.Type3E.SequenceNumber = + Controller->V1.OldEventLogSequenceNumber; + Command->V1.CommandMailbox.Type3E.BusAddress = + Controller->V1.EventLogEntryDMA; + DAC960_QueueCommand(Command); + return; + } + if (Controller->V1.NeedErrorTableInformation) + { + Controller->V1.NeedErrorTableInformation = false; + Command->V1.CommandMailbox.Type3.CommandOpcode = + DAC960_V1_GetErrorTable; + Command->V1.CommandMailbox.Type3.BusAddress = + Controller->V1.NewErrorTableDMA; + DAC960_QueueCommand(Command); + return; + } + if (Controller->V1.NeedRebuildProgress && + Controller->V1.RebuildProgressFirst) + { + Controller->V1.NeedRebuildProgress = false; + Command->V1.CommandMailbox.Type3.CommandOpcode = + DAC960_V1_GetRebuildProgress; + Command->V1.CommandMailbox.Type3.BusAddress = + Controller->V1.RebuildProgressDMA; + DAC960_QueueCommand(Command); + return; + } + if (Controller->V1.NeedDeviceStateInformation) + { + if (Controller->V1.NeedDeviceInquiryInformation) + { + DAC960_V1_DCDB_T *DCDB = Controller->V1.MonitoringDCDB; + dma_addr_t DCDB_DMA = Controller->V1.MonitoringDCDB_DMA; + + dma_addr_t NewInquiryStandardDataDMA = + Controller->V1.NewInquiryStandardDataDMA; + + Command->V1.CommandMailbox.Type3.CommandOpcode = DAC960_V1_DCDB; + Command->V1.CommandMailbox.Type3.BusAddress = DCDB_DMA; + DCDB->Channel = Controller->V1.DeviceStateChannel; + DCDB->TargetID = Controller->V1.DeviceStateTargetID; + DCDB->Direction = DAC960_V1_DCDB_DataTransferDeviceToSystem; + DCDB->EarlyStatus = false; + DCDB->Timeout = DAC960_V1_DCDB_Timeout_10_seconds; + DCDB->NoAutomaticRequestSense = false; + DCDB->DisconnectPermitted = true; + DCDB->TransferLength = sizeof(DAC960_SCSI_Inquiry_T); + DCDB->BusAddress = NewInquiryStandardDataDMA; + DCDB->CDBLength = 6; + DCDB->TransferLengthHigh4 = 0; + DCDB->SenseLength = sizeof(DCDB->SenseData); + DCDB->CDB[0] = 0x12; /* INQUIRY */ + DCDB->CDB[1] = 0; /* EVPD = 0 */ + DCDB->CDB[2] = 0; /* Page Code */ + DCDB->CDB[3] = 0; /* Reserved */ + DCDB->CDB[4] = sizeof(DAC960_SCSI_Inquiry_T); + DCDB->CDB[5] = 0; /* Control */ + DAC960_QueueCommand(Command); + return; + } + if (Controller->V1.NeedDeviceSerialNumberInformation) + { + DAC960_V1_DCDB_T *DCDB = Controller->V1.MonitoringDCDB; + dma_addr_t DCDB_DMA = Controller->V1.MonitoringDCDB_DMA; + dma_addr_t NewInquiryUnitSerialNumberDMA = + Controller->V1.NewInquiryUnitSerialNumberDMA; + + Command->V1.CommandMailbox.Type3.CommandOpcode = DAC960_V1_DCDB; + Command->V1.CommandMailbox.Type3.BusAddress = DCDB_DMA; + DCDB->Channel = Controller->V1.DeviceStateChannel; + DCDB->TargetID = Controller->V1.DeviceStateTargetID; + DCDB->Direction = DAC960_V1_DCDB_DataTransferDeviceToSystem; + DCDB->EarlyStatus = false; + DCDB->Timeout = DAC960_V1_DCDB_Timeout_10_seconds; + DCDB->NoAutomaticRequestSense = false; + DCDB->DisconnectPermitted = true; + DCDB->TransferLength = + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T); + DCDB->BusAddress = NewInquiryUnitSerialNumberDMA; + DCDB->CDBLength = 6; + DCDB->TransferLengthHigh4 = 0; + DCDB->SenseLength = sizeof(DCDB->SenseData); + DCDB->CDB[0] = 0x12; /* INQUIRY */ + DCDB->CDB[1] = 1; /* EVPD = 1 */ + DCDB->CDB[2] = 0x80; /* Page Code */ + DCDB->CDB[3] = 0; /* Reserved */ + DCDB->CDB[4] = sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T); + DCDB->CDB[5] = 0; /* Control */ + DAC960_QueueCommand(Command); + return; + } + if (Controller->V1.StartDeviceStateScan) + { + Controller->V1.DeviceStateChannel = 0; + Controller->V1.DeviceStateTargetID = 0; + Controller->V1.StartDeviceStateScan = false; + } + else if (++Controller->V1.DeviceStateTargetID == Controller->Targets) + { + Controller->V1.DeviceStateChannel++; + Controller->V1.DeviceStateTargetID = 0; + } + if (Controller->V1.DeviceStateChannel < Controller->Channels) + { + Controller->V1.NewDeviceState->DeviceState = + DAC960_V1_Device_Dead; + Command->V1.CommandMailbox.Type3D.CommandOpcode = + DAC960_V1_GetDeviceState; + Command->V1.CommandMailbox.Type3D.Channel = + Controller->V1.DeviceStateChannel; + Command->V1.CommandMailbox.Type3D.TargetID = + Controller->V1.DeviceStateTargetID; + Command->V1.CommandMailbox.Type3D.BusAddress = + Controller->V1.NewDeviceStateDMA; + DAC960_QueueCommand(Command); + return; + } + Controller->V1.NeedDeviceStateInformation = false; + } + if (Controller->V1.NeedLogicalDriveInformation) + { + Controller->V1.NeedLogicalDriveInformation = false; + Command->V1.CommandMailbox.Type3.CommandOpcode = + DAC960_V1_GetLogicalDriveInformation; + Command->V1.CommandMailbox.Type3.BusAddress = + Controller->V1.NewLogicalDriveInformationDMA; + DAC960_QueueCommand(Command); + return; + } + if (Controller->V1.NeedRebuildProgress) + { + Controller->V1.NeedRebuildProgress = false; + Command->V1.CommandMailbox.Type3.CommandOpcode = + DAC960_V1_GetRebuildProgress; + Command->V1.CommandMailbox.Type3.BusAddress = + Controller->V1.RebuildProgressDMA; + DAC960_QueueCommand(Command); + return; + } + if (Controller->V1.NeedConsistencyCheckProgress) + { + Controller->V1.NeedConsistencyCheckProgress = false; + Command->V1.CommandMailbox.Type3.CommandOpcode = + DAC960_V1_RebuildStat; + Command->V1.CommandMailbox.Type3.BusAddress = + Controller->V1.RebuildProgressDMA; + DAC960_QueueCommand(Command); + return; + } + if (Controller->V1.NeedBackgroundInitializationStatus) + { + Controller->V1.NeedBackgroundInitializationStatus = false; + Command->V1.CommandMailbox.Type3B.CommandOpcode = + DAC960_V1_BackgroundInitializationControl; + Command->V1.CommandMailbox.Type3B.CommandOpcode2 = 0x20; + Command->V1.CommandMailbox.Type3B.BusAddress = + Controller->V1.BackgroundInitializationStatusDMA; + DAC960_QueueCommand(Command); + return; + } + Controller->MonitoringTimerCount++; + Controller->MonitoringTimer.expires = + jiffies + DAC960_MonitoringTimerInterval; + add_timer(&Controller->MonitoringTimer); + } + if (CommandType == DAC960_ImmediateCommand) + { + complete(Command->Completion); + Command->Completion = NULL; + return; + } + if (CommandType == DAC960_QueuedCommand) + { + DAC960_V1_KernelCommand_T *KernelCommand = Command->V1.KernelCommand; + KernelCommand->CommandStatus = Command->V1.CommandStatus; + Command->V1.KernelCommand = NULL; + if (CommandOpcode == DAC960_V1_DCDB) + Controller->V1.DirectCommandActive[KernelCommand->DCDB->Channel] + [KernelCommand->DCDB->TargetID] = + false; + DAC960_DeallocateCommand(Command); + KernelCommand->CompletionFunction(KernelCommand); + return; + } + /* + Queue a Status Monitoring Command to the Controller using the just + completed Command if one was deferred previously due to lack of a + free Command when the Monitoring Timer Function was called. + */ + if (Controller->MonitoringCommandDeferred) + { + Controller->MonitoringCommandDeferred = false; + DAC960_V1_QueueMonitoringCommand(Command); + return; + } + /* + Deallocate the Command. + */ + DAC960_DeallocateCommand(Command); + /* + Wake up any processes waiting on a free Command. + */ + wake_up(&Controller->CommandWaitQueue); +} + + +/* + DAC960_V2_ReadWriteError prints an appropriate error message for Command + when an error occurs on a Read or Write operation. +*/ + +static void DAC960_V2_ReadWriteError(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + unsigned char *SenseErrors[] = { "NO SENSE", "RECOVERED ERROR", + "NOT READY", "MEDIUM ERROR", + "HARDWARE ERROR", "ILLEGAL REQUEST", + "UNIT ATTENTION", "DATA PROTECT", + "BLANK CHECK", "VENDOR-SPECIFIC", + "COPY ABORTED", "ABORTED COMMAND", + "EQUAL", "VOLUME OVERFLOW", + "MISCOMPARE", "RESERVED" }; + unsigned char *CommandName = "UNKNOWN"; + switch (Command->CommandType) + { + case DAC960_ReadCommand: + case DAC960_ReadRetryCommand: + CommandName = "READ"; + break; + case DAC960_WriteCommand: + case DAC960_WriteRetryCommand: + CommandName = "WRITE"; + break; + case DAC960_MonitoringCommand: + case DAC960_ImmediateCommand: + case DAC960_QueuedCommand: + break; + } + DAC960_Error("Error Condition %s on %s:\n", Controller, + SenseErrors[Command->V2.RequestSense->SenseKey], CommandName); + DAC960_Error(" /dev/rd/c%dd%d: absolute blocks %u..%u\n", + Controller, Controller->ControllerNumber, + Command->LogicalDriveNumber, Command->BlockNumber, + Command->BlockNumber + Command->BlockCount - 1); +} + + +/* + DAC960_V2_ReportEvent prints an appropriate message when a Controller Event + occurs. +*/ + +static void DAC960_V2_ReportEvent(DAC960_Controller_T *Controller, + DAC960_V2_Event_T *Event) +{ + DAC960_SCSI_RequestSense_T *RequestSense = + (DAC960_SCSI_RequestSense_T *) &Event->RequestSenseData; + unsigned char MessageBuffer[DAC960_LineBufferSize]; + static struct { int EventCode; unsigned char *EventMessage; } EventList[] = + { /* Physical Device Events (0x0000 - 0x007F) */ + { 0x0001, "P Online" }, + { 0x0002, "P Standby" }, + { 0x0005, "P Automatic Rebuild Started" }, + { 0x0006, "P Manual Rebuild Started" }, + { 0x0007, "P Rebuild Completed" }, + { 0x0008, "P Rebuild Cancelled" }, + { 0x0009, "P Rebuild Failed for Unknown Reasons" }, + { 0x000A, "P Rebuild Failed due to New Physical Device" }, + { 0x000B, "P Rebuild Failed due to Logical Drive Failure" }, + { 0x000C, "S Offline" }, + { 0x000D, "P Found" }, + { 0x000E, "P Removed" }, + { 0x000F, "P Unconfigured" }, + { 0x0010, "P Expand Capacity Started" }, + { 0x0011, "P Expand Capacity Completed" }, + { 0x0012, "P Expand Capacity Failed" }, + { 0x0013, "P Command Timed Out" }, + { 0x0014, "P Command Aborted" }, + { 0x0015, "P Command Retried" }, + { 0x0016, "P Parity Error" }, + { 0x0017, "P Soft Error" }, + { 0x0018, "P Miscellaneous Error" }, + { 0x0019, "P Reset" }, + { 0x001A, "P Active Spare Found" }, + { 0x001B, "P Warm Spare Found" }, + { 0x001C, "S Sense Data Received" }, + { 0x001D, "P Initialization Started" }, + { 0x001E, "P Initialization Completed" }, + { 0x001F, "P Initialization Failed" }, + { 0x0020, "P Initialization Cancelled" }, + { 0x0021, "P Failed because Write Recovery Failed" }, + { 0x0022, "P Failed because SCSI Bus Reset Failed" }, + { 0x0023, "P Failed because of Double Check Condition" }, + { 0x0024, "P Failed because Device Cannot Be Accessed" }, + { 0x0025, "P Failed because of Gross Error on SCSI Processor" }, + { 0x0026, "P Failed because of Bad Tag from Device" }, + { 0x0027, "P Failed because of Command Timeout" }, + { 0x0028, "P Failed because of System Reset" }, + { 0x0029, "P Failed because of Busy Status or Parity Error" }, + { 0x002A, "P Failed because Host Set Device to Failed State" }, + { 0x002B, "P Failed because of Selection Timeout" }, + { 0x002C, "P Failed because of SCSI Bus Phase Error" }, + { 0x002D, "P Failed because Device Returned Unknown Status" }, + { 0x002E, "P Failed because Device Not Ready" }, + { 0x002F, "P Failed because Device Not Found at Startup" }, + { 0x0030, "P Failed because COD Write Operation Failed" }, + { 0x0031, "P Failed because BDT Write Operation Failed" }, + { 0x0039, "P Missing at Startup" }, + { 0x003A, "P Start Rebuild Failed due to Physical Drive Too Small" }, + { 0x003C, "P Temporarily Offline Device Automatically Made Online" }, + { 0x003D, "P Standby Rebuild Started" }, + /* Logical Device Events (0x0080 - 0x00FF) */ + { 0x0080, "M Consistency Check Started" }, + { 0x0081, "M Consistency Check Completed" }, + { 0x0082, "M Consistency Check Cancelled" }, + { 0x0083, "M Consistency Check Completed With Errors" }, + { 0x0084, "M Consistency Check Failed due to Logical Drive Failure" }, + { 0x0085, "M Consistency Check Failed due to Physical Device Failure" }, + { 0x0086, "L Offline" }, + { 0x0087, "L Critical" }, + { 0x0088, "L Online" }, + { 0x0089, "M Automatic Rebuild Started" }, + { 0x008A, "M Manual Rebuild Started" }, + { 0x008B, "M Rebuild Completed" }, + { 0x008C, "M Rebuild Cancelled" }, + { 0x008D, "M Rebuild Failed for Unknown Reasons" }, + { 0x008E, "M Rebuild Failed due to New Physical Device" }, + { 0x008F, "M Rebuild Failed due to Logical Drive Failure" }, + { 0x0090, "M Initialization Started" }, + { 0x0091, "M Initialization Completed" }, + { 0x0092, "M Initialization Cancelled" }, + { 0x0093, "M Initialization Failed" }, + { 0x0094, "L Found" }, + { 0x0095, "L Deleted" }, + { 0x0096, "M Expand Capacity Started" }, + { 0x0097, "M Expand Capacity Completed" }, + { 0x0098, "M Expand Capacity Failed" }, + { 0x0099, "L Bad Block Found" }, + { 0x009A, "L Size Changed" }, + { 0x009B, "L Type Changed" }, + { 0x009C, "L Bad Data Block Found" }, + { 0x009E, "L Read of Data Block in BDT" }, + { 0x009F, "L Write Back Data for Disk Block Lost" }, + { 0x00A0, "L Temporarily Offline RAID-5/3 Drive Made Online" }, + { 0x00A1, "L Temporarily Offline RAID-6/1/0/7 Drive Made Online" }, + { 0x00A2, "L Standby Rebuild Started" }, + /* Fault Management Events (0x0100 - 0x017F) */ + { 0x0140, "E Fan %d Failed" }, + { 0x0141, "E Fan %d OK" }, + { 0x0142, "E Fan %d Not Present" }, + { 0x0143, "E Power Supply %d Failed" }, + { 0x0144, "E Power Supply %d OK" }, + { 0x0145, "E Power Supply %d Not Present" }, + { 0x0146, "E Temperature Sensor %d Temperature Exceeds Safe Limit" }, + { 0x0147, "E Temperature Sensor %d Temperature Exceeds Working Limit" }, + { 0x0148, "E Temperature Sensor %d Temperature Normal" }, + { 0x0149, "E Temperature Sensor %d Not Present" }, + { 0x014A, "E Enclosure Management Unit %d Access Critical" }, + { 0x014B, "E Enclosure Management Unit %d Access OK" }, + { 0x014C, "E Enclosure Management Unit %d Access Offline" }, + /* Controller Events (0x0180 - 0x01FF) */ + { 0x0181, "C Cache Write Back Error" }, + { 0x0188, "C Battery Backup Unit Found" }, + { 0x0189, "C Battery Backup Unit Charge Level Low" }, + { 0x018A, "C Battery Backup Unit Charge Level OK" }, + { 0x0193, "C Installation Aborted" }, + { 0x0195, "C Battery Backup Unit Physically Removed" }, + { 0x0196, "C Memory Error During Warm Boot" }, + { 0x019E, "C Memory Soft ECC Error Corrected" }, + { 0x019F, "C Memory Hard ECC Error Corrected" }, + { 0x01A2, "C Battery Backup Unit Failed" }, + { 0x01AB, "C Mirror Race Recovery Failed" }, + { 0x01AC, "C Mirror Race on Critical Drive" }, + /* Controller Internal Processor Events */ + { 0x0380, "C Internal Controller Hung" }, + { 0x0381, "C Internal Controller Firmware Breakpoint" }, + { 0x0390, "C Internal Controller i960 Processor Specific Error" }, + { 0x03A0, "C Internal Controller StrongARM Processor Specific Error" }, + { 0, "" } }; + int EventListIndex = 0, EventCode; + unsigned char EventType, *EventMessage; + if (Event->EventCode == 0x1C && + RequestSense->SenseKey == DAC960_SenseKey_VendorSpecific && + (RequestSense->AdditionalSenseCode == 0x80 || + RequestSense->AdditionalSenseCode == 0x81)) + Event->EventCode = ((RequestSense->AdditionalSenseCode - 0x80) << 8) | + RequestSense->AdditionalSenseCodeQualifier; + while (true) + { + EventCode = EventList[EventListIndex].EventCode; + if (EventCode == Event->EventCode || EventCode == 0) break; + EventListIndex++; + } + EventType = EventList[EventListIndex].EventMessage[0]; + EventMessage = &EventList[EventListIndex].EventMessage[2]; + if (EventCode == 0) + { + DAC960_Critical("Unknown Controller Event Code %04X\n", + Controller, Event->EventCode); + return; + } + switch (EventType) + { + case 'P': + DAC960_Critical("Physical Device %d:%d %s\n", Controller, + Event->Channel, Event->TargetID, EventMessage); + break; + case 'L': + DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) %s\n", Controller, + Event->LogicalUnit, Controller->ControllerNumber, + Event->LogicalUnit, EventMessage); + break; + case 'M': + DAC960_Progress("Logical Drive %d (/dev/rd/c%dd%d) %s\n", Controller, + Event->LogicalUnit, Controller->ControllerNumber, + Event->LogicalUnit, EventMessage); + break; + case 'S': + if (RequestSense->SenseKey == DAC960_SenseKey_NoSense || + (RequestSense->SenseKey == DAC960_SenseKey_NotReady && + RequestSense->AdditionalSenseCode == 0x04 && + (RequestSense->AdditionalSenseCodeQualifier == 0x01 || + RequestSense->AdditionalSenseCodeQualifier == 0x02))) + break; + DAC960_Critical("Physical Device %d:%d %s\n", Controller, + Event->Channel, Event->TargetID, EventMessage); + DAC960_Critical("Physical Device %d:%d Request Sense: " + "Sense Key = %X, ASC = %02X, ASCQ = %02X\n", + Controller, + Event->Channel, + Event->TargetID, + RequestSense->SenseKey, + RequestSense->AdditionalSenseCode, + RequestSense->AdditionalSenseCodeQualifier); + DAC960_Critical("Physical Device %d:%d Request Sense: " + "Information = %02X%02X%02X%02X " + "%02X%02X%02X%02X\n", + Controller, + Event->Channel, + Event->TargetID, + RequestSense->Information[0], + RequestSense->Information[1], + RequestSense->Information[2], + RequestSense->Information[3], + RequestSense->CommandSpecificInformation[0], + RequestSense->CommandSpecificInformation[1], + RequestSense->CommandSpecificInformation[2], + RequestSense->CommandSpecificInformation[3]); + break; + case 'E': + if (Controller->SuppressEnclosureMessages) break; + sprintf(MessageBuffer, EventMessage, Event->LogicalUnit); + DAC960_Critical("Enclosure %d %s\n", Controller, + Event->TargetID, MessageBuffer); + break; + case 'C': + DAC960_Critical("Controller %s\n", Controller, EventMessage); + break; + default: + DAC960_Critical("Unknown Controller Event Code %04X\n", + Controller, Event->EventCode); + break; + } +} + + +/* + DAC960_V2_ReportProgress prints an appropriate progress message for + Logical Device Long Operations. +*/ + +static void DAC960_V2_ReportProgress(DAC960_Controller_T *Controller, + unsigned char *MessageString, + unsigned int LogicalDeviceNumber, + unsigned long BlocksCompleted, + unsigned long LogicalDeviceSize) +{ + Controller->EphemeralProgressMessage = true; + DAC960_Progress("%s in Progress: Logical Drive %d (/dev/rd/c%dd%d) " + "%d%% completed\n", Controller, + MessageString, + LogicalDeviceNumber, + Controller->ControllerNumber, + LogicalDeviceNumber, + (100 * (BlocksCompleted >> 7)) / (LogicalDeviceSize >> 7)); + Controller->EphemeralProgressMessage = false; +} + + +/* + DAC960_V2_ProcessCompletedCommand performs completion processing for Command + for DAC960 V2 Firmware Controllers. +*/ + +static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + DAC960_CommandType_T CommandType = Command->CommandType; + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + DAC960_V2_IOCTL_Opcode_T IOCTLOpcode = CommandMailbox->Common.IOCTL_Opcode; + DAC960_V2_CommandOpcode_T CommandOpcode = CommandMailbox->SCSI_10.CommandOpcode; + DAC960_V2_CommandStatus_T CommandStatus = Command->V2.CommandStatus; + + if (CommandType == DAC960_ReadCommand || + CommandType == DAC960_WriteCommand) + { + +#ifdef FORCE_RETRY_DEBUG + CommandStatus = DAC960_V2_AbormalCompletion; +#endif + Command->V2.RequestSense->SenseKey = DAC960_SenseKey_MediumError; + + if (CommandStatus == DAC960_V2_NormalCompletion) { + + if (!DAC960_ProcessCompletedRequest(Command, true)) + BUG(); + + } else if (Command->V2.RequestSense->SenseKey == DAC960_SenseKey_MediumError) + { + /* + * break the command down into pieces and resubmit each + * piece, hoping that some of them will succeed. + */ + DAC960_queue_partial_rw(Command); + return; + } + else + { + if (Command->V2.RequestSense->SenseKey != DAC960_SenseKey_NotReady) + DAC960_V2_ReadWriteError(Command); + /* + Perform completion processing for all buffers in this I/O Request. + */ + (void)DAC960_ProcessCompletedRequest(Command, false); + } + } + else if (CommandType == DAC960_ReadRetryCommand || + CommandType == DAC960_WriteRetryCommand) + { + bool normal_completion; + +#ifdef FORCE_RETRY_FAILURE_DEBUG + static int retry_count = 1; +#endif + /* + Perform completion processing for the portion that was + retried, and submit the next portion, if any. + */ + normal_completion = true; + if (CommandStatus != DAC960_V2_NormalCompletion) { + normal_completion = false; + if (Command->V2.RequestSense->SenseKey != DAC960_SenseKey_NotReady) + DAC960_V2_ReadWriteError(Command); + } + +#ifdef FORCE_RETRY_FAILURE_DEBUG + if (!(++retry_count % 10000)) { + printk("V2 error retry failure test\n"); + normal_completion = false; + DAC960_V2_ReadWriteError(Command); + } +#endif + + if (!DAC960_ProcessCompletedRequest(Command, normal_completion)) { + DAC960_queue_partial_rw(Command); + return; + } + } + else if (CommandType == DAC960_MonitoringCommand) + { + if (Controller->ShutdownMonitoringTimer) + return; + if (IOCTLOpcode == DAC960_V2_GetControllerInfo) + { + DAC960_V2_ControllerInfo_T *NewControllerInfo = + Controller->V2.NewControllerInformation; + DAC960_V2_ControllerInfo_T *ControllerInfo = + &Controller->V2.ControllerInformation; + Controller->LogicalDriveCount = + NewControllerInfo->LogicalDevicesPresent; + Controller->V2.NeedLogicalDeviceInformation = true; + Controller->V2.NeedPhysicalDeviceInformation = true; + Controller->V2.StartLogicalDeviceInformationScan = true; + Controller->V2.StartPhysicalDeviceInformationScan = true; + Controller->MonitoringAlertMode = + (NewControllerInfo->LogicalDevicesCritical > 0 || + NewControllerInfo->LogicalDevicesOffline > 0 || + NewControllerInfo->PhysicalDisksCritical > 0 || + NewControllerInfo->PhysicalDisksOffline > 0); + memcpy(ControllerInfo, NewControllerInfo, + sizeof(DAC960_V2_ControllerInfo_T)); + } + else if (IOCTLOpcode == DAC960_V2_GetEvent) + { + if (CommandStatus == DAC960_V2_NormalCompletion) { + DAC960_V2_ReportEvent(Controller, Controller->V2.Event); + } + Controller->V2.NextEventSequenceNumber++; + } + else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid && + CommandStatus == DAC960_V2_NormalCompletion) + { + DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInfo = + Controller->V2.NewPhysicalDeviceInformation; + unsigned int PhysicalDeviceIndex = Controller->V2.PhysicalDeviceIndex; + DAC960_V2_PhysicalDeviceInfo_T *PhysicalDeviceInfo = + Controller->V2.PhysicalDeviceInformation[PhysicalDeviceIndex]; + DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber = + Controller->V2.InquiryUnitSerialNumber[PhysicalDeviceIndex]; + unsigned int DeviceIndex; + while (PhysicalDeviceInfo != NULL && + (NewPhysicalDeviceInfo->Channel > + PhysicalDeviceInfo->Channel || + (NewPhysicalDeviceInfo->Channel == + PhysicalDeviceInfo->Channel && + (NewPhysicalDeviceInfo->TargetID > + PhysicalDeviceInfo->TargetID || + (NewPhysicalDeviceInfo->TargetID == + PhysicalDeviceInfo->TargetID && + NewPhysicalDeviceInfo->LogicalUnit > + PhysicalDeviceInfo->LogicalUnit))))) + { + DAC960_Critical("Physical Device %d:%d No Longer Exists\n", + Controller, + PhysicalDeviceInfo->Channel, + PhysicalDeviceInfo->TargetID); + Controller->V2.PhysicalDeviceInformation + [PhysicalDeviceIndex] = NULL; + Controller->V2.InquiryUnitSerialNumber + [PhysicalDeviceIndex] = NULL; + kfree(PhysicalDeviceInfo); + kfree(InquiryUnitSerialNumber); + for (DeviceIndex = PhysicalDeviceIndex; + DeviceIndex < DAC960_V2_MaxPhysicalDevices - 1; + DeviceIndex++) + { + Controller->V2.PhysicalDeviceInformation[DeviceIndex] = + Controller->V2.PhysicalDeviceInformation[DeviceIndex+1]; + Controller->V2.InquiryUnitSerialNumber[DeviceIndex] = + Controller->V2.InquiryUnitSerialNumber[DeviceIndex+1]; + } + Controller->V2.PhysicalDeviceInformation + [DAC960_V2_MaxPhysicalDevices-1] = NULL; + Controller->V2.InquiryUnitSerialNumber + [DAC960_V2_MaxPhysicalDevices-1] = NULL; + PhysicalDeviceInfo = + Controller->V2.PhysicalDeviceInformation[PhysicalDeviceIndex]; + InquiryUnitSerialNumber = + Controller->V2.InquiryUnitSerialNumber[PhysicalDeviceIndex]; + } + if (PhysicalDeviceInfo == NULL || + (NewPhysicalDeviceInfo->Channel != + PhysicalDeviceInfo->Channel) || + (NewPhysicalDeviceInfo->TargetID != + PhysicalDeviceInfo->TargetID) || + (NewPhysicalDeviceInfo->LogicalUnit != + PhysicalDeviceInfo->LogicalUnit)) + { + PhysicalDeviceInfo = + kmalloc(sizeof(DAC960_V2_PhysicalDeviceInfo_T), GFP_ATOMIC); + InquiryUnitSerialNumber = + kmalloc(sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T), + GFP_ATOMIC); + if (InquiryUnitSerialNumber == NULL || + PhysicalDeviceInfo == NULL) + { + kfree(InquiryUnitSerialNumber); + InquiryUnitSerialNumber = NULL; + kfree(PhysicalDeviceInfo); + PhysicalDeviceInfo = NULL; + } + DAC960_Critical("Physical Device %d:%d Now Exists%s\n", + Controller, + NewPhysicalDeviceInfo->Channel, + NewPhysicalDeviceInfo->TargetID, + (PhysicalDeviceInfo != NULL + ? "" : " - Allocation Failed")); + if (PhysicalDeviceInfo != NULL) + { + memset(PhysicalDeviceInfo, 0, + sizeof(DAC960_V2_PhysicalDeviceInfo_T)); + PhysicalDeviceInfo->PhysicalDeviceState = + DAC960_V2_Device_InvalidState; + memset(InquiryUnitSerialNumber, 0, + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T)); + InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F; + for (DeviceIndex = DAC960_V2_MaxPhysicalDevices - 1; + DeviceIndex > PhysicalDeviceIndex; + DeviceIndex--) + { + Controller->V2.PhysicalDeviceInformation[DeviceIndex] = + Controller->V2.PhysicalDeviceInformation[DeviceIndex-1]; + Controller->V2.InquiryUnitSerialNumber[DeviceIndex] = + Controller->V2.InquiryUnitSerialNumber[DeviceIndex-1]; + } + Controller->V2.PhysicalDeviceInformation + [PhysicalDeviceIndex] = + PhysicalDeviceInfo; + Controller->V2.InquiryUnitSerialNumber + [PhysicalDeviceIndex] = + InquiryUnitSerialNumber; + Controller->V2.NeedDeviceSerialNumberInformation = true; + } + } + if (PhysicalDeviceInfo != NULL) + { + if (NewPhysicalDeviceInfo->PhysicalDeviceState != + PhysicalDeviceInfo->PhysicalDeviceState) + DAC960_Critical( + "Physical Device %d:%d is now %s\n", Controller, + NewPhysicalDeviceInfo->Channel, + NewPhysicalDeviceInfo->TargetID, + (NewPhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Online + ? "ONLINE" + : NewPhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Rebuild + ? "REBUILD" + : NewPhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Missing + ? "MISSING" + : NewPhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Critical + ? "CRITICAL" + : NewPhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Dead + ? "DEAD" + : NewPhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_SuspectedDead + ? "SUSPECTED-DEAD" + : NewPhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_CommandedOffline + ? "COMMANDED-OFFLINE" + : NewPhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Standby + ? "STANDBY" : "UNKNOWN")); + if ((NewPhysicalDeviceInfo->ParityErrors != + PhysicalDeviceInfo->ParityErrors) || + (NewPhysicalDeviceInfo->SoftErrors != + PhysicalDeviceInfo->SoftErrors) || + (NewPhysicalDeviceInfo->HardErrors != + PhysicalDeviceInfo->HardErrors) || + (NewPhysicalDeviceInfo->MiscellaneousErrors != + PhysicalDeviceInfo->MiscellaneousErrors) || + (NewPhysicalDeviceInfo->CommandTimeouts != + PhysicalDeviceInfo->CommandTimeouts) || + (NewPhysicalDeviceInfo->Retries != + PhysicalDeviceInfo->Retries) || + (NewPhysicalDeviceInfo->Aborts != + PhysicalDeviceInfo->Aborts) || + (NewPhysicalDeviceInfo->PredictedFailuresDetected != + PhysicalDeviceInfo->PredictedFailuresDetected)) + { + DAC960_Critical("Physical Device %d:%d Errors: " + "Parity = %d, Soft = %d, " + "Hard = %d, Misc = %d\n", + Controller, + NewPhysicalDeviceInfo->Channel, + NewPhysicalDeviceInfo->TargetID, + NewPhysicalDeviceInfo->ParityErrors, + NewPhysicalDeviceInfo->SoftErrors, + NewPhysicalDeviceInfo->HardErrors, + NewPhysicalDeviceInfo->MiscellaneousErrors); + DAC960_Critical("Physical Device %d:%d Errors: " + "Timeouts = %d, Retries = %d, " + "Aborts = %d, Predicted = %d\n", + Controller, + NewPhysicalDeviceInfo->Channel, + NewPhysicalDeviceInfo->TargetID, + NewPhysicalDeviceInfo->CommandTimeouts, + NewPhysicalDeviceInfo->Retries, + NewPhysicalDeviceInfo->Aborts, + NewPhysicalDeviceInfo + ->PredictedFailuresDetected); + } + if ((PhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_Dead || + PhysicalDeviceInfo->PhysicalDeviceState + == DAC960_V2_Device_InvalidState) && + NewPhysicalDeviceInfo->PhysicalDeviceState + != DAC960_V2_Device_Dead) + Controller->V2.NeedDeviceSerialNumberInformation = true; + memcpy(PhysicalDeviceInfo, NewPhysicalDeviceInfo, + sizeof(DAC960_V2_PhysicalDeviceInfo_T)); + } + NewPhysicalDeviceInfo->LogicalUnit++; + Controller->V2.PhysicalDeviceIndex++; + } + else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid) + { + unsigned int DeviceIndex; + for (DeviceIndex = Controller->V2.PhysicalDeviceIndex; + DeviceIndex < DAC960_V2_MaxPhysicalDevices; + DeviceIndex++) + { + DAC960_V2_PhysicalDeviceInfo_T *PhysicalDeviceInfo = + Controller->V2.PhysicalDeviceInformation[DeviceIndex]; + DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber = + Controller->V2.InquiryUnitSerialNumber[DeviceIndex]; + if (PhysicalDeviceInfo == NULL) break; + DAC960_Critical("Physical Device %d:%d No Longer Exists\n", + Controller, + PhysicalDeviceInfo->Channel, + PhysicalDeviceInfo->TargetID); + Controller->V2.PhysicalDeviceInformation[DeviceIndex] = NULL; + Controller->V2.InquiryUnitSerialNumber[DeviceIndex] = NULL; + kfree(PhysicalDeviceInfo); + kfree(InquiryUnitSerialNumber); + } + Controller->V2.NeedPhysicalDeviceInformation = false; + } + else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid && + CommandStatus == DAC960_V2_NormalCompletion) + { + DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInfo = + Controller->V2.NewLogicalDeviceInformation; + unsigned short LogicalDeviceNumber = + NewLogicalDeviceInfo->LogicalDeviceNumber; + DAC960_V2_LogicalDeviceInfo_T *LogicalDeviceInfo = + Controller->V2.LogicalDeviceInformation[LogicalDeviceNumber]; + if (LogicalDeviceInfo == NULL) + { + DAC960_V2_PhysicalDevice_T PhysicalDevice; + PhysicalDevice.Controller = 0; + PhysicalDevice.Channel = NewLogicalDeviceInfo->Channel; + PhysicalDevice.TargetID = NewLogicalDeviceInfo->TargetID; + PhysicalDevice.LogicalUnit = NewLogicalDeviceInfo->LogicalUnit; + Controller->V2.LogicalDriveToVirtualDevice[LogicalDeviceNumber] = + PhysicalDevice; + LogicalDeviceInfo = kmalloc(sizeof(DAC960_V2_LogicalDeviceInfo_T), + GFP_ATOMIC); + Controller->V2.LogicalDeviceInformation[LogicalDeviceNumber] = + LogicalDeviceInfo; + DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) " + "Now Exists%s\n", Controller, + LogicalDeviceNumber, + Controller->ControllerNumber, + LogicalDeviceNumber, + (LogicalDeviceInfo != NULL + ? "" : " - Allocation Failed")); + if (LogicalDeviceInfo != NULL) + { + memset(LogicalDeviceInfo, 0, + sizeof(DAC960_V2_LogicalDeviceInfo_T)); + DAC960_ComputeGenericDiskInfo(Controller); + } + } + if (LogicalDeviceInfo != NULL) + { + unsigned long LogicalDeviceSize = + NewLogicalDeviceInfo->ConfigurableDeviceSize; + if (NewLogicalDeviceInfo->LogicalDeviceState != + LogicalDeviceInfo->LogicalDeviceState) + DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) " + "is now %s\n", Controller, + LogicalDeviceNumber, + Controller->ControllerNumber, + LogicalDeviceNumber, + (NewLogicalDeviceInfo->LogicalDeviceState + == DAC960_V2_LogicalDevice_Online + ? "ONLINE" + : NewLogicalDeviceInfo->LogicalDeviceState + == DAC960_V2_LogicalDevice_Critical + ? "CRITICAL" : "OFFLINE")); + if ((NewLogicalDeviceInfo->SoftErrors != + LogicalDeviceInfo->SoftErrors) || + (NewLogicalDeviceInfo->CommandsFailed != + LogicalDeviceInfo->CommandsFailed) || + (NewLogicalDeviceInfo->DeferredWriteErrors != + LogicalDeviceInfo->DeferredWriteErrors)) + DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) Errors: " + "Soft = %d, Failed = %d, Deferred Write = %d\n", + Controller, LogicalDeviceNumber, + Controller->ControllerNumber, + LogicalDeviceNumber, + NewLogicalDeviceInfo->SoftErrors, + NewLogicalDeviceInfo->CommandsFailed, + NewLogicalDeviceInfo->DeferredWriteErrors); + if (NewLogicalDeviceInfo->ConsistencyCheckInProgress) + DAC960_V2_ReportProgress(Controller, + "Consistency Check", + LogicalDeviceNumber, + NewLogicalDeviceInfo + ->ConsistencyCheckBlockNumber, + LogicalDeviceSize); + else if (NewLogicalDeviceInfo->RebuildInProgress) + DAC960_V2_ReportProgress(Controller, + "Rebuild", + LogicalDeviceNumber, + NewLogicalDeviceInfo + ->RebuildBlockNumber, + LogicalDeviceSize); + else if (NewLogicalDeviceInfo->BackgroundInitializationInProgress) + DAC960_V2_ReportProgress(Controller, + "Background Initialization", + LogicalDeviceNumber, + NewLogicalDeviceInfo + ->BackgroundInitializationBlockNumber, + LogicalDeviceSize); + else if (NewLogicalDeviceInfo->ForegroundInitializationInProgress) + DAC960_V2_ReportProgress(Controller, + "Foreground Initialization", + LogicalDeviceNumber, + NewLogicalDeviceInfo + ->ForegroundInitializationBlockNumber, + LogicalDeviceSize); + else if (NewLogicalDeviceInfo->DataMigrationInProgress) + DAC960_V2_ReportProgress(Controller, + "Data Migration", + LogicalDeviceNumber, + NewLogicalDeviceInfo + ->DataMigrationBlockNumber, + LogicalDeviceSize); + else if (NewLogicalDeviceInfo->PatrolOperationInProgress) + DAC960_V2_ReportProgress(Controller, + "Patrol Operation", + LogicalDeviceNumber, + NewLogicalDeviceInfo + ->PatrolOperationBlockNumber, + LogicalDeviceSize); + if (LogicalDeviceInfo->BackgroundInitializationInProgress && + !NewLogicalDeviceInfo->BackgroundInitializationInProgress) + DAC960_Progress("Logical Drive %d (/dev/rd/c%dd%d) " + "Background Initialization %s\n", + Controller, + LogicalDeviceNumber, + Controller->ControllerNumber, + LogicalDeviceNumber, + (NewLogicalDeviceInfo->LogicalDeviceControl + .LogicalDeviceInitialized + ? "Completed" : "Failed")); + memcpy(LogicalDeviceInfo, NewLogicalDeviceInfo, + sizeof(DAC960_V2_LogicalDeviceInfo_T)); + } + Controller->V2.LogicalDriveFoundDuringScan + [LogicalDeviceNumber] = true; + NewLogicalDeviceInfo->LogicalDeviceNumber++; + } + else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid) + { + int LogicalDriveNumber; + for (LogicalDriveNumber = 0; + LogicalDriveNumber < DAC960_MaxLogicalDrives; + LogicalDriveNumber++) + { + DAC960_V2_LogicalDeviceInfo_T *LogicalDeviceInfo = + Controller->V2.LogicalDeviceInformation[LogicalDriveNumber]; + if (LogicalDeviceInfo == NULL || + Controller->V2.LogicalDriveFoundDuringScan + [LogicalDriveNumber]) + continue; + DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) " + "No Longer Exists\n", Controller, + LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber); + Controller->V2.LogicalDeviceInformation + [LogicalDriveNumber] = NULL; + kfree(LogicalDeviceInfo); + Controller->LogicalDriveInitiallyAccessible + [LogicalDriveNumber] = false; + DAC960_ComputeGenericDiskInfo(Controller); + } + Controller->V2.NeedLogicalDeviceInformation = false; + } + else if (CommandOpcode == DAC960_V2_SCSI_10_Passthru) + { + DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber = + Controller->V2.InquiryUnitSerialNumber[Controller->V2.PhysicalDeviceIndex - 1]; + + if (CommandStatus != DAC960_V2_NormalCompletion) { + memset(InquiryUnitSerialNumber, + 0, sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T)); + InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F; + } else + memcpy(InquiryUnitSerialNumber, + Controller->V2.NewInquiryUnitSerialNumber, + sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T)); + + Controller->V2.NeedDeviceSerialNumberInformation = false; + } + + if (Controller->V2.HealthStatusBuffer->NextEventSequenceNumber + - Controller->V2.NextEventSequenceNumber > 0) + { + CommandMailbox->GetEvent.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->GetEvent.DataTransferSize = sizeof(DAC960_V2_Event_T); + CommandMailbox->GetEvent.EventSequenceNumberHigh16 = + Controller->V2.NextEventSequenceNumber >> 16; + CommandMailbox->GetEvent.ControllerNumber = 0; + CommandMailbox->GetEvent.IOCTL_Opcode = + DAC960_V2_GetEvent; + CommandMailbox->GetEvent.EventSequenceNumberLow16 = + Controller->V2.NextEventSequenceNumber & 0xFFFF; + CommandMailbox->GetEvent.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + Controller->V2.EventDMA; + CommandMailbox->GetEvent.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->GetEvent.DataTransferSize; + DAC960_QueueCommand(Command); + return; + } + if (Controller->V2.NeedPhysicalDeviceInformation) + { + if (Controller->V2.NeedDeviceSerialNumberInformation) + { + DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber = + Controller->V2.NewInquiryUnitSerialNumber; + InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F; + + DAC960_V2_ConstructNewUnitSerialNumber(Controller, CommandMailbox, + Controller->V2.NewPhysicalDeviceInformation->Channel, + Controller->V2.NewPhysicalDeviceInformation->TargetID, + Controller->V2.NewPhysicalDeviceInformation->LogicalUnit - 1); + + + DAC960_QueueCommand(Command); + return; + } + if (Controller->V2.StartPhysicalDeviceInformationScan) + { + Controller->V2.PhysicalDeviceIndex = 0; + Controller->V2.NewPhysicalDeviceInformation->Channel = 0; + Controller->V2.NewPhysicalDeviceInformation->TargetID = 0; + Controller->V2.NewPhysicalDeviceInformation->LogicalUnit = 0; + Controller->V2.StartPhysicalDeviceInformationScan = false; + } + CommandMailbox->PhysicalDeviceInfo.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->PhysicalDeviceInfo.DataTransferSize = + sizeof(DAC960_V2_PhysicalDeviceInfo_T); + CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.LogicalUnit = + Controller->V2.NewPhysicalDeviceInformation->LogicalUnit; + CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.TargetID = + Controller->V2.NewPhysicalDeviceInformation->TargetID; + CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.Channel = + Controller->V2.NewPhysicalDeviceInformation->Channel; + CommandMailbox->PhysicalDeviceInfo.IOCTL_Opcode = + DAC960_V2_GetPhysicalDeviceInfoValid; + CommandMailbox->PhysicalDeviceInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + Controller->V2.NewPhysicalDeviceInformationDMA; + CommandMailbox->PhysicalDeviceInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->PhysicalDeviceInfo.DataTransferSize; + DAC960_QueueCommand(Command); + return; + } + if (Controller->V2.NeedLogicalDeviceInformation) + { + if (Controller->V2.StartLogicalDeviceInformationScan) + { + int LogicalDriveNumber; + for (LogicalDriveNumber = 0; + LogicalDriveNumber < DAC960_MaxLogicalDrives; + LogicalDriveNumber++) + Controller->V2.LogicalDriveFoundDuringScan + [LogicalDriveNumber] = false; + Controller->V2.NewLogicalDeviceInformation->LogicalDeviceNumber = 0; + Controller->V2.StartLogicalDeviceInformationScan = false; + } + CommandMailbox->LogicalDeviceInfo.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->LogicalDeviceInfo.DataTransferSize = + sizeof(DAC960_V2_LogicalDeviceInfo_T); + CommandMailbox->LogicalDeviceInfo.LogicalDevice.LogicalDeviceNumber = + Controller->V2.NewLogicalDeviceInformation->LogicalDeviceNumber; + CommandMailbox->LogicalDeviceInfo.IOCTL_Opcode = + DAC960_V2_GetLogicalDeviceInfoValid; + CommandMailbox->LogicalDeviceInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + Controller->V2.NewLogicalDeviceInformationDMA; + CommandMailbox->LogicalDeviceInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->LogicalDeviceInfo.DataTransferSize; + DAC960_QueueCommand(Command); + return; + } + Controller->MonitoringTimerCount++; + Controller->MonitoringTimer.expires = + jiffies + DAC960_HealthStatusMonitoringInterval; + add_timer(&Controller->MonitoringTimer); + } + if (CommandType == DAC960_ImmediateCommand) + { + complete(Command->Completion); + Command->Completion = NULL; + return; + } + if (CommandType == DAC960_QueuedCommand) + { + DAC960_V2_KernelCommand_T *KernelCommand = Command->V2.KernelCommand; + KernelCommand->CommandStatus = CommandStatus; + KernelCommand->RequestSenseLength = Command->V2.RequestSenseLength; + KernelCommand->DataTransferLength = Command->V2.DataTransferResidue; + Command->V2.KernelCommand = NULL; + DAC960_DeallocateCommand(Command); + KernelCommand->CompletionFunction(KernelCommand); + return; + } + /* + Queue a Status Monitoring Command to the Controller using the just + completed Command if one was deferred previously due to lack of a + free Command when the Monitoring Timer Function was called. + */ + if (Controller->MonitoringCommandDeferred) + { + Controller->MonitoringCommandDeferred = false; + DAC960_V2_QueueMonitoringCommand(Command); + return; + } + /* + Deallocate the Command. + */ + DAC960_DeallocateCommand(Command); + /* + Wake up any processes waiting on a free Command. + */ + wake_up(&Controller->CommandWaitQueue); +} + +/* + DAC960_GEM_InterruptHandler handles hardware interrupts from DAC960 GEM Series + Controllers. +*/ + +static irqreturn_t DAC960_GEM_InterruptHandler(int IRQ_Channel, + void *DeviceIdentifier) +{ + DAC960_Controller_T *Controller = DeviceIdentifier; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V2_StatusMailbox_T *NextStatusMailbox; + unsigned long flags; + + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_GEM_AcknowledgeInterrupt(ControllerBaseAddress); + NextStatusMailbox = Controller->V2.NextStatusMailbox; + while (NextStatusMailbox->Fields.CommandIdentifier > 0) + { + DAC960_V2_CommandIdentifier_T CommandIdentifier = + NextStatusMailbox->Fields.CommandIdentifier; + DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1]; + Command->V2.CommandStatus = NextStatusMailbox->Fields.CommandStatus; + Command->V2.RequestSenseLength = + NextStatusMailbox->Fields.RequestSenseLength; + Command->V2.DataTransferResidue = + NextStatusMailbox->Fields.DataTransferResidue; + NextStatusMailbox->Words[0] = 0; + if (++NextStatusMailbox > Controller->V2.LastStatusMailbox) + NextStatusMailbox = Controller->V2.FirstStatusMailbox; + DAC960_V2_ProcessCompletedCommand(Command); + } + Controller->V2.NextStatusMailbox = NextStatusMailbox; + /* + Attempt to remove additional I/O Requests from the Controller's + I/O Request Queue and queue them to the Controller. + */ + DAC960_ProcessRequest(Controller); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + return IRQ_HANDLED; +} + +/* + DAC960_BA_InterruptHandler handles hardware interrupts from DAC960 BA Series + Controllers. +*/ + +static irqreturn_t DAC960_BA_InterruptHandler(int IRQ_Channel, + void *DeviceIdentifier) +{ + DAC960_Controller_T *Controller = DeviceIdentifier; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V2_StatusMailbox_T *NextStatusMailbox; + unsigned long flags; + + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_BA_AcknowledgeInterrupt(ControllerBaseAddress); + NextStatusMailbox = Controller->V2.NextStatusMailbox; + while (NextStatusMailbox->Fields.CommandIdentifier > 0) + { + DAC960_V2_CommandIdentifier_T CommandIdentifier = + NextStatusMailbox->Fields.CommandIdentifier; + DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1]; + Command->V2.CommandStatus = NextStatusMailbox->Fields.CommandStatus; + Command->V2.RequestSenseLength = + NextStatusMailbox->Fields.RequestSenseLength; + Command->V2.DataTransferResidue = + NextStatusMailbox->Fields.DataTransferResidue; + NextStatusMailbox->Words[0] = 0; + if (++NextStatusMailbox > Controller->V2.LastStatusMailbox) + NextStatusMailbox = Controller->V2.FirstStatusMailbox; + DAC960_V2_ProcessCompletedCommand(Command); + } + Controller->V2.NextStatusMailbox = NextStatusMailbox; + /* + Attempt to remove additional I/O Requests from the Controller's + I/O Request Queue and queue them to the Controller. + */ + DAC960_ProcessRequest(Controller); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + return IRQ_HANDLED; +} + + +/* + DAC960_LP_InterruptHandler handles hardware interrupts from DAC960 LP Series + Controllers. +*/ + +static irqreturn_t DAC960_LP_InterruptHandler(int IRQ_Channel, + void *DeviceIdentifier) +{ + DAC960_Controller_T *Controller = DeviceIdentifier; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V2_StatusMailbox_T *NextStatusMailbox; + unsigned long flags; + + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_LP_AcknowledgeInterrupt(ControllerBaseAddress); + NextStatusMailbox = Controller->V2.NextStatusMailbox; + while (NextStatusMailbox->Fields.CommandIdentifier > 0) + { + DAC960_V2_CommandIdentifier_T CommandIdentifier = + NextStatusMailbox->Fields.CommandIdentifier; + DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1]; + Command->V2.CommandStatus = NextStatusMailbox->Fields.CommandStatus; + Command->V2.RequestSenseLength = + NextStatusMailbox->Fields.RequestSenseLength; + Command->V2.DataTransferResidue = + NextStatusMailbox->Fields.DataTransferResidue; + NextStatusMailbox->Words[0] = 0; + if (++NextStatusMailbox > Controller->V2.LastStatusMailbox) + NextStatusMailbox = Controller->V2.FirstStatusMailbox; + DAC960_V2_ProcessCompletedCommand(Command); + } + Controller->V2.NextStatusMailbox = NextStatusMailbox; + /* + Attempt to remove additional I/O Requests from the Controller's + I/O Request Queue and queue them to the Controller. + */ + DAC960_ProcessRequest(Controller); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + return IRQ_HANDLED; +} + + +/* + DAC960_LA_InterruptHandler handles hardware interrupts from DAC960 LA Series + Controllers. +*/ + +static irqreturn_t DAC960_LA_InterruptHandler(int IRQ_Channel, + void *DeviceIdentifier) +{ + DAC960_Controller_T *Controller = DeviceIdentifier; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V1_StatusMailbox_T *NextStatusMailbox; + unsigned long flags; + + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_LA_AcknowledgeInterrupt(ControllerBaseAddress); + NextStatusMailbox = Controller->V1.NextStatusMailbox; + while (NextStatusMailbox->Fields.Valid) + { + DAC960_V1_CommandIdentifier_T CommandIdentifier = + NextStatusMailbox->Fields.CommandIdentifier; + DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1]; + Command->V1.CommandStatus = NextStatusMailbox->Fields.CommandStatus; + NextStatusMailbox->Word = 0; + if (++NextStatusMailbox > Controller->V1.LastStatusMailbox) + NextStatusMailbox = Controller->V1.FirstStatusMailbox; + DAC960_V1_ProcessCompletedCommand(Command); + } + Controller->V1.NextStatusMailbox = NextStatusMailbox; + /* + Attempt to remove additional I/O Requests from the Controller's + I/O Request Queue and queue them to the Controller. + */ + DAC960_ProcessRequest(Controller); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + return IRQ_HANDLED; +} + + +/* + DAC960_PG_InterruptHandler handles hardware interrupts from DAC960 PG Series + Controllers. +*/ + +static irqreturn_t DAC960_PG_InterruptHandler(int IRQ_Channel, + void *DeviceIdentifier) +{ + DAC960_Controller_T *Controller = DeviceIdentifier; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + DAC960_V1_StatusMailbox_T *NextStatusMailbox; + unsigned long flags; + + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_PG_AcknowledgeInterrupt(ControllerBaseAddress); + NextStatusMailbox = Controller->V1.NextStatusMailbox; + while (NextStatusMailbox->Fields.Valid) + { + DAC960_V1_CommandIdentifier_T CommandIdentifier = + NextStatusMailbox->Fields.CommandIdentifier; + DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1]; + Command->V1.CommandStatus = NextStatusMailbox->Fields.CommandStatus; + NextStatusMailbox->Word = 0; + if (++NextStatusMailbox > Controller->V1.LastStatusMailbox) + NextStatusMailbox = Controller->V1.FirstStatusMailbox; + DAC960_V1_ProcessCompletedCommand(Command); + } + Controller->V1.NextStatusMailbox = NextStatusMailbox; + /* + Attempt to remove additional I/O Requests from the Controller's + I/O Request Queue and queue them to the Controller. + */ + DAC960_ProcessRequest(Controller); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + return IRQ_HANDLED; +} + + +/* + DAC960_PD_InterruptHandler handles hardware interrupts from DAC960 PD Series + Controllers. +*/ + +static irqreturn_t DAC960_PD_InterruptHandler(int IRQ_Channel, + void *DeviceIdentifier) +{ + DAC960_Controller_T *Controller = DeviceIdentifier; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + unsigned long flags; + + spin_lock_irqsave(&Controller->queue_lock, flags); + while (DAC960_PD_StatusAvailableP(ControllerBaseAddress)) + { + DAC960_V1_CommandIdentifier_T CommandIdentifier = + DAC960_PD_ReadStatusCommandIdentifier(ControllerBaseAddress); + DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1]; + Command->V1.CommandStatus = + DAC960_PD_ReadStatusRegister(ControllerBaseAddress); + DAC960_PD_AcknowledgeInterrupt(ControllerBaseAddress); + DAC960_PD_AcknowledgeStatus(ControllerBaseAddress); + DAC960_V1_ProcessCompletedCommand(Command); + } + /* + Attempt to remove additional I/O Requests from the Controller's + I/O Request Queue and queue them to the Controller. + */ + DAC960_ProcessRequest(Controller); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + return IRQ_HANDLED; +} + + +/* + DAC960_P_InterruptHandler handles hardware interrupts from DAC960 P Series + Controllers. + + Translations of DAC960_V1_Enquiry and DAC960_V1_GetDeviceState rely + on the data having been placed into DAC960_Controller_T, rather than + an arbitrary buffer. +*/ + +static irqreturn_t DAC960_P_InterruptHandler(int IRQ_Channel, + void *DeviceIdentifier) +{ + DAC960_Controller_T *Controller = DeviceIdentifier; + void __iomem *ControllerBaseAddress = Controller->BaseAddress; + unsigned long flags; + + spin_lock_irqsave(&Controller->queue_lock, flags); + while (DAC960_PD_StatusAvailableP(ControllerBaseAddress)) + { + DAC960_V1_CommandIdentifier_T CommandIdentifier = + DAC960_PD_ReadStatusCommandIdentifier(ControllerBaseAddress); + DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1]; + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + DAC960_V1_CommandOpcode_T CommandOpcode = + CommandMailbox->Common.CommandOpcode; + Command->V1.CommandStatus = + DAC960_PD_ReadStatusRegister(ControllerBaseAddress); + DAC960_PD_AcknowledgeInterrupt(ControllerBaseAddress); + DAC960_PD_AcknowledgeStatus(ControllerBaseAddress); + switch (CommandOpcode) + { + case DAC960_V1_Enquiry_Old: + Command->V1.CommandMailbox.Common.CommandOpcode = DAC960_V1_Enquiry; + DAC960_P_To_PD_TranslateEnquiry(Controller->V1.NewEnquiry); + break; + case DAC960_V1_GetDeviceState_Old: + Command->V1.CommandMailbox.Common.CommandOpcode = + DAC960_V1_GetDeviceState; + DAC960_P_To_PD_TranslateDeviceState(Controller->V1.NewDeviceState); + break; + case DAC960_V1_Read_Old: + Command->V1.CommandMailbox.Common.CommandOpcode = DAC960_V1_Read; + DAC960_P_To_PD_TranslateReadWriteCommand(CommandMailbox); + break; + case DAC960_V1_Write_Old: + Command->V1.CommandMailbox.Common.CommandOpcode = DAC960_V1_Write; + DAC960_P_To_PD_TranslateReadWriteCommand(CommandMailbox); + break; + case DAC960_V1_ReadWithScatterGather_Old: + Command->V1.CommandMailbox.Common.CommandOpcode = + DAC960_V1_ReadWithScatterGather; + DAC960_P_To_PD_TranslateReadWriteCommand(CommandMailbox); + break; + case DAC960_V1_WriteWithScatterGather_Old: + Command->V1.CommandMailbox.Common.CommandOpcode = + DAC960_V1_WriteWithScatterGather; + DAC960_P_To_PD_TranslateReadWriteCommand(CommandMailbox); + break; + default: + break; + } + DAC960_V1_ProcessCompletedCommand(Command); + } + /* + Attempt to remove additional I/O Requests from the Controller's + I/O Request Queue and queue them to the Controller. + */ + DAC960_ProcessRequest(Controller); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + return IRQ_HANDLED; +} + + +/* + DAC960_V1_QueueMonitoringCommand queues a Monitoring Command to DAC960 V1 + Firmware Controllers. +*/ + +static void DAC960_V1_QueueMonitoringCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + DAC960_V1_ClearCommand(Command); + Command->CommandType = DAC960_MonitoringCommand; + CommandMailbox->Type3.CommandOpcode = DAC960_V1_Enquiry; + CommandMailbox->Type3.BusAddress = Controller->V1.NewEnquiryDMA; + DAC960_QueueCommand(Command); +} + + +/* + DAC960_V2_QueueMonitoringCommand queues a Monitoring Command to DAC960 V2 + Firmware Controllers. +*/ + +static void DAC960_V2_QueueMonitoringCommand(DAC960_Command_T *Command) +{ + DAC960_Controller_T *Controller = Command->Controller; + DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox; + DAC960_V2_ClearCommand(Command); + Command->CommandType = DAC960_MonitoringCommand; + CommandMailbox->ControllerInfo.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->ControllerInfo.CommandControlBits + .DataTransferControllerToHost = true; + CommandMailbox->ControllerInfo.CommandControlBits + .NoAutoRequestSense = true; + CommandMailbox->ControllerInfo.DataTransferSize = + sizeof(DAC960_V2_ControllerInfo_T); + CommandMailbox->ControllerInfo.ControllerNumber = 0; + CommandMailbox->ControllerInfo.IOCTL_Opcode = DAC960_V2_GetControllerInfo; + CommandMailbox->ControllerInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + Controller->V2.NewControllerInformationDMA; + CommandMailbox->ControllerInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->ControllerInfo.DataTransferSize; + DAC960_QueueCommand(Command); +} + + +/* + DAC960_MonitoringTimerFunction is the timer function for monitoring + the status of DAC960 Controllers. +*/ + +static void DAC960_MonitoringTimerFunction(unsigned long TimerData) +{ + DAC960_Controller_T *Controller = (DAC960_Controller_T *) TimerData; + DAC960_Command_T *Command; + unsigned long flags; + + if (Controller->FirmwareType == DAC960_V1_Controller) + { + spin_lock_irqsave(&Controller->queue_lock, flags); + /* + Queue a Status Monitoring Command to Controller. + */ + Command = DAC960_AllocateCommand(Controller); + if (Command != NULL) + DAC960_V1_QueueMonitoringCommand(Command); + else Controller->MonitoringCommandDeferred = true; + spin_unlock_irqrestore(&Controller->queue_lock, flags); + } + else + { + DAC960_V2_ControllerInfo_T *ControllerInfo = + &Controller->V2.ControllerInformation; + unsigned int StatusChangeCounter = + Controller->V2.HealthStatusBuffer->StatusChangeCounter; + bool ForceMonitoringCommand = false; + if (time_after(jiffies, Controller->SecondaryMonitoringTime + + DAC960_SecondaryMonitoringInterval)) + { + int LogicalDriveNumber; + for (LogicalDriveNumber = 0; + LogicalDriveNumber < DAC960_MaxLogicalDrives; + LogicalDriveNumber++) + { + DAC960_V2_LogicalDeviceInfo_T *LogicalDeviceInfo = + Controller->V2.LogicalDeviceInformation[LogicalDriveNumber]; + if (LogicalDeviceInfo == NULL) continue; + if (!LogicalDeviceInfo->LogicalDeviceControl + .LogicalDeviceInitialized) + { + ForceMonitoringCommand = true; + break; + } + } + Controller->SecondaryMonitoringTime = jiffies; + } + if (StatusChangeCounter == Controller->V2.StatusChangeCounter && + Controller->V2.HealthStatusBuffer->NextEventSequenceNumber + == Controller->V2.NextEventSequenceNumber && + (ControllerInfo->BackgroundInitializationsActive + + ControllerInfo->LogicalDeviceInitializationsActive + + ControllerInfo->PhysicalDeviceInitializationsActive + + ControllerInfo->ConsistencyChecksActive + + ControllerInfo->RebuildsActive + + ControllerInfo->OnlineExpansionsActive == 0 || + time_before(jiffies, Controller->PrimaryMonitoringTime + + DAC960_MonitoringTimerInterval)) && + !ForceMonitoringCommand) + { + Controller->MonitoringTimer.expires = + jiffies + DAC960_HealthStatusMonitoringInterval; + add_timer(&Controller->MonitoringTimer); + return; + } + Controller->V2.StatusChangeCounter = StatusChangeCounter; + Controller->PrimaryMonitoringTime = jiffies; + + spin_lock_irqsave(&Controller->queue_lock, flags); + /* + Queue a Status Monitoring Command to Controller. + */ + Command = DAC960_AllocateCommand(Controller); + if (Command != NULL) + DAC960_V2_QueueMonitoringCommand(Command); + else Controller->MonitoringCommandDeferred = true; + spin_unlock_irqrestore(&Controller->queue_lock, flags); + /* + Wake up any processes waiting on a Health Status Buffer change. + */ + wake_up(&Controller->HealthStatusWaitQueue); + } +} + +/* + DAC960_CheckStatusBuffer verifies that there is room to hold ByteCount + additional bytes in the Combined Status Buffer and grows the buffer if + necessary. It returns true if there is enough room and false otherwise. +*/ + +static bool DAC960_CheckStatusBuffer(DAC960_Controller_T *Controller, + unsigned int ByteCount) +{ + unsigned char *NewStatusBuffer; + if (Controller->InitialStatusLength + 1 + + Controller->CurrentStatusLength + ByteCount + 1 <= + Controller->CombinedStatusBufferLength) + return true; + if (Controller->CombinedStatusBufferLength == 0) + { + unsigned int NewStatusBufferLength = DAC960_InitialStatusBufferSize; + while (NewStatusBufferLength < ByteCount) + NewStatusBufferLength *= 2; + Controller->CombinedStatusBuffer = kmalloc(NewStatusBufferLength, + GFP_ATOMIC); + if (Controller->CombinedStatusBuffer == NULL) return false; + Controller->CombinedStatusBufferLength = NewStatusBufferLength; + return true; + } + NewStatusBuffer = kmalloc(2 * Controller->CombinedStatusBufferLength, + GFP_ATOMIC); + if (NewStatusBuffer == NULL) + { + DAC960_Warning("Unable to expand Combined Status Buffer - Truncating\n", + Controller); + return false; + } + memcpy(NewStatusBuffer, Controller->CombinedStatusBuffer, + Controller->CombinedStatusBufferLength); + kfree(Controller->CombinedStatusBuffer); + Controller->CombinedStatusBuffer = NewStatusBuffer; + Controller->CombinedStatusBufferLength *= 2; + Controller->CurrentStatusBuffer = + &NewStatusBuffer[Controller->InitialStatusLength + 1]; + return true; +} + + +/* + DAC960_Message prints Driver Messages. +*/ + +static void DAC960_Message(DAC960_MessageLevel_T MessageLevel, + unsigned char *Format, + DAC960_Controller_T *Controller, + ...) +{ + static unsigned char Buffer[DAC960_LineBufferSize]; + static bool BeginningOfLine = true; + va_list Arguments; + int Length = 0; + va_start(Arguments, Controller); + Length = vsprintf(Buffer, Format, Arguments); + va_end(Arguments); + if (Controller == NULL) + printk("%sDAC960#%d: %s", DAC960_MessageLevelMap[MessageLevel], + DAC960_ControllerCount, Buffer); + else if (MessageLevel == DAC960_AnnounceLevel || + MessageLevel == DAC960_InfoLevel) + { + if (!Controller->ControllerInitialized) + { + if (DAC960_CheckStatusBuffer(Controller, Length)) + { + strcpy(&Controller->CombinedStatusBuffer + [Controller->InitialStatusLength], + Buffer); + Controller->InitialStatusLength += Length; + Controller->CurrentStatusBuffer = + &Controller->CombinedStatusBuffer + [Controller->InitialStatusLength + 1]; + } + if (MessageLevel == DAC960_AnnounceLevel) + { + static int AnnouncementLines = 0; + if (++AnnouncementLines <= 2) + printk("%sDAC960: %s", DAC960_MessageLevelMap[MessageLevel], + Buffer); + } + else + { + if (BeginningOfLine) + { + if (Buffer[0] != '\n' || Length > 1) + printk("%sDAC960#%d: %s", + DAC960_MessageLevelMap[MessageLevel], + Controller->ControllerNumber, Buffer); + } + else printk("%s", Buffer); + } + } + else if (DAC960_CheckStatusBuffer(Controller, Length)) + { + strcpy(&Controller->CurrentStatusBuffer[ + Controller->CurrentStatusLength], Buffer); + Controller->CurrentStatusLength += Length; + } + } + else if (MessageLevel == DAC960_ProgressLevel) + { + strcpy(Controller->ProgressBuffer, Buffer); + Controller->ProgressBufferLength = Length; + if (Controller->EphemeralProgressMessage) + { + if (time_after_eq(jiffies, Controller->LastProgressReportTime + + DAC960_ProgressReportingInterval)) + { + printk("%sDAC960#%d: %s", DAC960_MessageLevelMap[MessageLevel], + Controller->ControllerNumber, Buffer); + Controller->LastProgressReportTime = jiffies; + } + } + else printk("%sDAC960#%d: %s", DAC960_MessageLevelMap[MessageLevel], + Controller->ControllerNumber, Buffer); + } + else if (MessageLevel == DAC960_UserCriticalLevel) + { + strcpy(&Controller->UserStatusBuffer[Controller->UserStatusLength], + Buffer); + Controller->UserStatusLength += Length; + if (Buffer[0] != '\n' || Length > 1) + printk("%sDAC960#%d: %s", DAC960_MessageLevelMap[MessageLevel], + Controller->ControllerNumber, Buffer); + } + else + { + if (BeginningOfLine) + printk("%sDAC960#%d: %s", DAC960_MessageLevelMap[MessageLevel], + Controller->ControllerNumber, Buffer); + else printk("%s", Buffer); + } + BeginningOfLine = (Buffer[Length-1] == '\n'); +} + + +/* + DAC960_ParsePhysicalDevice parses spaces followed by a Physical Device + Channel:TargetID specification from a User Command string. It updates + Channel and TargetID and returns true on success and false on failure. +*/ + +static bool DAC960_ParsePhysicalDevice(DAC960_Controller_T *Controller, + char *UserCommandString, + unsigned char *Channel, + unsigned char *TargetID) +{ + char *NewUserCommandString = UserCommandString; + unsigned long XChannel, XTargetID; + while (*UserCommandString == ' ') UserCommandString++; + if (UserCommandString == NewUserCommandString) + return false; + XChannel = simple_strtoul(UserCommandString, &NewUserCommandString, 10); + if (NewUserCommandString == UserCommandString || + *NewUserCommandString != ':' || + XChannel >= Controller->Channels) + return false; + UserCommandString = ++NewUserCommandString; + XTargetID = simple_strtoul(UserCommandString, &NewUserCommandString, 10); + if (NewUserCommandString == UserCommandString || + *NewUserCommandString != '\0' || + XTargetID >= Controller->Targets) + return false; + *Channel = XChannel; + *TargetID = XTargetID; + return true; +} + + +/* + DAC960_ParseLogicalDrive parses spaces followed by a Logical Drive Number + specification from a User Command string. It updates LogicalDriveNumber and + returns true on success and false on failure. +*/ + +static bool DAC960_ParseLogicalDrive(DAC960_Controller_T *Controller, + char *UserCommandString, + unsigned char *LogicalDriveNumber) +{ + char *NewUserCommandString = UserCommandString; + unsigned long XLogicalDriveNumber; + while (*UserCommandString == ' ') UserCommandString++; + if (UserCommandString == NewUserCommandString) + return false; + XLogicalDriveNumber = + simple_strtoul(UserCommandString, &NewUserCommandString, 10); + if (NewUserCommandString == UserCommandString || + *NewUserCommandString != '\0' || + XLogicalDriveNumber > DAC960_MaxLogicalDrives - 1) + return false; + *LogicalDriveNumber = XLogicalDriveNumber; + return true; +} + + +/* + DAC960_V1_SetDeviceState sets the Device State for a Physical Device for + DAC960 V1 Firmware Controllers. +*/ + +static void DAC960_V1_SetDeviceState(DAC960_Controller_T *Controller, + DAC960_Command_T *Command, + unsigned char Channel, + unsigned char TargetID, + DAC960_V1_PhysicalDeviceState_T + DeviceState, + const unsigned char *DeviceStateString) +{ + DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox; + CommandMailbox->Type3D.CommandOpcode = DAC960_V1_StartDevice; + CommandMailbox->Type3D.Channel = Channel; + CommandMailbox->Type3D.TargetID = TargetID; + CommandMailbox->Type3D.DeviceState = DeviceState; + CommandMailbox->Type3D.Modifier = 0; + DAC960_ExecuteCommand(Command); + switch (Command->V1.CommandStatus) + { + case DAC960_V1_NormalCompletion: + DAC960_UserCritical("%s of Physical Device %d:%d Succeeded\n", Controller, + DeviceStateString, Channel, TargetID); + break; + case DAC960_V1_UnableToStartDevice: + DAC960_UserCritical("%s of Physical Device %d:%d Failed - " + "Unable to Start Device\n", Controller, + DeviceStateString, Channel, TargetID); + break; + case DAC960_V1_NoDeviceAtAddress: + DAC960_UserCritical("%s of Physical Device %d:%d Failed - " + "No Device at Address\n", Controller, + DeviceStateString, Channel, TargetID); + break; + case DAC960_V1_InvalidChannelOrTargetOrModifier: + DAC960_UserCritical("%s of Physical Device %d:%d Failed - " + "Invalid Channel or Target or Modifier\n", + Controller, DeviceStateString, Channel, TargetID); + break; + case DAC960_V1_ChannelBusy: + DAC960_UserCritical("%s of Physical Device %d:%d Failed - " + "Channel Busy\n", Controller, + DeviceStateString, Channel, TargetID); + break; + default: + DAC960_UserCritical("%s of Physical Device %d:%d Failed - " + "Unexpected Status %04X\n", Controller, + DeviceStateString, Channel, TargetID, + Command->V1.CommandStatus); + break; + } +} + + +/* + DAC960_V1_ExecuteUserCommand executes a User Command for DAC960 V1 Firmware + Controllers. +*/ + +static bool DAC960_V1_ExecuteUserCommand(DAC960_Controller_T *Controller, + unsigned char *UserCommand) +{ + DAC960_Command_T *Command; + DAC960_V1_CommandMailbox_T *CommandMailbox; + unsigned long flags; + unsigned char Channel, TargetID, LogicalDriveNumber; + + spin_lock_irqsave(&Controller->queue_lock, flags); + while ((Command = DAC960_AllocateCommand(Controller)) == NULL) + DAC960_WaitForCommand(Controller); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + Controller->UserStatusLength = 0; + DAC960_V1_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + CommandMailbox = &Command->V1.CommandMailbox; + if (strcmp(UserCommand, "flush-cache") == 0) + { + CommandMailbox->Type3.CommandOpcode = DAC960_V1_Flush; + DAC960_ExecuteCommand(Command); + DAC960_UserCritical("Cache Flush Completed\n", Controller); + } + else if (strncmp(UserCommand, "kill", 4) == 0 && + DAC960_ParsePhysicalDevice(Controller, &UserCommand[4], + &Channel, &TargetID)) + { + DAC960_V1_DeviceState_T *DeviceState = + &Controller->V1.DeviceState[Channel][TargetID]; + if (DeviceState->Present && + DeviceState->DeviceType == DAC960_V1_DiskType && + DeviceState->DeviceState != DAC960_V1_Device_Dead) + DAC960_V1_SetDeviceState(Controller, Command, Channel, TargetID, + DAC960_V1_Device_Dead, "Kill"); + else DAC960_UserCritical("Kill of Physical Device %d:%d Illegal\n", + Controller, Channel, TargetID); + } + else if (strncmp(UserCommand, "make-online", 11) == 0 && + DAC960_ParsePhysicalDevice(Controller, &UserCommand[11], + &Channel, &TargetID)) + { + DAC960_V1_DeviceState_T *DeviceState = + &Controller->V1.DeviceState[Channel][TargetID]; + if (DeviceState->Present && + DeviceState->DeviceType == DAC960_V1_DiskType && + DeviceState->DeviceState == DAC960_V1_Device_Dead) + DAC960_V1_SetDeviceState(Controller, Command, Channel, TargetID, + DAC960_V1_Device_Online, "Make Online"); + else DAC960_UserCritical("Make Online of Physical Device %d:%d Illegal\n", + Controller, Channel, TargetID); + + } + else if (strncmp(UserCommand, "make-standby", 12) == 0 && + DAC960_ParsePhysicalDevice(Controller, &UserCommand[12], + &Channel, &TargetID)) + { + DAC960_V1_DeviceState_T *DeviceState = + &Controller->V1.DeviceState[Channel][TargetID]; + if (DeviceState->Present && + DeviceState->DeviceType == DAC960_V1_DiskType && + DeviceState->DeviceState == DAC960_V1_Device_Dead) + DAC960_V1_SetDeviceState(Controller, Command, Channel, TargetID, + DAC960_V1_Device_Standby, "Make Standby"); + else DAC960_UserCritical("Make Standby of Physical " + "Device %d:%d Illegal\n", + Controller, Channel, TargetID); + } + else if (strncmp(UserCommand, "rebuild", 7) == 0 && + DAC960_ParsePhysicalDevice(Controller, &UserCommand[7], + &Channel, &TargetID)) + { + CommandMailbox->Type3D.CommandOpcode = DAC960_V1_RebuildAsync; + CommandMailbox->Type3D.Channel = Channel; + CommandMailbox->Type3D.TargetID = TargetID; + DAC960_ExecuteCommand(Command); + switch (Command->V1.CommandStatus) + { + case DAC960_V1_NormalCompletion: + DAC960_UserCritical("Rebuild of Physical Device %d:%d Initiated\n", + Controller, Channel, TargetID); + break; + case DAC960_V1_AttemptToRebuildOnlineDrive: + DAC960_UserCritical("Rebuild of Physical Device %d:%d Failed - " + "Attempt to Rebuild Online or " + "Unresponsive Drive\n", + Controller, Channel, TargetID); + break; + case DAC960_V1_NewDiskFailedDuringRebuild: + DAC960_UserCritical("Rebuild of Physical Device %d:%d Failed - " + "New Disk Failed During Rebuild\n", + Controller, Channel, TargetID); + break; + case DAC960_V1_InvalidDeviceAddress: + DAC960_UserCritical("Rebuild of Physical Device %d:%d Failed - " + "Invalid Device Address\n", + Controller, Channel, TargetID); + break; + case DAC960_V1_RebuildOrCheckAlreadyInProgress: + DAC960_UserCritical("Rebuild of Physical Device %d:%d Failed - " + "Rebuild or Consistency Check Already " + "in Progress\n", Controller, Channel, TargetID); + break; + default: + DAC960_UserCritical("Rebuild of Physical Device %d:%d Failed - " + "Unexpected Status %04X\n", Controller, + Channel, TargetID, Command->V1.CommandStatus); + break; + } + } + else if (strncmp(UserCommand, "check-consistency", 17) == 0 && + DAC960_ParseLogicalDrive(Controller, &UserCommand[17], + &LogicalDriveNumber)) + { + CommandMailbox->Type3C.CommandOpcode = DAC960_V1_CheckConsistencyAsync; + CommandMailbox->Type3C.LogicalDriveNumber = LogicalDriveNumber; + CommandMailbox->Type3C.AutoRestore = true; + DAC960_ExecuteCommand(Command); + switch (Command->V1.CommandStatus) + { + case DAC960_V1_NormalCompletion: + DAC960_UserCritical("Consistency Check of Logical Drive %d " + "(/dev/rd/c%dd%d) Initiated\n", + Controller, LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber); + break; + case DAC960_V1_DependentDiskIsDead: + DAC960_UserCritical("Consistency Check of Logical Drive %d " + "(/dev/rd/c%dd%d) Failed - " + "Dependent Physical Device is DEAD\n", + Controller, LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber); + break; + case DAC960_V1_InvalidOrNonredundantLogicalDrive: + DAC960_UserCritical("Consistency Check of Logical Drive %d " + "(/dev/rd/c%dd%d) Failed - " + "Invalid or Nonredundant Logical Drive\n", + Controller, LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber); + break; + case DAC960_V1_RebuildOrCheckAlreadyInProgress: + DAC960_UserCritical("Consistency Check of Logical Drive %d " + "(/dev/rd/c%dd%d) Failed - Rebuild or " + "Consistency Check Already in Progress\n", + Controller, LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber); + break; + default: + DAC960_UserCritical("Consistency Check of Logical Drive %d " + "(/dev/rd/c%dd%d) Failed - " + "Unexpected Status %04X\n", + Controller, LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber, Command->V1.CommandStatus); + break; + } + } + else if (strcmp(UserCommand, "cancel-rebuild") == 0 || + strcmp(UserCommand, "cancel-consistency-check") == 0) + { + /* + the OldRebuildRateConstant is never actually used + once its value is retrieved from the controller. + */ + unsigned char *OldRebuildRateConstant; + dma_addr_t OldRebuildRateConstantDMA; + + OldRebuildRateConstant = pci_alloc_consistent( Controller->PCIDevice, + sizeof(char), &OldRebuildRateConstantDMA); + if (OldRebuildRateConstant == NULL) { + DAC960_UserCritical("Cancellation of Rebuild or " + "Consistency Check Failed - " + "Out of Memory", + Controller); + goto failure; + } + CommandMailbox->Type3R.CommandOpcode = DAC960_V1_RebuildControl; + CommandMailbox->Type3R.RebuildRateConstant = 0xFF; + CommandMailbox->Type3R.BusAddress = OldRebuildRateConstantDMA; + DAC960_ExecuteCommand(Command); + switch (Command->V1.CommandStatus) + { + case DAC960_V1_NormalCompletion: + DAC960_UserCritical("Rebuild or Consistency Check Cancelled\n", + Controller); + break; + default: + DAC960_UserCritical("Cancellation of Rebuild or " + "Consistency Check Failed - " + "Unexpected Status %04X\n", + Controller, Command->V1.CommandStatus); + break; + } +failure: + pci_free_consistent(Controller->PCIDevice, sizeof(char), + OldRebuildRateConstant, OldRebuildRateConstantDMA); + } + else DAC960_UserCritical("Illegal User Command: '%s'\n", + Controller, UserCommand); + + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_DeallocateCommand(Command); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + return true; +} + + +/* + DAC960_V2_TranslatePhysicalDevice translates a Physical Device Channel and + TargetID into a Logical Device. It returns true on success and false + on failure. +*/ + +static bool DAC960_V2_TranslatePhysicalDevice(DAC960_Command_T *Command, + unsigned char Channel, + unsigned char TargetID, + unsigned short + *LogicalDeviceNumber) +{ + DAC960_V2_CommandMailbox_T SavedCommandMailbox, *CommandMailbox; + DAC960_Controller_T *Controller = Command->Controller; + + CommandMailbox = &Command->V2.CommandMailbox; + memcpy(&SavedCommandMailbox, CommandMailbox, + sizeof(DAC960_V2_CommandMailbox_T)); + + CommandMailbox->PhysicalDeviceInfo.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->PhysicalDeviceInfo.CommandControlBits + .DataTransferControllerToHost = true; + CommandMailbox->PhysicalDeviceInfo.CommandControlBits + .NoAutoRequestSense = true; + CommandMailbox->PhysicalDeviceInfo.DataTransferSize = + sizeof(DAC960_V2_PhysicalToLogicalDevice_T); + CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.TargetID = TargetID; + CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.Channel = Channel; + CommandMailbox->PhysicalDeviceInfo.IOCTL_Opcode = + DAC960_V2_TranslatePhysicalToLogicalDevice; + CommandMailbox->Common.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + Controller->V2.PhysicalToLogicalDeviceDMA; + CommandMailbox->Common.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->Common.DataTransferSize; + + DAC960_ExecuteCommand(Command); + *LogicalDeviceNumber = Controller->V2.PhysicalToLogicalDevice->LogicalDeviceNumber; + + memcpy(CommandMailbox, &SavedCommandMailbox, + sizeof(DAC960_V2_CommandMailbox_T)); + return (Command->V2.CommandStatus == DAC960_V2_NormalCompletion); +} + + +/* + DAC960_V2_ExecuteUserCommand executes a User Command for DAC960 V2 Firmware + Controllers. +*/ + +static bool DAC960_V2_ExecuteUserCommand(DAC960_Controller_T *Controller, + unsigned char *UserCommand) +{ + DAC960_Command_T *Command; + DAC960_V2_CommandMailbox_T *CommandMailbox; + unsigned long flags; + unsigned char Channel, TargetID, LogicalDriveNumber; + unsigned short LogicalDeviceNumber; + + spin_lock_irqsave(&Controller->queue_lock, flags); + while ((Command = DAC960_AllocateCommand(Controller)) == NULL) + DAC960_WaitForCommand(Controller); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + Controller->UserStatusLength = 0; + DAC960_V2_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + CommandMailbox = &Command->V2.CommandMailbox; + CommandMailbox->Common.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->Common.CommandControlBits.DataTransferControllerToHost = true; + CommandMailbox->Common.CommandControlBits.NoAutoRequestSense = true; + if (strcmp(UserCommand, "flush-cache") == 0) + { + CommandMailbox->DeviceOperation.IOCTL_Opcode = DAC960_V2_PauseDevice; + CommandMailbox->DeviceOperation.OperationDevice = + DAC960_V2_RAID_Controller; + DAC960_ExecuteCommand(Command); + DAC960_UserCritical("Cache Flush Completed\n", Controller); + } + else if (strncmp(UserCommand, "kill", 4) == 0 && + DAC960_ParsePhysicalDevice(Controller, &UserCommand[4], + &Channel, &TargetID) && + DAC960_V2_TranslatePhysicalDevice(Command, Channel, TargetID, + &LogicalDeviceNumber)) + { + CommandMailbox->SetDeviceState.LogicalDevice.LogicalDeviceNumber = + LogicalDeviceNumber; + CommandMailbox->SetDeviceState.IOCTL_Opcode = + DAC960_V2_SetDeviceState; + CommandMailbox->SetDeviceState.DeviceState.PhysicalDeviceState = + DAC960_V2_Device_Dead; + DAC960_ExecuteCommand(Command); + DAC960_UserCritical("Kill of Physical Device %d:%d %s\n", + Controller, Channel, TargetID, + (Command->V2.CommandStatus + == DAC960_V2_NormalCompletion + ? "Succeeded" : "Failed")); + } + else if (strncmp(UserCommand, "make-online", 11) == 0 && + DAC960_ParsePhysicalDevice(Controller, &UserCommand[11], + &Channel, &TargetID) && + DAC960_V2_TranslatePhysicalDevice(Command, Channel, TargetID, + &LogicalDeviceNumber)) + { + CommandMailbox->SetDeviceState.LogicalDevice.LogicalDeviceNumber = + LogicalDeviceNumber; + CommandMailbox->SetDeviceState.IOCTL_Opcode = + DAC960_V2_SetDeviceState; + CommandMailbox->SetDeviceState.DeviceState.PhysicalDeviceState = + DAC960_V2_Device_Online; + DAC960_ExecuteCommand(Command); + DAC960_UserCritical("Make Online of Physical Device %d:%d %s\n", + Controller, Channel, TargetID, + (Command->V2.CommandStatus + == DAC960_V2_NormalCompletion + ? "Succeeded" : "Failed")); + } + else if (strncmp(UserCommand, "make-standby", 12) == 0 && + DAC960_ParsePhysicalDevice(Controller, &UserCommand[12], + &Channel, &TargetID) && + DAC960_V2_TranslatePhysicalDevice(Command, Channel, TargetID, + &LogicalDeviceNumber)) + { + CommandMailbox->SetDeviceState.LogicalDevice.LogicalDeviceNumber = + LogicalDeviceNumber; + CommandMailbox->SetDeviceState.IOCTL_Opcode = + DAC960_V2_SetDeviceState; + CommandMailbox->SetDeviceState.DeviceState.PhysicalDeviceState = + DAC960_V2_Device_Standby; + DAC960_ExecuteCommand(Command); + DAC960_UserCritical("Make Standby of Physical Device %d:%d %s\n", + Controller, Channel, TargetID, + (Command->V2.CommandStatus + == DAC960_V2_NormalCompletion + ? "Succeeded" : "Failed")); + } + else if (strncmp(UserCommand, "rebuild", 7) == 0 && + DAC960_ParsePhysicalDevice(Controller, &UserCommand[7], + &Channel, &TargetID) && + DAC960_V2_TranslatePhysicalDevice(Command, Channel, TargetID, + &LogicalDeviceNumber)) + { + CommandMailbox->LogicalDeviceInfo.LogicalDevice.LogicalDeviceNumber = + LogicalDeviceNumber; + CommandMailbox->LogicalDeviceInfo.IOCTL_Opcode = + DAC960_V2_RebuildDeviceStart; + DAC960_ExecuteCommand(Command); + DAC960_UserCritical("Rebuild of Physical Device %d:%d %s\n", + Controller, Channel, TargetID, + (Command->V2.CommandStatus + == DAC960_V2_NormalCompletion + ? "Initiated" : "Not Initiated")); + } + else if (strncmp(UserCommand, "cancel-rebuild", 14) == 0 && + DAC960_ParsePhysicalDevice(Controller, &UserCommand[14], + &Channel, &TargetID) && + DAC960_V2_TranslatePhysicalDevice(Command, Channel, TargetID, + &LogicalDeviceNumber)) + { + CommandMailbox->LogicalDeviceInfo.LogicalDevice.LogicalDeviceNumber = + LogicalDeviceNumber; + CommandMailbox->LogicalDeviceInfo.IOCTL_Opcode = + DAC960_V2_RebuildDeviceStop; + DAC960_ExecuteCommand(Command); + DAC960_UserCritical("Rebuild of Physical Device %d:%d %s\n", + Controller, Channel, TargetID, + (Command->V2.CommandStatus + == DAC960_V2_NormalCompletion + ? "Cancelled" : "Not Cancelled")); + } + else if (strncmp(UserCommand, "check-consistency", 17) == 0 && + DAC960_ParseLogicalDrive(Controller, &UserCommand[17], + &LogicalDriveNumber)) + { + CommandMailbox->ConsistencyCheck.LogicalDevice.LogicalDeviceNumber = + LogicalDriveNumber; + CommandMailbox->ConsistencyCheck.IOCTL_Opcode = + DAC960_V2_ConsistencyCheckStart; + CommandMailbox->ConsistencyCheck.RestoreConsistency = true; + CommandMailbox->ConsistencyCheck.InitializedAreaOnly = false; + DAC960_ExecuteCommand(Command); + DAC960_UserCritical("Consistency Check of Logical Drive %d " + "(/dev/rd/c%dd%d) %s\n", + Controller, LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber, + (Command->V2.CommandStatus + == DAC960_V2_NormalCompletion + ? "Initiated" : "Not Initiated")); + } + else if (strncmp(UserCommand, "cancel-consistency-check", 24) == 0 && + DAC960_ParseLogicalDrive(Controller, &UserCommand[24], + &LogicalDriveNumber)) + { + CommandMailbox->ConsistencyCheck.LogicalDevice.LogicalDeviceNumber = + LogicalDriveNumber; + CommandMailbox->ConsistencyCheck.IOCTL_Opcode = + DAC960_V2_ConsistencyCheckStop; + DAC960_ExecuteCommand(Command); + DAC960_UserCritical("Consistency Check of Logical Drive %d " + "(/dev/rd/c%dd%d) %s\n", + Controller, LogicalDriveNumber, + Controller->ControllerNumber, + LogicalDriveNumber, + (Command->V2.CommandStatus + == DAC960_V2_NormalCompletion + ? "Cancelled" : "Not Cancelled")); + } + else if (strcmp(UserCommand, "perform-discovery") == 0) + { + CommandMailbox->Common.IOCTL_Opcode = DAC960_V2_StartDiscovery; + DAC960_ExecuteCommand(Command); + DAC960_UserCritical("Discovery %s\n", Controller, + (Command->V2.CommandStatus + == DAC960_V2_NormalCompletion + ? "Initiated" : "Not Initiated")); + if (Command->V2.CommandStatus == DAC960_V2_NormalCompletion) + { + CommandMailbox->ControllerInfo.CommandOpcode = DAC960_V2_IOCTL; + CommandMailbox->ControllerInfo.CommandControlBits + .DataTransferControllerToHost = true; + CommandMailbox->ControllerInfo.CommandControlBits + .NoAutoRequestSense = true; + CommandMailbox->ControllerInfo.DataTransferSize = + sizeof(DAC960_V2_ControllerInfo_T); + CommandMailbox->ControllerInfo.ControllerNumber = 0; + CommandMailbox->ControllerInfo.IOCTL_Opcode = + DAC960_V2_GetControllerInfo; + /* + * How does this NOT race with the queued Monitoring + * usage of this structure? + */ + CommandMailbox->ControllerInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = + Controller->V2.NewControllerInformationDMA; + CommandMailbox->ControllerInfo.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->ControllerInfo.DataTransferSize; + while (1) { + DAC960_ExecuteCommand(Command); + if (!Controller->V2.NewControllerInformation->PhysicalScanActive) + break; + msleep(1000); + } + DAC960_UserCritical("Discovery Completed\n", Controller); + } + } + else if (strcmp(UserCommand, "suppress-enclosure-messages") == 0) + Controller->SuppressEnclosureMessages = true; + else DAC960_UserCritical("Illegal User Command: '%s'\n", + Controller, UserCommand); + + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_DeallocateCommand(Command); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + return true; +} + +static int dac960_proc_show(struct seq_file *m, void *v) +{ + unsigned char *StatusMessage = "OK\n"; + int ControllerNumber; + for (ControllerNumber = 0; + ControllerNumber < DAC960_ControllerCount; + ControllerNumber++) + { + DAC960_Controller_T *Controller = DAC960_Controllers[ControllerNumber]; + if (Controller == NULL) continue; + if (Controller->MonitoringAlertMode) + { + StatusMessage = "ALERT\n"; + break; + } + } + seq_puts(m, StatusMessage); + return 0; +} + +static int dac960_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, dac960_proc_show, NULL); +} + +static const struct file_operations dac960_proc_fops = { + .owner = THIS_MODULE, + .open = dac960_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int dac960_initial_status_proc_show(struct seq_file *m, void *v) +{ + DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private; + seq_printf(m, "%.*s", Controller->InitialStatusLength, Controller->CombinedStatusBuffer); + return 0; +} + +static int dac960_initial_status_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, dac960_initial_status_proc_show, PDE_DATA(inode)); +} + +static const struct file_operations dac960_initial_status_proc_fops = { + .owner = THIS_MODULE, + .open = dac960_initial_status_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int dac960_current_status_proc_show(struct seq_file *m, void *v) +{ + DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private; + unsigned char *StatusMessage = + "No Rebuild or Consistency Check in Progress\n"; + int ProgressMessageLength = strlen(StatusMessage); + if (jiffies != Controller->LastCurrentStatusTime) + { + Controller->CurrentStatusLength = 0; + DAC960_AnnounceDriver(Controller); + DAC960_ReportControllerConfiguration(Controller); + DAC960_ReportDeviceConfiguration(Controller); + if (Controller->ProgressBufferLength > 0) + ProgressMessageLength = Controller->ProgressBufferLength; + if (DAC960_CheckStatusBuffer(Controller, 2 + ProgressMessageLength)) + { + unsigned char *CurrentStatusBuffer = Controller->CurrentStatusBuffer; + CurrentStatusBuffer[Controller->CurrentStatusLength++] = ' '; + CurrentStatusBuffer[Controller->CurrentStatusLength++] = ' '; + if (Controller->ProgressBufferLength > 0) + strcpy(&CurrentStatusBuffer[Controller->CurrentStatusLength], + Controller->ProgressBuffer); + else + strcpy(&CurrentStatusBuffer[Controller->CurrentStatusLength], + StatusMessage); + Controller->CurrentStatusLength += ProgressMessageLength; + } + Controller->LastCurrentStatusTime = jiffies; + } + seq_printf(m, "%.*s", Controller->CurrentStatusLength, Controller->CurrentStatusBuffer); + return 0; +} + +static int dac960_current_status_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, dac960_current_status_proc_show, PDE_DATA(inode)); +} + +static const struct file_operations dac960_current_status_proc_fops = { + .owner = THIS_MODULE, + .open = dac960_current_status_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int dac960_user_command_proc_show(struct seq_file *m, void *v) +{ + DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private; + + seq_printf(m, "%.*s", Controller->UserStatusLength, Controller->UserStatusBuffer); + return 0; +} + +static int dac960_user_command_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, dac960_user_command_proc_show, PDE_DATA(inode)); +} + +static ssize_t dac960_user_command_proc_write(struct file *file, + const char __user *Buffer, + size_t Count, loff_t *pos) +{ + DAC960_Controller_T *Controller = PDE_DATA(file_inode(file)); + unsigned char CommandBuffer[80]; + int Length; + if (Count > sizeof(CommandBuffer)-1) return -EINVAL; + if (copy_from_user(CommandBuffer, Buffer, Count)) return -EFAULT; + CommandBuffer[Count] = '\0'; + Length = strlen(CommandBuffer); + if (Length > 0 && CommandBuffer[Length-1] == '\n') + CommandBuffer[--Length] = '\0'; + if (Controller->FirmwareType == DAC960_V1_Controller) + return (DAC960_V1_ExecuteUserCommand(Controller, CommandBuffer) + ? Count : -EBUSY); + else + return (DAC960_V2_ExecuteUserCommand(Controller, CommandBuffer) + ? Count : -EBUSY); +} + +static const struct file_operations dac960_user_command_proc_fops = { + .owner = THIS_MODULE, + .open = dac960_user_command_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = dac960_user_command_proc_write, +}; + +/* + DAC960_CreateProcEntries creates the /proc/rd/... entries for the + DAC960 Driver. +*/ + +static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller) +{ + struct proc_dir_entry *ControllerProcEntry; + + if (DAC960_ProcDirectoryEntry == NULL) { + DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL); + proc_create("status", 0, DAC960_ProcDirectoryEntry, + &dac960_proc_fops); + } + + sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber); + ControllerProcEntry = proc_mkdir(Controller->ControllerName, + DAC960_ProcDirectoryEntry); + proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller); + proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller); + proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller); + Controller->ControllerProcEntry = ControllerProcEntry; +} + + +/* + DAC960_DestroyProcEntries destroys the /proc/rd/... entries for the + DAC960 Driver. +*/ + +static void DAC960_DestroyProcEntries(DAC960_Controller_T *Controller) +{ + if (Controller->ControllerProcEntry == NULL) + return; + remove_proc_entry("initial_status", Controller->ControllerProcEntry); + remove_proc_entry("current_status", Controller->ControllerProcEntry); + remove_proc_entry("user_command", Controller->ControllerProcEntry); + remove_proc_entry(Controller->ControllerName, DAC960_ProcDirectoryEntry); + Controller->ControllerProcEntry = NULL; +} + +#ifdef DAC960_GAM_MINOR + +/* + * DAC960_gam_ioctl is the ioctl function for performing RAID operations. +*/ + +static long DAC960_gam_ioctl(struct file *file, unsigned int Request, + unsigned long Argument) +{ + long ErrorCode = 0; + if (!capable(CAP_SYS_ADMIN)) return -EACCES; + + mutex_lock(&DAC960_mutex); + switch (Request) + { + case DAC960_IOCTL_GET_CONTROLLER_COUNT: + ErrorCode = DAC960_ControllerCount; + break; + case DAC960_IOCTL_GET_CONTROLLER_INFO: + { + DAC960_ControllerInfo_T __user *UserSpaceControllerInfo = + (DAC960_ControllerInfo_T __user *) Argument; + DAC960_ControllerInfo_T ControllerInfo; + DAC960_Controller_T *Controller; + int ControllerNumber; + if (UserSpaceControllerInfo == NULL) + ErrorCode = -EINVAL; + else ErrorCode = get_user(ControllerNumber, + &UserSpaceControllerInfo->ControllerNumber); + if (ErrorCode != 0) + break; + ErrorCode = -ENXIO; + if (ControllerNumber < 0 || + ControllerNumber > DAC960_ControllerCount - 1) { + break; + } + Controller = DAC960_Controllers[ControllerNumber]; + if (Controller == NULL) + break; + memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T)); + ControllerInfo.ControllerNumber = ControllerNumber; + ControllerInfo.FirmwareType = Controller->FirmwareType; + ControllerInfo.Channels = Controller->Channels; + ControllerInfo.Targets = Controller->Targets; + ControllerInfo.PCI_Bus = Controller->Bus; + ControllerInfo.PCI_Device = Controller->Device; + ControllerInfo.PCI_Function = Controller->Function; + ControllerInfo.IRQ_Channel = Controller->IRQ_Channel; + ControllerInfo.PCI_Address = Controller->PCI_Address; + strcpy(ControllerInfo.ModelName, Controller->ModelName); + strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion); + ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo, + sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0); + break; + } + case DAC960_IOCTL_V1_EXECUTE_COMMAND: + { + DAC960_V1_UserCommand_T __user *UserSpaceUserCommand = + (DAC960_V1_UserCommand_T __user *) Argument; + DAC960_V1_UserCommand_T UserCommand; + DAC960_Controller_T *Controller; + DAC960_Command_T *Command = NULL; + DAC960_V1_CommandOpcode_T CommandOpcode; + DAC960_V1_CommandStatus_T CommandStatus; + DAC960_V1_DCDB_T DCDB; + DAC960_V1_DCDB_T *DCDB_IOBUF = NULL; + dma_addr_t DCDB_IOBUFDMA; + unsigned long flags; + int ControllerNumber, DataTransferLength; + unsigned char *DataTransferBuffer = NULL; + dma_addr_t DataTransferBufferDMA; + if (UserSpaceUserCommand == NULL) { + ErrorCode = -EINVAL; + break; + } + if (copy_from_user(&UserCommand, UserSpaceUserCommand, + sizeof(DAC960_V1_UserCommand_T))) { + ErrorCode = -EFAULT; + break; + } + ControllerNumber = UserCommand.ControllerNumber; + ErrorCode = -ENXIO; + if (ControllerNumber < 0 || + ControllerNumber > DAC960_ControllerCount - 1) + break; + Controller = DAC960_Controllers[ControllerNumber]; + if (Controller == NULL) + break; + ErrorCode = -EINVAL; + if (Controller->FirmwareType != DAC960_V1_Controller) + break; + CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode; + DataTransferLength = UserCommand.DataTransferLength; + if (CommandOpcode & 0x80) + break; + if (CommandOpcode == DAC960_V1_DCDB) + { + if (copy_from_user(&DCDB, UserCommand.DCDB, + sizeof(DAC960_V1_DCDB_T))) { + ErrorCode = -EFAULT; + break; + } + if (DCDB.Channel >= DAC960_V1_MaxChannels) + break; + if (!((DataTransferLength == 0 && + DCDB.Direction + == DAC960_V1_DCDB_NoDataTransfer) || + (DataTransferLength > 0 && + DCDB.Direction + == DAC960_V1_DCDB_DataTransferDeviceToSystem) || + (DataTransferLength < 0 && + DCDB.Direction + == DAC960_V1_DCDB_DataTransferSystemToDevice))) + break; + if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength) + != abs(DataTransferLength)) + break; + DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice, + sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA); + if (DCDB_IOBUF == NULL) { + ErrorCode = -ENOMEM; + break; + } + } + ErrorCode = -ENOMEM; + if (DataTransferLength > 0) + { + DataTransferBuffer = pci_zalloc_consistent(Controller->PCIDevice, + DataTransferLength, + &DataTransferBufferDMA); + if (DataTransferBuffer == NULL) + break; + } + else if (DataTransferLength < 0) + { + DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, + -DataTransferLength, &DataTransferBufferDMA); + if (DataTransferBuffer == NULL) + break; + if (copy_from_user(DataTransferBuffer, + UserCommand.DataTransferBuffer, + -DataTransferLength)) { + ErrorCode = -EFAULT; + break; + } + } + if (CommandOpcode == DAC960_V1_DCDB) + { + spin_lock_irqsave(&Controller->queue_lock, flags); + while ((Command = DAC960_AllocateCommand(Controller)) == NULL) + DAC960_WaitForCommand(Controller); + while (Controller->V1.DirectCommandActive[DCDB.Channel] + [DCDB.TargetID]) + { + spin_unlock_irq(&Controller->queue_lock); + __wait_event(Controller->CommandWaitQueue, + !Controller->V1.DirectCommandActive + [DCDB.Channel][DCDB.TargetID]); + spin_lock_irq(&Controller->queue_lock); + } + Controller->V1.DirectCommandActive[DCDB.Channel] + [DCDB.TargetID] = true; + spin_unlock_irqrestore(&Controller->queue_lock, flags); + DAC960_V1_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + memcpy(&Command->V1.CommandMailbox, &UserCommand.CommandMailbox, + sizeof(DAC960_V1_CommandMailbox_T)); + Command->V1.CommandMailbox.Type3.BusAddress = DCDB_IOBUFDMA; + DCDB.BusAddress = DataTransferBufferDMA; + memcpy(DCDB_IOBUF, &DCDB, sizeof(DAC960_V1_DCDB_T)); + } + else + { + spin_lock_irqsave(&Controller->queue_lock, flags); + while ((Command = DAC960_AllocateCommand(Controller)) == NULL) + DAC960_WaitForCommand(Controller); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + DAC960_V1_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + memcpy(&Command->V1.CommandMailbox, &UserCommand.CommandMailbox, + sizeof(DAC960_V1_CommandMailbox_T)); + if (DataTransferBuffer != NULL) + Command->V1.CommandMailbox.Type3.BusAddress = + DataTransferBufferDMA; + } + DAC960_ExecuteCommand(Command); + CommandStatus = Command->V1.CommandStatus; + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_DeallocateCommand(Command); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + if (DataTransferLength > 0) + { + if (copy_to_user(UserCommand.DataTransferBuffer, + DataTransferBuffer, DataTransferLength)) { + ErrorCode = -EFAULT; + goto Failure1; + } + } + if (CommandOpcode == DAC960_V1_DCDB) + { + /* + I don't believe Target or Channel in the DCDB_IOBUF + should be any different from the contents of DCDB. + */ + Controller->V1.DirectCommandActive[DCDB.Channel] + [DCDB.TargetID] = false; + if (copy_to_user(UserCommand.DCDB, DCDB_IOBUF, + sizeof(DAC960_V1_DCDB_T))) { + ErrorCode = -EFAULT; + goto Failure1; + } + } + ErrorCode = CommandStatus; + Failure1: + if (DataTransferBuffer != NULL) + pci_free_consistent(Controller->PCIDevice, abs(DataTransferLength), + DataTransferBuffer, DataTransferBufferDMA); + if (DCDB_IOBUF != NULL) + pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T), + DCDB_IOBUF, DCDB_IOBUFDMA); + break; + } + case DAC960_IOCTL_V2_EXECUTE_COMMAND: + { + DAC960_V2_UserCommand_T __user *UserSpaceUserCommand = + (DAC960_V2_UserCommand_T __user *) Argument; + DAC960_V2_UserCommand_T UserCommand; + DAC960_Controller_T *Controller; + DAC960_Command_T *Command = NULL; + DAC960_V2_CommandMailbox_T *CommandMailbox; + DAC960_V2_CommandStatus_T CommandStatus; + unsigned long flags; + int ControllerNumber, DataTransferLength; + int DataTransferResidue, RequestSenseLength; + unsigned char *DataTransferBuffer = NULL; + dma_addr_t DataTransferBufferDMA; + unsigned char *RequestSenseBuffer = NULL; + dma_addr_t RequestSenseBufferDMA; + + ErrorCode = -EINVAL; + if (UserSpaceUserCommand == NULL) + break; + if (copy_from_user(&UserCommand, UserSpaceUserCommand, + sizeof(DAC960_V2_UserCommand_T))) { + ErrorCode = -EFAULT; + break; + } + ErrorCode = -ENXIO; + ControllerNumber = UserCommand.ControllerNumber; + if (ControllerNumber < 0 || + ControllerNumber > DAC960_ControllerCount - 1) + break; + Controller = DAC960_Controllers[ControllerNumber]; + if (Controller == NULL) + break; + if (Controller->FirmwareType != DAC960_V2_Controller){ + ErrorCode = -EINVAL; + break; + } + DataTransferLength = UserCommand.DataTransferLength; + ErrorCode = -ENOMEM; + if (DataTransferLength > 0) + { + DataTransferBuffer = pci_zalloc_consistent(Controller->PCIDevice, + DataTransferLength, + &DataTransferBufferDMA); + if (DataTransferBuffer == NULL) + break; + } + else if (DataTransferLength < 0) + { + DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice, + -DataTransferLength, &DataTransferBufferDMA); + if (DataTransferBuffer == NULL) + break; + if (copy_from_user(DataTransferBuffer, + UserCommand.DataTransferBuffer, + -DataTransferLength)) { + ErrorCode = -EFAULT; + goto Failure2; + } + } + RequestSenseLength = UserCommand.RequestSenseLength; + if (RequestSenseLength > 0) + { + RequestSenseBuffer = pci_zalloc_consistent(Controller->PCIDevice, + RequestSenseLength, + &RequestSenseBufferDMA); + if (RequestSenseBuffer == NULL) + { + ErrorCode = -ENOMEM; + goto Failure2; + } + } + spin_lock_irqsave(&Controller->queue_lock, flags); + while ((Command = DAC960_AllocateCommand(Controller)) == NULL) + DAC960_WaitForCommand(Controller); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + DAC960_V2_ClearCommand(Command); + Command->CommandType = DAC960_ImmediateCommand; + CommandMailbox = &Command->V2.CommandMailbox; + memcpy(CommandMailbox, &UserCommand.CommandMailbox, + sizeof(DAC960_V2_CommandMailbox_T)); + CommandMailbox->Common.CommandControlBits + .AdditionalScatterGatherListMemory = false; + CommandMailbox->Common.CommandControlBits + .NoAutoRequestSense = true; + CommandMailbox->Common.DataTransferSize = 0; + CommandMailbox->Common.DataTransferPageNumber = 0; + memset(&CommandMailbox->Common.DataTransferMemoryAddress, 0, + sizeof(DAC960_V2_DataTransferMemoryAddress_T)); + if (DataTransferLength != 0) + { + if (DataTransferLength > 0) + { + CommandMailbox->Common.CommandControlBits + .DataTransferControllerToHost = true; + CommandMailbox->Common.DataTransferSize = DataTransferLength; + } + else + { + CommandMailbox->Common.CommandControlBits + .DataTransferControllerToHost = false; + CommandMailbox->Common.DataTransferSize = -DataTransferLength; + } + CommandMailbox->Common.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentDataPointer = DataTransferBufferDMA; + CommandMailbox->Common.DataTransferMemoryAddress + .ScatterGatherSegments[0] + .SegmentByteCount = + CommandMailbox->Common.DataTransferSize; + } + if (RequestSenseLength > 0) + { + CommandMailbox->Common.CommandControlBits + .NoAutoRequestSense = false; + CommandMailbox->Common.RequestSenseSize = RequestSenseLength; + CommandMailbox->Common.RequestSenseBusAddress = + RequestSenseBufferDMA; + } + DAC960_ExecuteCommand(Command); + CommandStatus = Command->V2.CommandStatus; + RequestSenseLength = Command->V2.RequestSenseLength; + DataTransferResidue = Command->V2.DataTransferResidue; + spin_lock_irqsave(&Controller->queue_lock, flags); + DAC960_DeallocateCommand(Command); + spin_unlock_irqrestore(&Controller->queue_lock, flags); + if (RequestSenseLength > UserCommand.RequestSenseLength) + RequestSenseLength = UserCommand.RequestSenseLength; + if (copy_to_user(&UserSpaceUserCommand->DataTransferLength, + &DataTransferResidue, + sizeof(DataTransferResidue))) { + ErrorCode = -EFAULT; + goto Failure2; + } + if (copy_to_user(&UserSpaceUserCommand->RequestSenseLength, + &RequestSenseLength, sizeof(RequestSenseLength))) { + ErrorCode = -EFAULT; + goto Failure2; + } + if (DataTransferLength > 0) + { + if (copy_to_user(UserCommand.DataTransferBuffer, + DataTransferBuffer, DataTransferLength)) { + ErrorCode = -EFAULT; + goto Failure2; + } + } + if (RequestSenseLength > 0) + { + if (copy_to_user(UserCommand.RequestSenseBuffer, + RequestSenseBuffer, RequestSenseLength)) { + ErrorCode = -EFAULT; + goto Failure2; + } + } + ErrorCode = CommandStatus; + Failure2: + pci_free_consistent(Controller->PCIDevice, abs(DataTransferLength), + DataTransferBuffer, DataTransferBufferDMA); + if (RequestSenseBuffer != NULL) + pci_free_consistent(Controller->PCIDevice, RequestSenseLength, + RequestSenseBuffer, RequestSenseBufferDMA); + break; + } + case DAC960_IOCTL_V2_GET_HEALTH_STATUS: + { + DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus = + (DAC960_V2_GetHealthStatus_T __user *) Argument; + DAC960_V2_GetHealthStatus_T GetHealthStatus; + DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer; + DAC960_Controller_T *Controller; + int ControllerNumber; + if (UserSpaceGetHealthStatus == NULL) { + ErrorCode = -EINVAL; + break; + } + if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus, + sizeof(DAC960_V2_GetHealthStatus_T))) { + ErrorCode = -EFAULT; + break; + } + ErrorCode = -ENXIO; + ControllerNumber = GetHealthStatus.ControllerNumber; + if (ControllerNumber < 0 || + ControllerNumber > DAC960_ControllerCount - 1) + break; + Controller = DAC960_Controllers[ControllerNumber]; + if (Controller == NULL) + break; + if (Controller->FirmwareType != DAC960_V2_Controller) { + ErrorCode = -EINVAL; + break; + } + if (copy_from_user(&HealthStatusBuffer, + GetHealthStatus.HealthStatusBuffer, + sizeof(DAC960_V2_HealthStatusBuffer_T))) { + ErrorCode = -EFAULT; + break; + } + ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue, + !(Controller->V2.HealthStatusBuffer->StatusChangeCounter + == HealthStatusBuffer.StatusChangeCounter && + Controller->V2.HealthStatusBuffer->NextEventSequenceNumber + == HealthStatusBuffer.NextEventSequenceNumber), + DAC960_MonitoringTimerInterval); + if (ErrorCode == -ERESTARTSYS) { + ErrorCode = -EINTR; + break; + } + if (copy_to_user(GetHealthStatus.HealthStatusBuffer, + Controller->V2.HealthStatusBuffer, + sizeof(DAC960_V2_HealthStatusBuffer_T))) + ErrorCode = -EFAULT; + else + ErrorCode = 0; + } + break; + default: + ErrorCode = -ENOTTY; + } + mutex_unlock(&DAC960_mutex); + return ErrorCode; +} + +static const struct file_operations DAC960_gam_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = DAC960_gam_ioctl, + .llseek = noop_llseek, +}; + +static struct miscdevice DAC960_gam_dev = { + DAC960_GAM_MINOR, + "dac960_gam", + &DAC960_gam_fops +}; + +static int DAC960_gam_init(void) +{ + int ret; + + ret = misc_register(&DAC960_gam_dev); + if (ret) + printk(KERN_ERR "DAC960_gam: can't misc_register on minor %d\n", DAC960_GAM_MINOR); + return ret; +} + +static void DAC960_gam_cleanup(void) +{ + misc_deregister(&DAC960_gam_dev); +} + +#endif /* DAC960_GAM_MINOR */ + +static struct DAC960_privdata DAC960_GEM_privdata = { + .HardwareType = DAC960_GEM_Controller, + .FirmwareType = DAC960_V2_Controller, + .InterruptHandler = DAC960_GEM_InterruptHandler, + .MemoryWindowSize = DAC960_GEM_RegisterWindowSize, +}; + + +static struct DAC960_privdata DAC960_BA_privdata = { + .HardwareType = DAC960_BA_Controller, + .FirmwareType = DAC960_V2_Controller, + .InterruptHandler = DAC960_BA_InterruptHandler, + .MemoryWindowSize = DAC960_BA_RegisterWindowSize, +}; + +static struct DAC960_privdata DAC960_LP_privdata = { + .HardwareType = DAC960_LP_Controller, + .FirmwareType = DAC960_V2_Controller, + .InterruptHandler = DAC960_LP_InterruptHandler, + .MemoryWindowSize = DAC960_LP_RegisterWindowSize, +}; + +static struct DAC960_privdata DAC960_LA_privdata = { + .HardwareType = DAC960_LA_Controller, + .FirmwareType = DAC960_V1_Controller, + .InterruptHandler = DAC960_LA_InterruptHandler, + .MemoryWindowSize = DAC960_LA_RegisterWindowSize, +}; + +static struct DAC960_privdata DAC960_PG_privdata = { + .HardwareType = DAC960_PG_Controller, + .FirmwareType = DAC960_V1_Controller, + .InterruptHandler = DAC960_PG_InterruptHandler, + .MemoryWindowSize = DAC960_PG_RegisterWindowSize, +}; + +static struct DAC960_privdata DAC960_PD_privdata = { + .HardwareType = DAC960_PD_Controller, + .FirmwareType = DAC960_V1_Controller, + .InterruptHandler = DAC960_PD_InterruptHandler, + .MemoryWindowSize = DAC960_PD_RegisterWindowSize, +}; + +static struct DAC960_privdata DAC960_P_privdata = { + .HardwareType = DAC960_P_Controller, + .FirmwareType = DAC960_V1_Controller, + .InterruptHandler = DAC960_P_InterruptHandler, + .MemoryWindowSize = DAC960_PD_RegisterWindowSize, +}; + +static const struct pci_device_id DAC960_id_table[] = { + { + .vendor = PCI_VENDOR_ID_MYLEX, + .device = PCI_DEVICE_ID_MYLEX_DAC960_GEM, + .subvendor = PCI_VENDOR_ID_MYLEX, + .subdevice = PCI_ANY_ID, + .driver_data = (unsigned long) &DAC960_GEM_privdata, + }, + { + .vendor = PCI_VENDOR_ID_MYLEX, + .device = PCI_DEVICE_ID_MYLEX_DAC960_BA, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .driver_data = (unsigned long) &DAC960_BA_privdata, + }, + { + .vendor = PCI_VENDOR_ID_MYLEX, + .device = PCI_DEVICE_ID_MYLEX_DAC960_LP, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .driver_data = (unsigned long) &DAC960_LP_privdata, + }, + { + .vendor = PCI_VENDOR_ID_DEC, + .device = PCI_DEVICE_ID_DEC_21285, + .subvendor = PCI_VENDOR_ID_MYLEX, + .subdevice = PCI_DEVICE_ID_MYLEX_DAC960_LA, + .driver_data = (unsigned long) &DAC960_LA_privdata, + }, + { + .vendor = PCI_VENDOR_ID_MYLEX, + .device = PCI_DEVICE_ID_MYLEX_DAC960_PG, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .driver_data = (unsigned long) &DAC960_PG_privdata, + }, + { + .vendor = PCI_VENDOR_ID_MYLEX, + .device = PCI_DEVICE_ID_MYLEX_DAC960_PD, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .driver_data = (unsigned long) &DAC960_PD_privdata, + }, + { + .vendor = PCI_VENDOR_ID_MYLEX, + .device = PCI_DEVICE_ID_MYLEX_DAC960_P, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .driver_data = (unsigned long) &DAC960_P_privdata, + }, + {0, }, +}; + +MODULE_DEVICE_TABLE(pci, DAC960_id_table); + +static struct pci_driver DAC960_pci_driver = { + .name = "DAC960", + .id_table = DAC960_id_table, + .probe = DAC960_Probe, + .remove = DAC960_Remove, +}; + +static int __init DAC960_init_module(void) +{ + int ret; + + ret = pci_register_driver(&DAC960_pci_driver); +#ifdef DAC960_GAM_MINOR + if (!ret) + DAC960_gam_init(); +#endif + return ret; +} + +static void __exit DAC960_cleanup_module(void) +{ + int i; + +#ifdef DAC960_GAM_MINOR + DAC960_gam_cleanup(); +#endif + + for (i = 0; i < DAC960_ControllerCount; i++) { + DAC960_Controller_T *Controller = DAC960_Controllers[i]; + if (Controller == NULL) + continue; + DAC960_FinalizeController(Controller); + } + if (DAC960_ProcDirectoryEntry != NULL) { + remove_proc_entry("rd/status", NULL); + remove_proc_entry("rd", NULL); + } + DAC960_ControllerCount = 0; + pci_unregister_driver(&DAC960_pci_driver); +} + +module_init(DAC960_init_module); +module_exit(DAC960_cleanup_module); + +MODULE_LICENSE("GPL"); diff --git a/kernel/drivers/block/DAC960.h b/kernel/drivers/block/DAC960.h new file mode 100644 index 000000000..85fa9bb63 --- /dev/null +++ b/kernel/drivers/block/DAC960.h @@ -0,0 +1,4415 @@ +/* + + Linux Driver for Mylex DAC960/AcceleRAID/eXtremeRAID PCI RAID Controllers + + Copyright 1998-2001 by Leonard N. Zubkoff + + This program is free software; you may redistribute and/or modify it under + the terms of the GNU General Public License Version 2 as published by the + Free Software Foundation. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY, without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for complete details. + + The author respectfully requests that any modifications to this software be + sent directly to him for evaluation and testing. + +*/ + + +/* + Define the maximum number of DAC960 Controllers supported by this driver. +*/ + +#define DAC960_MaxControllers 8 + + +/* + Define the maximum number of Controller Channels supported by DAC960 + V1 and V2 Firmware Controllers. +*/ + +#define DAC960_V1_MaxChannels 3 +#define DAC960_V2_MaxChannels 4 + + +/* + Define the maximum number of Targets per Channel supported by DAC960 + V1 and V2 Firmware Controllers. +*/ + +#define DAC960_V1_MaxTargets 16 +#define DAC960_V2_MaxTargets 128 + + +/* + Define the maximum number of Logical Drives supported by DAC960 + V1 and V2 Firmware Controllers. +*/ + +#define DAC960_MaxLogicalDrives 32 + + +/* + Define the maximum number of Physical Devices supported by DAC960 + V1 and V2 Firmware Controllers. +*/ + +#define DAC960_V1_MaxPhysicalDevices 45 +#define DAC960_V2_MaxPhysicalDevices 272 + +/* + Define a 32/64 bit I/O Address data type. +*/ + +typedef unsigned long DAC960_IO_Address_T; + + +/* + Define a 32/64 bit PCI Bus Address data type. +*/ + +typedef unsigned long DAC960_PCI_Address_T; + + +/* + Define a 32 bit Bus Address data type. +*/ + +typedef unsigned int DAC960_BusAddress32_T; + + +/* + Define a 64 bit Bus Address data type. +*/ + +typedef unsigned long long DAC960_BusAddress64_T; + + +/* + Define a 32 bit Byte Count data type. +*/ + +typedef unsigned int DAC960_ByteCount32_T; + + +/* + Define a 64 bit Byte Count data type. +*/ + +typedef unsigned long long DAC960_ByteCount64_T; + + +/* + dma_loaf is used by helper routines to divide a region of + dma mapped memory into smaller pieces, where those pieces + are not of uniform size. + */ + +struct dma_loaf { + void *cpu_base; + dma_addr_t dma_base; + size_t length; + void *cpu_free; + dma_addr_t dma_free; +}; + +/* + Define the SCSI INQUIRY Standard Data structure. +*/ + +typedef struct DAC960_SCSI_Inquiry +{ + unsigned char PeripheralDeviceType:5; /* Byte 0 Bits 0-4 */ + unsigned char PeripheralQualifier:3; /* Byte 0 Bits 5-7 */ + unsigned char DeviceTypeModifier:7; /* Byte 1 Bits 0-6 */ + bool RMB:1; /* Byte 1 Bit 7 */ + unsigned char ANSI_ApprovedVersion:3; /* Byte 2 Bits 0-2 */ + unsigned char ECMA_Version:3; /* Byte 2 Bits 3-5 */ + unsigned char ISO_Version:2; /* Byte 2 Bits 6-7 */ + unsigned char ResponseDataFormat:4; /* Byte 3 Bits 0-3 */ + unsigned char :2; /* Byte 3 Bits 4-5 */ + bool TrmIOP:1; /* Byte 3 Bit 6 */ + bool AENC:1; /* Byte 3 Bit 7 */ + unsigned char AdditionalLength; /* Byte 4 */ + unsigned char :8; /* Byte 5 */ + unsigned char :8; /* Byte 6 */ + bool SftRe:1; /* Byte 7 Bit 0 */ + bool CmdQue:1; /* Byte 7 Bit 1 */ + bool :1; /* Byte 7 Bit 2 */ + bool Linked:1; /* Byte 7 Bit 3 */ + bool Sync:1; /* Byte 7 Bit 4 */ + bool WBus16:1; /* Byte 7 Bit 5 */ + bool WBus32:1; /* Byte 7 Bit 6 */ + bool RelAdr:1; /* Byte 7 Bit 7 */ + unsigned char VendorIdentification[8]; /* Bytes 8-15 */ + unsigned char ProductIdentification[16]; /* Bytes 16-31 */ + unsigned char ProductRevisionLevel[4]; /* Bytes 32-35 */ +} +DAC960_SCSI_Inquiry_T; + + +/* + Define the SCSI INQUIRY Unit Serial Number structure. +*/ + +typedef struct DAC960_SCSI_Inquiry_UnitSerialNumber +{ + unsigned char PeripheralDeviceType:5; /* Byte 0 Bits 0-4 */ + unsigned char PeripheralQualifier:3; /* Byte 0 Bits 5-7 */ + unsigned char PageCode; /* Byte 1 */ + unsigned char :8; /* Byte 2 */ + unsigned char PageLength; /* Byte 3 */ + unsigned char ProductSerialNumber[28]; /* Bytes 4-31 */ +} +DAC960_SCSI_Inquiry_UnitSerialNumber_T; + + +/* + Define the SCSI REQUEST SENSE Sense Key type. +*/ + +typedef enum +{ + DAC960_SenseKey_NoSense = 0x0, + DAC960_SenseKey_RecoveredError = 0x1, + DAC960_SenseKey_NotReady = 0x2, + DAC960_SenseKey_MediumError = 0x3, + DAC960_SenseKey_HardwareError = 0x4, + DAC960_SenseKey_IllegalRequest = 0x5, + DAC960_SenseKey_UnitAttention = 0x6, + DAC960_SenseKey_DataProtect = 0x7, + DAC960_SenseKey_BlankCheck = 0x8, + DAC960_SenseKey_VendorSpecific = 0x9, + DAC960_SenseKey_CopyAborted = 0xA, + DAC960_SenseKey_AbortedCommand = 0xB, + DAC960_SenseKey_Equal = 0xC, + DAC960_SenseKey_VolumeOverflow = 0xD, + DAC960_SenseKey_Miscompare = 0xE, + DAC960_SenseKey_Reserved = 0xF +} +__attribute__ ((packed)) +DAC960_SCSI_RequestSenseKey_T; + + +/* + Define the SCSI REQUEST SENSE structure. +*/ + +typedef struct DAC960_SCSI_RequestSense +{ + unsigned char ErrorCode:7; /* Byte 0 Bits 0-6 */ + bool Valid:1; /* Byte 0 Bit 7 */ + unsigned char SegmentNumber; /* Byte 1 */ + DAC960_SCSI_RequestSenseKey_T SenseKey:4; /* Byte 2 Bits 0-3 */ + unsigned char :1; /* Byte 2 Bit 4 */ + bool ILI:1; /* Byte 2 Bit 5 */ + bool EOM:1; /* Byte 2 Bit 6 */ + bool Filemark:1; /* Byte 2 Bit 7 */ + unsigned char Information[4]; /* Bytes 3-6 */ + unsigned char AdditionalSenseLength; /* Byte 7 */ + unsigned char CommandSpecificInformation[4]; /* Bytes 8-11 */ + unsigned char AdditionalSenseCode; /* Byte 12 */ + unsigned char AdditionalSenseCodeQualifier; /* Byte 13 */ +} +DAC960_SCSI_RequestSense_T; + + +/* + Define the DAC960 V1 Firmware Command Opcodes. +*/ + +typedef enum +{ + /* I/O Commands */ + DAC960_V1_ReadExtended = 0x33, + DAC960_V1_WriteExtended = 0x34, + DAC960_V1_ReadAheadExtended = 0x35, + DAC960_V1_ReadExtendedWithScatterGather = 0xB3, + DAC960_V1_WriteExtendedWithScatterGather = 0xB4, + DAC960_V1_Read = 0x36, + DAC960_V1_ReadWithScatterGather = 0xB6, + DAC960_V1_Write = 0x37, + DAC960_V1_WriteWithScatterGather = 0xB7, + DAC960_V1_DCDB = 0x04, + DAC960_V1_DCDBWithScatterGather = 0x84, + DAC960_V1_Flush = 0x0A, + /* Controller Status Related Commands */ + DAC960_V1_Enquiry = 0x53, + DAC960_V1_Enquiry2 = 0x1C, + DAC960_V1_GetLogicalDriveElement = 0x55, + DAC960_V1_GetLogicalDriveInformation = 0x19, + DAC960_V1_IOPortRead = 0x39, + DAC960_V1_IOPortWrite = 0x3A, + DAC960_V1_GetSDStats = 0x3E, + DAC960_V1_GetPDStats = 0x3F, + DAC960_V1_PerformEventLogOperation = 0x72, + /* Device Related Commands */ + DAC960_V1_StartDevice = 0x10, + DAC960_V1_GetDeviceState = 0x50, + DAC960_V1_StopChannel = 0x13, + DAC960_V1_StartChannel = 0x12, + DAC960_V1_ResetChannel = 0x1A, + /* Commands Associated with Data Consistency and Errors */ + DAC960_V1_Rebuild = 0x09, + DAC960_V1_RebuildAsync = 0x16, + DAC960_V1_CheckConsistency = 0x0F, + DAC960_V1_CheckConsistencyAsync = 0x1E, + DAC960_V1_RebuildStat = 0x0C, + DAC960_V1_GetRebuildProgress = 0x27, + DAC960_V1_RebuildControl = 0x1F, + DAC960_V1_ReadBadBlockTable = 0x0B, + DAC960_V1_ReadBadDataTable = 0x25, + DAC960_V1_ClearBadDataTable = 0x26, + DAC960_V1_GetErrorTable = 0x17, + DAC960_V1_AddCapacityAsync = 0x2A, + DAC960_V1_BackgroundInitializationControl = 0x2B, + /* Configuration Related Commands */ + DAC960_V1_ReadConfig2 = 0x3D, + DAC960_V1_WriteConfig2 = 0x3C, + DAC960_V1_ReadConfigurationOnDisk = 0x4A, + DAC960_V1_WriteConfigurationOnDisk = 0x4B, + DAC960_V1_ReadConfiguration = 0x4E, + DAC960_V1_ReadBackupConfiguration = 0x4D, + DAC960_V1_WriteConfiguration = 0x4F, + DAC960_V1_AddConfiguration = 0x4C, + DAC960_V1_ReadConfigurationLabel = 0x48, + DAC960_V1_WriteConfigurationLabel = 0x49, + /* Firmware Upgrade Related Commands */ + DAC960_V1_LoadImage = 0x20, + DAC960_V1_StoreImage = 0x21, + DAC960_V1_ProgramImage = 0x22, + /* Diagnostic Commands */ + DAC960_V1_SetDiagnosticMode = 0x31, + DAC960_V1_RunDiagnostic = 0x32, + /* Subsystem Service Commands */ + DAC960_V1_GetSubsystemData = 0x70, + DAC960_V1_SetSubsystemParameters = 0x71, + /* Version 2.xx Firmware Commands */ + DAC960_V1_Enquiry_Old = 0x05, + DAC960_V1_GetDeviceState_Old = 0x14, + DAC960_V1_Read_Old = 0x02, + DAC960_V1_Write_Old = 0x03, + DAC960_V1_ReadWithScatterGather_Old = 0x82, + DAC960_V1_WriteWithScatterGather_Old = 0x83 +} +__attribute__ ((packed)) +DAC960_V1_CommandOpcode_T; + + +/* + Define the DAC960 V1 Firmware Command Identifier type. +*/ + +typedef unsigned char DAC960_V1_CommandIdentifier_T; + + +/* + Define the DAC960 V1 Firmware Command Status Codes. +*/ + +#define DAC960_V1_NormalCompletion 0x0000 /* Common */ +#define DAC960_V1_CheckConditionReceived 0x0002 /* Common */ +#define DAC960_V1_NoDeviceAtAddress 0x0102 /* Common */ +#define DAC960_V1_InvalidDeviceAddress 0x0105 /* Common */ +#define DAC960_V1_InvalidParameter 0x0105 /* Common */ +#define DAC960_V1_IrrecoverableDataError 0x0001 /* I/O */ +#define DAC960_V1_LogicalDriveNonexistentOrOffline 0x0002 /* I/O */ +#define DAC960_V1_AccessBeyondEndOfLogicalDrive 0x0105 /* I/O */ +#define DAC960_V1_BadDataEncountered 0x010C /* I/O */ +#define DAC960_V1_DeviceBusy 0x0008 /* DCDB */ +#define DAC960_V1_DeviceNonresponsive 0x000E /* DCDB */ +#define DAC960_V1_CommandTerminatedAbnormally 0x000F /* DCDB */ +#define DAC960_V1_UnableToStartDevice 0x0002 /* Device */ +#define DAC960_V1_InvalidChannelOrTargetOrModifier 0x0105 /* Device */ +#define DAC960_V1_ChannelBusy 0x0106 /* Device */ +#define DAC960_V1_ChannelNotStopped 0x0002 /* Device */ +#define DAC960_V1_AttemptToRebuildOnlineDrive 0x0002 /* Consistency */ +#define DAC960_V1_RebuildBadBlocksEncountered 0x0003 /* Consistency */ +#define DAC960_V1_NewDiskFailedDuringRebuild 0x0004 /* Consistency */ +#define DAC960_V1_RebuildOrCheckAlreadyInProgress 0x0106 /* Consistency */ +#define DAC960_V1_DependentDiskIsDead 0x0002 /* Consistency */ +#define DAC960_V1_InconsistentBlocksFound 0x0003 /* Consistency */ +#define DAC960_V1_InvalidOrNonredundantLogicalDrive 0x0105 /* Consistency */ +#define DAC960_V1_NoRebuildOrCheckInProgress 0x0105 /* Consistency */ +#define DAC960_V1_RebuildInProgress_DataValid 0x0000 /* Consistency */ +#define DAC960_V1_RebuildFailed_LogicalDriveFailure 0x0002 /* Consistency */ +#define DAC960_V1_RebuildFailed_BadBlocksOnOther 0x0003 /* Consistency */ +#define DAC960_V1_RebuildFailed_NewDriveFailed 0x0004 /* Consistency */ +#define DAC960_V1_RebuildSuccessful 0x0100 /* Consistency */ +#define DAC960_V1_RebuildSuccessfullyTerminated 0x0107 /* Consistency */ +#define DAC960_V1_BackgroundInitSuccessful 0x0100 /* Consistency */ +#define DAC960_V1_BackgroundInitAborted 0x0005 /* Consistency */ +#define DAC960_V1_NoBackgroundInitInProgress 0x0105 /* Consistency */ +#define DAC960_V1_AddCapacityInProgress 0x0004 /* Consistency */ +#define DAC960_V1_AddCapacityFailedOrSuspended 0x00F4 /* Consistency */ +#define DAC960_V1_Config2ChecksumError 0x0002 /* Configuration */ +#define DAC960_V1_ConfigurationSuspended 0x0106 /* Configuration */ +#define DAC960_V1_FailedToConfigureNVRAM 0x0105 /* Configuration */ +#define DAC960_V1_ConfigurationNotSavedStateChange 0x0106 /* Configuration */ +#define DAC960_V1_SubsystemNotInstalled 0x0001 /* Subsystem */ +#define DAC960_V1_SubsystemFailed 0x0002 /* Subsystem */ +#define DAC960_V1_SubsystemBusy 0x0106 /* Subsystem */ + +typedef unsigned short DAC960_V1_CommandStatus_T; + + +/* + Define the DAC960 V1 Firmware Enquiry Command reply structure. +*/ + +typedef struct DAC960_V1_Enquiry +{ + unsigned char NumberOfLogicalDrives; /* Byte 0 */ + unsigned int :24; /* Bytes 1-3 */ + unsigned int LogicalDriveSizes[32]; /* Bytes 4-131 */ + unsigned short FlashAge; /* Bytes 132-133 */ + struct { + bool DeferredWriteError:1; /* Byte 134 Bit 0 */ + bool BatteryLow:1; /* Byte 134 Bit 1 */ + unsigned char :6; /* Byte 134 Bits 2-7 */ + } StatusFlags; + unsigned char :8; /* Byte 135 */ + unsigned char MinorFirmwareVersion; /* Byte 136 */ + unsigned char MajorFirmwareVersion; /* Byte 137 */ + enum { + DAC960_V1_NoStandbyRebuildOrCheckInProgress = 0x00, + DAC960_V1_StandbyRebuildInProgress = 0x01, + DAC960_V1_BackgroundRebuildInProgress = 0x02, + DAC960_V1_BackgroundCheckInProgress = 0x03, + DAC960_V1_StandbyRebuildCompletedWithError = 0xFF, + DAC960_V1_BackgroundRebuildOrCheckFailed_DriveFailed = 0xF0, + DAC960_V1_BackgroundRebuildOrCheckFailed_LogicalDriveFailed = 0xF1, + DAC960_V1_BackgroundRebuildOrCheckFailed_OtherCauses = 0xF2, + DAC960_V1_BackgroundRebuildOrCheckSuccessfullyTerminated = 0xF3 + } __attribute__ ((packed)) RebuildFlag; /* Byte 138 */ + unsigned char MaxCommands; /* Byte 139 */ + unsigned char OfflineLogicalDriveCount; /* Byte 140 */ + unsigned char :8; /* Byte 141 */ + unsigned short EventLogSequenceNumber; /* Bytes 142-143 */ + unsigned char CriticalLogicalDriveCount; /* Byte 144 */ + unsigned int :24; /* Bytes 145-147 */ + unsigned char DeadDriveCount; /* Byte 148 */ + unsigned char :8; /* Byte 149 */ + unsigned char RebuildCount; /* Byte 150 */ + struct { + unsigned char :3; /* Byte 151 Bits 0-2 */ + bool BatteryBackupUnitPresent:1; /* Byte 151 Bit 3 */ + unsigned char :3; /* Byte 151 Bits 4-6 */ + unsigned char :1; /* Byte 151 Bit 7 */ + } MiscFlags; + struct { + unsigned char TargetID; + unsigned char Channel; + } DeadDrives[21]; /* Bytes 152-194 */ + unsigned char Reserved[62]; /* Bytes 195-255 */ +} +__attribute__ ((packed)) +DAC960_V1_Enquiry_T; + + +/* + Define the DAC960 V1 Firmware Enquiry2 Command reply structure. +*/ + +typedef struct DAC960_V1_Enquiry2 +{ + struct { + enum { + DAC960_V1_P_PD_PU = 0x01, + DAC960_V1_PL = 0x02, + DAC960_V1_PG = 0x10, + DAC960_V1_PJ = 0x11, + DAC960_V1_PR = 0x12, + DAC960_V1_PT = 0x13, + DAC960_V1_PTL0 = 0x14, + DAC960_V1_PRL = 0x15, + DAC960_V1_PTL1 = 0x16, + DAC960_V1_1164P = 0x20 + } __attribute__ ((packed)) SubModel; /* Byte 0 */ + unsigned char ActualChannels; /* Byte 1 */ + enum { + DAC960_V1_FiveChannelBoard = 0x01, + DAC960_V1_ThreeChannelBoard = 0x02, + DAC960_V1_TwoChannelBoard = 0x03, + DAC960_V1_ThreeChannelASIC_DAC = 0x04 + } __attribute__ ((packed)) Model; /* Byte 2 */ + enum { + DAC960_V1_EISA_Controller = 0x01, + DAC960_V1_MicroChannel_Controller = 0x02, + DAC960_V1_PCI_Controller = 0x03, + DAC960_V1_SCSItoSCSI_Controller = 0x08 + } __attribute__ ((packed)) ProductFamily; /* Byte 3 */ + } HardwareID; /* Bytes 0-3 */ + /* MajorVersion.MinorVersion-FirmwareType-TurnID */ + struct { + unsigned char MajorVersion; /* Byte 4 */ + unsigned char MinorVersion; /* Byte 5 */ + unsigned char TurnID; /* Byte 6 */ + char FirmwareType; /* Byte 7 */ + } FirmwareID; /* Bytes 4-7 */ + unsigned char :8; /* Byte 8 */ + unsigned int :24; /* Bytes 9-11 */ + unsigned char ConfiguredChannels; /* Byte 12 */ + unsigned char ActualChannels; /* Byte 13 */ + unsigned char MaxTargets; /* Byte 14 */ + unsigned char MaxTags; /* Byte 15 */ + unsigned char MaxLogicalDrives; /* Byte 16 */ + unsigned char MaxArms; /* Byte 17 */ + unsigned char MaxSpans; /* Byte 18 */ + unsigned char :8; /* Byte 19 */ + unsigned int :32; /* Bytes 20-23 */ + unsigned int MemorySize; /* Bytes 24-27 */ + unsigned int CacheSize; /* Bytes 28-31 */ + unsigned int FlashMemorySize; /* Bytes 32-35 */ + unsigned int NonVolatileMemorySize; /* Bytes 36-39 */ + struct { + enum { + DAC960_V1_RamType_DRAM = 0x0, + DAC960_V1_RamType_EDO = 0x1, + DAC960_V1_RamType_SDRAM = 0x2, + DAC960_V1_RamType_Last = 0x7 + } __attribute__ ((packed)) RamType:3; /* Byte 40 Bits 0-2 */ + enum { + DAC960_V1_ErrorCorrection_None = 0x0, + DAC960_V1_ErrorCorrection_Parity = 0x1, + DAC960_V1_ErrorCorrection_ECC = 0x2, + DAC960_V1_ErrorCorrection_Last = 0x7 + } __attribute__ ((packed)) ErrorCorrection:3; /* Byte 40 Bits 3-5 */ + bool FastPageMode:1; /* Byte 40 Bit 6 */ + bool LowPowerMemory:1; /* Byte 40 Bit 7 */ + unsigned char :8; /* Bytes 41 */ + } MemoryType; + unsigned short ClockSpeed; /* Bytes 42-43 */ + unsigned short MemorySpeed; /* Bytes 44-45 */ + unsigned short HardwareSpeed; /* Bytes 46-47 */ + unsigned int :32; /* Bytes 48-51 */ + unsigned int :32; /* Bytes 52-55 */ + unsigned char :8; /* Byte 56 */ + unsigned char :8; /* Byte 57 */ + unsigned short :16; /* Bytes 58-59 */ + unsigned short MaxCommands; /* Bytes 60-61 */ + unsigned short MaxScatterGatherEntries; /* Bytes 62-63 */ + unsigned short MaxDriveCommands; /* Bytes 64-65 */ + unsigned short MaxIODescriptors; /* Bytes 66-67 */ + unsigned short MaxCombinedSectors; /* Bytes 68-69 */ + unsigned char Latency; /* Byte 70 */ + unsigned char :8; /* Byte 71 */ + unsigned char SCSITimeout; /* Byte 72 */ + unsigned char :8; /* Byte 73 */ + unsigned short MinFreeLines; /* Bytes 74-75 */ + unsigned int :32; /* Bytes 76-79 */ + unsigned int :32; /* Bytes 80-83 */ + unsigned char RebuildRateConstant; /* Byte 84 */ + unsigned char :8; /* Byte 85 */ + unsigned char :8; /* Byte 86 */ + unsigned char :8; /* Byte 87 */ + unsigned int :32; /* Bytes 88-91 */ + unsigned int :32; /* Bytes 92-95 */ + unsigned short PhysicalDriveBlockSize; /* Bytes 96-97 */ + unsigned short LogicalDriveBlockSize; /* Bytes 98-99 */ + unsigned short MaxBlocksPerCommand; /* Bytes 100-101 */ + unsigned short BlockFactor; /* Bytes 102-103 */ + unsigned short CacheLineSize; /* Bytes 104-105 */ + struct { + enum { + DAC960_V1_Narrow_8bit = 0x0, + DAC960_V1_Wide_16bit = 0x1, + DAC960_V1_Wide_32bit = 0x2 + } __attribute__ ((packed)) BusWidth:2; /* Byte 106 Bits 0-1 */ + enum { + DAC960_V1_Fast = 0x0, + DAC960_V1_Ultra = 0x1, + DAC960_V1_Ultra2 = 0x2 + } __attribute__ ((packed)) BusSpeed:2; /* Byte 106 Bits 2-3 */ + bool Differential:1; /* Byte 106 Bit 4 */ + unsigned char :3; /* Byte 106 Bits 5-7 */ + } SCSICapability; + unsigned char :8; /* Byte 107 */ + unsigned int :32; /* Bytes 108-111 */ + unsigned short FirmwareBuildNumber; /* Bytes 112-113 */ + enum { + DAC960_V1_AEMI = 0x01, + DAC960_V1_OEM1 = 0x02, + DAC960_V1_OEM2 = 0x04, + DAC960_V1_OEM3 = 0x08, + DAC960_V1_Conner = 0x10, + DAC960_V1_SAFTE = 0x20 + } __attribute__ ((packed)) FaultManagementType; /* Byte 114 */ + unsigned char :8; /* Byte 115 */ + struct { + bool Clustering:1; /* Byte 116 Bit 0 */ + bool MylexOnlineRAIDExpansion:1; /* Byte 116 Bit 1 */ + bool ReadAhead:1; /* Byte 116 Bit 2 */ + bool BackgroundInitialization:1; /* Byte 116 Bit 3 */ + unsigned int :28; /* Bytes 116-119 */ + } FirmwareFeatures; + unsigned int :32; /* Bytes 120-123 */ + unsigned int :32; /* Bytes 124-127 */ +} +DAC960_V1_Enquiry2_T; + + +/* + Define the DAC960 V1 Firmware Logical Drive State type. +*/ + +typedef enum +{ + DAC960_V1_LogicalDrive_Online = 0x03, + DAC960_V1_LogicalDrive_Critical = 0x04, + DAC960_V1_LogicalDrive_Offline = 0xFF +} +__attribute__ ((packed)) +DAC960_V1_LogicalDriveState_T; + + +/* + Define the DAC960 V1 Firmware Logical Drive Information structure. +*/ + +typedef struct DAC960_V1_LogicalDriveInformation +{ + unsigned int LogicalDriveSize; /* Bytes 0-3 */ + DAC960_V1_LogicalDriveState_T LogicalDriveState; /* Byte 4 */ + unsigned char RAIDLevel:7; /* Byte 5 Bits 0-6 */ + bool WriteBack:1; /* Byte 5 Bit 7 */ + unsigned short :16; /* Bytes 6-7 */ +} +DAC960_V1_LogicalDriveInformation_T; + + +/* + Define the DAC960 V1 Firmware Get Logical Drive Information Command + reply structure. +*/ + +typedef DAC960_V1_LogicalDriveInformation_T + DAC960_V1_LogicalDriveInformationArray_T[DAC960_MaxLogicalDrives]; + + +/* + Define the DAC960 V1 Firmware Perform Event Log Operation Types. +*/ + +typedef enum +{ + DAC960_V1_GetEventLogEntry = 0x00 +} +__attribute__ ((packed)) +DAC960_V1_PerformEventLogOpType_T; + + +/* + Define the DAC960 V1 Firmware Get Event Log Entry Command reply structure. +*/ + +typedef struct DAC960_V1_EventLogEntry +{ + unsigned char MessageType; /* Byte 0 */ + unsigned char MessageLength; /* Byte 1 */ + unsigned char TargetID:5; /* Byte 2 Bits 0-4 */ + unsigned char Channel:3; /* Byte 2 Bits 5-7 */ + unsigned char LogicalUnit:6; /* Byte 3 Bits 0-5 */ + unsigned char :2; /* Byte 3 Bits 6-7 */ + unsigned short SequenceNumber; /* Bytes 4-5 */ + unsigned char ErrorCode:7; /* Byte 6 Bits 0-6 */ + bool Valid:1; /* Byte 6 Bit 7 */ + unsigned char SegmentNumber; /* Byte 7 */ + DAC960_SCSI_RequestSenseKey_T SenseKey:4; /* Byte 8 Bits 0-3 */ + unsigned char :1; /* Byte 8 Bit 4 */ + bool ILI:1; /* Byte 8 Bit 5 */ + bool EOM:1; /* Byte 8 Bit 6 */ + bool Filemark:1; /* Byte 8 Bit 7 */ + unsigned char Information[4]; /* Bytes 9-12 */ + unsigned char AdditionalSenseLength; /* Byte 13 */ + unsigned char CommandSpecificInformation[4]; /* Bytes 14-17 */ + unsigned char AdditionalSenseCode; /* Byte 18 */ + unsigned char AdditionalSenseCodeQualifier; /* Byte 19 */ + unsigned char Dummy[12]; /* Bytes 20-31 */ +} +DAC960_V1_EventLogEntry_T; + + +/* + Define the DAC960 V1 Firmware Physical Device State type. +*/ + +typedef enum +{ + DAC960_V1_Device_Dead = 0x00, + DAC960_V1_Device_WriteOnly = 0x02, + DAC960_V1_Device_Online = 0x03, + DAC960_V1_Device_Standby = 0x10 +} +__attribute__ ((packed)) +DAC960_V1_PhysicalDeviceState_T; + + +/* + Define the DAC960 V1 Firmware Get Device State Command reply structure. + The structure is padded by 2 bytes for compatibility with Version 2.xx + Firmware. +*/ + +typedef struct DAC960_V1_DeviceState +{ + bool Present:1; /* Byte 0 Bit 0 */ + unsigned char :7; /* Byte 0 Bits 1-7 */ + enum { + DAC960_V1_OtherType = 0x0, + DAC960_V1_DiskType = 0x1, + DAC960_V1_SequentialType = 0x2, + DAC960_V1_CDROM_or_WORM_Type = 0x3 + } __attribute__ ((packed)) DeviceType:2; /* Byte 1 Bits 0-1 */ + bool :1; /* Byte 1 Bit 2 */ + bool Fast20:1; /* Byte 1 Bit 3 */ + bool Sync:1; /* Byte 1 Bit 4 */ + bool Fast:1; /* Byte 1 Bit 5 */ + bool Wide:1; /* Byte 1 Bit 6 */ + bool TaggedQueuingSupported:1; /* Byte 1 Bit 7 */ + DAC960_V1_PhysicalDeviceState_T DeviceState; /* Byte 2 */ + unsigned char :8; /* Byte 3 */ + unsigned char SynchronousMultiplier; /* Byte 4 */ + unsigned char SynchronousOffset:5; /* Byte 5 Bits 0-4 */ + unsigned char :3; /* Byte 5 Bits 5-7 */ + unsigned int DiskSize __attribute__ ((packed)); /* Bytes 6-9 */ + unsigned short :16; /* Bytes 10-11 */ +} +DAC960_V1_DeviceState_T; + + +/* + Define the DAC960 V1 Firmware Get Rebuild Progress Command reply structure. +*/ + +typedef struct DAC960_V1_RebuildProgress +{ + unsigned int LogicalDriveNumber; /* Bytes 0-3 */ + unsigned int LogicalDriveSize; /* Bytes 4-7 */ + unsigned int RemainingBlocks; /* Bytes 8-11 */ +} +DAC960_V1_RebuildProgress_T; + + +/* + Define the DAC960 V1 Firmware Background Initialization Status Command + reply structure. +*/ + +typedef struct DAC960_V1_BackgroundInitializationStatus +{ + unsigned int LogicalDriveSize; /* Bytes 0-3 */ + unsigned int BlocksCompleted; /* Bytes 4-7 */ + unsigned char Reserved1[12]; /* Bytes 8-19 */ + unsigned int LogicalDriveNumber; /* Bytes 20-23 */ + unsigned char RAIDLevel; /* Byte 24 */ + enum { + DAC960_V1_BackgroundInitializationInvalid = 0x00, + DAC960_V1_BackgroundInitializationStarted = 0x02, + DAC960_V1_BackgroundInitializationInProgress = 0x04, + DAC960_V1_BackgroundInitializationSuspended = 0x05, + DAC960_V1_BackgroundInitializationCancelled = 0x06 + } __attribute__ ((packed)) Status; /* Byte 25 */ + unsigned char Reserved2[6]; /* Bytes 26-31 */ +} +DAC960_V1_BackgroundInitializationStatus_T; + + +/* + Define the DAC960 V1 Firmware Error Table Entry structure. +*/ + +typedef struct DAC960_V1_ErrorTableEntry +{ + unsigned char ParityErrorCount; /* Byte 0 */ + unsigned char SoftErrorCount; /* Byte 1 */ + unsigned char HardErrorCount; /* Byte 2 */ + unsigned char MiscErrorCount; /* Byte 3 */ +} +DAC960_V1_ErrorTableEntry_T; + + +/* + Define the DAC960 V1 Firmware Get Error Table Command reply structure. +*/ + +typedef struct DAC960_V1_ErrorTable +{ + DAC960_V1_ErrorTableEntry_T + ErrorTableEntries[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets]; +} +DAC960_V1_ErrorTable_T; + + +/* + Define the DAC960 V1 Firmware Read Config2 Command reply structure. +*/ + +typedef struct DAC960_V1_Config2 +{ + unsigned char :1; /* Byte 0 Bit 0 */ + bool ActiveNegationEnabled:1; /* Byte 0 Bit 1 */ + unsigned char :5; /* Byte 0 Bits 2-6 */ + bool NoRescanIfResetReceivedDuringScan:1; /* Byte 0 Bit 7 */ + bool StorageWorksSupportEnabled:1; /* Byte 1 Bit 0 */ + bool HewlettPackardSupportEnabled:1; /* Byte 1 Bit 1 */ + bool NoDisconnectOnFirstCommand:1; /* Byte 1 Bit 2 */ + unsigned char :2; /* Byte 1 Bits 3-4 */ + bool AEMI_ARM:1; /* Byte 1 Bit 5 */ + bool AEMI_OFM:1; /* Byte 1 Bit 6 */ + unsigned char :1; /* Byte 1 Bit 7 */ + enum { + DAC960_V1_OEMID_Mylex = 0x00, + DAC960_V1_OEMID_IBM = 0x08, + DAC960_V1_OEMID_HP = 0x0A, + DAC960_V1_OEMID_DEC = 0x0C, + DAC960_V1_OEMID_Siemens = 0x10, + DAC960_V1_OEMID_Intel = 0x12 + } __attribute__ ((packed)) OEMID; /* Byte 2 */ + unsigned char OEMModelNumber; /* Byte 3 */ + unsigned char PhysicalSector; /* Byte 4 */ + unsigned char LogicalSector; /* Byte 5 */ + unsigned char BlockFactor; /* Byte 6 */ + bool ReadAheadEnabled:1; /* Byte 7 Bit 0 */ + bool LowBIOSDelay:1; /* Byte 7 Bit 1 */ + unsigned char :2; /* Byte 7 Bits 2-3 */ + bool ReassignRestrictedToOneSector:1; /* Byte 7 Bit 4 */ + unsigned char :1; /* Byte 7 Bit 5 */ + bool ForceUnitAccessDuringWriteRecovery:1; /* Byte 7 Bit 6 */ + bool EnableLeftSymmetricRAID5Algorithm:1; /* Byte 7 Bit 7 */ + unsigned char DefaultRebuildRate; /* Byte 8 */ + unsigned char :8; /* Byte 9 */ + unsigned char BlocksPerCacheLine; /* Byte 10 */ + unsigned char BlocksPerStripe; /* Byte 11 */ + struct { + enum { + DAC960_V1_Async = 0x0, + DAC960_V1_Sync_8MHz = 0x1, + DAC960_V1_Sync_5MHz = 0x2, + DAC960_V1_Sync_10or20MHz = 0x3 /* Byte 11 Bits 0-1 */ + } __attribute__ ((packed)) Speed:2; + bool Force8Bit:1; /* Byte 11 Bit 2 */ + bool DisableFast20:1; /* Byte 11 Bit 3 */ + unsigned char :3; /* Byte 11 Bits 4-6 */ + bool EnableTaggedQueuing:1; /* Byte 11 Bit 7 */ + } __attribute__ ((packed)) ChannelParameters[6]; /* Bytes 12-17 */ + unsigned char SCSIInitiatorID; /* Byte 18 */ + unsigned char :8; /* Byte 19 */ + enum { + DAC960_V1_StartupMode_ControllerSpinUp = 0x00, + DAC960_V1_StartupMode_PowerOnSpinUp = 0x01 + } __attribute__ ((packed)) StartupMode; /* Byte 20 */ + unsigned char SimultaneousDeviceSpinUpCount; /* Byte 21 */ + unsigned char SecondsDelayBetweenSpinUps; /* Byte 22 */ + unsigned char Reserved1[29]; /* Bytes 23-51 */ + bool BIOSDisabled:1; /* Byte 52 Bit 0 */ + bool CDROMBootEnabled:1; /* Byte 52 Bit 1 */ + unsigned char :3; /* Byte 52 Bits 2-4 */ + enum { + DAC960_V1_Geometry_128_32 = 0x0, + DAC960_V1_Geometry_255_63 = 0x1, + DAC960_V1_Geometry_Reserved1 = 0x2, + DAC960_V1_Geometry_Reserved2 = 0x3 + } __attribute__ ((packed)) DriveGeometry:2; /* Byte 52 Bits 5-6 */ + unsigned char :1; /* Byte 52 Bit 7 */ + unsigned char Reserved2[9]; /* Bytes 53-61 */ + unsigned short Checksum; /* Bytes 62-63 */ +} +DAC960_V1_Config2_T; + + +/* + Define the DAC960 V1 Firmware DCDB request structure. +*/ + +typedef struct DAC960_V1_DCDB +{ + unsigned char TargetID:4; /* Byte 0 Bits 0-3 */ + unsigned char Channel:4; /* Byte 0 Bits 4-7 */ + enum { + DAC960_V1_DCDB_NoDataTransfer = 0, + DAC960_V1_DCDB_DataTransferDeviceToSystem = 1, + DAC960_V1_DCDB_DataTransferSystemToDevice = 2, + DAC960_V1_DCDB_IllegalDataTransfer = 3 + } __attribute__ ((packed)) Direction:2; /* Byte 1 Bits 0-1 */ + bool EarlyStatus:1; /* Byte 1 Bit 2 */ + unsigned char :1; /* Byte 1 Bit 3 */ + enum { + DAC960_V1_DCDB_Timeout_24_hours = 0, + DAC960_V1_DCDB_Timeout_10_seconds = 1, + DAC960_V1_DCDB_Timeout_60_seconds = 2, + DAC960_V1_DCDB_Timeout_10_minutes = 3 + } __attribute__ ((packed)) Timeout:2; /* Byte 1 Bits 4-5 */ + bool NoAutomaticRequestSense:1; /* Byte 1 Bit 6 */ + bool DisconnectPermitted:1; /* Byte 1 Bit 7 */ + unsigned short TransferLength; /* Bytes 2-3 */ + DAC960_BusAddress32_T BusAddress; /* Bytes 4-7 */ + unsigned char CDBLength:4; /* Byte 8 Bits 0-3 */ + unsigned char TransferLengthHigh4:4; /* Byte 8 Bits 4-7 */ + unsigned char SenseLength; /* Byte 9 */ + unsigned char CDB[12]; /* Bytes 10-21 */ + unsigned char SenseData[64]; /* Bytes 22-85 */ + unsigned char Status; /* Byte 86 */ + unsigned char :8; /* Byte 87 */ +} +DAC960_V1_DCDB_T; + + +/* + Define the DAC960 V1 Firmware Scatter/Gather List Type 1 32 Bit Address + 32 Bit Byte Count structure. +*/ + +typedef struct DAC960_V1_ScatterGatherSegment +{ + DAC960_BusAddress32_T SegmentDataPointer; /* Bytes 0-3 */ + DAC960_ByteCount32_T SegmentByteCount; /* Bytes 4-7 */ +} +DAC960_V1_ScatterGatherSegment_T; + + +/* + Define the 13 Byte DAC960 V1 Firmware Command Mailbox structure. Bytes 13-15 + are not used. The Command Mailbox structure is padded to 16 bytes for + efficient access. +*/ + +typedef union DAC960_V1_CommandMailbox +{ + unsigned int Words[4]; /* Words 0-3 */ + unsigned char Bytes[16]; /* Bytes 0-15 */ + struct { + DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */ + DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */ + unsigned char Dummy[14]; /* Bytes 2-15 */ + } __attribute__ ((packed)) Common; + struct { + DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */ + DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */ + unsigned char Dummy1[6]; /* Bytes 2-7 */ + DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */ + unsigned char Dummy2[4]; /* Bytes 12-15 */ + } __attribute__ ((packed)) Type3; + struct { + DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */ + DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */ + unsigned char CommandOpcode2; /* Byte 2 */ + unsigned char Dummy1[5]; /* Bytes 3-7 */ + DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */ + unsigned char Dummy2[4]; /* Bytes 12-15 */ + } __attribute__ ((packed)) Type3B; + struct { + DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */ + DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */ + unsigned char Dummy1[5]; /* Bytes 2-6 */ + unsigned char LogicalDriveNumber:6; /* Byte 7 Bits 0-6 */ + bool AutoRestore:1; /* Byte 7 Bit 7 */ + unsigned char Dummy2[8]; /* Bytes 8-15 */ + } __attribute__ ((packed)) Type3C; + struct { + DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */ + DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */ + unsigned char Channel; /* Byte 2 */ + unsigned char TargetID; /* Byte 3 */ + DAC960_V1_PhysicalDeviceState_T DeviceState:5; /* Byte 4 Bits 0-4 */ + unsigned char Modifier:3; /* Byte 4 Bits 5-7 */ + unsigned char Dummy1[3]; /* Bytes 5-7 */ + DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */ + unsigned char Dummy2[4]; /* Bytes 12-15 */ + } __attribute__ ((packed)) Type3D; + struct { + DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */ + DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */ + DAC960_V1_PerformEventLogOpType_T OperationType; /* Byte 2 */ + unsigned char OperationQualifier; /* Byte 3 */ + unsigned short SequenceNumber; /* Bytes 4-5 */ + unsigned char Dummy1[2]; /* Bytes 6-7 */ + DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */ + unsigned char Dummy2[4]; /* Bytes 12-15 */ + } __attribute__ ((packed)) Type3E; + struct { + DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */ + DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */ + unsigned char Dummy1[2]; /* Bytes 2-3 */ + unsigned char RebuildRateConstant; /* Byte 4 */ + unsigned char Dummy2[3]; /* Bytes 5-7 */ + DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */ + unsigned char Dummy3[4]; /* Bytes 12-15 */ + } __attribute__ ((packed)) Type3R; + struct { + DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */ + DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */ + unsigned short TransferLength; /* Bytes 2-3 */ + unsigned int LogicalBlockAddress; /* Bytes 4-7 */ + DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */ + unsigned char LogicalDriveNumber; /* Byte 12 */ + unsigned char Dummy[3]; /* Bytes 13-15 */ + } __attribute__ ((packed)) Type4; + struct { + DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */ + DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */ + struct { + unsigned short TransferLength:11; /* Bytes 2-3 */ + unsigned char LogicalDriveNumber:5; /* Byte 3 Bits 3-7 */ + } __attribute__ ((packed)) LD; + unsigned int LogicalBlockAddress; /* Bytes 4-7 */ + DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */ + unsigned char ScatterGatherCount:6; /* Byte 12 Bits 0-5 */ + enum { + DAC960_V1_ScatterGather_32BitAddress_32BitByteCount = 0x0, + DAC960_V1_ScatterGather_32BitAddress_16BitByteCount = 0x1, + DAC960_V1_ScatterGather_32BitByteCount_32BitAddress = 0x2, + DAC960_V1_ScatterGather_16BitByteCount_32BitAddress = 0x3 + } __attribute__ ((packed)) ScatterGatherType:2; /* Byte 12 Bits 6-7 */ + unsigned char Dummy[3]; /* Bytes 13-15 */ + } __attribute__ ((packed)) Type5; + struct { + DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */ + DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */ + unsigned char CommandOpcode2; /* Byte 2 */ + unsigned char :8; /* Byte 3 */ + DAC960_BusAddress32_T CommandMailboxesBusAddress; /* Bytes 4-7 */ + DAC960_BusAddress32_T StatusMailboxesBusAddress; /* Bytes 8-11 */ + unsigned char Dummy[4]; /* Bytes 12-15 */ + } __attribute__ ((packed)) TypeX; +} +DAC960_V1_CommandMailbox_T; + + +/* + Define the DAC960 V2 Firmware Command Opcodes. +*/ + +typedef enum +{ + DAC960_V2_MemCopy = 0x01, + DAC960_V2_SCSI_10_Passthru = 0x02, + DAC960_V2_SCSI_255_Passthru = 0x03, + DAC960_V2_SCSI_10 = 0x04, + DAC960_V2_SCSI_256 = 0x05, + DAC960_V2_IOCTL = 0x20 +} +__attribute__ ((packed)) +DAC960_V2_CommandOpcode_T; + + +/* + Define the DAC960 V2 Firmware IOCTL Opcodes. +*/ + +typedef enum +{ + DAC960_V2_GetControllerInfo = 0x01, + DAC960_V2_GetLogicalDeviceInfoValid = 0x03, + DAC960_V2_GetPhysicalDeviceInfoValid = 0x05, + DAC960_V2_GetHealthStatus = 0x11, + DAC960_V2_GetEvent = 0x15, + DAC960_V2_StartDiscovery = 0x81, + DAC960_V2_SetDeviceState = 0x82, + DAC960_V2_RebuildDeviceStart = 0x88, + DAC960_V2_RebuildDeviceStop = 0x89, + DAC960_V2_ConsistencyCheckStart = 0x8C, + DAC960_V2_ConsistencyCheckStop = 0x8D, + DAC960_V2_SetMemoryMailbox = 0x8E, + DAC960_V2_PauseDevice = 0x92, + DAC960_V2_TranslatePhysicalToLogicalDevice = 0xC5 +} +__attribute__ ((packed)) +DAC960_V2_IOCTL_Opcode_T; + + +/* + Define the DAC960 V2 Firmware Command Identifier type. +*/ + +typedef unsigned short DAC960_V2_CommandIdentifier_T; + + +/* + Define the DAC960 V2 Firmware Command Status Codes. +*/ + +#define DAC960_V2_NormalCompletion 0x00 +#define DAC960_V2_AbormalCompletion 0x02 +#define DAC960_V2_DeviceBusy 0x08 +#define DAC960_V2_DeviceNonresponsive 0x0E +#define DAC960_V2_DeviceNonresponsive2 0x0F +#define DAC960_V2_DeviceRevervationConflict 0x18 + +typedef unsigned char DAC960_V2_CommandStatus_T; + + +/* + Define the DAC960 V2 Firmware Memory Type structure. +*/ + +typedef struct DAC960_V2_MemoryType +{ + enum { + DAC960_V2_MemoryType_Reserved = 0x00, + DAC960_V2_MemoryType_DRAM = 0x01, + DAC960_V2_MemoryType_EDRAM = 0x02, + DAC960_V2_MemoryType_EDO = 0x03, + DAC960_V2_MemoryType_SDRAM = 0x04, + DAC960_V2_MemoryType_Last = 0x1F + } __attribute__ ((packed)) MemoryType:5; /* Byte 0 Bits 0-4 */ + bool :1; /* Byte 0 Bit 5 */ + bool MemoryParity:1; /* Byte 0 Bit 6 */ + bool MemoryECC:1; /* Byte 0 Bit 7 */ +} +DAC960_V2_MemoryType_T; + + +/* + Define the DAC960 V2 Firmware Processor Type structure. +*/ + +typedef enum +{ + DAC960_V2_ProcessorType_i960CA = 0x01, + DAC960_V2_ProcessorType_i960RD = 0x02, + DAC960_V2_ProcessorType_i960RN = 0x03, + DAC960_V2_ProcessorType_i960RP = 0x04, + DAC960_V2_ProcessorType_NorthBay = 0x05, + DAC960_V2_ProcessorType_StrongArm = 0x06, + DAC960_V2_ProcessorType_i960RM = 0x07 +} +__attribute__ ((packed)) +DAC960_V2_ProcessorType_T; + + +/* + Define the DAC960 V2 Firmware Get Controller Info reply structure. +*/ + +typedef struct DAC960_V2_ControllerInfo +{ + unsigned char :8; /* Byte 0 */ + enum { + DAC960_V2_SCSI_Bus = 0x00, + DAC960_V2_Fibre_Bus = 0x01, + DAC960_V2_PCI_Bus = 0x03 + } __attribute__ ((packed)) BusInterfaceType; /* Byte 1 */ + enum { + DAC960_V2_DAC960E = 0x01, + DAC960_V2_DAC960M = 0x08, + DAC960_V2_DAC960PD = 0x10, + DAC960_V2_DAC960PL = 0x11, + DAC960_V2_DAC960PU = 0x12, + DAC960_V2_DAC960PE = 0x13, + DAC960_V2_DAC960PG = 0x14, + DAC960_V2_DAC960PJ = 0x15, + DAC960_V2_DAC960PTL0 = 0x16, + DAC960_V2_DAC960PR = 0x17, + DAC960_V2_DAC960PRL = 0x18, + DAC960_V2_DAC960PT = 0x19, + DAC960_V2_DAC1164P = 0x1A, + DAC960_V2_DAC960PTL1 = 0x1B, + DAC960_V2_EXR2000P = 0x1C, + DAC960_V2_EXR3000P = 0x1D, + DAC960_V2_AcceleRAID352 = 0x1E, + DAC960_V2_AcceleRAID170 = 0x1F, + DAC960_V2_AcceleRAID160 = 0x20, + DAC960_V2_DAC960S = 0x60, + DAC960_V2_DAC960SU = 0x61, + DAC960_V2_DAC960SX = 0x62, + DAC960_V2_DAC960SF = 0x63, + DAC960_V2_DAC960SS = 0x64, + DAC960_V2_DAC960FL = 0x65, + DAC960_V2_DAC960LL = 0x66, + DAC960_V2_DAC960FF = 0x67, + DAC960_V2_DAC960HP = 0x68, + DAC960_V2_RAIDBRICK = 0x69, + DAC960_V2_METEOR_FL = 0x6A, + DAC960_V2_METEOR_FF = 0x6B + } __attribute__ ((packed)) ControllerType; /* Byte 2 */ + unsigned char :8; /* Byte 3 */ + unsigned short BusInterfaceSpeedMHz; /* Bytes 4-5 */ + unsigned char BusWidthBits; /* Byte 6 */ + unsigned char FlashCodeTypeOrProductID; /* Byte 7 */ + unsigned char NumberOfHostPortsPresent; /* Byte 8 */ + unsigned char Reserved1[7]; /* Bytes 9-15 */ + unsigned char BusInterfaceName[16]; /* Bytes 16-31 */ + unsigned char ControllerName[16]; /* Bytes 32-47 */ + unsigned char Reserved2[16]; /* Bytes 48-63 */ + /* Firmware Release Information */ + unsigned char FirmwareMajorVersion; /* Byte 64 */ + unsigned char FirmwareMinorVersion; /* Byte 65 */ + unsigned char FirmwareTurnNumber; /* Byte 66 */ + unsigned char FirmwareBuildNumber; /* Byte 67 */ + unsigned char FirmwareReleaseDay; /* Byte 68 */ + unsigned char FirmwareReleaseMonth; /* Byte 69 */ + unsigned char FirmwareReleaseYearHigh2Digits; /* Byte 70 */ + unsigned char FirmwareReleaseYearLow2Digits; /* Byte 71 */ + /* Hardware Release Information */ + unsigned char HardwareRevision; /* Byte 72 */ + unsigned int :24; /* Bytes 73-75 */ + unsigned char HardwareReleaseDay; /* Byte 76 */ + unsigned char HardwareReleaseMonth; /* Byte 77 */ + unsigned char HardwareReleaseYearHigh2Digits; /* Byte 78 */ + unsigned char HardwareReleaseYearLow2Digits; /* Byte 79 */ + /* Hardware Manufacturing Information */ + unsigned char ManufacturingBatchNumber; /* Byte 80 */ + unsigned char :8; /* Byte 81 */ + unsigned char ManufacturingPlantNumber; /* Byte 82 */ + unsigned char :8; /* Byte 83 */ + unsigned char HardwareManufacturingDay; /* Byte 84 */ + unsigned char HardwareManufacturingMonth; /* Byte 85 */ + unsigned char HardwareManufacturingYearHigh2Digits; /* Byte 86 */ + unsigned char HardwareManufacturingYearLow2Digits; /* Byte 87 */ + unsigned char MaximumNumberOfPDDperXLD; /* Byte 88 */ + unsigned char MaximumNumberOfILDperXLD; /* Byte 89 */ + unsigned short NonvolatileMemorySizeKB; /* Bytes 90-91 */ + unsigned char MaximumNumberOfXLD; /* Byte 92 */ + unsigned int :24; /* Bytes 93-95 */ + /* Unique Information per Controller */ + unsigned char ControllerSerialNumber[16]; /* Bytes 96-111 */ + unsigned char Reserved3[16]; /* Bytes 112-127 */ + /* Vendor Information */ + unsigned int :24; /* Bytes 128-130 */ + unsigned char OEM_Code; /* Byte 131 */ + unsigned char VendorName[16]; /* Bytes 132-147 */ + /* Other Physical/Controller/Operation Information */ + bool BBU_Present:1; /* Byte 148 Bit 0 */ + bool ActiveActiveClusteringMode:1; /* Byte 148 Bit 1 */ + unsigned char :6; /* Byte 148 Bits 2-7 */ + unsigned char :8; /* Byte 149 */ + unsigned short :16; /* Bytes 150-151 */ + /* Physical Device Scan Information */ + bool PhysicalScanActive:1; /* Byte 152 Bit 0 */ + unsigned char :7; /* Byte 152 Bits 1-7 */ + unsigned char PhysicalDeviceChannelNumber; /* Byte 153 */ + unsigned char PhysicalDeviceTargetID; /* Byte 154 */ + unsigned char PhysicalDeviceLogicalUnit; /* Byte 155 */ + /* Maximum Command Data Transfer Sizes */ + unsigned short MaximumDataTransferSizeInBlocks; /* Bytes 156-157 */ + unsigned short MaximumScatterGatherEntries; /* Bytes 158-159 */ + /* Logical/Physical Device Counts */ + unsigned short LogicalDevicesPresent; /* Bytes 160-161 */ + unsigned short LogicalDevicesCritical; /* Bytes 162-163 */ + unsigned short LogicalDevicesOffline; /* Bytes 164-165 */ + unsigned short PhysicalDevicesPresent; /* Bytes 166-167 */ + unsigned short PhysicalDisksPresent; /* Bytes 168-169 */ + unsigned short PhysicalDisksCritical; /* Bytes 170-171 */ + unsigned short PhysicalDisksOffline; /* Bytes 172-173 */ + unsigned short MaximumParallelCommands; /* Bytes 174-175 */ + /* Channel and Target ID Information */ + unsigned char NumberOfPhysicalChannelsPresent; /* Byte 176 */ + unsigned char NumberOfVirtualChannelsPresent; /* Byte 177 */ + unsigned char NumberOfPhysicalChannelsPossible; /* Byte 178 */ + unsigned char NumberOfVirtualChannelsPossible; /* Byte 179 */ + unsigned char MaximumTargetsPerChannel[16]; /* Bytes 180-195 */ + unsigned char Reserved4[12]; /* Bytes 196-207 */ + /* Memory/Cache Information */ + unsigned short MemorySizeMB; /* Bytes 208-209 */ + unsigned short CacheSizeMB; /* Bytes 210-211 */ + unsigned int ValidCacheSizeInBytes; /* Bytes 212-215 */ + unsigned int DirtyCacheSizeInBytes; /* Bytes 216-219 */ + unsigned short MemorySpeedMHz; /* Bytes 220-221 */ + unsigned char MemoryDataWidthBits; /* Byte 222 */ + DAC960_V2_MemoryType_T MemoryType; /* Byte 223 */ + unsigned char CacheMemoryTypeName[16]; /* Bytes 224-239 */ + /* Execution Memory Information */ + unsigned short ExecutionMemorySizeMB; /* Bytes 240-241 */ + unsigned short ExecutionL2CacheSizeMB; /* Bytes 242-243 */ + unsigned char Reserved5[8]; /* Bytes 244-251 */ + unsigned short ExecutionMemorySpeedMHz; /* Bytes 252-253 */ + unsigned char ExecutionMemoryDataWidthBits; /* Byte 254 */ + DAC960_V2_MemoryType_T ExecutionMemoryType; /* Byte 255 */ + unsigned char ExecutionMemoryTypeName[16]; /* Bytes 256-271 */ + /* First CPU Type Information */ + unsigned short FirstProcessorSpeedMHz; /* Bytes 272-273 */ + DAC960_V2_ProcessorType_T FirstProcessorType; /* Byte 274 */ + unsigned char FirstProcessorCount; /* Byte 275 */ + unsigned char Reserved6[12]; /* Bytes 276-287 */ + unsigned char FirstProcessorName[16]; /* Bytes 288-303 */ + /* Second CPU Type Information */ + unsigned short SecondProcessorSpeedMHz; /* Bytes 304-305 */ + DAC960_V2_ProcessorType_T SecondProcessorType; /* Byte 306 */ + unsigned char SecondProcessorCount; /* Byte 307 */ + unsigned char Reserved7[12]; /* Bytes 308-319 */ + unsigned char SecondProcessorName[16]; /* Bytes 320-335 */ + /* Debugging/Profiling/Command Time Tracing Information */ + unsigned short CurrentProfilingDataPageNumber; /* Bytes 336-337 */ + unsigned short ProgramsAwaitingProfilingData; /* Bytes 338-339 */ + unsigned short CurrentCommandTimeTraceDataPageNumber; /* Bytes 340-341 */ + unsigned short ProgramsAwaitingCommandTimeTraceData; /* Bytes 342-343 */ + unsigned char Reserved8[8]; /* Bytes 344-351 */ + /* Error Counters on Physical Devices */ + unsigned short PhysicalDeviceBusResets; /* Bytes 352-353 */ + unsigned short PhysicalDeviceParityErrors; /* Bytes 355-355 */ + unsigned short PhysicalDeviceSoftErrors; /* Bytes 356-357 */ + unsigned short PhysicalDeviceCommandsFailed; /* Bytes 358-359 */ + unsigned short PhysicalDeviceMiscellaneousErrors; /* Bytes 360-361 */ + unsigned short PhysicalDeviceCommandTimeouts; /* Bytes 362-363 */ + unsigned short PhysicalDeviceSelectionTimeouts; /* Bytes 364-365 */ + unsigned short PhysicalDeviceRetriesDone; /* Bytes 366-367 */ + unsigned short PhysicalDeviceAbortsDone; /* Bytes 368-369 */ + unsigned short PhysicalDeviceHostCommandAbortsDone; /* Bytes 370-371 */ + unsigned short PhysicalDevicePredictedFailuresDetected; /* Bytes 372-373 */ + unsigned short PhysicalDeviceHostCommandsFailed; /* Bytes 374-375 */ + unsigned short PhysicalDeviceHardErrors; /* Bytes 376-377 */ + unsigned char Reserved9[6]; /* Bytes 378-383 */ + /* Error Counters on Logical Devices */ + unsigned short LogicalDeviceSoftErrors; /* Bytes 384-385 */ + unsigned short LogicalDeviceCommandsFailed; /* Bytes 386-387 */ + unsigned short LogicalDeviceHostCommandAbortsDone; /* Bytes 388-389 */ + unsigned short :16; /* Bytes 390-391 */ + /* Error Counters on Controller */ + unsigned short ControllerMemoryErrors; /* Bytes 392-393 */ + unsigned short ControllerHostCommandAbortsDone; /* Bytes 394-395 */ + unsigned int :32; /* Bytes 396-399 */ + /* Long Duration Activity Information */ + unsigned short BackgroundInitializationsActive; /* Bytes 400-401 */ + unsigned short LogicalDeviceInitializationsActive; /* Bytes 402-403 */ + unsigned short PhysicalDeviceInitializationsActive; /* Bytes 404-405 */ + unsigned short ConsistencyChecksActive; /* Bytes 406-407 */ + unsigned short RebuildsActive; /* Bytes 408-409 */ + unsigned short OnlineExpansionsActive; /* Bytes 410-411 */ + unsigned short PatrolActivitiesActive; /* Bytes 412-413 */ + unsigned short :16; /* Bytes 414-415 */ + /* Flash ROM Information */ + unsigned char FlashType; /* Byte 416 */ + unsigned char :8; /* Byte 417 */ + unsigned short FlashSizeMB; /* Bytes 418-419 */ + unsigned int FlashLimit; /* Bytes 420-423 */ + unsigned int FlashCount; /* Bytes 424-427 */ + unsigned int :32; /* Bytes 428-431 */ + unsigned char FlashTypeName[16]; /* Bytes 432-447 */ + /* Firmware Run Time Information */ + unsigned char RebuildRate; /* Byte 448 */ + unsigned char BackgroundInitializationRate; /* Byte 449 */ + unsigned char ForegroundInitializationRate; /* Byte 450 */ + unsigned char ConsistencyCheckRate; /* Byte 451 */ + unsigned int :32; /* Bytes 452-455 */ + unsigned int MaximumDP; /* Bytes 456-459 */ + unsigned int FreeDP; /* Bytes 460-463 */ + unsigned int MaximumIOP; /* Bytes 464-467 */ + unsigned int FreeIOP; /* Bytes 468-471 */ + unsigned short MaximumCombLengthInBlocks; /* Bytes 472-473 */ + unsigned short NumberOfConfigurationGroups; /* Bytes 474-475 */ + bool InstallationAbortStatus:1; /* Byte 476 Bit 0 */ + bool MaintenanceModeStatus:1; /* Byte 476 Bit 1 */ + unsigned int :24; /* Bytes 476-479 */ + unsigned char Reserved10[32]; /* Bytes 480-511 */ + unsigned char Reserved11[512]; /* Bytes 512-1023 */ +} +DAC960_V2_ControllerInfo_T; + + +/* + Define the DAC960 V2 Firmware Logical Device State type. +*/ + +typedef enum +{ + DAC960_V2_LogicalDevice_Online = 0x01, + DAC960_V2_LogicalDevice_Offline = 0x08, + DAC960_V2_LogicalDevice_Critical = 0x09 +} +__attribute__ ((packed)) +DAC960_V2_LogicalDeviceState_T; + + +/* + Define the DAC960 V2 Firmware Get Logical Device Info reply structure. +*/ + +typedef struct DAC960_V2_LogicalDeviceInfo +{ + unsigned char :8; /* Byte 0 */ + unsigned char Channel; /* Byte 1 */ + unsigned char TargetID; /* Byte 2 */ + unsigned char LogicalUnit; /* Byte 3 */ + DAC960_V2_LogicalDeviceState_T LogicalDeviceState; /* Byte 4 */ + unsigned char RAIDLevel; /* Byte 5 */ + unsigned char StripeSize; /* Byte 6 */ + unsigned char CacheLineSize; /* Byte 7 */ + struct { + enum { + DAC960_V2_ReadCacheDisabled = 0x0, + DAC960_V2_ReadCacheEnabled = 0x1, + DAC960_V2_ReadAheadEnabled = 0x2, + DAC960_V2_IntelligentReadAheadEnabled = 0x3, + DAC960_V2_ReadCache_Last = 0x7 + } __attribute__ ((packed)) ReadCache:3; /* Byte 8 Bits 0-2 */ + enum { + DAC960_V2_WriteCacheDisabled = 0x0, + DAC960_V2_LogicalDeviceReadOnly = 0x1, + DAC960_V2_WriteCacheEnabled = 0x2, + DAC960_V2_IntelligentWriteCacheEnabled = 0x3, + DAC960_V2_WriteCache_Last = 0x7 + } __attribute__ ((packed)) WriteCache:3; /* Byte 8 Bits 3-5 */ + bool :1; /* Byte 8 Bit 6 */ + bool LogicalDeviceInitialized:1; /* Byte 8 Bit 7 */ + } LogicalDeviceControl; /* Byte 8 */ + /* Logical Device Operations Status */ + bool ConsistencyCheckInProgress:1; /* Byte 9 Bit 0 */ + bool RebuildInProgress:1; /* Byte 9 Bit 1 */ + bool BackgroundInitializationInProgress:1; /* Byte 9 Bit 2 */ + bool ForegroundInitializationInProgress:1; /* Byte 9 Bit 3 */ + bool DataMigrationInProgress:1; /* Byte 9 Bit 4 */ + bool PatrolOperationInProgress:1; /* Byte 9 Bit 5 */ + unsigned char :2; /* Byte 9 Bits 6-7 */ + unsigned char RAID5WriteUpdate; /* Byte 10 */ + unsigned char RAID5Algorithm; /* Byte 11 */ + unsigned short LogicalDeviceNumber; /* Bytes 12-13 */ + /* BIOS Info */ + bool BIOSDisabled:1; /* Byte 14 Bit 0 */ + bool CDROMBootEnabled:1; /* Byte 14 Bit 1 */ + bool DriveCoercionEnabled:1; /* Byte 14 Bit 2 */ + bool WriteSameDisabled:1; /* Byte 14 Bit 3 */ + bool HBA_ModeEnabled:1; /* Byte 14 Bit 4 */ + enum { + DAC960_V2_Geometry_128_32 = 0x0, + DAC960_V2_Geometry_255_63 = 0x1, + DAC960_V2_Geometry_Reserved1 = 0x2, + DAC960_V2_Geometry_Reserved2 = 0x3 + } __attribute__ ((packed)) DriveGeometry:2; /* Byte 14 Bits 5-6 */ + bool SuperReadAheadEnabled:1; /* Byte 14 Bit 7 */ + unsigned char :8; /* Byte 15 */ + /* Error Counters */ + unsigned short SoftErrors; /* Bytes 16-17 */ + unsigned short CommandsFailed; /* Bytes 18-19 */ + unsigned short HostCommandAbortsDone; /* Bytes 20-21 */ + unsigned short DeferredWriteErrors; /* Bytes 22-23 */ + unsigned int :32; /* Bytes 24-27 */ + unsigned int :32; /* Bytes 28-31 */ + /* Device Size Information */ + unsigned short :16; /* Bytes 32-33 */ + unsigned short DeviceBlockSizeInBytes; /* Bytes 34-35 */ + unsigned int OriginalDeviceSize; /* Bytes 36-39 */ + unsigned int ConfigurableDeviceSize; /* Bytes 40-43 */ + unsigned int :32; /* Bytes 44-47 */ + unsigned char LogicalDeviceName[32]; /* Bytes 48-79 */ + unsigned char SCSI_InquiryData[36]; /* Bytes 80-115 */ + unsigned char Reserved1[12]; /* Bytes 116-127 */ + DAC960_ByteCount64_T LastReadBlockNumber; /* Bytes 128-135 */ + DAC960_ByteCount64_T LastWrittenBlockNumber; /* Bytes 136-143 */ + DAC960_ByteCount64_T ConsistencyCheckBlockNumber; /* Bytes 144-151 */ + DAC960_ByteCount64_T RebuildBlockNumber; /* Bytes 152-159 */ + DAC960_ByteCount64_T BackgroundInitializationBlockNumber; /* Bytes 160-167 */ + DAC960_ByteCount64_T ForegroundInitializationBlockNumber; /* Bytes 168-175 */ + DAC960_ByteCount64_T DataMigrationBlockNumber; /* Bytes 176-183 */ + DAC960_ByteCount64_T PatrolOperationBlockNumber; /* Bytes 184-191 */ + unsigned char Reserved2[64]; /* Bytes 192-255 */ +} +DAC960_V2_LogicalDeviceInfo_T; + + +/* + Define the DAC960 V2 Firmware Physical Device State type. +*/ + +typedef enum +{ + DAC960_V2_Device_Unconfigured = 0x00, + DAC960_V2_Device_Online = 0x01, + DAC960_V2_Device_Rebuild = 0x03, + DAC960_V2_Device_Missing = 0x04, + DAC960_V2_Device_Critical = 0x05, + DAC960_V2_Device_Dead = 0x08, + DAC960_V2_Device_SuspectedDead = 0x0C, + DAC960_V2_Device_CommandedOffline = 0x10, + DAC960_V2_Device_Standby = 0x21, + DAC960_V2_Device_InvalidState = 0xFF +} +__attribute__ ((packed)) +DAC960_V2_PhysicalDeviceState_T; + + +/* + Define the DAC960 V2 Firmware Get Physical Device Info reply structure. +*/ + +typedef struct DAC960_V2_PhysicalDeviceInfo +{ + unsigned char :8; /* Byte 0 */ + unsigned char Channel; /* Byte 1 */ + unsigned char TargetID; /* Byte 2 */ + unsigned char LogicalUnit; /* Byte 3 */ + /* Configuration Status Bits */ + bool PhysicalDeviceFaultTolerant:1; /* Byte 4 Bit 0 */ + bool PhysicalDeviceConnected:1; /* Byte 4 Bit 1 */ + bool PhysicalDeviceLocalToController:1; /* Byte 4 Bit 2 */ + unsigned char :5; /* Byte 4 Bits 3-7 */ + /* Multiple Host/Controller Status Bits */ + bool RemoteHostSystemDead:1; /* Byte 5 Bit 0 */ + bool RemoteControllerDead:1; /* Byte 5 Bit 1 */ + unsigned char :6; /* Byte 5 Bits 2-7 */ + DAC960_V2_PhysicalDeviceState_T PhysicalDeviceState; /* Byte 6 */ + unsigned char NegotiatedDataWidthBits; /* Byte 7 */ + unsigned short NegotiatedSynchronousMegaTransfers; /* Bytes 8-9 */ + /* Multiported Physical Device Information */ + unsigned char NumberOfPortConnections; /* Byte 10 */ + unsigned char DriveAccessibilityBitmap; /* Byte 11 */ + unsigned int :32; /* Bytes 12-15 */ + unsigned char NetworkAddress[16]; /* Bytes 16-31 */ + unsigned short MaximumTags; /* Bytes 32-33 */ + /* Physical Device Operations Status */ + bool ConsistencyCheckInProgress:1; /* Byte 34 Bit 0 */ + bool RebuildInProgress:1; /* Byte 34 Bit 1 */ + bool MakingDataConsistentInProgress:1; /* Byte 34 Bit 2 */ + bool PhysicalDeviceInitializationInProgress:1; /* Byte 34 Bit 3 */ + bool DataMigrationInProgress:1; /* Byte 34 Bit 4 */ + bool PatrolOperationInProgress:1; /* Byte 34 Bit 5 */ + unsigned char :2; /* Byte 34 Bits 6-7 */ + unsigned char LongOperationStatus; /* Byte 35 */ + unsigned char ParityErrors; /* Byte 36 */ + unsigned char SoftErrors; /* Byte 37 */ + unsigned char HardErrors; /* Byte 38 */ + unsigned char MiscellaneousErrors; /* Byte 39 */ + unsigned char CommandTimeouts; /* Byte 40 */ + unsigned char Retries; /* Byte 41 */ + unsigned char Aborts; /* Byte 42 */ + unsigned char PredictedFailuresDetected; /* Byte 43 */ + unsigned int :32; /* Bytes 44-47 */ + unsigned short :16; /* Bytes 48-49 */ + unsigned short DeviceBlockSizeInBytes; /* Bytes 50-51 */ + unsigned int OriginalDeviceSize; /* Bytes 52-55 */ + unsigned int ConfigurableDeviceSize; /* Bytes 56-59 */ + unsigned int :32; /* Bytes 60-63 */ + unsigned char PhysicalDeviceName[16]; /* Bytes 64-79 */ + unsigned char Reserved1[16]; /* Bytes 80-95 */ + unsigned char Reserved2[32]; /* Bytes 96-127 */ + unsigned char SCSI_InquiryData[36]; /* Bytes 128-163 */ + unsigned char Reserved3[20]; /* Bytes 164-183 */ + unsigned char Reserved4[8]; /* Bytes 184-191 */ + DAC960_ByteCount64_T LastReadBlockNumber; /* Bytes 192-199 */ + DAC960_ByteCount64_T LastWrittenBlockNumber; /* Bytes 200-207 */ + DAC960_ByteCount64_T ConsistencyCheckBlockNumber; /* Bytes 208-215 */ + DAC960_ByteCount64_T RebuildBlockNumber; /* Bytes 216-223 */ + DAC960_ByteCount64_T MakingDataConsistentBlockNumber; /* Bytes 224-231 */ + DAC960_ByteCount64_T DeviceInitializationBlockNumber; /* Bytes 232-239 */ + DAC960_ByteCount64_T DataMigrationBlockNumber; /* Bytes 240-247 */ + DAC960_ByteCount64_T PatrolOperationBlockNumber; /* Bytes 248-255 */ + unsigned char Reserved5[256]; /* Bytes 256-511 */ +} +DAC960_V2_PhysicalDeviceInfo_T; + + +/* + Define the DAC960 V2 Firmware Health Status Buffer structure. +*/ + +typedef struct DAC960_V2_HealthStatusBuffer +{ + unsigned int MicrosecondsFromControllerStartTime; /* Bytes 0-3 */ + unsigned int MillisecondsFromControllerStartTime; /* Bytes 4-7 */ + unsigned int SecondsFrom1January1970; /* Bytes 8-11 */ + unsigned int :32; /* Bytes 12-15 */ + unsigned int StatusChangeCounter; /* Bytes 16-19 */ + unsigned int :32; /* Bytes 20-23 */ + unsigned int DebugOutputMessageBufferIndex; /* Bytes 24-27 */ + unsigned int CodedMessageBufferIndex; /* Bytes 28-31 */ + unsigned int CurrentTimeTracePageNumber; /* Bytes 32-35 */ + unsigned int CurrentProfilerPageNumber; /* Bytes 36-39 */ + unsigned int NextEventSequenceNumber; /* Bytes 40-43 */ + unsigned int :32; /* Bytes 44-47 */ + unsigned char Reserved1[16]; /* Bytes 48-63 */ + unsigned char Reserved2[64]; /* Bytes 64-127 */ +} +DAC960_V2_HealthStatusBuffer_T; + + +/* + Define the DAC960 V2 Firmware Get Event reply structure. +*/ + +typedef struct DAC960_V2_Event +{ + unsigned int EventSequenceNumber; /* Bytes 0-3 */ + unsigned int EventTime; /* Bytes 4-7 */ + unsigned int EventCode; /* Bytes 8-11 */ + unsigned char :8; /* Byte 12 */ + unsigned char Channel; /* Byte 13 */ + unsigned char TargetID; /* Byte 14 */ + unsigned char LogicalUnit; /* Byte 15 */ + unsigned int :32; /* Bytes 16-19 */ + unsigned int EventSpecificParameter; /* Bytes 20-23 */ + unsigned char RequestSenseData[40]; /* Bytes 24-63 */ +} +DAC960_V2_Event_T; + + +/* + Define the DAC960 V2 Firmware Command Control Bits structure. +*/ + +typedef struct DAC960_V2_CommandControlBits +{ + bool ForceUnitAccess:1; /* Byte 0 Bit 0 */ + bool DisablePageOut:1; /* Byte 0 Bit 1 */ + bool :1; /* Byte 0 Bit 2 */ + bool AdditionalScatterGatherListMemory:1; /* Byte 0 Bit 3 */ + bool DataTransferControllerToHost:1; /* Byte 0 Bit 4 */ + bool :1; /* Byte 0 Bit 5 */ + bool NoAutoRequestSense:1; /* Byte 0 Bit 6 */ + bool DisconnectProhibited:1; /* Byte 0 Bit 7 */ +} +DAC960_V2_CommandControlBits_T; + + +/* + Define the DAC960 V2 Firmware Command Timeout structure. +*/ + +typedef struct DAC960_V2_CommandTimeout +{ + unsigned char TimeoutValue:6; /* Byte 0 Bits 0-5 */ + enum { + DAC960_V2_TimeoutScale_Seconds = 0, + DAC960_V2_TimeoutScale_Minutes = 1, + DAC960_V2_TimeoutScale_Hours = 2, + DAC960_V2_TimeoutScale_Reserved = 3 + } __attribute__ ((packed)) TimeoutScale:2; /* Byte 0 Bits 6-7 */ +} +DAC960_V2_CommandTimeout_T; + + +/* + Define the DAC960 V2 Firmware Physical Device structure. +*/ + +typedef struct DAC960_V2_PhysicalDevice +{ + unsigned char LogicalUnit; /* Byte 0 */ + unsigned char TargetID; /* Byte 1 */ + unsigned char Channel:3; /* Byte 2 Bits 0-2 */ + unsigned char Controller:5; /* Byte 2 Bits 3-7 */ +} +__attribute__ ((packed)) +DAC960_V2_PhysicalDevice_T; + + +/* + Define the DAC960 V2 Firmware Logical Device structure. +*/ + +typedef struct DAC960_V2_LogicalDevice +{ + unsigned short LogicalDeviceNumber; /* Bytes 0-1 */ + unsigned char :3; /* Byte 2 Bits 0-2 */ + unsigned char Controller:5; /* Byte 2 Bits 3-7 */ +} +__attribute__ ((packed)) +DAC960_V2_LogicalDevice_T; + + +/* + Define the DAC960 V2 Firmware Operation Device type. +*/ + +typedef enum +{ + DAC960_V2_Physical_Device = 0x00, + DAC960_V2_RAID_Device = 0x01, + DAC960_V2_Physical_Channel = 0x02, + DAC960_V2_RAID_Channel = 0x03, + DAC960_V2_Physical_Controller = 0x04, + DAC960_V2_RAID_Controller = 0x05, + DAC960_V2_Configuration_Group = 0x10, + DAC960_V2_Enclosure = 0x11 +} +__attribute__ ((packed)) +DAC960_V2_OperationDevice_T; + + +/* + Define the DAC960 V2 Firmware Translate Physical To Logical Device structure. +*/ + +typedef struct DAC960_V2_PhysicalToLogicalDevice +{ + unsigned short LogicalDeviceNumber; /* Bytes 0-1 */ + unsigned short :16; /* Bytes 2-3 */ + unsigned char PreviousBootController; /* Byte 4 */ + unsigned char PreviousBootChannel; /* Byte 5 */ + unsigned char PreviousBootTargetID; /* Byte 6 */ + unsigned char PreviousBootLogicalUnit; /* Byte 7 */ +} +DAC960_V2_PhysicalToLogicalDevice_T; + + + +/* + Define the DAC960 V2 Firmware Scatter/Gather List Entry structure. +*/ + +typedef struct DAC960_V2_ScatterGatherSegment +{ + DAC960_BusAddress64_T SegmentDataPointer; /* Bytes 0-7 */ + DAC960_ByteCount64_T SegmentByteCount; /* Bytes 8-15 */ +} +DAC960_V2_ScatterGatherSegment_T; + + +/* + Define the DAC960 V2 Firmware Data Transfer Memory Address structure. +*/ + +typedef union DAC960_V2_DataTransferMemoryAddress +{ + DAC960_V2_ScatterGatherSegment_T ScatterGatherSegments[2]; /* Bytes 0-31 */ + struct { + unsigned short ScatterGatherList0Length; /* Bytes 0-1 */ + unsigned short ScatterGatherList1Length; /* Bytes 2-3 */ + unsigned short ScatterGatherList2Length; /* Bytes 4-5 */ + unsigned short :16; /* Bytes 6-7 */ + DAC960_BusAddress64_T ScatterGatherList0Address; /* Bytes 8-15 */ + DAC960_BusAddress64_T ScatterGatherList1Address; /* Bytes 16-23 */ + DAC960_BusAddress64_T ScatterGatherList2Address; /* Bytes 24-31 */ + } ExtendedScatterGather; +} +DAC960_V2_DataTransferMemoryAddress_T; + + +/* + Define the 64 Byte DAC960 V2 Firmware Command Mailbox structure. +*/ + +typedef union DAC960_V2_CommandMailbox +{ + unsigned int Words[16]; /* Words 0-15 */ + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */ + DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */ + DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */ + unsigned char DataTransferPageNumber; /* Byte 7 */ + DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */ + unsigned int :24; /* Bytes 16-18 */ + DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */ + unsigned char RequestSenseSize; /* Byte 20 */ + unsigned char IOCTL_Opcode; /* Byte 21 */ + unsigned char Reserved[10]; /* Bytes 22-31 */ + DAC960_V2_DataTransferMemoryAddress_T + DataTransferMemoryAddress; /* Bytes 32-63 */ + } Common; + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */ + DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */ + DAC960_ByteCount32_T DataTransferSize; /* Bytes 4-7 */ + DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */ + DAC960_V2_PhysicalDevice_T PhysicalDevice; /* Bytes 16-18 */ + DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */ + unsigned char RequestSenseSize; /* Byte 20 */ + unsigned char CDBLength; /* Byte 21 */ + unsigned char SCSI_CDB[10]; /* Bytes 22-31 */ + DAC960_V2_DataTransferMemoryAddress_T + DataTransferMemoryAddress; /* Bytes 32-63 */ + } SCSI_10; + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */ + DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */ + DAC960_ByteCount32_T DataTransferSize; /* Bytes 4-7 */ + DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */ + DAC960_V2_PhysicalDevice_T PhysicalDevice; /* Bytes 16-18 */ + DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */ + unsigned char RequestSenseSize; /* Byte 20 */ + unsigned char CDBLength; /* Byte 21 */ + unsigned short :16; /* Bytes 22-23 */ + DAC960_BusAddress64_T SCSI_CDB_BusAddress; /* Bytes 24-31 */ + DAC960_V2_DataTransferMemoryAddress_T + DataTransferMemoryAddress; /* Bytes 32-63 */ + } SCSI_255; + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */ + DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */ + DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */ + unsigned char DataTransferPageNumber; /* Byte 7 */ + DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */ + unsigned short :16; /* Bytes 16-17 */ + unsigned char ControllerNumber; /* Byte 18 */ + DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */ + unsigned char RequestSenseSize; /* Byte 20 */ + unsigned char IOCTL_Opcode; /* Byte 21 */ + unsigned char Reserved[10]; /* Bytes 22-31 */ + DAC960_V2_DataTransferMemoryAddress_T + DataTransferMemoryAddress; /* Bytes 32-63 */ + } ControllerInfo; + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */ + DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */ + DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */ + unsigned char DataTransferPageNumber; /* Byte 7 */ + DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */ + DAC960_V2_LogicalDevice_T LogicalDevice; /* Bytes 16-18 */ + DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */ + unsigned char RequestSenseSize; /* Byte 20 */ + unsigned char IOCTL_Opcode; /* Byte 21 */ + unsigned char Reserved[10]; /* Bytes 22-31 */ + DAC960_V2_DataTransferMemoryAddress_T + DataTransferMemoryAddress; /* Bytes 32-63 */ + } LogicalDeviceInfo; + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */ + DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */ + DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */ + unsigned char DataTransferPageNumber; /* Byte 7 */ + DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */ + DAC960_V2_PhysicalDevice_T PhysicalDevice; /* Bytes 16-18 */ + DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */ + unsigned char RequestSenseSize; /* Byte 20 */ + unsigned char IOCTL_Opcode; /* Byte 21 */ + unsigned char Reserved[10]; /* Bytes 22-31 */ + DAC960_V2_DataTransferMemoryAddress_T + DataTransferMemoryAddress; /* Bytes 32-63 */ + } PhysicalDeviceInfo; + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */ + DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */ + DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */ + unsigned char DataTransferPageNumber; /* Byte 7 */ + DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */ + unsigned short EventSequenceNumberHigh16; /* Bytes 16-17 */ + unsigned char ControllerNumber; /* Byte 18 */ + DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */ + unsigned char RequestSenseSize; /* Byte 20 */ + unsigned char IOCTL_Opcode; /* Byte 21 */ + unsigned short EventSequenceNumberLow16; /* Bytes 22-23 */ + unsigned char Reserved[8]; /* Bytes 24-31 */ + DAC960_V2_DataTransferMemoryAddress_T + DataTransferMemoryAddress; /* Bytes 32-63 */ + } GetEvent; + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */ + DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */ + DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */ + unsigned char DataTransferPageNumber; /* Byte 7 */ + DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */ + DAC960_V2_LogicalDevice_T LogicalDevice; /* Bytes 16-18 */ + DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */ + unsigned char RequestSenseSize; /* Byte 20 */ + unsigned char IOCTL_Opcode; /* Byte 21 */ + union { + DAC960_V2_LogicalDeviceState_T LogicalDeviceState; + DAC960_V2_PhysicalDeviceState_T PhysicalDeviceState; + } DeviceState; /* Byte 22 */ + unsigned char Reserved[9]; /* Bytes 23-31 */ + DAC960_V2_DataTransferMemoryAddress_T + DataTransferMemoryAddress; /* Bytes 32-63 */ + } SetDeviceState; + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */ + DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */ + DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */ + unsigned char DataTransferPageNumber; /* Byte 7 */ + DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */ + DAC960_V2_LogicalDevice_T LogicalDevice; /* Bytes 16-18 */ + DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */ + unsigned char RequestSenseSize; /* Byte 20 */ + unsigned char IOCTL_Opcode; /* Byte 21 */ + bool RestoreConsistency:1; /* Byte 22 Bit 0 */ + bool InitializedAreaOnly:1; /* Byte 22 Bit 1 */ + unsigned char :6; /* Byte 22 Bits 2-7 */ + unsigned char Reserved[9]; /* Bytes 23-31 */ + DAC960_V2_DataTransferMemoryAddress_T + DataTransferMemoryAddress; /* Bytes 32-63 */ + } ConsistencyCheck; + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */ + DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */ + unsigned char FirstCommandMailboxSizeKB; /* Byte 4 */ + unsigned char FirstStatusMailboxSizeKB; /* Byte 5 */ + unsigned char SecondCommandMailboxSizeKB; /* Byte 6 */ + unsigned char SecondStatusMailboxSizeKB; /* Byte 7 */ + DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */ + unsigned int :24; /* Bytes 16-18 */ + DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */ + unsigned char RequestSenseSize; /* Byte 20 */ + unsigned char IOCTL_Opcode; /* Byte 21 */ + unsigned char HealthStatusBufferSizeKB; /* Byte 22 */ + unsigned char :8; /* Byte 23 */ + DAC960_BusAddress64_T HealthStatusBufferBusAddress; /* Bytes 24-31 */ + DAC960_BusAddress64_T FirstCommandMailboxBusAddress; /* Bytes 32-39 */ + DAC960_BusAddress64_T FirstStatusMailboxBusAddress; /* Bytes 40-47 */ + DAC960_BusAddress64_T SecondCommandMailboxBusAddress; /* Bytes 48-55 */ + DAC960_BusAddress64_T SecondStatusMailboxBusAddress; /* Bytes 56-63 */ + } SetMemoryMailbox; + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */ + DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */ + DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */ + unsigned char DataTransferPageNumber; /* Byte 7 */ + DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */ + DAC960_V2_PhysicalDevice_T PhysicalDevice; /* Bytes 16-18 */ + DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */ + unsigned char RequestSenseSize; /* Byte 20 */ + unsigned char IOCTL_Opcode; /* Byte 21 */ + DAC960_V2_OperationDevice_T OperationDevice; /* Byte 22 */ + unsigned char Reserved[9]; /* Bytes 23-31 */ + DAC960_V2_DataTransferMemoryAddress_T + DataTransferMemoryAddress; /* Bytes 32-63 */ + } DeviceOperation; +} +DAC960_V2_CommandMailbox_T; + + +/* + Define the DAC960 Driver IOCTL requests. +*/ + +#define DAC960_IOCTL_GET_CONTROLLER_COUNT 0xDAC001 +#define DAC960_IOCTL_GET_CONTROLLER_INFO 0xDAC002 +#define DAC960_IOCTL_V1_EXECUTE_COMMAND 0xDAC003 +#define DAC960_IOCTL_V2_EXECUTE_COMMAND 0xDAC004 +#define DAC960_IOCTL_V2_GET_HEALTH_STATUS 0xDAC005 + + +/* + Define the DAC960_IOCTL_GET_CONTROLLER_INFO reply structure. +*/ + +typedef struct DAC960_ControllerInfo +{ + unsigned char ControllerNumber; + unsigned char FirmwareType; + unsigned char Channels; + unsigned char Targets; + unsigned char PCI_Bus; + unsigned char PCI_Device; + unsigned char PCI_Function; + unsigned char IRQ_Channel; + DAC960_PCI_Address_T PCI_Address; + unsigned char ModelName[20]; + unsigned char FirmwareVersion[12]; +} +DAC960_ControllerInfo_T; + + +/* + Define the User Mode DAC960_IOCTL_V1_EXECUTE_COMMAND request structure. +*/ + +typedef struct DAC960_V1_UserCommand +{ + unsigned char ControllerNumber; + DAC960_V1_CommandMailbox_T CommandMailbox; + int DataTransferLength; + void __user *DataTransferBuffer; + DAC960_V1_DCDB_T __user *DCDB; +} +DAC960_V1_UserCommand_T; + + +/* + Define the Kernel Mode DAC960_IOCTL_V1_EXECUTE_COMMAND request structure. +*/ + +typedef struct DAC960_V1_KernelCommand +{ + unsigned char ControllerNumber; + DAC960_V1_CommandMailbox_T CommandMailbox; + int DataTransferLength; + void *DataTransferBuffer; + DAC960_V1_DCDB_T *DCDB; + DAC960_V1_CommandStatus_T CommandStatus; + void (*CompletionFunction)(struct DAC960_V1_KernelCommand *); + void *CompletionData; +} +DAC960_V1_KernelCommand_T; + + +/* + Define the User Mode DAC960_IOCTL_V2_EXECUTE_COMMAND request structure. +*/ + +typedef struct DAC960_V2_UserCommand +{ + unsigned char ControllerNumber; + DAC960_V2_CommandMailbox_T CommandMailbox; + int DataTransferLength; + int RequestSenseLength; + void __user *DataTransferBuffer; + void __user *RequestSenseBuffer; +} +DAC960_V2_UserCommand_T; + + +/* + Define the Kernel Mode DAC960_IOCTL_V2_EXECUTE_COMMAND request structure. +*/ + +typedef struct DAC960_V2_KernelCommand +{ + unsigned char ControllerNumber; + DAC960_V2_CommandMailbox_T CommandMailbox; + int DataTransferLength; + int RequestSenseLength; + void *DataTransferBuffer; + void *RequestSenseBuffer; + DAC960_V2_CommandStatus_T CommandStatus; + void (*CompletionFunction)(struct DAC960_V2_KernelCommand *); + void *CompletionData; +} +DAC960_V2_KernelCommand_T; + + +/* + Define the User Mode DAC960_IOCTL_V2_GET_HEALTH_STATUS request structure. +*/ + +typedef struct DAC960_V2_GetHealthStatus +{ + unsigned char ControllerNumber; + DAC960_V2_HealthStatusBuffer_T __user *HealthStatusBuffer; +} +DAC960_V2_GetHealthStatus_T; + + +/* + Import the Kernel Mode IOCTL interface. +*/ + +extern int DAC960_KernelIOCTL(unsigned int Request, void *Argument); + + +/* + DAC960_DriverVersion protects the private portion of this file. +*/ + +#ifdef DAC960_DriverVersion + + +/* + Define the maximum Driver Queue Depth and Controller Queue Depth supported + by DAC960 V1 and V2 Firmware Controllers. +*/ + +#define DAC960_MaxDriverQueueDepth 511 +#define DAC960_MaxControllerQueueDepth 512 + + +/* + Define the maximum number of Scatter/Gather Segments supported for any + DAC960 V1 and V2 Firmware controller. +*/ + +#define DAC960_V1_ScatterGatherLimit 33 +#define DAC960_V2_ScatterGatherLimit 128 + + +/* + Define the number of Command Mailboxes and Status Mailboxes used by the + DAC960 V1 and V2 Firmware Memory Mailbox Interface. +*/ + +#define DAC960_V1_CommandMailboxCount 256 +#define DAC960_V1_StatusMailboxCount 1024 +#define DAC960_V2_CommandMailboxCount 512 +#define DAC960_V2_StatusMailboxCount 512 + + +/* + Define the DAC960 Controller Monitoring Timer Interval. +*/ + +#define DAC960_MonitoringTimerInterval (10 * HZ) + + +/* + Define the DAC960 Controller Secondary Monitoring Interval. +*/ + +#define DAC960_SecondaryMonitoringInterval (60 * HZ) + + +/* + Define the DAC960 Controller Health Status Monitoring Interval. +*/ + +#define DAC960_HealthStatusMonitoringInterval (1 * HZ) + + +/* + Define the DAC960 Controller Progress Reporting Interval. +*/ + +#define DAC960_ProgressReportingInterval (60 * HZ) + + +/* + Define the maximum number of Partitions allowed for each Logical Drive. +*/ + +#define DAC960_MaxPartitions 8 +#define DAC960_MaxPartitionsBits 3 + +/* + Define the DAC960 Controller fixed Block Size and Block Size Bits. +*/ + +#define DAC960_BlockSize 512 +#define DAC960_BlockSizeBits 9 + + +/* + Define the number of Command structures that should be allocated as a + group to optimize kernel memory allocation. +*/ + +#define DAC960_V1_CommandAllocationGroupSize 11 +#define DAC960_V2_CommandAllocationGroupSize 29 + + +/* + Define the Controller Line Buffer, Progress Buffer, User Message, and + Initial Status Buffer sizes. +*/ + +#define DAC960_LineBufferSize 100 +#define DAC960_ProgressBufferSize 200 +#define DAC960_UserMessageSize 200 +#define DAC960_InitialStatusBufferSize (8192-32) + + +/* + Define the DAC960 Controller Firmware Types. +*/ + +typedef enum +{ + DAC960_V1_Controller = 1, + DAC960_V2_Controller = 2 +} +DAC960_FirmwareType_T; + + +/* + Define the DAC960 Controller Hardware Types. +*/ + +typedef enum +{ + DAC960_BA_Controller = 1, /* eXtremeRAID 2000 */ + DAC960_LP_Controller = 2, /* AcceleRAID 352 */ + DAC960_LA_Controller = 3, /* DAC1164P */ + DAC960_PG_Controller = 4, /* DAC960PTL/PJ/PG */ + DAC960_PD_Controller = 5, /* DAC960PU/PD/PL/P */ + DAC960_P_Controller = 6, /* DAC960PU/PD/PL/P */ + DAC960_GEM_Controller = 7, /* AcceleRAID 4/5/600 */ +} +DAC960_HardwareType_T; + + +/* + Define the Driver Message Levels. +*/ + +typedef enum DAC960_MessageLevel +{ + DAC960_AnnounceLevel = 0, + DAC960_InfoLevel = 1, + DAC960_NoticeLevel = 2, + DAC960_WarningLevel = 3, + DAC960_ErrorLevel = 4, + DAC960_ProgressLevel = 5, + DAC960_CriticalLevel = 6, + DAC960_UserCriticalLevel = 7 +} +DAC960_MessageLevel_T; + +static char + *DAC960_MessageLevelMap[] = + { KERN_NOTICE, KERN_NOTICE, KERN_NOTICE, KERN_WARNING, + KERN_ERR, KERN_CRIT, KERN_CRIT, KERN_CRIT }; + + +/* + Define Driver Message macros. +*/ + +#define DAC960_Announce(Format, Arguments...) \ + DAC960_Message(DAC960_AnnounceLevel, Format, ##Arguments) + +#define DAC960_Info(Format, Arguments...) \ + DAC960_Message(DAC960_InfoLevel, Format, ##Arguments) + +#define DAC960_Notice(Format, Arguments...) \ + DAC960_Message(DAC960_NoticeLevel, Format, ##Arguments) + +#define DAC960_Warning(Format, Arguments...) \ + DAC960_Message(DAC960_WarningLevel, Format, ##Arguments) + +#define DAC960_Error(Format, Arguments...) \ + DAC960_Message(DAC960_ErrorLevel, Format, ##Arguments) + +#define DAC960_Progress(Format, Arguments...) \ + DAC960_Message(DAC960_ProgressLevel, Format, ##Arguments) + +#define DAC960_Critical(Format, Arguments...) \ + DAC960_Message(DAC960_CriticalLevel, Format, ##Arguments) + +#define DAC960_UserCritical(Format, Arguments...) \ + DAC960_Message(DAC960_UserCriticalLevel, Format, ##Arguments) + + +struct DAC960_privdata { + DAC960_HardwareType_T HardwareType; + DAC960_FirmwareType_T FirmwareType; + irq_handler_t InterruptHandler; + unsigned int MemoryWindowSize; +}; + + +/* + Define the DAC960 V1 Firmware Controller Status Mailbox structure. +*/ + +typedef union DAC960_V1_StatusMailbox +{ + unsigned int Word; /* Word 0 */ + struct { + DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 0 */ + unsigned char :7; /* Byte 1 Bits 0-6 */ + bool Valid:1; /* Byte 1 Bit 7 */ + DAC960_V1_CommandStatus_T CommandStatus; /* Bytes 2-3 */ + } Fields; +} +DAC960_V1_StatusMailbox_T; + + +/* + Define the DAC960 V2 Firmware Controller Status Mailbox structure. +*/ + +typedef union DAC960_V2_StatusMailbox +{ + unsigned int Words[2]; /* Words 0-1 */ + struct { + DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */ + DAC960_V2_CommandStatus_T CommandStatus; /* Byte 2 */ + unsigned char RequestSenseLength; /* Byte 3 */ + int DataTransferResidue; /* Bytes 4-7 */ + } Fields; +} +DAC960_V2_StatusMailbox_T; + + +/* + Define the DAC960 Driver Command Types. +*/ + +typedef enum +{ + DAC960_ReadCommand = 1, + DAC960_WriteCommand = 2, + DAC960_ReadRetryCommand = 3, + DAC960_WriteRetryCommand = 4, + DAC960_MonitoringCommand = 5, + DAC960_ImmediateCommand = 6, + DAC960_QueuedCommand = 7 +} +DAC960_CommandType_T; + + +/* + Define the DAC960 Driver Command structure. +*/ + +typedef struct DAC960_Command +{ + int CommandIdentifier; + DAC960_CommandType_T CommandType; + struct DAC960_Controller *Controller; + struct DAC960_Command *Next; + struct completion *Completion; + unsigned int LogicalDriveNumber; + unsigned int BlockNumber; + unsigned int BlockCount; + unsigned int SegmentCount; + int DmaDirection; + struct scatterlist *cmd_sglist; + struct request *Request; + union { + struct { + DAC960_V1_CommandMailbox_T CommandMailbox; + DAC960_V1_KernelCommand_T *KernelCommand; + DAC960_V1_CommandStatus_T CommandStatus; + DAC960_V1_ScatterGatherSegment_T *ScatterGatherList; + dma_addr_t ScatterGatherListDMA; + struct scatterlist ScatterList[DAC960_V1_ScatterGatherLimit]; + unsigned int EndMarker[0]; + } V1; + struct { + DAC960_V2_CommandMailbox_T CommandMailbox; + DAC960_V2_KernelCommand_T *KernelCommand; + DAC960_V2_CommandStatus_T CommandStatus; + unsigned char RequestSenseLength; + int DataTransferResidue; + DAC960_V2_ScatterGatherSegment_T *ScatterGatherList; + dma_addr_t ScatterGatherListDMA; + DAC960_SCSI_RequestSense_T *RequestSense; + dma_addr_t RequestSenseDMA; + struct scatterlist ScatterList[DAC960_V2_ScatterGatherLimit]; + unsigned int EndMarker[0]; + } V2; + } FW; +} +DAC960_Command_T; + + +/* + Define the DAC960 Driver Controller structure. +*/ + +typedef struct DAC960_Controller +{ + void __iomem *BaseAddress; + void __iomem *MemoryMappedAddress; + DAC960_FirmwareType_T FirmwareType; + DAC960_HardwareType_T HardwareType; + DAC960_IO_Address_T IO_Address; + DAC960_PCI_Address_T PCI_Address; + struct pci_dev *PCIDevice; + unsigned char ControllerNumber; + unsigned char ControllerName[4]; + unsigned char ModelName[20]; + unsigned char FullModelName[28]; + unsigned char FirmwareVersion[12]; + unsigned char Bus; + unsigned char Device; + unsigned char Function; + unsigned char IRQ_Channel; + unsigned char Channels; + unsigned char Targets; + unsigned char MemorySize; + unsigned char LogicalDriveCount; + unsigned short CommandAllocationGroupSize; + unsigned short ControllerQueueDepth; + unsigned short DriverQueueDepth; + unsigned short MaxBlocksPerCommand; + unsigned short ControllerScatterGatherLimit; + unsigned short DriverScatterGatherLimit; + u64 BounceBufferLimit; + unsigned int CombinedStatusBufferLength; + unsigned int InitialStatusLength; + unsigned int CurrentStatusLength; + unsigned int ProgressBufferLength; + unsigned int UserStatusLength; + struct dma_loaf DmaPages; + unsigned long MonitoringTimerCount; + unsigned long PrimaryMonitoringTime; + unsigned long SecondaryMonitoringTime; + unsigned long ShutdownMonitoringTimer; + unsigned long LastProgressReportTime; + unsigned long LastCurrentStatusTime; + bool ControllerInitialized; + bool MonitoringCommandDeferred; + bool EphemeralProgressMessage; + bool DriveSpinUpMessageDisplayed; + bool MonitoringAlertMode; + bool SuppressEnclosureMessages; + struct timer_list MonitoringTimer; + struct gendisk *disks[DAC960_MaxLogicalDrives]; + struct pci_pool *ScatterGatherPool; + DAC960_Command_T *FreeCommands; + unsigned char *CombinedStatusBuffer; + unsigned char *CurrentStatusBuffer; + struct request_queue *RequestQueue[DAC960_MaxLogicalDrives]; + int req_q_index; + spinlock_t queue_lock; + wait_queue_head_t CommandWaitQueue; + wait_queue_head_t HealthStatusWaitQueue; + DAC960_Command_T InitialCommand; + DAC960_Command_T *Commands[DAC960_MaxDriverQueueDepth]; + struct proc_dir_entry *ControllerProcEntry; + bool LogicalDriveInitiallyAccessible[DAC960_MaxLogicalDrives]; + void (*QueueCommand)(DAC960_Command_T *Command); + bool (*ReadControllerConfiguration)(struct DAC960_Controller *); + bool (*ReadDeviceConfiguration)(struct DAC960_Controller *); + bool (*ReportDeviceConfiguration)(struct DAC960_Controller *); + void (*QueueReadWriteCommand)(DAC960_Command_T *Command); + union { + struct { + unsigned char GeometryTranslationHeads; + unsigned char GeometryTranslationSectors; + unsigned char PendingRebuildFlag; + unsigned short StripeSize; + unsigned short SegmentSize; + unsigned short NewEventLogSequenceNumber; + unsigned short OldEventLogSequenceNumber; + unsigned short DeviceStateChannel; + unsigned short DeviceStateTargetID; + bool DualModeMemoryMailboxInterface; + bool BackgroundInitializationStatusSupported; + bool SAFTE_EnclosureManagementEnabled; + bool NeedLogicalDriveInformation; + bool NeedErrorTableInformation; + bool NeedDeviceStateInformation; + bool NeedDeviceInquiryInformation; + bool NeedDeviceSerialNumberInformation; + bool NeedRebuildProgress; + bool NeedConsistencyCheckProgress; + bool NeedBackgroundInitializationStatus; + bool StartDeviceStateScan; + bool RebuildProgressFirst; + bool RebuildFlagPending; + bool RebuildStatusPending; + + dma_addr_t FirstCommandMailboxDMA; + DAC960_V1_CommandMailbox_T *FirstCommandMailbox; + DAC960_V1_CommandMailbox_T *LastCommandMailbox; + DAC960_V1_CommandMailbox_T *NextCommandMailbox; + DAC960_V1_CommandMailbox_T *PreviousCommandMailbox1; + DAC960_V1_CommandMailbox_T *PreviousCommandMailbox2; + + dma_addr_t FirstStatusMailboxDMA; + DAC960_V1_StatusMailbox_T *FirstStatusMailbox; + DAC960_V1_StatusMailbox_T *LastStatusMailbox; + DAC960_V1_StatusMailbox_T *NextStatusMailbox; + + DAC960_V1_DCDB_T *MonitoringDCDB; + dma_addr_t MonitoringDCDB_DMA; + + DAC960_V1_Enquiry_T Enquiry; + DAC960_V1_Enquiry_T *NewEnquiry; + dma_addr_t NewEnquiryDMA; + + DAC960_V1_ErrorTable_T ErrorTable; + DAC960_V1_ErrorTable_T *NewErrorTable; + dma_addr_t NewErrorTableDMA; + + DAC960_V1_EventLogEntry_T *EventLogEntry; + dma_addr_t EventLogEntryDMA; + + DAC960_V1_RebuildProgress_T *RebuildProgress; + dma_addr_t RebuildProgressDMA; + DAC960_V1_CommandStatus_T LastRebuildStatus; + DAC960_V1_CommandStatus_T PendingRebuildStatus; + + DAC960_V1_LogicalDriveInformationArray_T LogicalDriveInformation; + DAC960_V1_LogicalDriveInformationArray_T *NewLogicalDriveInformation; + dma_addr_t NewLogicalDriveInformationDMA; + + DAC960_V1_BackgroundInitializationStatus_T + *BackgroundInitializationStatus; + dma_addr_t BackgroundInitializationStatusDMA; + DAC960_V1_BackgroundInitializationStatus_T + LastBackgroundInitializationStatus; + + DAC960_V1_DeviceState_T + DeviceState[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets]; + DAC960_V1_DeviceState_T *NewDeviceState; + dma_addr_t NewDeviceStateDMA; + + DAC960_SCSI_Inquiry_T + InquiryStandardData[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets]; + DAC960_SCSI_Inquiry_T *NewInquiryStandardData; + dma_addr_t NewInquiryStandardDataDMA; + + DAC960_SCSI_Inquiry_UnitSerialNumber_T + InquiryUnitSerialNumber[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets]; + DAC960_SCSI_Inquiry_UnitSerialNumber_T *NewInquiryUnitSerialNumber; + dma_addr_t NewInquiryUnitSerialNumberDMA; + + int DeviceResetCount[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets]; + bool DirectCommandActive[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets]; + } V1; + struct { + unsigned int StatusChangeCounter; + unsigned int NextEventSequenceNumber; + unsigned int PhysicalDeviceIndex; + bool NeedLogicalDeviceInformation; + bool NeedPhysicalDeviceInformation; + bool NeedDeviceSerialNumberInformation; + bool StartLogicalDeviceInformationScan; + bool StartPhysicalDeviceInformationScan; + struct pci_pool *RequestSensePool; + + dma_addr_t FirstCommandMailboxDMA; + DAC960_V2_CommandMailbox_T *FirstCommandMailbox; + DAC960_V2_CommandMailbox_T *LastCommandMailbox; + DAC960_V2_CommandMailbox_T *NextCommandMailbox; + DAC960_V2_CommandMailbox_T *PreviousCommandMailbox1; + DAC960_V2_CommandMailbox_T *PreviousCommandMailbox2; + + dma_addr_t FirstStatusMailboxDMA; + DAC960_V2_StatusMailbox_T *FirstStatusMailbox; + DAC960_V2_StatusMailbox_T *LastStatusMailbox; + DAC960_V2_StatusMailbox_T *NextStatusMailbox; + + dma_addr_t HealthStatusBufferDMA; + DAC960_V2_HealthStatusBuffer_T *HealthStatusBuffer; + + DAC960_V2_ControllerInfo_T ControllerInformation; + DAC960_V2_ControllerInfo_T *NewControllerInformation; + dma_addr_t NewControllerInformationDMA; + + DAC960_V2_LogicalDeviceInfo_T + *LogicalDeviceInformation[DAC960_MaxLogicalDrives]; + DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInformation; + dma_addr_t NewLogicalDeviceInformationDMA; + + DAC960_V2_PhysicalDeviceInfo_T + *PhysicalDeviceInformation[DAC960_V2_MaxPhysicalDevices]; + DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInformation; + dma_addr_t NewPhysicalDeviceInformationDMA; + + DAC960_SCSI_Inquiry_UnitSerialNumber_T *NewInquiryUnitSerialNumber; + dma_addr_t NewInquiryUnitSerialNumberDMA; + DAC960_SCSI_Inquiry_UnitSerialNumber_T + *InquiryUnitSerialNumber[DAC960_V2_MaxPhysicalDevices]; + + DAC960_V2_Event_T *Event; + dma_addr_t EventDMA; + + DAC960_V2_PhysicalToLogicalDevice_T *PhysicalToLogicalDevice; + dma_addr_t PhysicalToLogicalDeviceDMA; + + DAC960_V2_PhysicalDevice_T + LogicalDriveToVirtualDevice[DAC960_MaxLogicalDrives]; + bool LogicalDriveFoundDuringScan[DAC960_MaxLogicalDrives]; + } V2; + } FW; + unsigned char ProgressBuffer[DAC960_ProgressBufferSize]; + unsigned char UserStatusBuffer[DAC960_UserMessageSize]; +} +DAC960_Controller_T; + + +/* + Simplify access to Firmware Version Dependent Data Structure Components + and Functions. +*/ + +#define V1 FW.V1 +#define V2 FW.V2 +#define DAC960_QueueCommand(Command) \ + (Controller->QueueCommand)(Command) +#define DAC960_ReadControllerConfiguration(Controller) \ + (Controller->ReadControllerConfiguration)(Controller) +#define DAC960_ReadDeviceConfiguration(Controller) \ + (Controller->ReadDeviceConfiguration)(Controller) +#define DAC960_ReportDeviceConfiguration(Controller) \ + (Controller->ReportDeviceConfiguration)(Controller) +#define DAC960_QueueReadWriteCommand(Command) \ + (Controller->QueueReadWriteCommand)(Command) + +/* + * dma_addr_writeql is provided to write dma_addr_t types + * to a 64-bit pci address space register. The controller + * will accept having the register written as two 32-bit + * values. + * + * In HIGHMEM kernels, dma_addr_t is a 64-bit value. + * without HIGHMEM, dma_addr_t is a 32-bit value. + * + * The compiler should always fix up the assignment + * to u.wq appropriately, depending upon the size of + * dma_addr_t. + */ +static inline +void dma_addr_writeql(dma_addr_t addr, void __iomem *write_address) +{ + union { + u64 wq; + uint wl[2]; + } u; + + u.wq = addr; + + writel(u.wl[0], write_address); + writel(u.wl[1], write_address + 4); +} + +/* + Define the DAC960 GEM Series Controller Interface Register Offsets. + */ + +#define DAC960_GEM_RegisterWindowSize 0x600 + +typedef enum +{ + DAC960_GEM_InboundDoorBellRegisterReadSetOffset = 0x214, + DAC960_GEM_InboundDoorBellRegisterClearOffset = 0x218, + DAC960_GEM_OutboundDoorBellRegisterReadSetOffset = 0x224, + DAC960_GEM_OutboundDoorBellRegisterClearOffset = 0x228, + DAC960_GEM_InterruptStatusRegisterOffset = 0x208, + DAC960_GEM_InterruptMaskRegisterReadSetOffset = 0x22C, + DAC960_GEM_InterruptMaskRegisterClearOffset = 0x230, + DAC960_GEM_CommandMailboxBusAddressOffset = 0x510, + DAC960_GEM_CommandStatusOffset = 0x518, + DAC960_GEM_ErrorStatusRegisterReadSetOffset = 0x224, + DAC960_GEM_ErrorStatusRegisterClearOffset = 0x228, +} +DAC960_GEM_RegisterOffsets_T; + +/* + Define the structure of the DAC960 GEM Series Inbound Door Bell + */ + +typedef union DAC960_GEM_InboundDoorBellRegister +{ + unsigned int All; + struct { + unsigned int :24; + bool HardwareMailboxNewCommand:1; + bool AcknowledgeHardwareMailboxStatus:1; + bool GenerateInterrupt:1; + bool ControllerReset:1; + bool MemoryMailboxNewCommand:1; + unsigned int :3; + } Write; + struct { + unsigned int :24; + bool HardwareMailboxFull:1; + bool InitializationInProgress:1; + unsigned int :6; + } Read; +} +DAC960_GEM_InboundDoorBellRegister_T; + +/* + Define the structure of the DAC960 GEM Series Outbound Door Bell Register. + */ +typedef union DAC960_GEM_OutboundDoorBellRegister +{ + unsigned int All; + struct { + unsigned int :24; + bool AcknowledgeHardwareMailboxInterrupt:1; + bool AcknowledgeMemoryMailboxInterrupt:1; + unsigned int :6; + } Write; + struct { + unsigned int :24; + bool HardwareMailboxStatusAvailable:1; + bool MemoryMailboxStatusAvailable:1; + unsigned int :6; + } Read; +} +DAC960_GEM_OutboundDoorBellRegister_T; + +/* + Define the structure of the DAC960 GEM Series Interrupt Mask Register. + */ +typedef union DAC960_GEM_InterruptMaskRegister +{ + unsigned int All; + struct { + unsigned int :16; + unsigned int :8; + unsigned int HardwareMailboxInterrupt:1; + unsigned int MemoryMailboxInterrupt:1; + unsigned int :6; + } Bits; +} +DAC960_GEM_InterruptMaskRegister_T; + +/* + Define the structure of the DAC960 GEM Series Error Status Register. + */ + +typedef union DAC960_GEM_ErrorStatusRegister +{ + unsigned int All; + struct { + unsigned int :24; + unsigned int :5; + bool ErrorStatusPending:1; + unsigned int :2; + } Bits; +} +DAC960_GEM_ErrorStatusRegister_T; + +/* + Define inline functions to provide an abstraction for reading and writing the + DAC960 GEM Series Controller Interface Registers. +*/ + +static inline +void DAC960_GEM_HardwareMailboxNewCommand(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.HardwareMailboxNewCommand = true; + writel(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_GEM_InboundDoorBellRegisterReadSetOffset); +} + +static inline +void DAC960_GEM_AcknowledgeHardwareMailboxStatus(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.AcknowledgeHardwareMailboxStatus = true; + writel(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_GEM_InboundDoorBellRegisterClearOffset); +} + +static inline +void DAC960_GEM_GenerateInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.GenerateInterrupt = true; + writel(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_GEM_InboundDoorBellRegisterReadSetOffset); +} + +static inline +void DAC960_GEM_ControllerReset(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.ControllerReset = true; + writel(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_GEM_InboundDoorBellRegisterReadSetOffset); +} + +static inline +void DAC960_GEM_MemoryMailboxNewCommand(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.MemoryMailboxNewCommand = true; + writel(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_GEM_InboundDoorBellRegisterReadSetOffset); +} + +static inline +bool DAC960_GEM_HardwareMailboxFullP(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readl(ControllerBaseAddress + + DAC960_GEM_InboundDoorBellRegisterReadSetOffset); + return InboundDoorBellRegister.Read.HardwareMailboxFull; +} + +static inline +bool DAC960_GEM_InitializationInProgressP(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readl(ControllerBaseAddress + + DAC960_GEM_InboundDoorBellRegisterReadSetOffset); + return InboundDoorBellRegister.Read.InitializationInProgress; +} + +static inline +void DAC960_GEM_AcknowledgeHardwareMailboxInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true; + writel(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_GEM_OutboundDoorBellRegisterClearOffset); +} + +static inline +void DAC960_GEM_AcknowledgeMemoryMailboxInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true; + writel(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_GEM_OutboundDoorBellRegisterClearOffset); +} + +static inline +void DAC960_GEM_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true; + OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true; + writel(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_GEM_OutboundDoorBellRegisterClearOffset); +} + +static inline +bool DAC960_GEM_HardwareMailboxStatusAvailableP(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = + readl(ControllerBaseAddress + + DAC960_GEM_OutboundDoorBellRegisterReadSetOffset); + return OutboundDoorBellRegister.Read.HardwareMailboxStatusAvailable; +} + +static inline +bool DAC960_GEM_MemoryMailboxStatusAvailableP(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = + readl(ControllerBaseAddress + + DAC960_GEM_OutboundDoorBellRegisterReadSetOffset); + return OutboundDoorBellRegister.Read.MemoryMailboxStatusAvailable; +} + +static inline +void DAC960_GEM_EnableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = 0; + InterruptMaskRegister.Bits.HardwareMailboxInterrupt = true; + InterruptMaskRegister.Bits.MemoryMailboxInterrupt = true; + writel(InterruptMaskRegister.All, + ControllerBaseAddress + DAC960_GEM_InterruptMaskRegisterClearOffset); +} + +static inline +void DAC960_GEM_DisableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = 0; + InterruptMaskRegister.Bits.HardwareMailboxInterrupt = true; + InterruptMaskRegister.Bits.MemoryMailboxInterrupt = true; + writel(InterruptMaskRegister.All, + ControllerBaseAddress + DAC960_GEM_InterruptMaskRegisterReadSetOffset); +} + +static inline +bool DAC960_GEM_InterruptsEnabledP(void __iomem *ControllerBaseAddress) +{ + DAC960_GEM_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = + readl(ControllerBaseAddress + + DAC960_GEM_InterruptMaskRegisterReadSetOffset); + return !(InterruptMaskRegister.Bits.HardwareMailboxInterrupt || + InterruptMaskRegister.Bits.MemoryMailboxInterrupt); +} + +static inline +void DAC960_GEM_WriteCommandMailbox(DAC960_V2_CommandMailbox_T + *MemoryCommandMailbox, + DAC960_V2_CommandMailbox_T + *CommandMailbox) +{ + memcpy(&MemoryCommandMailbox->Words[1], &CommandMailbox->Words[1], + sizeof(DAC960_V2_CommandMailbox_T) - sizeof(unsigned int)); + wmb(); + MemoryCommandMailbox->Words[0] = CommandMailbox->Words[0]; + mb(); +} + +static inline +void DAC960_GEM_WriteHardwareMailbox(void __iomem *ControllerBaseAddress, + dma_addr_t CommandMailboxDMA) +{ + dma_addr_writeql(CommandMailboxDMA, + ControllerBaseAddress + + DAC960_GEM_CommandMailboxBusAddressOffset); +} + +static inline DAC960_V2_CommandIdentifier_T +DAC960_GEM_ReadCommandIdentifier(void __iomem *ControllerBaseAddress) +{ + return readw(ControllerBaseAddress + DAC960_GEM_CommandStatusOffset); +} + +static inline DAC960_V2_CommandStatus_T +DAC960_GEM_ReadCommandStatus(void __iomem *ControllerBaseAddress) +{ + return readw(ControllerBaseAddress + DAC960_GEM_CommandStatusOffset + 2); +} + +static inline bool +DAC960_GEM_ReadErrorStatus(void __iomem *ControllerBaseAddress, + unsigned char *ErrorStatus, + unsigned char *Parameter0, + unsigned char *Parameter1) +{ + DAC960_GEM_ErrorStatusRegister_T ErrorStatusRegister; + ErrorStatusRegister.All = + readl(ControllerBaseAddress + DAC960_GEM_ErrorStatusRegisterReadSetOffset); + if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false; + ErrorStatusRegister.Bits.ErrorStatusPending = false; + *ErrorStatus = ErrorStatusRegister.All; + *Parameter0 = + readb(ControllerBaseAddress + DAC960_GEM_CommandMailboxBusAddressOffset + 0); + *Parameter1 = + readb(ControllerBaseAddress + DAC960_GEM_CommandMailboxBusAddressOffset + 1); + writel(0x03000000, ControllerBaseAddress + + DAC960_GEM_ErrorStatusRegisterClearOffset); + return true; +} + +/* + Define the DAC960 BA Series Controller Interface Register Offsets. +*/ + +#define DAC960_BA_RegisterWindowSize 0x80 + +typedef enum +{ + DAC960_BA_InboundDoorBellRegisterOffset = 0x60, + DAC960_BA_OutboundDoorBellRegisterOffset = 0x61, + DAC960_BA_InterruptStatusRegisterOffset = 0x30, + DAC960_BA_InterruptMaskRegisterOffset = 0x34, + DAC960_BA_CommandMailboxBusAddressOffset = 0x50, + DAC960_BA_CommandStatusOffset = 0x58, + DAC960_BA_ErrorStatusRegisterOffset = 0x63 +} +DAC960_BA_RegisterOffsets_T; + + +/* + Define the structure of the DAC960 BA Series Inbound Door Bell Register. +*/ + +typedef union DAC960_BA_InboundDoorBellRegister +{ + unsigned char All; + struct { + bool HardwareMailboxNewCommand:1; /* Bit 0 */ + bool AcknowledgeHardwareMailboxStatus:1; /* Bit 1 */ + bool GenerateInterrupt:1; /* Bit 2 */ + bool ControllerReset:1; /* Bit 3 */ + bool MemoryMailboxNewCommand:1; /* Bit 4 */ + unsigned char :3; /* Bits 5-7 */ + } Write; + struct { + bool HardwareMailboxEmpty:1; /* Bit 0 */ + bool InitializationNotInProgress:1; /* Bit 1 */ + unsigned char :6; /* Bits 2-7 */ + } Read; +} +DAC960_BA_InboundDoorBellRegister_T; + + +/* + Define the structure of the DAC960 BA Series Outbound Door Bell Register. +*/ + +typedef union DAC960_BA_OutboundDoorBellRegister +{ + unsigned char All; + struct { + bool AcknowledgeHardwareMailboxInterrupt:1; /* Bit 0 */ + bool AcknowledgeMemoryMailboxInterrupt:1; /* Bit 1 */ + unsigned char :6; /* Bits 2-7 */ + } Write; + struct { + bool HardwareMailboxStatusAvailable:1; /* Bit 0 */ + bool MemoryMailboxStatusAvailable:1; /* Bit 1 */ + unsigned char :6; /* Bits 2-7 */ + } Read; +} +DAC960_BA_OutboundDoorBellRegister_T; + + +/* + Define the structure of the DAC960 BA Series Interrupt Mask Register. +*/ + +typedef union DAC960_BA_InterruptMaskRegister +{ + unsigned char All; + struct { + unsigned int :2; /* Bits 0-1 */ + bool DisableInterrupts:1; /* Bit 2 */ + bool DisableInterruptsI2O:1; /* Bit 3 */ + unsigned int :4; /* Bits 4-7 */ + } Bits; +} +DAC960_BA_InterruptMaskRegister_T; + + +/* + Define the structure of the DAC960 BA Series Error Status Register. +*/ + +typedef union DAC960_BA_ErrorStatusRegister +{ + unsigned char All; + struct { + unsigned int :2; /* Bits 0-1 */ + bool ErrorStatusPending:1; /* Bit 2 */ + unsigned int :5; /* Bits 3-7 */ + } Bits; +} +DAC960_BA_ErrorStatusRegister_T; + + +/* + Define inline functions to provide an abstraction for reading and writing the + DAC960 BA Series Controller Interface Registers. +*/ + +static inline +void DAC960_BA_HardwareMailboxNewCommand(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.HardwareMailboxNewCommand = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_BA_AcknowledgeHardwareMailboxStatus(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.AcknowledgeHardwareMailboxStatus = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_BA_GenerateInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.GenerateInterrupt = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_BA_ControllerReset(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.ControllerReset = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_BA_MemoryMailboxNewCommand(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.MemoryMailboxNewCommand = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset); +} + +static inline +bool DAC960_BA_HardwareMailboxFullP(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset); + return !InboundDoorBellRegister.Read.HardwareMailboxEmpty; +} + +static inline +bool DAC960_BA_InitializationInProgressP(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset); + return !InboundDoorBellRegister.Read.InitializationNotInProgress; +} + +static inline +void DAC960_BA_AcknowledgeHardwareMailboxInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true; + writeb(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_BA_OutboundDoorBellRegisterOffset); +} + +static inline +void DAC960_BA_AcknowledgeMemoryMailboxInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true; + writeb(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_BA_OutboundDoorBellRegisterOffset); +} + +static inline +void DAC960_BA_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true; + OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true; + writeb(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_BA_OutboundDoorBellRegisterOffset); +} + +static inline +bool DAC960_BA_HardwareMailboxStatusAvailableP(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_BA_OutboundDoorBellRegisterOffset); + return OutboundDoorBellRegister.Read.HardwareMailboxStatusAvailable; +} + +static inline +bool DAC960_BA_MemoryMailboxStatusAvailableP(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_BA_OutboundDoorBellRegisterOffset); + return OutboundDoorBellRegister.Read.MemoryMailboxStatusAvailable; +} + +static inline +void DAC960_BA_EnableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = 0xFF; + InterruptMaskRegister.Bits.DisableInterrupts = false; + InterruptMaskRegister.Bits.DisableInterruptsI2O = true; + writeb(InterruptMaskRegister.All, + ControllerBaseAddress + DAC960_BA_InterruptMaskRegisterOffset); +} + +static inline +void DAC960_BA_DisableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = 0xFF; + InterruptMaskRegister.Bits.DisableInterrupts = true; + InterruptMaskRegister.Bits.DisableInterruptsI2O = true; + writeb(InterruptMaskRegister.All, + ControllerBaseAddress + DAC960_BA_InterruptMaskRegisterOffset); +} + +static inline +bool DAC960_BA_InterruptsEnabledP(void __iomem *ControllerBaseAddress) +{ + DAC960_BA_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = + readb(ControllerBaseAddress + DAC960_BA_InterruptMaskRegisterOffset); + return !InterruptMaskRegister.Bits.DisableInterrupts; +} + +static inline +void DAC960_BA_WriteCommandMailbox(DAC960_V2_CommandMailbox_T + *MemoryCommandMailbox, + DAC960_V2_CommandMailbox_T + *CommandMailbox) +{ + memcpy(&MemoryCommandMailbox->Words[1], &CommandMailbox->Words[1], + sizeof(DAC960_V2_CommandMailbox_T) - sizeof(unsigned int)); + wmb(); + MemoryCommandMailbox->Words[0] = CommandMailbox->Words[0]; + mb(); +} + + +static inline +void DAC960_BA_WriteHardwareMailbox(void __iomem *ControllerBaseAddress, + dma_addr_t CommandMailboxDMA) +{ + dma_addr_writeql(CommandMailboxDMA, + ControllerBaseAddress + + DAC960_BA_CommandMailboxBusAddressOffset); +} + +static inline DAC960_V2_CommandIdentifier_T +DAC960_BA_ReadCommandIdentifier(void __iomem *ControllerBaseAddress) +{ + return readw(ControllerBaseAddress + DAC960_BA_CommandStatusOffset); +} + +static inline DAC960_V2_CommandStatus_T +DAC960_BA_ReadCommandStatus(void __iomem *ControllerBaseAddress) +{ + return readw(ControllerBaseAddress + DAC960_BA_CommandStatusOffset + 2); +} + +static inline bool +DAC960_BA_ReadErrorStatus(void __iomem *ControllerBaseAddress, + unsigned char *ErrorStatus, + unsigned char *Parameter0, + unsigned char *Parameter1) +{ + DAC960_BA_ErrorStatusRegister_T ErrorStatusRegister; + ErrorStatusRegister.All = + readb(ControllerBaseAddress + DAC960_BA_ErrorStatusRegisterOffset); + if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false; + ErrorStatusRegister.Bits.ErrorStatusPending = false; + *ErrorStatus = ErrorStatusRegister.All; + *Parameter0 = + readb(ControllerBaseAddress + DAC960_BA_CommandMailboxBusAddressOffset + 0); + *Parameter1 = + readb(ControllerBaseAddress + DAC960_BA_CommandMailboxBusAddressOffset + 1); + writeb(0xFF, ControllerBaseAddress + DAC960_BA_ErrorStatusRegisterOffset); + return true; +} + + +/* + Define the DAC960 LP Series Controller Interface Register Offsets. +*/ + +#define DAC960_LP_RegisterWindowSize 0x80 + +typedef enum +{ + DAC960_LP_InboundDoorBellRegisterOffset = 0x20, + DAC960_LP_OutboundDoorBellRegisterOffset = 0x2C, + DAC960_LP_InterruptStatusRegisterOffset = 0x30, + DAC960_LP_InterruptMaskRegisterOffset = 0x34, + DAC960_LP_CommandMailboxBusAddressOffset = 0x10, + DAC960_LP_CommandStatusOffset = 0x18, + DAC960_LP_ErrorStatusRegisterOffset = 0x2E +} +DAC960_LP_RegisterOffsets_T; + + +/* + Define the structure of the DAC960 LP Series Inbound Door Bell Register. +*/ + +typedef union DAC960_LP_InboundDoorBellRegister +{ + unsigned char All; + struct { + bool HardwareMailboxNewCommand:1; /* Bit 0 */ + bool AcknowledgeHardwareMailboxStatus:1; /* Bit 1 */ + bool GenerateInterrupt:1; /* Bit 2 */ + bool ControllerReset:1; /* Bit 3 */ + bool MemoryMailboxNewCommand:1; /* Bit 4 */ + unsigned char :3; /* Bits 5-7 */ + } Write; + struct { + bool HardwareMailboxFull:1; /* Bit 0 */ + bool InitializationInProgress:1; /* Bit 1 */ + unsigned char :6; /* Bits 2-7 */ + } Read; +} +DAC960_LP_InboundDoorBellRegister_T; + + +/* + Define the structure of the DAC960 LP Series Outbound Door Bell Register. +*/ + +typedef union DAC960_LP_OutboundDoorBellRegister +{ + unsigned char All; + struct { + bool AcknowledgeHardwareMailboxInterrupt:1; /* Bit 0 */ + bool AcknowledgeMemoryMailboxInterrupt:1; /* Bit 1 */ + unsigned char :6; /* Bits 2-7 */ + } Write; + struct { + bool HardwareMailboxStatusAvailable:1; /* Bit 0 */ + bool MemoryMailboxStatusAvailable:1; /* Bit 1 */ + unsigned char :6; /* Bits 2-7 */ + } Read; +} +DAC960_LP_OutboundDoorBellRegister_T; + + +/* + Define the structure of the DAC960 LP Series Interrupt Mask Register. +*/ + +typedef union DAC960_LP_InterruptMaskRegister +{ + unsigned char All; + struct { + unsigned int :2; /* Bits 0-1 */ + bool DisableInterrupts:1; /* Bit 2 */ + unsigned int :5; /* Bits 3-7 */ + } Bits; +} +DAC960_LP_InterruptMaskRegister_T; + + +/* + Define the structure of the DAC960 LP Series Error Status Register. +*/ + +typedef union DAC960_LP_ErrorStatusRegister +{ + unsigned char All; + struct { + unsigned int :2; /* Bits 0-1 */ + bool ErrorStatusPending:1; /* Bit 2 */ + unsigned int :5; /* Bits 3-7 */ + } Bits; +} +DAC960_LP_ErrorStatusRegister_T; + + +/* + Define inline functions to provide an abstraction for reading and writing the + DAC960 LP Series Controller Interface Registers. +*/ + +static inline +void DAC960_LP_HardwareMailboxNewCommand(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.HardwareMailboxNewCommand = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LP_AcknowledgeHardwareMailboxStatus(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.AcknowledgeHardwareMailboxStatus = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LP_GenerateInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.GenerateInterrupt = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LP_ControllerReset(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.ControllerReset = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LP_MemoryMailboxNewCommand(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.MemoryMailboxNewCommand = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset); +} + +static inline +bool DAC960_LP_HardwareMailboxFullP(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset); + return InboundDoorBellRegister.Read.HardwareMailboxFull; +} + +static inline +bool DAC960_LP_InitializationInProgressP(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset); + return InboundDoorBellRegister.Read.InitializationInProgress; +} + +static inline +void DAC960_LP_AcknowledgeHardwareMailboxInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true; + writeb(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LP_OutboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LP_AcknowledgeMemoryMailboxInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true; + writeb(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LP_OutboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LP_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true; + OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true; + writeb(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LP_OutboundDoorBellRegisterOffset); +} + +static inline +bool DAC960_LP_HardwareMailboxStatusAvailableP(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_LP_OutboundDoorBellRegisterOffset); + return OutboundDoorBellRegister.Read.HardwareMailboxStatusAvailable; +} + +static inline +bool DAC960_LP_MemoryMailboxStatusAvailableP(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_LP_OutboundDoorBellRegisterOffset); + return OutboundDoorBellRegister.Read.MemoryMailboxStatusAvailable; +} + +static inline +void DAC960_LP_EnableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = 0xFF; + InterruptMaskRegister.Bits.DisableInterrupts = false; + writeb(InterruptMaskRegister.All, + ControllerBaseAddress + DAC960_LP_InterruptMaskRegisterOffset); +} + +static inline +void DAC960_LP_DisableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = 0xFF; + InterruptMaskRegister.Bits.DisableInterrupts = true; + writeb(InterruptMaskRegister.All, + ControllerBaseAddress + DAC960_LP_InterruptMaskRegisterOffset); +} + +static inline +bool DAC960_LP_InterruptsEnabledP(void __iomem *ControllerBaseAddress) +{ + DAC960_LP_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = + readb(ControllerBaseAddress + DAC960_LP_InterruptMaskRegisterOffset); + return !InterruptMaskRegister.Bits.DisableInterrupts; +} + +static inline +void DAC960_LP_WriteCommandMailbox(DAC960_V2_CommandMailbox_T + *MemoryCommandMailbox, + DAC960_V2_CommandMailbox_T + *CommandMailbox) +{ + memcpy(&MemoryCommandMailbox->Words[1], &CommandMailbox->Words[1], + sizeof(DAC960_V2_CommandMailbox_T) - sizeof(unsigned int)); + wmb(); + MemoryCommandMailbox->Words[0] = CommandMailbox->Words[0]; + mb(); +} + +static inline +void DAC960_LP_WriteHardwareMailbox(void __iomem *ControllerBaseAddress, + dma_addr_t CommandMailboxDMA) +{ + dma_addr_writeql(CommandMailboxDMA, + ControllerBaseAddress + + DAC960_LP_CommandMailboxBusAddressOffset); +} + +static inline DAC960_V2_CommandIdentifier_T +DAC960_LP_ReadCommandIdentifier(void __iomem *ControllerBaseAddress) +{ + return readw(ControllerBaseAddress + DAC960_LP_CommandStatusOffset); +} + +static inline DAC960_V2_CommandStatus_T +DAC960_LP_ReadCommandStatus(void __iomem *ControllerBaseAddress) +{ + return readw(ControllerBaseAddress + DAC960_LP_CommandStatusOffset + 2); +} + +static inline bool +DAC960_LP_ReadErrorStatus(void __iomem *ControllerBaseAddress, + unsigned char *ErrorStatus, + unsigned char *Parameter0, + unsigned char *Parameter1) +{ + DAC960_LP_ErrorStatusRegister_T ErrorStatusRegister; + ErrorStatusRegister.All = + readb(ControllerBaseAddress + DAC960_LP_ErrorStatusRegisterOffset); + if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false; + ErrorStatusRegister.Bits.ErrorStatusPending = false; + *ErrorStatus = ErrorStatusRegister.All; + *Parameter0 = + readb(ControllerBaseAddress + DAC960_LP_CommandMailboxBusAddressOffset + 0); + *Parameter1 = + readb(ControllerBaseAddress + DAC960_LP_CommandMailboxBusAddressOffset + 1); + writeb(0xFF, ControllerBaseAddress + DAC960_LP_ErrorStatusRegisterOffset); + return true; +} + + +/* + Define the DAC960 LA Series Controller Interface Register Offsets. +*/ + +#define DAC960_LA_RegisterWindowSize 0x80 + +typedef enum +{ + DAC960_LA_InboundDoorBellRegisterOffset = 0x60, + DAC960_LA_OutboundDoorBellRegisterOffset = 0x61, + DAC960_LA_InterruptMaskRegisterOffset = 0x34, + DAC960_LA_CommandOpcodeRegisterOffset = 0x50, + DAC960_LA_CommandIdentifierRegisterOffset = 0x51, + DAC960_LA_MailboxRegister2Offset = 0x52, + DAC960_LA_MailboxRegister3Offset = 0x53, + DAC960_LA_MailboxRegister4Offset = 0x54, + DAC960_LA_MailboxRegister5Offset = 0x55, + DAC960_LA_MailboxRegister6Offset = 0x56, + DAC960_LA_MailboxRegister7Offset = 0x57, + DAC960_LA_MailboxRegister8Offset = 0x58, + DAC960_LA_MailboxRegister9Offset = 0x59, + DAC960_LA_MailboxRegister10Offset = 0x5A, + DAC960_LA_MailboxRegister11Offset = 0x5B, + DAC960_LA_MailboxRegister12Offset = 0x5C, + DAC960_LA_StatusCommandIdentifierRegOffset = 0x5D, + DAC960_LA_StatusRegisterOffset = 0x5E, + DAC960_LA_ErrorStatusRegisterOffset = 0x63 +} +DAC960_LA_RegisterOffsets_T; + + +/* + Define the structure of the DAC960 LA Series Inbound Door Bell Register. +*/ + +typedef union DAC960_LA_InboundDoorBellRegister +{ + unsigned char All; + struct { + bool HardwareMailboxNewCommand:1; /* Bit 0 */ + bool AcknowledgeHardwareMailboxStatus:1; /* Bit 1 */ + bool GenerateInterrupt:1; /* Bit 2 */ + bool ControllerReset:1; /* Bit 3 */ + bool MemoryMailboxNewCommand:1; /* Bit 4 */ + unsigned char :3; /* Bits 5-7 */ + } Write; + struct { + bool HardwareMailboxEmpty:1; /* Bit 0 */ + bool InitializationNotInProgress:1; /* Bit 1 */ + unsigned char :6; /* Bits 2-7 */ + } Read; +} +DAC960_LA_InboundDoorBellRegister_T; + + +/* + Define the structure of the DAC960 LA Series Outbound Door Bell Register. +*/ + +typedef union DAC960_LA_OutboundDoorBellRegister +{ + unsigned char All; + struct { + bool AcknowledgeHardwareMailboxInterrupt:1; /* Bit 0 */ + bool AcknowledgeMemoryMailboxInterrupt:1; /* Bit 1 */ + unsigned char :6; /* Bits 2-7 */ + } Write; + struct { + bool HardwareMailboxStatusAvailable:1; /* Bit 0 */ + bool MemoryMailboxStatusAvailable:1; /* Bit 1 */ + unsigned char :6; /* Bits 2-7 */ + } Read; +} +DAC960_LA_OutboundDoorBellRegister_T; + + +/* + Define the structure of the DAC960 LA Series Interrupt Mask Register. +*/ + +typedef union DAC960_LA_InterruptMaskRegister +{ + unsigned char All; + struct { + unsigned char :2; /* Bits 0-1 */ + bool DisableInterrupts:1; /* Bit 2 */ + unsigned char :5; /* Bits 3-7 */ + } Bits; +} +DAC960_LA_InterruptMaskRegister_T; + + +/* + Define the structure of the DAC960 LA Series Error Status Register. +*/ + +typedef union DAC960_LA_ErrorStatusRegister +{ + unsigned char All; + struct { + unsigned int :2; /* Bits 0-1 */ + bool ErrorStatusPending:1; /* Bit 2 */ + unsigned int :5; /* Bits 3-7 */ + } Bits; +} +DAC960_LA_ErrorStatusRegister_T; + + +/* + Define inline functions to provide an abstraction for reading and writing the + DAC960 LA Series Controller Interface Registers. +*/ + +static inline +void DAC960_LA_HardwareMailboxNewCommand(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.HardwareMailboxNewCommand = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LA_AcknowledgeHardwareMailboxStatus(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.AcknowledgeHardwareMailboxStatus = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LA_GenerateInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.GenerateInterrupt = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LA_ControllerReset(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.ControllerReset = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LA_MemoryMailboxNewCommand(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.MemoryMailboxNewCommand = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset); +} + +static inline +bool DAC960_LA_HardwareMailboxFullP(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset); + return !InboundDoorBellRegister.Read.HardwareMailboxEmpty; +} + +static inline +bool DAC960_LA_InitializationInProgressP(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset); + return !InboundDoorBellRegister.Read.InitializationNotInProgress; +} + +static inline +void DAC960_LA_AcknowledgeHardwareMailboxInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true; + writeb(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LA_OutboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LA_AcknowledgeMemoryMailboxInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true; + writeb(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LA_OutboundDoorBellRegisterOffset); +} + +static inline +void DAC960_LA_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true; + OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true; + writeb(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_LA_OutboundDoorBellRegisterOffset); +} + +static inline +bool DAC960_LA_HardwareMailboxStatusAvailableP(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_LA_OutboundDoorBellRegisterOffset); + return OutboundDoorBellRegister.Read.HardwareMailboxStatusAvailable; +} + +static inline +bool DAC960_LA_MemoryMailboxStatusAvailableP(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_LA_OutboundDoorBellRegisterOffset); + return OutboundDoorBellRegister.Read.MemoryMailboxStatusAvailable; +} + +static inline +void DAC960_LA_EnableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = 0xFF; + InterruptMaskRegister.Bits.DisableInterrupts = false; + writeb(InterruptMaskRegister.All, + ControllerBaseAddress + DAC960_LA_InterruptMaskRegisterOffset); +} + +static inline +void DAC960_LA_DisableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = 0xFF; + InterruptMaskRegister.Bits.DisableInterrupts = true; + writeb(InterruptMaskRegister.All, + ControllerBaseAddress + DAC960_LA_InterruptMaskRegisterOffset); +} + +static inline +bool DAC960_LA_InterruptsEnabledP(void __iomem *ControllerBaseAddress) +{ + DAC960_LA_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = + readb(ControllerBaseAddress + DAC960_LA_InterruptMaskRegisterOffset); + return !InterruptMaskRegister.Bits.DisableInterrupts; +} + +static inline +void DAC960_LA_WriteCommandMailbox(DAC960_V1_CommandMailbox_T + *MemoryCommandMailbox, + DAC960_V1_CommandMailbox_T + *CommandMailbox) +{ + MemoryCommandMailbox->Words[1] = CommandMailbox->Words[1]; + MemoryCommandMailbox->Words[2] = CommandMailbox->Words[2]; + MemoryCommandMailbox->Words[3] = CommandMailbox->Words[3]; + wmb(); + MemoryCommandMailbox->Words[0] = CommandMailbox->Words[0]; + mb(); +} + +static inline +void DAC960_LA_WriteHardwareMailbox(void __iomem *ControllerBaseAddress, + DAC960_V1_CommandMailbox_T *CommandMailbox) +{ + writel(CommandMailbox->Words[0], + ControllerBaseAddress + DAC960_LA_CommandOpcodeRegisterOffset); + writel(CommandMailbox->Words[1], + ControllerBaseAddress + DAC960_LA_MailboxRegister4Offset); + writel(CommandMailbox->Words[2], + ControllerBaseAddress + DAC960_LA_MailboxRegister8Offset); + writeb(CommandMailbox->Bytes[12], + ControllerBaseAddress + DAC960_LA_MailboxRegister12Offset); +} + +static inline DAC960_V1_CommandIdentifier_T +DAC960_LA_ReadStatusCommandIdentifier(void __iomem *ControllerBaseAddress) +{ + return readb(ControllerBaseAddress + + DAC960_LA_StatusCommandIdentifierRegOffset); +} + +static inline DAC960_V1_CommandStatus_T +DAC960_LA_ReadStatusRegister(void __iomem *ControllerBaseAddress) +{ + return readw(ControllerBaseAddress + DAC960_LA_StatusRegisterOffset); +} + +static inline bool +DAC960_LA_ReadErrorStatus(void __iomem *ControllerBaseAddress, + unsigned char *ErrorStatus, + unsigned char *Parameter0, + unsigned char *Parameter1) +{ + DAC960_LA_ErrorStatusRegister_T ErrorStatusRegister; + ErrorStatusRegister.All = + readb(ControllerBaseAddress + DAC960_LA_ErrorStatusRegisterOffset); + if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false; + ErrorStatusRegister.Bits.ErrorStatusPending = false; + *ErrorStatus = ErrorStatusRegister.All; + *Parameter0 = + readb(ControllerBaseAddress + DAC960_LA_CommandOpcodeRegisterOffset); + *Parameter1 = + readb(ControllerBaseAddress + DAC960_LA_CommandIdentifierRegisterOffset); + writeb(0xFF, ControllerBaseAddress + DAC960_LA_ErrorStatusRegisterOffset); + return true; +} + +/* + Define the DAC960 PG Series Controller Interface Register Offsets. +*/ + +#define DAC960_PG_RegisterWindowSize 0x2000 + +typedef enum +{ + DAC960_PG_InboundDoorBellRegisterOffset = 0x0020, + DAC960_PG_OutboundDoorBellRegisterOffset = 0x002C, + DAC960_PG_InterruptMaskRegisterOffset = 0x0034, + DAC960_PG_CommandOpcodeRegisterOffset = 0x1000, + DAC960_PG_CommandIdentifierRegisterOffset = 0x1001, + DAC960_PG_MailboxRegister2Offset = 0x1002, + DAC960_PG_MailboxRegister3Offset = 0x1003, + DAC960_PG_MailboxRegister4Offset = 0x1004, + DAC960_PG_MailboxRegister5Offset = 0x1005, + DAC960_PG_MailboxRegister6Offset = 0x1006, + DAC960_PG_MailboxRegister7Offset = 0x1007, + DAC960_PG_MailboxRegister8Offset = 0x1008, + DAC960_PG_MailboxRegister9Offset = 0x1009, + DAC960_PG_MailboxRegister10Offset = 0x100A, + DAC960_PG_MailboxRegister11Offset = 0x100B, + DAC960_PG_MailboxRegister12Offset = 0x100C, + DAC960_PG_StatusCommandIdentifierRegOffset = 0x1018, + DAC960_PG_StatusRegisterOffset = 0x101A, + DAC960_PG_ErrorStatusRegisterOffset = 0x103F +} +DAC960_PG_RegisterOffsets_T; + + +/* + Define the structure of the DAC960 PG Series Inbound Door Bell Register. +*/ + +typedef union DAC960_PG_InboundDoorBellRegister +{ + unsigned int All; + struct { + bool HardwareMailboxNewCommand:1; /* Bit 0 */ + bool AcknowledgeHardwareMailboxStatus:1; /* Bit 1 */ + bool GenerateInterrupt:1; /* Bit 2 */ + bool ControllerReset:1; /* Bit 3 */ + bool MemoryMailboxNewCommand:1; /* Bit 4 */ + unsigned int :27; /* Bits 5-31 */ + } Write; + struct { + bool HardwareMailboxFull:1; /* Bit 0 */ + bool InitializationInProgress:1; /* Bit 1 */ + unsigned int :30; /* Bits 2-31 */ + } Read; +} +DAC960_PG_InboundDoorBellRegister_T; + + +/* + Define the structure of the DAC960 PG Series Outbound Door Bell Register. +*/ + +typedef union DAC960_PG_OutboundDoorBellRegister +{ + unsigned int All; + struct { + bool AcknowledgeHardwareMailboxInterrupt:1; /* Bit 0 */ + bool AcknowledgeMemoryMailboxInterrupt:1; /* Bit 1 */ + unsigned int :30; /* Bits 2-31 */ + } Write; + struct { + bool HardwareMailboxStatusAvailable:1; /* Bit 0 */ + bool MemoryMailboxStatusAvailable:1; /* Bit 1 */ + unsigned int :30; /* Bits 2-31 */ + } Read; +} +DAC960_PG_OutboundDoorBellRegister_T; + + +/* + Define the structure of the DAC960 PG Series Interrupt Mask Register. +*/ + +typedef union DAC960_PG_InterruptMaskRegister +{ + unsigned int All; + struct { + unsigned int MessageUnitInterruptMask1:2; /* Bits 0-1 */ + bool DisableInterrupts:1; /* Bit 2 */ + unsigned int MessageUnitInterruptMask2:5; /* Bits 3-7 */ + unsigned int Reserved0:24; /* Bits 8-31 */ + } Bits; +} +DAC960_PG_InterruptMaskRegister_T; + + +/* + Define the structure of the DAC960 PG Series Error Status Register. +*/ + +typedef union DAC960_PG_ErrorStatusRegister +{ + unsigned char All; + struct { + unsigned int :2; /* Bits 0-1 */ + bool ErrorStatusPending:1; /* Bit 2 */ + unsigned int :5; /* Bits 3-7 */ + } Bits; +} +DAC960_PG_ErrorStatusRegister_T; + + +/* + Define inline functions to provide an abstraction for reading and writing the + DAC960 PG Series Controller Interface Registers. +*/ + +static inline +void DAC960_PG_HardwareMailboxNewCommand(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.HardwareMailboxNewCommand = true; + writel(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_PG_AcknowledgeHardwareMailboxStatus(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.AcknowledgeHardwareMailboxStatus = true; + writel(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_PG_GenerateInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.GenerateInterrupt = true; + writel(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_PG_ControllerReset(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.ControllerReset = true; + writel(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_PG_MemoryMailboxNewCommand(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.MemoryMailboxNewCommand = true; + writel(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset); +} + +static inline +bool DAC960_PG_HardwareMailboxFullP(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readl(ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset); + return InboundDoorBellRegister.Read.HardwareMailboxFull; +} + +static inline +bool DAC960_PG_InitializationInProgressP(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readl(ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset); + return InboundDoorBellRegister.Read.InitializationInProgress; +} + +static inline +void DAC960_PG_AcknowledgeHardwareMailboxInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true; + writel(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PG_OutboundDoorBellRegisterOffset); +} + +static inline +void DAC960_PG_AcknowledgeMemoryMailboxInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true; + writel(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PG_OutboundDoorBellRegisterOffset); +} + +static inline +void DAC960_PG_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true; + OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true; + writel(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PG_OutboundDoorBellRegisterOffset); +} + +static inline +bool DAC960_PG_HardwareMailboxStatusAvailableP(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = + readl(ControllerBaseAddress + DAC960_PG_OutboundDoorBellRegisterOffset); + return OutboundDoorBellRegister.Read.HardwareMailboxStatusAvailable; +} + +static inline +bool DAC960_PG_MemoryMailboxStatusAvailableP(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = + readl(ControllerBaseAddress + DAC960_PG_OutboundDoorBellRegisterOffset); + return OutboundDoorBellRegister.Read.MemoryMailboxStatusAvailable; +} + +static inline +void DAC960_PG_EnableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = 0; + InterruptMaskRegister.Bits.MessageUnitInterruptMask1 = 0x3; + InterruptMaskRegister.Bits.DisableInterrupts = false; + InterruptMaskRegister.Bits.MessageUnitInterruptMask2 = 0x1F; + writel(InterruptMaskRegister.All, + ControllerBaseAddress + DAC960_PG_InterruptMaskRegisterOffset); +} + +static inline +void DAC960_PG_DisableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = 0; + InterruptMaskRegister.Bits.MessageUnitInterruptMask1 = 0x3; + InterruptMaskRegister.Bits.DisableInterrupts = true; + InterruptMaskRegister.Bits.MessageUnitInterruptMask2 = 0x1F; + writel(InterruptMaskRegister.All, + ControllerBaseAddress + DAC960_PG_InterruptMaskRegisterOffset); +} + +static inline +bool DAC960_PG_InterruptsEnabledP(void __iomem *ControllerBaseAddress) +{ + DAC960_PG_InterruptMaskRegister_T InterruptMaskRegister; + InterruptMaskRegister.All = + readl(ControllerBaseAddress + DAC960_PG_InterruptMaskRegisterOffset); + return !InterruptMaskRegister.Bits.DisableInterrupts; +} + +static inline +void DAC960_PG_WriteCommandMailbox(DAC960_V1_CommandMailbox_T + *MemoryCommandMailbox, + DAC960_V1_CommandMailbox_T + *CommandMailbox) +{ + MemoryCommandMailbox->Words[1] = CommandMailbox->Words[1]; + MemoryCommandMailbox->Words[2] = CommandMailbox->Words[2]; + MemoryCommandMailbox->Words[3] = CommandMailbox->Words[3]; + wmb(); + MemoryCommandMailbox->Words[0] = CommandMailbox->Words[0]; + mb(); +} + +static inline +void DAC960_PG_WriteHardwareMailbox(void __iomem *ControllerBaseAddress, + DAC960_V1_CommandMailbox_T *CommandMailbox) +{ + writel(CommandMailbox->Words[0], + ControllerBaseAddress + DAC960_PG_CommandOpcodeRegisterOffset); + writel(CommandMailbox->Words[1], + ControllerBaseAddress + DAC960_PG_MailboxRegister4Offset); + writel(CommandMailbox->Words[2], + ControllerBaseAddress + DAC960_PG_MailboxRegister8Offset); + writeb(CommandMailbox->Bytes[12], + ControllerBaseAddress + DAC960_PG_MailboxRegister12Offset); +} + +static inline DAC960_V1_CommandIdentifier_T +DAC960_PG_ReadStatusCommandIdentifier(void __iomem *ControllerBaseAddress) +{ + return readb(ControllerBaseAddress + + DAC960_PG_StatusCommandIdentifierRegOffset); +} + +static inline DAC960_V1_CommandStatus_T +DAC960_PG_ReadStatusRegister(void __iomem *ControllerBaseAddress) +{ + return readw(ControllerBaseAddress + DAC960_PG_StatusRegisterOffset); +} + +static inline bool +DAC960_PG_ReadErrorStatus(void __iomem *ControllerBaseAddress, + unsigned char *ErrorStatus, + unsigned char *Parameter0, + unsigned char *Parameter1) +{ + DAC960_PG_ErrorStatusRegister_T ErrorStatusRegister; + ErrorStatusRegister.All = + readb(ControllerBaseAddress + DAC960_PG_ErrorStatusRegisterOffset); + if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false; + ErrorStatusRegister.Bits.ErrorStatusPending = false; + *ErrorStatus = ErrorStatusRegister.All; + *Parameter0 = + readb(ControllerBaseAddress + DAC960_PG_CommandOpcodeRegisterOffset); + *Parameter1 = + readb(ControllerBaseAddress + DAC960_PG_CommandIdentifierRegisterOffset); + writeb(0, ControllerBaseAddress + DAC960_PG_ErrorStatusRegisterOffset); + return true; +} + +/* + Define the DAC960 PD Series Controller Interface Register Offsets. +*/ + +#define DAC960_PD_RegisterWindowSize 0x80 + +typedef enum +{ + DAC960_PD_CommandOpcodeRegisterOffset = 0x00, + DAC960_PD_CommandIdentifierRegisterOffset = 0x01, + DAC960_PD_MailboxRegister2Offset = 0x02, + DAC960_PD_MailboxRegister3Offset = 0x03, + DAC960_PD_MailboxRegister4Offset = 0x04, + DAC960_PD_MailboxRegister5Offset = 0x05, + DAC960_PD_MailboxRegister6Offset = 0x06, + DAC960_PD_MailboxRegister7Offset = 0x07, + DAC960_PD_MailboxRegister8Offset = 0x08, + DAC960_PD_MailboxRegister9Offset = 0x09, + DAC960_PD_MailboxRegister10Offset = 0x0A, + DAC960_PD_MailboxRegister11Offset = 0x0B, + DAC960_PD_MailboxRegister12Offset = 0x0C, + DAC960_PD_StatusCommandIdentifierRegOffset = 0x0D, + DAC960_PD_StatusRegisterOffset = 0x0E, + DAC960_PD_ErrorStatusRegisterOffset = 0x3F, + DAC960_PD_InboundDoorBellRegisterOffset = 0x40, + DAC960_PD_OutboundDoorBellRegisterOffset = 0x41, + DAC960_PD_InterruptEnableRegisterOffset = 0x43 +} +DAC960_PD_RegisterOffsets_T; + + +/* + Define the structure of the DAC960 PD Series Inbound Door Bell Register. +*/ + +typedef union DAC960_PD_InboundDoorBellRegister +{ + unsigned char All; + struct { + bool NewCommand:1; /* Bit 0 */ + bool AcknowledgeStatus:1; /* Bit 1 */ + bool GenerateInterrupt:1; /* Bit 2 */ + bool ControllerReset:1; /* Bit 3 */ + unsigned char :4; /* Bits 4-7 */ + } Write; + struct { + bool MailboxFull:1; /* Bit 0 */ + bool InitializationInProgress:1; /* Bit 1 */ + unsigned char :6; /* Bits 2-7 */ + } Read; +} +DAC960_PD_InboundDoorBellRegister_T; + + +/* + Define the structure of the DAC960 PD Series Outbound Door Bell Register. +*/ + +typedef union DAC960_PD_OutboundDoorBellRegister +{ + unsigned char All; + struct { + bool AcknowledgeInterrupt:1; /* Bit 0 */ + unsigned char :7; /* Bits 1-7 */ + } Write; + struct { + bool StatusAvailable:1; /* Bit 0 */ + unsigned char :7; /* Bits 1-7 */ + } Read; +} +DAC960_PD_OutboundDoorBellRegister_T; + + +/* + Define the structure of the DAC960 PD Series Interrupt Enable Register. +*/ + +typedef union DAC960_PD_InterruptEnableRegister +{ + unsigned char All; + struct { + bool EnableInterrupts:1; /* Bit 0 */ + unsigned char :7; /* Bits 1-7 */ + } Bits; +} +DAC960_PD_InterruptEnableRegister_T; + + +/* + Define the structure of the DAC960 PD Series Error Status Register. +*/ + +typedef union DAC960_PD_ErrorStatusRegister +{ + unsigned char All; + struct { + unsigned int :2; /* Bits 0-1 */ + bool ErrorStatusPending:1; /* Bit 2 */ + unsigned int :5; /* Bits 3-7 */ + } Bits; +} +DAC960_PD_ErrorStatusRegister_T; + + +/* + Define inline functions to provide an abstraction for reading and writing the + DAC960 PD Series Controller Interface Registers. +*/ + +static inline +void DAC960_PD_NewCommand(void __iomem *ControllerBaseAddress) +{ + DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.NewCommand = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_PD_AcknowledgeStatus(void __iomem *ControllerBaseAddress) +{ + DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.AcknowledgeStatus = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_PD_GenerateInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.GenerateInterrupt = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset); +} + +static inline +void DAC960_PD_ControllerReset(void __iomem *ControllerBaseAddress) +{ + DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = 0; + InboundDoorBellRegister.Write.ControllerReset = true; + writeb(InboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset); +} + +static inline +bool DAC960_PD_MailboxFullP(void __iomem *ControllerBaseAddress) +{ + DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset); + return InboundDoorBellRegister.Read.MailboxFull; +} + +static inline +bool DAC960_PD_InitializationInProgressP(void __iomem *ControllerBaseAddress) +{ + DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister; + InboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset); + return InboundDoorBellRegister.Read.InitializationInProgress; +} + +static inline +void DAC960_PD_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress) +{ + DAC960_PD_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = 0; + OutboundDoorBellRegister.Write.AcknowledgeInterrupt = true; + writeb(OutboundDoorBellRegister.All, + ControllerBaseAddress + DAC960_PD_OutboundDoorBellRegisterOffset); +} + +static inline +bool DAC960_PD_StatusAvailableP(void __iomem *ControllerBaseAddress) +{ + DAC960_PD_OutboundDoorBellRegister_T OutboundDoorBellRegister; + OutboundDoorBellRegister.All = + readb(ControllerBaseAddress + DAC960_PD_OutboundDoorBellRegisterOffset); + return OutboundDoorBellRegister.Read.StatusAvailable; +} + +static inline +void DAC960_PD_EnableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_PD_InterruptEnableRegister_T InterruptEnableRegister; + InterruptEnableRegister.All = 0; + InterruptEnableRegister.Bits.EnableInterrupts = true; + writeb(InterruptEnableRegister.All, + ControllerBaseAddress + DAC960_PD_InterruptEnableRegisterOffset); +} + +static inline +void DAC960_PD_DisableInterrupts(void __iomem *ControllerBaseAddress) +{ + DAC960_PD_InterruptEnableRegister_T InterruptEnableRegister; + InterruptEnableRegister.All = 0; + InterruptEnableRegister.Bits.EnableInterrupts = false; + writeb(InterruptEnableRegister.All, + ControllerBaseAddress + DAC960_PD_InterruptEnableRegisterOffset); +} + +static inline +bool DAC960_PD_InterruptsEnabledP(void __iomem *ControllerBaseAddress) +{ + DAC960_PD_InterruptEnableRegister_T InterruptEnableRegister; + InterruptEnableRegister.All = + readb(ControllerBaseAddress + DAC960_PD_InterruptEnableRegisterOffset); + return InterruptEnableRegister.Bits.EnableInterrupts; +} + +static inline +void DAC960_PD_WriteCommandMailbox(void __iomem *ControllerBaseAddress, + DAC960_V1_CommandMailbox_T *CommandMailbox) +{ + writel(CommandMailbox->Words[0], + ControllerBaseAddress + DAC960_PD_CommandOpcodeRegisterOffset); + writel(CommandMailbox->Words[1], + ControllerBaseAddress + DAC960_PD_MailboxRegister4Offset); + writel(CommandMailbox->Words[2], + ControllerBaseAddress + DAC960_PD_MailboxRegister8Offset); + writeb(CommandMailbox->Bytes[12], + ControllerBaseAddress + DAC960_PD_MailboxRegister12Offset); +} + +static inline DAC960_V1_CommandIdentifier_T +DAC960_PD_ReadStatusCommandIdentifier(void __iomem *ControllerBaseAddress) +{ + return readb(ControllerBaseAddress + + DAC960_PD_StatusCommandIdentifierRegOffset); +} + +static inline DAC960_V1_CommandStatus_T +DAC960_PD_ReadStatusRegister(void __iomem *ControllerBaseAddress) +{ + return readw(ControllerBaseAddress + DAC960_PD_StatusRegisterOffset); +} + +static inline bool +DAC960_PD_ReadErrorStatus(void __iomem *ControllerBaseAddress, + unsigned char *ErrorStatus, + unsigned char *Parameter0, + unsigned char *Parameter1) +{ + DAC960_PD_ErrorStatusRegister_T ErrorStatusRegister; + ErrorStatusRegister.All = + readb(ControllerBaseAddress + DAC960_PD_ErrorStatusRegisterOffset); + if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false; + ErrorStatusRegister.Bits.ErrorStatusPending = false; + *ErrorStatus = ErrorStatusRegister.All; + *Parameter0 = + readb(ControllerBaseAddress + DAC960_PD_CommandOpcodeRegisterOffset); + *Parameter1 = + readb(ControllerBaseAddress + DAC960_PD_CommandIdentifierRegisterOffset); + writeb(0, ControllerBaseAddress + DAC960_PD_ErrorStatusRegisterOffset); + return true; +} + +static inline void DAC960_P_To_PD_TranslateEnquiry(void *Enquiry) +{ + memcpy(Enquiry + 132, Enquiry + 36, 64); + memset(Enquiry + 36, 0, 96); +} + +static inline void DAC960_P_To_PD_TranslateDeviceState(void *DeviceState) +{ + memcpy(DeviceState + 2, DeviceState + 3, 1); + memmove(DeviceState + 4, DeviceState + 5, 2); + memmove(DeviceState + 6, DeviceState + 8, 4); +} + +static inline +void DAC960_PD_To_P_TranslateReadWriteCommand(DAC960_V1_CommandMailbox_T + *CommandMailbox) +{ + int LogicalDriveNumber = CommandMailbox->Type5.LD.LogicalDriveNumber; + CommandMailbox->Bytes[3] &= 0x7; + CommandMailbox->Bytes[3] |= CommandMailbox->Bytes[7] << 6; + CommandMailbox->Bytes[7] = LogicalDriveNumber; +} + +static inline +void DAC960_P_To_PD_TranslateReadWriteCommand(DAC960_V1_CommandMailbox_T + *CommandMailbox) +{ + int LogicalDriveNumber = CommandMailbox->Bytes[7]; + CommandMailbox->Bytes[7] = CommandMailbox->Bytes[3] >> 6; + CommandMailbox->Bytes[3] &= 0x7; + CommandMailbox->Bytes[3] |= LogicalDriveNumber << 3; +} + + +/* + Define prototypes for the forward referenced DAC960 Driver Internal Functions. +*/ + +static void DAC960_FinalizeController(DAC960_Controller_T *); +static void DAC960_V1_QueueReadWriteCommand(DAC960_Command_T *); +static void DAC960_V2_QueueReadWriteCommand(DAC960_Command_T *); +static void DAC960_RequestFunction(struct request_queue *); +static irqreturn_t DAC960_BA_InterruptHandler(int, void *); +static irqreturn_t DAC960_LP_InterruptHandler(int, void *); +static irqreturn_t DAC960_LA_InterruptHandler(int, void *); +static irqreturn_t DAC960_PG_InterruptHandler(int, void *); +static irqreturn_t DAC960_PD_InterruptHandler(int, void *); +static irqreturn_t DAC960_P_InterruptHandler(int, void *); +static void DAC960_V1_QueueMonitoringCommand(DAC960_Command_T *); +static void DAC960_V2_QueueMonitoringCommand(DAC960_Command_T *); +static void DAC960_MonitoringTimerFunction(unsigned long); +static void DAC960_Message(DAC960_MessageLevel_T, unsigned char *, + DAC960_Controller_T *, ...); +static void DAC960_CreateProcEntries(DAC960_Controller_T *); +static void DAC960_DestroyProcEntries(DAC960_Controller_T *); + +#endif /* DAC960_DriverVersion */ diff --git a/kernel/drivers/block/Kconfig b/kernel/drivers/block/Kconfig new file mode 100644 index 000000000..3ccef9eba --- /dev/null +++ b/kernel/drivers/block/Kconfig @@ -0,0 +1,573 @@ +# +# Block device driver configuration +# + +menuconfig BLK_DEV + bool "Block devices" + depends on BLOCK + default y + ---help--- + Say Y here to get to see options for various different block device + drivers. This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and disabled; + only do this if you know what you are doing. + +if BLK_DEV + +config BLK_DEV_NULL_BLK + tristate "Null test block driver" + +config BLK_DEV_FD + tristate "Normal floppy disk support" + depends on ARCH_MAY_HAVE_PC_FDC + ---help--- + If you want to use the floppy disk drive(s) of your PC under Linux, + say Y. Information about this driver, especially important for IBM + Thinkpad users, is contained in + . + That file also contains the location of the Floppy driver FAQ as + well as location of the fdutils package used to configure additional + parameters of the driver at run time. + + To compile this driver as a module, choose M here: the + module will be called floppy. + +config AMIGA_FLOPPY + tristate "Amiga floppy support" + depends on AMIGA + +config ATARI_FLOPPY + tristate "Atari floppy support" + depends on ATARI + +config MAC_FLOPPY + tristate "Support for PowerMac floppy" + depends on PPC_PMAC && !PPC_PMAC64 + help + If you have a SWIM-3 (Super Woz Integrated Machine 3; from Apple) + floppy controller, say Y here. Most commonly found in PowerMacs. + +config BLK_DEV_SWIM + tristate "Support for SWIM Macintosh floppy" + depends on M68K && MAC + help + You should select this option if you want floppy support + and you don't have a II, IIfx, Q900, Q950 or AV series. + +config AMIGA_Z2RAM + tristate "Amiga Zorro II ramdisk support" + depends on ZORRO + help + This enables support for using Chip RAM and Zorro II RAM as a + ramdisk or as a swap partition. Say Y if you want to include this + driver in the kernel. + + To compile this driver as a module, choose M here: the + module will be called z2ram. + +config GDROM + tristate "SEGA Dreamcast GD-ROM drive" + depends on SH_DREAMCAST + help + A standard SEGA Dreamcast comes with a modified CD ROM drive called a + "GD-ROM" by SEGA to signify it is capable of reading special disks + with up to 1 GB of data. This drive will also read standard CD ROM + disks. Select this option to access any disks in your GD ROM drive. + Most users will want to say "Y" here. + You can also build this as a module which will be called gdrom. + +config PARIDE + tristate "Parallel port IDE device support" + depends on PARPORT_PC + ---help--- + There are many external CD-ROM and disk devices that connect through + your computer's parallel port. Most of them are actually IDE devices + using a parallel port IDE adapter. This option enables the PARIDE + subsystem which contains drivers for many of these external drives. + Read for more information. + + If you have said Y to the "Parallel-port support" configuration + option, you may share a single port between your printer and other + parallel port devices. Answer Y to build PARIDE support into your + kernel, or M if you would like to build it as a loadable module. If + your parallel port support is in a loadable module, you must build + PARIDE as a module. If you built PARIDE support into your kernel, + you may still build the individual protocol modules and high-level + drivers as loadable modules. If you build this support as a module, + it will be called paride. + + To use the PARIDE support, you must say Y or M here and also to at + least one high-level driver (e.g. "Parallel port IDE disks", + "Parallel port ATAPI CD-ROMs", "Parallel port ATAPI disks" etc.) and + to at least one protocol driver (e.g. "ATEN EH-100 protocol", + "MicroSolutions backpack protocol", "DataStor Commuter protocol" + etc.). + +source "drivers/block/paride/Kconfig" + +source "drivers/block/mtip32xx/Kconfig" + +source "drivers/block/zram/Kconfig" + +config BLK_CPQ_DA + tristate "Compaq SMART2 support" + depends on PCI && VIRT_TO_BUS && 0 + help + This is the driver for Compaq Smart Array controllers. Everyone + using these boards should say Y here. See the file + for the current list of + boards supported by this driver, and for further information on the + use of this driver. + +config BLK_CPQ_CISS_DA + tristate "Compaq Smart Array 5xxx support" + depends on PCI + select CHECK_SIGNATURE + help + This is the driver for Compaq Smart Array 5xxx controllers. + Everyone using these boards should say Y here. + See for the current list of + boards supported by this driver, and for further information + on the use of this driver. + +config CISS_SCSI_TAPE + bool "SCSI tape drive support for Smart Array 5xxx" + depends on BLK_CPQ_CISS_DA && PROC_FS + depends on SCSI=y || SCSI=BLK_CPQ_CISS_DA + help + When enabled (Y), this option allows SCSI tape drives and SCSI medium + changers (tape robots) to be accessed via a Compaq 5xxx array + controller. (See for more details.) + + "SCSI support" and "SCSI tape support" must also be enabled for this + option to work. + + When this option is disabled (N), the SCSI portion of the driver + is not compiled. + +config BLK_DEV_DAC960 + tristate "Mylex DAC960/DAC1100 PCI RAID Controller support" + depends on PCI + help + This driver adds support for the Mylex DAC960, AcceleRAID, and + eXtremeRAID PCI RAID controllers. See the file + for further information + about this driver. + + To compile this driver as a module, choose M here: the + module will be called DAC960. + +config BLK_DEV_UMEM + tristate "Micro Memory MM5415 Battery Backed RAM support" + depends on PCI + ---help--- + Saying Y here will include support for the MM5415 family of + battery backed (Non-volatile) RAM cards. + + + The cards appear as block devices that can be partitioned into + as many as 15 partitions. + + To compile this driver as a module, choose M here: the + module will be called umem. + + The umem driver has not yet been allocated a MAJOR number, so + one is chosen dynamically. + +config BLK_DEV_UBD + bool "Virtual block device" + depends on UML + ---help--- + The User-Mode Linux port includes a driver called UBD which will let + you access arbitrary files on the host computer as block devices. + Unless you know that you do not need such virtual block devices say + Y here. + +config BLK_DEV_UBD_SYNC + bool "Always do synchronous disk IO for UBD" + depends on BLK_DEV_UBD + ---help--- + Writes to the virtual block device are not immediately written to the + host's disk; this may cause problems if, for example, the User-Mode + Linux 'Virtual Machine' uses a journalling filesystem and the host + computer crashes. + + Synchronous operation (i.e. always writing data to the host's disk + immediately) is configurable on a per-UBD basis by using a special + kernel command line option. Alternatively, you can say Y here to + turn on synchronous operation by default for all block devices. + + If you're running a journalling file system (like reiserfs, for + example) in your virtual machine, you will want to say Y here. If + you care for the safety of the data in your virtual machine, Y is a + wise choice too. In all other cases (for example, if you're just + playing around with User-Mode Linux) you can choose N. + +config BLK_DEV_COW_COMMON + bool + default BLK_DEV_UBD + +config BLK_DEV_LOOP + tristate "Loopback device support" + ---help--- + Saying Y here will allow you to use a regular file as a block + device; you can then create a file system on that block device and + mount it just as you would mount other block devices such as hard + drive partitions, CD-ROM drives or floppy drives. The loop devices + are block special device files with major number 7 and typically + called /dev/loop0, /dev/loop1 etc. + + This is useful if you want to check an ISO 9660 file system before + burning the CD, or if you want to use floppy images without first + writing them to floppy. Furthermore, some Linux distributions avoid + the need for a dedicated Linux partition by keeping their complete + root file system inside a DOS FAT file using this loop device + driver. + + To use the loop device, you need the losetup utility, found in the + util-linux package, see + . + + The loop device driver can also be used to "hide" a file system in + a disk partition, floppy, or regular file, either using encryption + (scrambling the data) or steganography (hiding the data in the low + bits of, say, a sound file). This is also safe if the file resides + on a remote file server. + + There are several ways of encrypting disks. Some of these require + kernel patches. The vanilla kernel offers the cryptoloop option + and a Device Mapper target (which is superior, as it supports all + file systems). If you want to use the cryptoloop, say Y to both + LOOP and CRYPTOLOOP, and make sure you have a recent (version 2.12 + or later) version of util-linux. Additionally, be aware that + the cryptoloop is not safe for storing journaled filesystems. + + Note that this loop device has nothing to do with the loopback + device used for network connections from the machine to itself. + + To compile this driver as a module, choose M here: the + module will be called loop. + + Most users will answer N here. + +config BLK_DEV_LOOP_MIN_COUNT + int "Number of loop devices to pre-create at init time" + depends on BLK_DEV_LOOP + default 8 + help + Static number of loop devices to be unconditionally pre-created + at init time. + + This default value can be overwritten on the kernel command + line or with module-parameter loop.max_loop. + + The historic default is 8. If a late 2011 version of losetup(8) + is used, it can be set to 0, since needed loop devices can be + dynamically allocated with the /dev/loop-control interface. + +config BLK_DEV_CRYPTOLOOP + tristate "Cryptoloop Support" + select CRYPTO + select CRYPTO_CBC + depends on BLK_DEV_LOOP + ---help--- + Say Y here if you want to be able to use the ciphers that are + provided by the CryptoAPI as loop transformation. This might be + used as hard disk encryption. + + WARNING: This device is not safe for journaled file systems like + ext3 or Reiserfs. Please use the Device Mapper crypto module + instead, which can be configured to be on-disk compatible with the + cryptoloop device. + +source "drivers/block/drbd/Kconfig" + +config BLK_DEV_NBD + tristate "Network block device support" + depends on NET + ---help--- + Saying Y here will allow your computer to be a client for network + block devices, i.e. it will be able to use block devices exported by + servers (mount file systems on them etc.). Communication between + client and server works over TCP/IP networking, but to the client + program this is hidden: it looks like a regular local file access to + a block device special file such as /dev/nd0. + + Network block devices also allows you to run a block-device in + userland (making server and client physically the same computer, + communicating using the loopback network device). + + Read for more information, + especially about where to find the server code, which runs in user + space and does not need special kernel support. + + Note that this has nothing to do with the network file systems NFS + or Coda; you can say N here even if you intend to use NFS or Coda. + + To compile this driver as a module, choose M here: the + module will be called nbd. + + If unsure, say N. + +config BLK_DEV_NVME + tristate "NVM Express block device" + depends on PCI + ---help--- + The NVM Express driver is for solid state drives directly + connected to the PCI or PCI Express bus. If you know you + don't have one of these, it is safe to answer N. + + To compile this driver as a module, choose M here: the + module will be called nvme. + +config BLK_DEV_SKD + tristate "STEC S1120 Block Driver" + depends on PCI + depends on 64BIT + ---help--- + Saying Y or M here will enable support for the + STEC, Inc. S1120 PCIe SSD. + + Use device /dev/skd$N amd /dev/skd$Np$M. + +config BLK_DEV_OSD + tristate "OSD object-as-blkdev support" + depends on SCSI_OSD_ULD + ---help--- + Saying Y or M here will allow the exporting of a single SCSI + OSD (object-based storage) object as a Linux block device. + + For example, if you create a 2G object on an OSD device, + you can then use this module to present that 2G object as + a Linux block device. + + To compile this driver as a module, choose M here: the + module will be called osdblk. + + If unsure, say N. + +config BLK_DEV_SX8 + tristate "Promise SATA SX8 support" + depends on PCI + ---help--- + Saying Y or M here will enable support for the + Promise SATA SX8 controllers. + + Use devices /dev/sx8/$N and /dev/sx8/$Np$M. + +config BLK_DEV_RAM + tristate "RAM block device support" + ---help--- + Saying Y here will allow you to use a portion of your RAM memory as + a block device, so that you can make file systems on it, read and + write to it and do all the other things that you can do with normal + block devices (such as hard drives). It is usually used to load and + store a copy of a minimal root file system off of a floppy into RAM + during the initial install of Linux. + + Note that the kernel command line option "ramdisk=XX" is now obsolete. + For details, read . + + To compile this driver as a module, choose M here: the + module will be called brd. An alias "rd" has been defined + for historical reasons. + + Most normal users won't need the RAM disk functionality, and can + thus say N here. + +config BLK_DEV_RAM_COUNT + int "Default number of RAM disks" + default "16" + depends on BLK_DEV_RAM + help + The default value is 16 RAM disks. Change this if you know what you + are doing. If you boot from a filesystem that needs to be extracted + in memory, you will need at least one RAM disk (e.g. root on cramfs). + +config BLK_DEV_RAM_SIZE + int "Default RAM disk size (kbytes)" + depends on BLK_DEV_RAM + default "4096" + help + The default value is 4096 kilobytes. Only change this if you know + what you are doing. + +config BLK_DEV_RAM_DAX + bool "Support Direct Access (DAX) to RAM block devices" + depends on BLK_DEV_RAM && FS_DAX + default n + help + Support filesystems using DAX to access RAM block devices. This + avoids double-buffering data in the page cache before copying it + to the block device. Answering Y will slightly enlarge the kernel, + and will prevent RAM block device backing store memory from being + allocated from highmem (only a problem for highmem systems). + +config BLK_DEV_PMEM + tristate "Persistent memory block device support" + depends on HAS_IOMEM + help + Saying Y here will allow you to use a contiguous range of reserved + memory as one or more persistent block devices. + + To compile this driver as a module, choose M here: the module will be + called 'pmem'. + + If unsure, say N. + +config CDROM_PKTCDVD + tristate "Packet writing on CD/DVD media" + depends on !UML + help + If you have a CDROM/DVD drive that supports packet writing, say + Y to include support. It should work with any MMC/Mt Fuji + compliant ATAPI or SCSI drive, which is just about any newer + DVD/CD writer. + + Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs + is possible. + DVD-RW disks must be in restricted overwrite mode. + + See the file + for further information on the use of this driver. + + To compile this driver as a module, choose M here: the + module will be called pktcdvd. + +config CDROM_PKTCDVD_BUFFERS + int "Free buffers for data gathering" + depends on CDROM_PKTCDVD + default "8" + help + This controls the maximum number of active concurrent packets. More + concurrent packets can increase write performance, but also require + more memory. Each concurrent packet will require approximately 64Kb + of non-swappable kernel memory, memory which will be allocated when + a disc is opened for writing. + +config CDROM_PKTCDVD_WCACHE + bool "Enable write caching" + depends on CDROM_PKTCDVD + help + If enabled, write caching will be set for the CD-R/W device. For now + this option is dangerous unless the CD-RW media is known good, as we + don't do deferred write error handling yet. + +config ATA_OVER_ETH + tristate "ATA over Ethernet support" + depends on NET + help + This driver provides Support for ATA over Ethernet block + devices like the Coraid EtherDrive (R) Storage Blade. + +config MG_DISK + tristate "mGine mflash, gflash support" + depends on ARM && GPIOLIB + help + mGine mFlash(gFlash) block device driver + +config MG_DISK_RES + int "Size of reserved area before MBR" + depends on MG_DISK + default 0 + help + Define size of reserved area that usually used for boot. Unit is KB. + All of the block device operation will be taken this value as start + offset + Examples: + 1024 => 1 MB + +config SUNVDC + tristate "Sun Virtual Disk Client support" + depends on SUN_LDOMS + help + Support for virtual disk devices as a client under Sun + Logical Domains. + +source "drivers/s390/block/Kconfig" + +config XILINX_SYSACE + tristate "Xilinx SystemACE support" + depends on 4xx || MICROBLAZE + help + Include support for the Xilinx SystemACE CompactFlash interface + +config XEN_BLKDEV_FRONTEND + tristate "Xen virtual block device support" + depends on XEN + default y + select XEN_XENBUS_FRONTEND + help + This driver implements the front-end of the Xen virtual + block device driver. It communicates with a back-end driver + in another domain which drives the actual block device. + +config XEN_BLKDEV_BACKEND + tristate "Xen block-device backend driver" + depends on XEN_BACKEND + help + The block-device backend driver allows the kernel to export its + block devices to other guests via a high-performance shared-memory + interface. + + The corresponding Linux frontend driver is enabled by the + CONFIG_XEN_BLKDEV_FRONTEND configuration option. + + The backend driver attaches itself to a any block device specified + in the XenBus configuration. There are no limits to what the block + device as long as it has a major and minor. + + If you are compiling a kernel to run in a Xen block backend driver + domain (often this is domain 0) you should say Y here. To + compile this driver as a module, chose M here: the module + will be called xen-blkback. + + +config VIRTIO_BLK + tristate "Virtio block driver" + depends on VIRTIO + ---help--- + This is the virtual block driver for virtio. It can be used with + lguest or QEMU based VMMs (like KVM or Xen). Say Y or M. + +config BLK_DEV_HD + bool "Very old hard disk (MFM/RLL/IDE) driver" + depends on HAVE_IDE + depends on !ARM || ARCH_RPC || BROKEN + help + This is a very old hard disk driver that lacks the enhanced + functionality of the newer ones. + + It is required for systems with ancient MFM/RLL/ESDI drives. + + If unsure, say N. + +config BLK_DEV_RBD + tristate "Rados block device (RBD)" + depends on INET && BLOCK + select CEPH_LIB + select LIBCRC32C + select CRYPTO_AES + select CRYPTO + default n + help + Say Y here if you want include the Rados block device, which stripes + a block device over objects stored in the Ceph distributed object + store. + + More information at http://ceph.newdream.net/. + + If unsure, say N. + +config BLK_DEV_RSXX + tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver" + depends on PCI + help + Device driver for IBM's high speed PCIe SSD + storage device: Flash Adapter 900GB Full Height. + + To compile this driver as a module, choose M here: the + module will be called rsxx. + +endif # BLK_DEV diff --git a/kernel/drivers/block/Makefile b/kernel/drivers/block/Makefile new file mode 100644 index 000000000..9cc6c18a1 --- /dev/null +++ b/kernel/drivers/block/Makefile @@ -0,0 +1,50 @@ +# +# Makefile for the kernel block device drivers. +# +# 12 June 2000, Christoph Hellwig +# Rewritten to use lists instead of if-statements. +# + +obj-$(CONFIG_MAC_FLOPPY) += swim3.o +obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o +obj-$(CONFIG_BLK_DEV_FD) += floppy.o +obj-$(CONFIG_AMIGA_FLOPPY) += amiflop.o +obj-$(CONFIG_PS3_DISK) += ps3disk.o +obj-$(CONFIG_PS3_VRAM) += ps3vram.o +obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o +obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o +obj-$(CONFIG_BLK_DEV_RAM) += brd.o +obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o +obj-$(CONFIG_BLK_DEV_LOOP) += loop.o +obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o +obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o +obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o +obj-$(CONFIG_XILINX_SYSACE) += xsysace.o +obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o +obj-$(CONFIG_MG_DISK) += mg_disk.o +obj-$(CONFIG_SUNVDC) += sunvdc.o +obj-$(CONFIG_BLK_DEV_NVME) += nvme.o +obj-$(CONFIG_BLK_DEV_SKD) += skd.o +obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o + +obj-$(CONFIG_BLK_DEV_UMEM) += umem.o +obj-$(CONFIG_BLK_DEV_NBD) += nbd.o +obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o +obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o + +obj-$(CONFIG_BLK_DEV_SX8) += sx8.o +obj-$(CONFIG_BLK_DEV_HD) += hd.o + +obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o +obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ +obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ +obj-$(CONFIG_BLK_DEV_RBD) += rbd.o +obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ + +obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o +obj-$(CONFIG_ZRAM) += zram/ + +nvme-y := nvme-core.o nvme-scsi.o +skd-y := skd_main.o +swim_mod-y := swim.o swim_asm.o diff --git a/kernel/drivers/block/amiflop.c b/kernel/drivers/block/amiflop.c new file mode 100644 index 000000000..5fd50a284 --- /dev/null +++ b/kernel/drivers/block/amiflop.c @@ -0,0 +1,1893 @@ +/* + * linux/amiga/amiflop.c + * + * Copyright (C) 1993 Greg Harp + * Portions of this driver are based on code contributed by Brad Pepers + * + * revised 28.5.95 by Joerg Dorchain + * - now no bugs(?) any more for both HD & DD + * - added support for 40 Track 5.25" drives, 80-track hopefully behaves + * like 3.5" dd (no way to test - are there any 5.25" drives out there + * that work on an A4000?) + * - wrote formatting routine (maybe dirty, but works) + * + * june/july 1995 added ms-dos support by Joerg Dorchain + * (portions based on messydos.device and various contributors) + * - currently only 9 and 18 sector disks + * + * - fixed a bug with the internal trackbuffer when using multiple + * disks the same time + * - made formatting a bit safer + * - added command line and machine based default for "silent" df0 + * + * december 1995 adapted for 1.2.13pl4 by Joerg Dorchain + * - works but I think it's inefficient. (look in redo_fd_request) + * But the changes were very efficient. (only three and a half lines) + * + * january 1996 added special ioctl for tracking down read/write problems + * - usage ioctl(d, RAW_TRACK, ptr); the raw track buffer (MFM-encoded data + * is copied to area. (area should be large enough since no checking is + * done - 30K is currently sufficient). return the actual size of the + * trackbuffer + * - replaced udelays() by a timer (CIAA timer B) for the waits + * needed for the disk mechanic. + * + * february 1996 fixed error recovery and multiple disk access + * - both got broken the first time I tampered with the driver :-( + * - still not safe, but better than before + * + * revised Marts 3rd, 1996 by Jes Sorensen for use in the 1.3.28 kernel. + * - Minor changes to accept the kdev_t. + * - Replaced some more udelays with ms_delays. Udelay is just a loop, + * and so the delay will be different depending on the given + * processor :-( + * - The driver could use a major cleanup because of the new + * major/minor handling that came with kdev_t. It seems to work for + * the time being, but I can't guarantee that it will stay like + * that when we start using 16 (24?) bit minors. + * + * restructured jan 1997 by Joerg Dorchain + * - Fixed Bug accessing multiple disks + * - some code cleanup + * - added trackbuffer for each drive to speed things up + * - fixed some race conditions (who finds the next may send it to me ;-) + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#undef DEBUG /* print _LOTS_ of infos */ + +#define RAW_IOCTL +#ifdef RAW_IOCTL +#define IOCTL_RAW_TRACK 0x5254524B /* 'RTRK' */ +#endif + +/* + * Defines + */ + +/* + * Error codes + */ +#define FD_OK 0 /* operation succeeded */ +#define FD_ERROR -1 /* general error (seek, read, write, etc) */ +#define FD_NOUNIT 1 /* unit does not exist */ +#define FD_UNITBUSY 2 /* unit already active */ +#define FD_NOTACTIVE 3 /* unit is not active */ +#define FD_NOTREADY 4 /* unit is not ready (motor not on/no disk) */ + +#define MFM_NOSYNC 1 +#define MFM_HEADER 2 +#define MFM_DATA 3 +#define MFM_TRACK 4 + +/* + * Floppy ID values + */ +#define FD_NODRIVE 0x00000000 /* response when no unit is present */ +#define FD_DD_3 0xffffffff /* double-density 3.5" (880K) drive */ +#define FD_HD_3 0x55555555 /* high-density 3.5" (1760K) drive */ +#define FD_DD_5 0xaaaaaaaa /* double-density 5.25" (440K) drive */ + +static DEFINE_MUTEX(amiflop_mutex); +static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */ + +module_param(fd_def_df0, ulong, 0); +MODULE_LICENSE("GPL"); + +/* + * Macros + */ +#define MOTOR_ON (ciab.prb &= ~DSKMOTOR) +#define MOTOR_OFF (ciab.prb |= DSKMOTOR) +#define SELECT(mask) (ciab.prb &= ~mask) +#define DESELECT(mask) (ciab.prb |= mask) +#define SELMASK(drive) (1 << (3 + (drive & 3))) + +static struct fd_drive_type drive_types[] = { +/* code name tr he rdsz wrsz sm pc1 pc2 sd st st*/ +/* warning: times are now in milliseconds (ms) */ +{ FD_DD_3, "DD 3.5", 80, 2, 14716, 13630, 1, 80,161, 3, 18, 1}, +{ FD_HD_3, "HD 3.5", 80, 2, 28344, 27258, 2, 80,161, 3, 18, 1}, +{ FD_DD_5, "DD 5.25", 40, 2, 14716, 13630, 1, 40, 81, 6, 30, 2}, +{ FD_NODRIVE, "No Drive", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} +}; +static int num_dr_types = ARRAY_SIZE(drive_types); + +static int amiga_read(int), dos_read(int); +static void amiga_write(int), dos_write(int); +static struct fd_data_type data_types[] = { + { "Amiga", 11 , amiga_read, amiga_write}, + { "MS-Dos", 9, dos_read, dos_write} +}; + +/* current info on each unit */ +static struct amiga_floppy_struct unit[FD_MAX_UNITS]; + +static struct timer_list flush_track_timer[FD_MAX_UNITS]; +static struct timer_list post_write_timer; +static struct timer_list motor_on_timer; +static struct timer_list motor_off_timer[FD_MAX_UNITS]; +static int on_attempts; + +/* Synchronization of FDC access */ +/* request loop (trackbuffer) */ +static volatile int fdc_busy = -1; +static volatile int fdc_nested; +static DECLARE_WAIT_QUEUE_HEAD(fdc_wait); + +static DECLARE_COMPLETION(motor_on_completion); + +static volatile int selected = -1; /* currently selected drive */ + +static int writepending; +static int writefromint; +static char *raw_buf; +static int fdc_queue; + +static DEFINE_SPINLOCK(amiflop_lock); + +#define RAW_BUF_SIZE 30000 /* size of raw disk data */ + +/* + * These are global variables, as that's the easiest way to give + * information to interrupts. They are the data used for the current + * request. + */ +static volatile char block_flag; +static DECLARE_WAIT_QUEUE_HEAD(wait_fd_block); + +/* MS-Dos MFM Coding tables (should go quick and easy) */ +static unsigned char mfmencode[16]={ + 0x2a, 0x29, 0x24, 0x25, 0x12, 0x11, 0x14, 0x15, + 0x4a, 0x49, 0x44, 0x45, 0x52, 0x51, 0x54, 0x55 +}; +static unsigned char mfmdecode[128]; + +/* floppy internal millisecond timer stuff */ +static DECLARE_COMPLETION(ms_wait_completion); +#define MS_TICKS ((amiga_eclock+50)/1000) + +/* + * Note that MAX_ERRORS=X doesn't imply that we retry every bad read + * max X times - some types of errors increase the errorcount by 2 or + * even 3, so we might actually retry only X/2 times before giving up. + */ +#define MAX_ERRORS 12 + +#define custom amiga_custom + +/* Prevent "aliased" accesses. */ +static int fd_ref[4] = { 0,0,0,0 }; +static int fd_device[4] = { 0, 0, 0, 0 }; + +/* + * Here come the actual hardware access and helper functions. + * They are not reentrant and single threaded because all drives + * share the same hardware and the same trackbuffer. + */ + +/* Milliseconds timer */ + +static irqreturn_t ms_isr(int irq, void *dummy) +{ + complete(&ms_wait_completion); + return IRQ_HANDLED; +} + +/* all waits are queued up + A more generic routine would do a schedule a la timer.device */ +static void ms_delay(int ms) +{ + int ticks; + static DEFINE_MUTEX(mutex); + + if (ms > 0) { + mutex_lock(&mutex); + ticks = MS_TICKS*ms-1; + ciaa.tblo=ticks%256; + ciaa.tbhi=ticks/256; + ciaa.crb=0x19; /*count eclock, force load, one-shoot, start */ + wait_for_completion(&ms_wait_completion); + mutex_unlock(&mutex); + } +} + +/* Hardware semaphore */ + +/* returns true when we would get the semaphore */ +static inline int try_fdc(int drive) +{ + drive &= 3; + return ((fdc_busy < 0) || (fdc_busy == drive)); +} + +static void get_fdc(int drive) +{ + unsigned long flags; + + drive &= 3; +#ifdef DEBUG + printk("get_fdc: drive %d fdc_busy %d fdc_nested %d\n",drive,fdc_busy,fdc_nested); +#endif + local_irq_save(flags); + wait_event(fdc_wait, try_fdc(drive)); + fdc_busy = drive; + fdc_nested++; + local_irq_restore(flags); +} + +static inline void rel_fdc(void) +{ +#ifdef DEBUG + if (fdc_nested == 0) + printk("fd: unmatched rel_fdc\n"); + printk("rel_fdc: fdc_busy %d fdc_nested %d\n",fdc_busy,fdc_nested); +#endif + fdc_nested--; + if (fdc_nested == 0) { + fdc_busy = -1; + wake_up(&fdc_wait); + } +} + +static void fd_select (int drive) +{ + unsigned char prb = ~0; + + drive&=3; +#ifdef DEBUG + printk("selecting %d\n",drive); +#endif + if (drive == selected) + return; + get_fdc(drive); + selected = drive; + + if (unit[drive].track % 2 != 0) + prb &= ~DSKSIDE; + if (unit[drive].motor == 1) + prb &= ~DSKMOTOR; + ciab.prb |= (SELMASK(0)|SELMASK(1)|SELMASK(2)|SELMASK(3)); + ciab.prb = prb; + prb &= ~SELMASK(drive); + ciab.prb = prb; + rel_fdc(); +} + +static void fd_deselect (int drive) +{ + unsigned char prb; + unsigned long flags; + + drive&=3; +#ifdef DEBUG + printk("deselecting %d\n",drive); +#endif + if (drive != selected) { + printk(KERN_WARNING "Deselecting drive %d while %d was selected!\n",drive,selected); + return; + } + + get_fdc(drive); + local_irq_save(flags); + + selected = -1; + + prb = ciab.prb; + prb |= (SELMASK(0)|SELMASK(1)|SELMASK(2)|SELMASK(3)); + ciab.prb = prb; + + local_irq_restore (flags); + rel_fdc(); + +} + +static void motor_on_callback(unsigned long nr) +{ + if (!(ciaa.pra & DSKRDY) || --on_attempts == 0) { + complete_all(&motor_on_completion); + } else { + motor_on_timer.expires = jiffies + HZ/10; + add_timer(&motor_on_timer); + } +} + +static int fd_motor_on(int nr) +{ + nr &= 3; + + del_timer(motor_off_timer + nr); + + if (!unit[nr].motor) { + unit[nr].motor = 1; + fd_select(nr); + + reinit_completion(&motor_on_completion); + motor_on_timer.data = nr; + mod_timer(&motor_on_timer, jiffies + HZ/2); + + on_attempts = 10; + wait_for_completion(&motor_on_completion); + fd_deselect(nr); + } + + if (on_attempts == 0) { + on_attempts = -1; +#if 0 + printk (KERN_ERR "motor_on failed, turning motor off\n"); + fd_motor_off (nr); + return 0; +#else + printk (KERN_WARNING "DSKRDY not set after 1.5 seconds - assuming drive is spinning notwithstanding\n"); +#endif + } + + return 1; +} + +static void fd_motor_off(unsigned long drive) +{ + long calledfromint; +#ifdef MODULE + long decusecount; + + decusecount = drive & 0x40000000; +#endif + calledfromint = drive & 0x80000000; + drive&=3; + if (calledfromint && !try_fdc(drive)) { + /* We would be blocked in an interrupt, so try again later */ + motor_off_timer[drive].expires = jiffies + 1; + add_timer(motor_off_timer + drive); + return; + } + unit[drive].motor = 0; + fd_select(drive); + udelay (1); + fd_deselect(drive); +} + +static void floppy_off (unsigned int nr) +{ + int drive; + + drive = nr & 3; + /* called this way it is always from interrupt */ + motor_off_timer[drive].data = nr | 0x80000000; + mod_timer(motor_off_timer + drive, jiffies + 3*HZ); +} + +static int fd_calibrate(int drive) +{ + unsigned char prb; + int n; + + drive &= 3; + get_fdc(drive); + if (!fd_motor_on (drive)) + return 0; + fd_select (drive); + prb = ciab.prb; + prb |= DSKSIDE; + prb &= ~DSKDIREC; + ciab.prb = prb; + for (n = unit[drive].type->tracks/2; n != 0; --n) { + if (ciaa.pra & DSKTRACK0) + break; + prb &= ~DSKSTEP; + ciab.prb = prb; + prb |= DSKSTEP; + udelay (2); + ciab.prb = prb; + ms_delay(unit[drive].type->step_delay); + } + ms_delay (unit[drive].type->settle_time); + prb |= DSKDIREC; + n = unit[drive].type->tracks + 20; + for (;;) { + prb &= ~DSKSTEP; + ciab.prb = prb; + prb |= DSKSTEP; + udelay (2); + ciab.prb = prb; + ms_delay(unit[drive].type->step_delay + 1); + if ((ciaa.pra & DSKTRACK0) == 0) + break; + if (--n == 0) { + printk (KERN_ERR "fd%d: calibrate failed, turning motor off\n", drive); + fd_motor_off (drive); + unit[drive].track = -1; + rel_fdc(); + return 0; + } + } + unit[drive].track = 0; + ms_delay(unit[drive].type->settle_time); + + rel_fdc(); + fd_deselect(drive); + return 1; +} + +static int fd_seek(int drive, int track) +{ + unsigned char prb; + int cnt; + +#ifdef DEBUG + printk("seeking drive %d to track %d\n",drive,track); +#endif + drive &= 3; + get_fdc(drive); + if (unit[drive].track == track) { + rel_fdc(); + return 1; + } + if (!fd_motor_on(drive)) { + rel_fdc(); + return 0; + } + if (unit[drive].track < 0 && !fd_calibrate(drive)) { + rel_fdc(); + return 0; + } + + fd_select (drive); + cnt = unit[drive].track/2 - track/2; + prb = ciab.prb; + prb |= DSKSIDE | DSKDIREC; + if (track % 2 != 0) + prb &= ~DSKSIDE; + if (cnt < 0) { + cnt = - cnt; + prb &= ~DSKDIREC; + } + ciab.prb = prb; + if (track % 2 != unit[drive].track % 2) + ms_delay (unit[drive].type->side_time); + unit[drive].track = track; + if (cnt == 0) { + rel_fdc(); + fd_deselect(drive); + return 1; + } + do { + prb &= ~DSKSTEP; + ciab.prb = prb; + prb |= DSKSTEP; + udelay (1); + ciab.prb = prb; + ms_delay (unit[drive].type->step_delay); + } while (--cnt != 0); + ms_delay (unit[drive].type->settle_time); + + rel_fdc(); + fd_deselect(drive); + return 1; +} + +static unsigned long fd_get_drive_id(int drive) +{ + int i; + ulong id = 0; + + drive&=3; + get_fdc(drive); + /* set up for ID */ + MOTOR_ON; + udelay(2); + SELECT(SELMASK(drive)); + udelay(2); + DESELECT(SELMASK(drive)); + udelay(2); + MOTOR_OFF; + udelay(2); + SELECT(SELMASK(drive)); + udelay(2); + DESELECT(SELMASK(drive)); + udelay(2); + + /* loop and read disk ID */ + for (i=0; i<32; i++) { + SELECT(SELMASK(drive)); + udelay(2); + + /* read and store value of DSKRDY */ + id <<= 1; + id |= (ciaa.pra & DSKRDY) ? 0 : 1; /* cia regs are low-active! */ + + DESELECT(SELMASK(drive)); + } + + rel_fdc(); + + /* + * RB: At least A500/A2000's df0: don't identify themselves. + * As every (real) Amiga has at least a 3.5" DD drive as df0: + * we default to that if df0: doesn't identify as a certain + * type. + */ + if(drive == 0 && id == FD_NODRIVE) + { + id = fd_def_df0; + printk(KERN_NOTICE "fd: drive 0 didn't identify, setting default %08lx\n", (ulong)fd_def_df0); + } + /* return the ID value */ + return (id); +} + +static irqreturn_t fd_block_done(int irq, void *dummy) +{ + if (block_flag) + custom.dsklen = 0x4000; + + if (block_flag == 2) { /* writing */ + writepending = 2; + post_write_timer.expires = jiffies + 1; /* at least 2 ms */ + post_write_timer.data = selected; + add_timer(&post_write_timer); + } + else { /* reading */ + block_flag = 0; + wake_up (&wait_fd_block); + } + return IRQ_HANDLED; +} + +static void raw_read(int drive) +{ + drive&=3; + get_fdc(drive); + wait_event(wait_fd_block, !block_flag); + fd_select(drive); + /* setup adkcon bits correctly */ + custom.adkcon = ADK_MSBSYNC; + custom.adkcon = ADK_SETCLR|ADK_WORDSYNC|ADK_FAST; + + custom.dsksync = MFM_SYNC; + + custom.dsklen = 0; + custom.dskptr = (u_char *)ZTWO_PADDR((u_char *)raw_buf); + custom.dsklen = unit[drive].type->read_size/sizeof(short) | DSKLEN_DMAEN; + custom.dsklen = unit[drive].type->read_size/sizeof(short) | DSKLEN_DMAEN; + + block_flag = 1; + + wait_event(wait_fd_block, !block_flag); + + custom.dsklen = 0; + fd_deselect(drive); + rel_fdc(); +} + +static int raw_write(int drive) +{ + ushort adk; + + drive&=3; + get_fdc(drive); /* corresponds to rel_fdc() in post_write() */ + if ((ciaa.pra & DSKPROT) == 0) { + rel_fdc(); + return 0; + } + wait_event(wait_fd_block, !block_flag); + fd_select(drive); + /* clear adkcon bits */ + custom.adkcon = ADK_PRECOMP1|ADK_PRECOMP0|ADK_WORDSYNC|ADK_MSBSYNC; + /* set appropriate adkcon bits */ + adk = ADK_SETCLR|ADK_FAST; + if ((ulong)unit[drive].track >= unit[drive].type->precomp2) + adk |= ADK_PRECOMP1; + else if ((ulong)unit[drive].track >= unit[drive].type->precomp1) + adk |= ADK_PRECOMP0; + custom.adkcon = adk; + + custom.dsklen = DSKLEN_WRITE; + custom.dskptr = (u_char *)ZTWO_PADDR((u_char *)raw_buf); + custom.dsklen = unit[drive].type->write_size/sizeof(short) | DSKLEN_DMAEN|DSKLEN_WRITE; + custom.dsklen = unit[drive].type->write_size/sizeof(short) | DSKLEN_DMAEN|DSKLEN_WRITE; + + block_flag = 2; + return 1; +} + +/* + * to be called at least 2ms after the write has finished but before any + * other access to the hardware. + */ +static void post_write (unsigned long drive) +{ +#ifdef DEBUG + printk("post_write for drive %ld\n",drive); +#endif + drive &= 3; + custom.dsklen = 0; + block_flag = 0; + writepending = 0; + writefromint = 0; + unit[drive].dirty = 0; + wake_up(&wait_fd_block); + fd_deselect(drive); + rel_fdc(); /* corresponds to get_fdc() in raw_write */ +} + + +/* + * The following functions are to convert the block contents into raw data + * written to disk and vice versa. + * (Add other formats here ;-)) + */ + +static unsigned long scan_sync(unsigned long raw, unsigned long end) +{ + ushort *ptr = (ushort *)raw, *endp = (ushort *)end; + + while (ptr < endp && *ptr++ != 0x4489) + ; + if (ptr < endp) { + while (*ptr == 0x4489 && ptr < endp) + ptr++; + return (ulong)ptr; + } + return 0; +} + +static inline unsigned long checksum(unsigned long *addr, int len) +{ + unsigned long csum = 0; + + len /= sizeof(*addr); + while (len-- > 0) + csum ^= *addr++; + csum = ((csum>>1) & 0x55555555) ^ (csum & 0x55555555); + + return csum; +} + +static unsigned long decode (unsigned long *data, unsigned long *raw, + int len) +{ + ulong *odd, *even; + + /* convert length from bytes to longwords */ + len >>= 2; + odd = raw; + even = odd + len; + + /* prepare return pointer */ + raw += len * 2; + + do { + *data++ = ((*odd++ & 0x55555555) << 1) | (*even++ & 0x55555555); + } while (--len != 0); + + return (ulong)raw; +} + +struct header { + unsigned char magic; + unsigned char track; + unsigned char sect; + unsigned char ord; + unsigned char labels[16]; + unsigned long hdrchk; + unsigned long datachk; +}; + +static int amiga_read(int drive) +{ + unsigned long raw; + unsigned long end; + int scnt; + unsigned long csum; + struct header hdr; + + drive&=3; + raw = (long) raw_buf; + end = raw + unit[drive].type->read_size; + + for (scnt = 0;scnt < unit[drive].dtype->sects * unit[drive].type->sect_mult; scnt++) { + if (!(raw = scan_sync(raw, end))) { + printk (KERN_INFO "can't find sync for sector %d\n", scnt); + return MFM_NOSYNC; + } + + raw = decode ((ulong *)&hdr.magic, (ulong *)raw, 4); + raw = decode ((ulong *)&hdr.labels, (ulong *)raw, 16); + raw = decode ((ulong *)&hdr.hdrchk, (ulong *)raw, 4); + raw = decode ((ulong *)&hdr.datachk, (ulong *)raw, 4); + csum = checksum((ulong *)&hdr, + (char *)&hdr.hdrchk-(char *)&hdr); + +#ifdef DEBUG + printk ("(%x,%d,%d,%d) (%lx,%lx,%lx,%lx) %lx %lx\n", + hdr.magic, hdr.track, hdr.sect, hdr.ord, + *(ulong *)&hdr.labels[0], *(ulong *)&hdr.labels[4], + *(ulong *)&hdr.labels[8], *(ulong *)&hdr.labels[12], + hdr.hdrchk, hdr.datachk); +#endif + + if (hdr.hdrchk != csum) { + printk(KERN_INFO "MFM_HEADER: %08lx,%08lx\n", hdr.hdrchk, csum); + return MFM_HEADER; + } + + /* verify track */ + if (hdr.track != unit[drive].track) { + printk(KERN_INFO "MFM_TRACK: %d, %d\n", hdr.track, unit[drive].track); + return MFM_TRACK; + } + + raw = decode ((ulong *)(unit[drive].trackbuf + hdr.sect*512), + (ulong *)raw, 512); + csum = checksum((ulong *)(unit[drive].trackbuf + hdr.sect*512), 512); + + if (hdr.datachk != csum) { + printk(KERN_INFO "MFM_DATA: (%x:%d:%d:%d) sc=%d %lx, %lx\n", + hdr.magic, hdr.track, hdr.sect, hdr.ord, scnt, + hdr.datachk, csum); + printk (KERN_INFO "data=(%lx,%lx,%lx,%lx)\n", + ((ulong *)(unit[drive].trackbuf+hdr.sect*512))[0], + ((ulong *)(unit[drive].trackbuf+hdr.sect*512))[1], + ((ulong *)(unit[drive].trackbuf+hdr.sect*512))[2], + ((ulong *)(unit[drive].trackbuf+hdr.sect*512))[3]); + return MFM_DATA; + } + } + + return 0; +} + +static void encode(unsigned long data, unsigned long *dest) +{ + unsigned long data2; + + data &= 0x55555555; + data2 = data ^ 0x55555555; + data |= ((data2 >> 1) | 0x80000000) & (data2 << 1); + + if (*(dest - 1) & 0x00000001) + data &= 0x7FFFFFFF; + + *dest = data; +} + +static void encode_block(unsigned long *dest, unsigned long *src, int len) +{ + int cnt, to_cnt = 0; + unsigned long data; + + /* odd bits */ + for (cnt = 0; cnt < len / 4; cnt++) { + data = src[cnt] >> 1; + encode(data, dest + to_cnt++); + } + + /* even bits */ + for (cnt = 0; cnt < len / 4; cnt++) { + data = src[cnt]; + encode(data, dest + to_cnt++); + } +} + +static unsigned long *putsec(int disk, unsigned long *raw, int cnt) +{ + struct header hdr; + int i; + + disk&=3; + *raw = (raw[-1]&1) ? 0x2AAAAAAA : 0xAAAAAAAA; + raw++; + *raw++ = 0x44894489; + + hdr.magic = 0xFF; + hdr.track = unit[disk].track; + hdr.sect = cnt; + hdr.ord = unit[disk].dtype->sects * unit[disk].type->sect_mult - cnt; + for (i = 0; i < 16; i++) + hdr.labels[i] = 0; + hdr.hdrchk = checksum((ulong *)&hdr, + (char *)&hdr.hdrchk-(char *)&hdr); + hdr.datachk = checksum((ulong *)(unit[disk].trackbuf+cnt*512), 512); + + encode_block(raw, (ulong *)&hdr.magic, 4); + raw += 2; + encode_block(raw, (ulong *)&hdr.labels, 16); + raw += 8; + encode_block(raw, (ulong *)&hdr.hdrchk, 4); + raw += 2; + encode_block(raw, (ulong *)&hdr.datachk, 4); + raw += 2; + encode_block(raw, (ulong *)(unit[disk].trackbuf+cnt*512), 512); + raw += 256; + + return raw; +} + +static void amiga_write(int disk) +{ + unsigned int cnt; + unsigned long *ptr = (unsigned long *)raw_buf; + + disk&=3; + /* gap space */ + for (cnt = 0; cnt < 415 * unit[disk].type->sect_mult; cnt++) + *ptr++ = 0xaaaaaaaa; + + /* sectors */ + for (cnt = 0; cnt < unit[disk].dtype->sects * unit[disk].type->sect_mult; cnt++) + ptr = putsec (disk, ptr, cnt); + *(ushort *)ptr = (ptr[-1]&1) ? 0x2AA8 : 0xAAA8; +} + + +struct dos_header { + unsigned char track, /* 0-80 */ + side, /* 0-1 */ + sec, /* 0-...*/ + len_desc;/* 2 */ + unsigned short crc; /* on 68000 we got an alignment problem, + but this compiler solves it by adding silently + adding a pad byte so data won't fit + and this took about 3h to discover.... */ + unsigned char gap1[22]; /* for longword-alignedness (0x4e) */ +}; + +/* crc routines are borrowed from the messydos-handler */ + +/* excerpt from the messydos-device +; The CRC is computed not only over the actual data, but including +; the SYNC mark (3 * $a1) and the 'ID/DATA - Address Mark' ($fe/$fb). +; As we don't read or encode these fields into our buffers, we have to +; preload the registers containing the CRC with the values they would have +; after stepping over these fields. +; +; How CRCs "really" work: +; +; First, you should regard a bitstring as a series of coefficients of +; polynomials. We calculate with these polynomials in modulo-2 +; arithmetic, in which both add and subtract are done the same as +; exclusive-or. Now, we modify our data (a very long polynomial) in +; such a way that it becomes divisible by the CCITT-standard 16-bit +; 16 12 5 +; polynomial: x + x + x + 1, represented by $11021. The easiest +; way to do this would be to multiply (using proper arithmetic) our +; datablock with $11021. So we have: +; data * $11021 = +; data * ($10000 + $1021) = +; data * $10000 + data * $1021 +; The left part of this is simple: Just add two 0 bytes. But then +; the right part (data $1021) remains difficult and even could have +; a carry into the left part. The solution is to use a modified +; multiplication, which has a result that is not correct, but with +; a difference of any multiple of $11021. We then only need to keep +; the 16 least significant bits of the result. +; +; The following algorithm does this for us: +; +; unsigned char *data, c, crclo, crchi; +; while (not done) { +; c = *data++ + crchi; +; crchi = (@ c) >> 8 + crclo; +; crclo = @ c; +; } +; +; Remember, + is done with EOR, the @ operator is in two tables (high +; and low byte separately), which is calculated as +; +; $1021 * (c & $F0) +; xor $1021 * (c & $0F) +; xor $1021 * (c >> 4) (* is regular multiplication) +; +; +; Anyway, the end result is the same as the remainder of the division of +; the data by $11021. I am afraid I need to study theory a bit more... + + +my only works was to code this from manx to C.... + +*/ + +static ushort dos_crc(void * data_a3, int data_d0, int data_d1, int data_d3) +{ + static unsigned char CRCTable1[] = { + 0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70,0x81,0x91,0xa1,0xb1,0xc1,0xd1,0xe1,0xf1, + 0x12,0x02,0x32,0x22,0x52,0x42,0x72,0x62,0x93,0x83,0xb3,0xa3,0xd3,0xc3,0xf3,0xe3, + 0x24,0x34,0x04,0x14,0x64,0x74,0x44,0x54,0xa5,0xb5,0x85,0x95,0xe5,0xf5,0xc5,0xd5, + 0x36,0x26,0x16,0x06,0x76,0x66,0x56,0x46,0xb7,0xa7,0x97,0x87,0xf7,0xe7,0xd7,0xc7, + 0x48,0x58,0x68,0x78,0x08,0x18,0x28,0x38,0xc9,0xd9,0xe9,0xf9,0x89,0x99,0xa9,0xb9, + 0x5a,0x4a,0x7a,0x6a,0x1a,0x0a,0x3a,0x2a,0xdb,0xcb,0xfb,0xeb,0x9b,0x8b,0xbb,0xab, + 0x6c,0x7c,0x4c,0x5c,0x2c,0x3c,0x0c,0x1c,0xed,0xfd,0xcd,0xdd,0xad,0xbd,0x8d,0x9d, + 0x7e,0x6e,0x5e,0x4e,0x3e,0x2e,0x1e,0x0e,0xff,0xef,0xdf,0xcf,0xbf,0xaf,0x9f,0x8f, + 0x91,0x81,0xb1,0xa1,0xd1,0xc1,0xf1,0xe1,0x10,0x00,0x30,0x20,0x50,0x40,0x70,0x60, + 0x83,0x93,0xa3,0xb3,0xc3,0xd3,0xe3,0xf3,0x02,0x12,0x22,0x32,0x42,0x52,0x62,0x72, + 0xb5,0xa5,0x95,0x85,0xf5,0xe5,0xd5,0xc5,0x34,0x24,0x14,0x04,0x74,0x64,0x54,0x44, + 0xa7,0xb7,0x87,0x97,0xe7,0xf7,0xc7,0xd7,0x26,0x36,0x06,0x16,0x66,0x76,0x46,0x56, + 0xd9,0xc9,0xf9,0xe9,0x99,0x89,0xb9,0xa9,0x58,0x48,0x78,0x68,0x18,0x08,0x38,0x28, + 0xcb,0xdb,0xeb,0xfb,0x8b,0x9b,0xab,0xbb,0x4a,0x5a,0x6a,0x7a,0x0a,0x1a,0x2a,0x3a, + 0xfd,0xed,0xdd,0xcd,0xbd,0xad,0x9d,0x8d,0x7c,0x6c,0x5c,0x4c,0x3c,0x2c,0x1c,0x0c, + 0xef,0xff,0xcf,0xdf,0xaf,0xbf,0x8f,0x9f,0x6e,0x7e,0x4e,0x5e,0x2e,0x3e,0x0e,0x1e + }; + + static unsigned char CRCTable2[] = { + 0x00,0x21,0x42,0x63,0x84,0xa5,0xc6,0xe7,0x08,0x29,0x4a,0x6b,0x8c,0xad,0xce,0xef, + 0x31,0x10,0x73,0x52,0xb5,0x94,0xf7,0xd6,0x39,0x18,0x7b,0x5a,0xbd,0x9c,0xff,0xde, + 0x62,0x43,0x20,0x01,0xe6,0xc7,0xa4,0x85,0x6a,0x4b,0x28,0x09,0xee,0xcf,0xac,0x8d, + 0x53,0x72,0x11,0x30,0xd7,0xf6,0x95,0xb4,0x5b,0x7a,0x19,0x38,0xdf,0xfe,0x9d,0xbc, + 0xc4,0xe5,0x86,0xa7,0x40,0x61,0x02,0x23,0xcc,0xed,0x8e,0xaf,0x48,0x69,0x0a,0x2b, + 0xf5,0xd4,0xb7,0x96,0x71,0x50,0x33,0x12,0xfd,0xdc,0xbf,0x9e,0x79,0x58,0x3b,0x1a, + 0xa6,0x87,0xe4,0xc5,0x22,0x03,0x60,0x41,0xae,0x8f,0xec,0xcd,0x2a,0x0b,0x68,0x49, + 0x97,0xb6,0xd5,0xf4,0x13,0x32,0x51,0x70,0x9f,0xbe,0xdd,0xfc,0x1b,0x3a,0x59,0x78, + 0x88,0xa9,0xca,0xeb,0x0c,0x2d,0x4e,0x6f,0x80,0xa1,0xc2,0xe3,0x04,0x25,0x46,0x67, + 0xb9,0x98,0xfb,0xda,0x3d,0x1c,0x7f,0x5e,0xb1,0x90,0xf3,0xd2,0x35,0x14,0x77,0x56, + 0xea,0xcb,0xa8,0x89,0x6e,0x4f,0x2c,0x0d,0xe2,0xc3,0xa0,0x81,0x66,0x47,0x24,0x05, + 0xdb,0xfa,0x99,0xb8,0x5f,0x7e,0x1d,0x3c,0xd3,0xf2,0x91,0xb0,0x57,0x76,0x15,0x34, + 0x4c,0x6d,0x0e,0x2f,0xc8,0xe9,0x8a,0xab,0x44,0x65,0x06,0x27,0xc0,0xe1,0x82,0xa3, + 0x7d,0x5c,0x3f,0x1e,0xf9,0xd8,0xbb,0x9a,0x75,0x54,0x37,0x16,0xf1,0xd0,0xb3,0x92, + 0x2e,0x0f,0x6c,0x4d,0xaa,0x8b,0xe8,0xc9,0x26,0x07,0x64,0x45,0xa2,0x83,0xe0,0xc1, + 0x1f,0x3e,0x5d,0x7c,0x9b,0xba,0xd9,0xf8,0x17,0x36,0x55,0x74,0x93,0xb2,0xd1,0xf0 + }; + +/* look at the asm-code - what looks in C a bit strange is almost as good as handmade */ + register int i; + register unsigned char *CRCT1, *CRCT2, *data, c, crch, crcl; + + CRCT1=CRCTable1; + CRCT2=CRCTable2; + data=data_a3; + crcl=data_d1; + crch=data_d0; + for (i=data_d3; i>=0; i--) { + c = (*data++) ^ crch; + crch = CRCT1[c] ^ crcl; + crcl = CRCT2[c]; + } + return (crch<<8)|crcl; +} + +static inline ushort dos_hdr_crc (struct dos_header *hdr) +{ + return dos_crc(&(hdr->track), 0xb2, 0x30, 3); /* precomputed magic */ +} + +static inline ushort dos_data_crc(unsigned char *data) +{ + return dos_crc(data, 0xe2, 0x95 ,511); /* precomputed magic */ +} + +static inline unsigned char dos_decode_byte(ushort word) +{ + register ushort w2; + register unsigned char byte; + register unsigned char *dec = mfmdecode; + + w2=word; + w2>>=8; + w2&=127; + byte = dec[w2]; + byte <<= 4; + w2 = word & 127; + byte |= dec[w2]; + return byte; +} + +static unsigned long dos_decode(unsigned char *data, unsigned short *raw, int len) +{ + int i; + + for (i = 0; i < len; i++) + *data++=dos_decode_byte(*raw++); + return ((ulong)raw); +} + +#ifdef DEBUG +static void dbg(unsigned long ptr) +{ + printk("raw data @%08lx: %08lx, %08lx ,%08lx, %08lx\n", ptr, + ((ulong *)ptr)[0], ((ulong *)ptr)[1], + ((ulong *)ptr)[2], ((ulong *)ptr)[3]); +} +#endif + +static int dos_read(int drive) +{ + unsigned long end; + unsigned long raw; + int scnt; + unsigned short crc,data_crc[2]; + struct dos_header hdr; + + drive&=3; + raw = (long) raw_buf; + end = raw + unit[drive].type->read_size; + + for (scnt=0; scnt < unit[drive].dtype->sects * unit[drive].type->sect_mult; scnt++) { + do { /* search for the right sync of each sec-hdr */ + if (!(raw = scan_sync (raw, end))) { + printk(KERN_INFO "dos_read: no hdr sync on " + "track %d, unit %d for sector %d\n", + unit[drive].track,drive,scnt); + return MFM_NOSYNC; + } +#ifdef DEBUG + dbg(raw); +#endif + } while (*((ushort *)raw)!=0x5554); /* loop usually only once done */ + raw+=2; /* skip over headermark */ + raw = dos_decode((unsigned char *)&hdr,(ushort *) raw,8); + crc = dos_hdr_crc(&hdr); + +#ifdef DEBUG + printk("(%3d,%d,%2d,%d) %x\n", hdr.track, hdr.side, + hdr.sec, hdr.len_desc, hdr.crc); +#endif + + if (crc != hdr.crc) { + printk(KERN_INFO "dos_read: MFM_HEADER %04x,%04x\n", + hdr.crc, crc); + return MFM_HEADER; + } + if (hdr.track != unit[drive].track/unit[drive].type->heads) { + printk(KERN_INFO "dos_read: MFM_TRACK %d, %d\n", + hdr.track, + unit[drive].track/unit[drive].type->heads); + return MFM_TRACK; + } + + if (hdr.side != unit[drive].track%unit[drive].type->heads) { + printk(KERN_INFO "dos_read: MFM_SIDE %d, %d\n", + hdr.side, + unit[drive].track%unit[drive].type->heads); + return MFM_TRACK; + } + + if (hdr.len_desc != 2) { + printk(KERN_INFO "dos_read: unknown sector len " + "descriptor %d\n", hdr.len_desc); + return MFM_DATA; + } +#ifdef DEBUG + printk("hdr accepted\n"); +#endif + if (!(raw = scan_sync (raw, end))) { + printk(KERN_INFO "dos_read: no data sync on track " + "%d, unit %d for sector%d, disk sector %d\n", + unit[drive].track, drive, scnt, hdr.sec); + return MFM_NOSYNC; + } +#ifdef DEBUG + dbg(raw); +#endif + + if (*((ushort *)raw)!=0x5545) { + printk(KERN_INFO "dos_read: no data mark after " + "sync (%d,%d,%d,%d) sc=%d\n", + hdr.track,hdr.side,hdr.sec,hdr.len_desc,scnt); + return MFM_NOSYNC; + } + + raw+=2; /* skip data mark (included in checksum) */ + raw = dos_decode((unsigned char *)(unit[drive].trackbuf + (hdr.sec - 1) * 512), (ushort *) raw, 512); + raw = dos_decode((unsigned char *)data_crc,(ushort *) raw,4); + crc = dos_data_crc(unit[drive].trackbuf + (hdr.sec - 1) * 512); + + if (crc != data_crc[0]) { + printk(KERN_INFO "dos_read: MFM_DATA (%d,%d,%d,%d) " + "sc=%d, %x %x\n", hdr.track, hdr.side, + hdr.sec, hdr.len_desc, scnt,data_crc[0], crc); + printk(KERN_INFO "data=(%lx,%lx,%lx,%lx,...)\n", + ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[0], + ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[1], + ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[2], + ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[3]); + return MFM_DATA; + } + } + return 0; +} + +static inline ushort dos_encode_byte(unsigned char byte) +{ + register unsigned char *enc, b2, b1; + register ushort word; + + enc=mfmencode; + b1=byte; + b2=b1>>4; + b1&=15; + word=enc[b2] <<8 | enc [b1]; + return (word|((word&(256|64)) ? 0: 128)); +} + +static void dos_encode_block(ushort *dest, unsigned char *src, int len) +{ + int i; + + for (i = 0; i < len; i++) { + *dest=dos_encode_byte(*src++); + *dest|=((dest[-1]&1)||(*dest&0x4000))? 0: 0x8000; + dest++; + } +} + +static unsigned long *ms_putsec(int drive, unsigned long *raw, int cnt) +{ + static struct dos_header hdr={0,0,0,2,0, + {78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78}}; + int i; + static ushort crc[2]={0,0x4e4e}; + + drive&=3; +/* id gap 1 */ +/* the MFM word before is always 9254 */ + for(i=0;i<6;i++) + *raw++=0xaaaaaaaa; +/* 3 sync + 1 headermark */ + *raw++=0x44894489; + *raw++=0x44895554; + +/* fill in the variable parts of the header */ + hdr.track=unit[drive].track/unit[drive].type->heads; + hdr.side=unit[drive].track%unit[drive].type->heads; + hdr.sec=cnt+1; + hdr.crc=dos_hdr_crc(&hdr); + +/* header (without "magic") and id gap 2*/ + dos_encode_block((ushort *)raw,(unsigned char *) &hdr.track,28); + raw+=14; + +/*id gap 3 */ + for(i=0;i<6;i++) + *raw++=0xaaaaaaaa; + +/* 3 syncs and 1 datamark */ + *raw++=0x44894489; + *raw++=0x44895545; + +/* data */ + dos_encode_block((ushort *)raw, + (unsigned char *)unit[drive].trackbuf+cnt*512,512); + raw+=256; + +/*data crc + jd's special gap (long words :-/) */ + crc[0]=dos_data_crc(unit[drive].trackbuf+cnt*512); + dos_encode_block((ushort *) raw,(unsigned char *)crc,4); + raw+=2; + +/* data gap */ + for(i=0;i<38;i++) + *raw++=0x92549254; + + return raw; /* wrote 652 MFM words */ +} + +static void dos_write(int disk) +{ + int cnt; + unsigned long raw = (unsigned long) raw_buf; + unsigned long *ptr=(unsigned long *)raw; + + disk&=3; +/* really gap4 + indexgap , but we write it first and round it up */ + for (cnt=0;cnt<425;cnt++) + *ptr++=0x92549254; + +/* the following is just guessed */ + if (unit[disk].type->sect_mult==2) /* check for HD-Disks */ + for(cnt=0;cnt<473;cnt++) + *ptr++=0x92549254; + +/* now the index marks...*/ + for (cnt=0;cnt<20;cnt++) + *ptr++=0x92549254; + for (cnt=0;cnt<6;cnt++) + *ptr++=0xaaaaaaaa; + *ptr++=0x52245224; + *ptr++=0x52245552; + for (cnt=0;cnt<20;cnt++) + *ptr++=0x92549254; + +/* sectors */ + for(cnt = 0; cnt < unit[disk].dtype->sects * unit[disk].type->sect_mult; cnt++) + ptr=ms_putsec(disk,ptr,cnt); + + *(ushort *)ptr = 0xaaa8; /* MFM word before is always 0x9254 */ +} + +/* + * Here comes the high level stuff (i.e. the filesystem interface) + * and helper functions. + * Normally this should be the only part that has to be adapted to + * different kernel versions. + */ + +/* FIXME: this assumes the drive is still spinning - + * which is only true if we complete writing a track within three seconds + */ +static void flush_track_callback(unsigned long nr) +{ + nr&=3; + writefromint = 1; + if (!try_fdc(nr)) { + /* we might block in an interrupt, so try again later */ + flush_track_timer[nr].expires = jiffies + 1; + add_timer(flush_track_timer + nr); + return; + } + get_fdc(nr); + (*unit[nr].dtype->write_fkt)(nr); + if (!raw_write(nr)) { + printk (KERN_NOTICE "floppy disk write protected\n"); + writefromint = 0; + writepending = 0; + } + rel_fdc(); +} + +static int non_int_flush_track (unsigned long nr) +{ + unsigned long flags; + + nr&=3; + writefromint = 0; + del_timer(&post_write_timer); + get_fdc(nr); + if (!fd_motor_on(nr)) { + writepending = 0; + rel_fdc(); + return 0; + } + local_irq_save(flags); + if (writepending != 2) { + local_irq_restore(flags); + (*unit[nr].dtype->write_fkt)(nr); + if (!raw_write(nr)) { + printk (KERN_NOTICE "floppy disk write protected " + "in write!\n"); + writepending = 0; + return 0; + } + wait_event(wait_fd_block, block_flag != 2); + } + else { + local_irq_restore(flags); + ms_delay(2); /* 2 ms post_write delay */ + post_write(nr); + } + rel_fdc(); + return 1; +} + +static int get_track(int drive, int track) +{ + int error, errcnt; + + drive&=3; + if (unit[drive].track == track) + return 0; + get_fdc(drive); + if (!fd_motor_on(drive)) { + rel_fdc(); + return -1; + } + + if (unit[drive].dirty == 1) { + del_timer (flush_track_timer + drive); + non_int_flush_track (drive); + } + errcnt = 0; + while (errcnt < MAX_ERRORS) { + if (!fd_seek(drive, track)) + return -1; + raw_read(drive); + error = (*unit[drive].dtype->read_fkt)(drive); + if (error == 0) { + rel_fdc(); + return 0; + } + /* Read Error Handling: recalibrate and try again */ + unit[drive].track = -1; + errcnt++; + } + rel_fdc(); + return -1; +} + +/* + * Round-robin between our available drives, doing one request from each + */ +static struct request *set_next_request(void) +{ + struct request_queue *q; + int cnt = FD_MAX_UNITS; + struct request *rq = NULL; + + /* Find next queue we can dispatch from */ + fdc_queue = fdc_queue + 1; + if (fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + + for(cnt = FD_MAX_UNITS; cnt > 0; cnt--) { + + if (unit[fdc_queue].type->code == FD_NODRIVE) { + if (++fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + continue; + } + + q = unit[fdc_queue].gendisk->queue; + if (q) { + rq = blk_fetch_request(q); + if (rq) + break; + } + + if (++fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + } + + return rq; +} + +static void redo_fd_request(void) +{ + struct request *rq; + unsigned int cnt, block, track, sector; + int drive; + struct amiga_floppy_struct *floppy; + char *data; + unsigned long flags; + int err; + +next_req: + rq = set_next_request(); + if (!rq) { + /* Nothing left to do */ + return; + } + + floppy = rq->rq_disk->private_data; + drive = floppy - unit; + +next_segment: + /* Here someone could investigate to be more efficient */ + for (cnt = 0, err = 0; cnt < blk_rq_cur_sectors(rq); cnt++) { +#ifdef DEBUG + printk("fd: sector %ld + %d requested for %s\n", + blk_rq_pos(rq), cnt, + (rq_data_dir(rq) == READ) ? "read" : "write"); +#endif + block = blk_rq_pos(rq) + cnt; + if ((int)block > floppy->blocks) { + err = -EIO; + break; + } + + track = block / (floppy->dtype->sects * floppy->type->sect_mult); + sector = block % (floppy->dtype->sects * floppy->type->sect_mult); + data = bio_data(rq->bio) + 512 * cnt; +#ifdef DEBUG + printk("access to track %d, sector %d, with buffer at " + "0x%08lx\n", track, sector, data); +#endif + + if (get_track(drive, track) == -1) { + err = -EIO; + break; + } + + if (rq_data_dir(rq) == READ) { + memcpy(data, floppy->trackbuf + sector * 512, 512); + } else { + memcpy(floppy->trackbuf + sector * 512, data, 512); + + /* keep the drive spinning while writes are scheduled */ + if (!fd_motor_on(drive)) { + err = -EIO; + break; + } + /* + * setup a callback to write the track buffer + * after a short (1 tick) delay. + */ + local_irq_save(flags); + + floppy->dirty = 1; + /* reset the timer */ + mod_timer (flush_track_timer + drive, jiffies + 1); + local_irq_restore(flags); + } + } + + if (__blk_end_request_cur(rq, err)) + goto next_segment; + goto next_req; +} + +static void do_fd_request(struct request_queue * q) +{ + redo_fd_request(); +} + +static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + int drive = MINOR(bdev->bd_dev) & 3; + + geo->heads = unit[drive].type->heads; + geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult; + geo->cylinders = unit[drive].type->tracks; + return 0; +} + +static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long param) +{ + struct amiga_floppy_struct *p = bdev->bd_disk->private_data; + int drive = p - unit; + static struct floppy_struct getprm; + void __user *argp = (void __user *)param; + + switch(cmd){ + case FDFMTBEG: + get_fdc(drive); + if (fd_ref[drive] > 1) { + rel_fdc(); + return -EBUSY; + } + fsync_bdev(bdev); + if (fd_motor_on(drive) == 0) { + rel_fdc(); + return -ENODEV; + } + if (fd_calibrate(drive) == 0) { + rel_fdc(); + return -ENXIO; + } + floppy_off(drive); + rel_fdc(); + break; + case FDFMTTRK: + if (param < p->type->tracks * p->type->heads) + { + get_fdc(drive); + if (fd_seek(drive,param) != 0){ + memset(p->trackbuf, FD_FILL_BYTE, + p->dtype->sects * p->type->sect_mult * 512); + non_int_flush_track(drive); + } + floppy_off(drive); + rel_fdc(); + } + else + return -EINVAL; + break; + case FDFMTEND: + floppy_off(drive); + invalidate_bdev(bdev); + break; + case FDGETPRM: + memset((void *)&getprm, 0, sizeof (getprm)); + getprm.track=p->type->tracks; + getprm.head=p->type->heads; + getprm.sect=p->dtype->sects * p->type->sect_mult; + getprm.size=p->blocks; + if (copy_to_user(argp, &getprm, sizeof(struct floppy_struct))) + return -EFAULT; + break; + case FDSETPRM: + case FDDEFPRM: + return -EINVAL; + case FDFLUSH: /* unconditionally, even if not needed */ + del_timer (flush_track_timer + drive); + non_int_flush_track(drive); + break; +#ifdef RAW_IOCTL + case IOCTL_RAW_TRACK: + if (copy_to_user(argp, raw_buf, p->type->read_size)) + return -EFAULT; + else + return p->type->read_size; +#endif + default: + printk(KERN_DEBUG "fd_ioctl: unknown cmd %d for drive %d.", + cmd, drive); + return -ENOSYS; + } + return 0; +} + +static int fd_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long param) +{ + int ret; + + mutex_lock(&amiflop_mutex); + ret = fd_locked_ioctl(bdev, mode, cmd, param); + mutex_unlock(&amiflop_mutex); + + return ret; +} + +static void fd_probe(int dev) +{ + unsigned long code; + int type; + int drive; + + drive = dev & 3; + code = fd_get_drive_id(drive); + + /* get drive type */ + for (type = 0; type < num_dr_types; type++) + if (drive_types[type].code == code) + break; + + if (type >= num_dr_types) { + printk(KERN_WARNING "fd_probe: unsupported drive type " + "%08lx found\n", code); + unit[drive].type = &drive_types[num_dr_types-1]; /* FD_NODRIVE */ + return; + } + + unit[drive].type = drive_types + type; + unit[drive].track = -1; + + unit[drive].disk = -1; + unit[drive].motor = 0; + unit[drive].busy = 0; + unit[drive].status = -1; +} + +/* + * floppy_open check for aliasing (/dev/fd0 can be the same as + * /dev/PS0 etc), and disallows simultaneous access to the same + * drive with different device numbers. + */ +static int floppy_open(struct block_device *bdev, fmode_t mode) +{ + int drive = MINOR(bdev->bd_dev) & 3; + int system = (MINOR(bdev->bd_dev) & 4) >> 2; + int old_dev; + unsigned long flags; + + mutex_lock(&amiflop_mutex); + old_dev = fd_device[drive]; + + if (fd_ref[drive] && old_dev != system) { + mutex_unlock(&amiflop_mutex); + return -EBUSY; + } + + if (mode & (FMODE_READ|FMODE_WRITE)) { + check_disk_change(bdev); + if (mode & FMODE_WRITE) { + int wrprot; + + get_fdc(drive); + fd_select (drive); + wrprot = !(ciaa.pra & DSKPROT); + fd_deselect (drive); + rel_fdc(); + + if (wrprot) { + mutex_unlock(&amiflop_mutex); + return -EROFS; + } + } + } + + local_irq_save(flags); + fd_ref[drive]++; + fd_device[drive] = system; + local_irq_restore(flags); + + unit[drive].dtype=&data_types[system]; + unit[drive].blocks=unit[drive].type->heads*unit[drive].type->tracks* + data_types[system].sects*unit[drive].type->sect_mult; + set_capacity(unit[drive].gendisk, unit[drive].blocks); + + printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive, + unit[drive].type->name, data_types[system].name); + + mutex_unlock(&amiflop_mutex); + return 0; +} + +static void floppy_release(struct gendisk *disk, fmode_t mode) +{ + struct amiga_floppy_struct *p = disk->private_data; + int drive = p - unit; + + mutex_lock(&amiflop_mutex); + if (unit[drive].dirty == 1) { + del_timer (flush_track_timer + drive); + non_int_flush_track (drive); + } + + if (!fd_ref[drive]--) { + printk(KERN_CRIT "floppy_release with fd_ref == 0"); + fd_ref[drive] = 0; + } +#ifdef MODULE +/* the mod_use counter is handled this way */ + floppy_off (drive | 0x40000000); +#endif + mutex_unlock(&amiflop_mutex); +} + +/* + * check_events is never called from an interrupt, so we can relax a bit + * here, sleep etc. Note that floppy-on tries to set current_DOR to point + * to the desired drive, but it will probably not survive the sleep if + * several floppies are used at the same time: thus the loop. + */ +static unsigned amiga_check_events(struct gendisk *disk, unsigned int clearing) +{ + struct amiga_floppy_struct *p = disk->private_data; + int drive = p - unit; + int changed; + static int first_time = 1; + + if (first_time) + changed = first_time--; + else { + get_fdc(drive); + fd_select (drive); + changed = !(ciaa.pra & DSKCHANGE); + fd_deselect (drive); + rel_fdc(); + } + + if (changed) { + fd_probe(drive); + p->track = -1; + p->dirty = 0; + writepending = 0; /* if this was true before, too bad! */ + writefromint = 0; + return DISK_EVENT_MEDIA_CHANGE; + } + return 0; +} + +static const struct block_device_operations floppy_fops = { + .owner = THIS_MODULE, + .open = floppy_open, + .release = floppy_release, + .ioctl = fd_ioctl, + .getgeo = fd_getgeo, + .check_events = amiga_check_events, +}; + +static int __init fd_probe_drives(void) +{ + int drive,drives,nomem; + + printk(KERN_INFO "FD: probing units\nfound "); + drives=0; + nomem=0; + for(drive=0;drivecode == FD_NODRIVE) + continue; + disk = alloc_disk(1); + if (!disk) { + unit[drive].type->code = FD_NODRIVE; + continue; + } + unit[drive].gendisk = disk; + + disk->queue = blk_init_queue(do_fd_request, &amiflop_lock); + if (!disk->queue) { + unit[drive].type->code = FD_NODRIVE; + continue; + } + + drives++; + if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) { + printk("no mem for "); + unit[drive].type = &drive_types[num_dr_types - 1]; /* FD_NODRIVE */ + drives--; + nomem = 1; + } + printk("fd%d ",drive); + disk->major = FLOPPY_MAJOR; + disk->first_minor = drive; + disk->fops = &floppy_fops; + sprintf(disk->disk_name, "fd%d", drive); + disk->private_data = &unit[drive]; + set_capacity(disk, 880*2); + add_disk(disk); + } + if ((drives > 0) || (nomem == 0)) { + if (drives == 0) + printk("no drives"); + printk("\n"); + return drives; + } + printk("\n"); + return -ENOMEM; +} + +static struct kobject *floppy_find(dev_t dev, int *part, void *data) +{ + int drive = *part & 3; + if (unit[drive].type->code == FD_NODRIVE) + return NULL; + *part = 0; + return get_disk(unit[drive].gendisk); +} + +static int __init amiga_floppy_probe(struct platform_device *pdev) +{ + int i, ret; + + if (register_blkdev(FLOPPY_MAJOR,"fd")) + return -EBUSY; + + ret = -ENOMEM; + raw_buf = amiga_chip_alloc(RAW_BUF_SIZE, "Floppy"); + if (!raw_buf) { + printk("fd: cannot get chip mem buffer\n"); + goto out_blkdev; + } + + ret = -EBUSY; + if (request_irq(IRQ_AMIGA_DSKBLK, fd_block_done, 0, "floppy_dma", NULL)) { + printk("fd: cannot get irq for dma\n"); + goto out_irq; + } + + if (request_irq(IRQ_AMIGA_CIAA_TB, ms_isr, 0, "floppy_timer", NULL)) { + printk("fd: cannot get irq for timer\n"); + goto out_irq2; + } + + ret = -ENODEV; + if (fd_probe_drives() < 1) /* No usable drives */ + goto out_probe; + + blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, + floppy_find, NULL, NULL); + + /* initialize variables */ + init_timer(&motor_on_timer); + motor_on_timer.expires = 0; + motor_on_timer.data = 0; + motor_on_timer.function = motor_on_callback; + for (i = 0; i < FD_MAX_UNITS; i++) { + init_timer(&motor_off_timer[i]); + motor_off_timer[i].expires = 0; + motor_off_timer[i].data = i|0x80000000; + motor_off_timer[i].function = fd_motor_off; + init_timer(&flush_track_timer[i]); + flush_track_timer[i].expires = 0; + flush_track_timer[i].data = i; + flush_track_timer[i].function = flush_track_callback; + + unit[i].track = -1; + } + + init_timer(&post_write_timer); + post_write_timer.expires = 0; + post_write_timer.data = 0; + post_write_timer.function = post_write; + + for (i = 0; i < 128; i++) + mfmdecode[i]=255; + for (i = 0; i < 16; i++) + mfmdecode[mfmencode[i]]=i; + + /* make sure that disk DMA is enabled */ + custom.dmacon = DMAF_SETCLR | DMAF_DISK; + + /* init ms timer */ + ciaa.crb = 8; /* one-shot, stop */ + return 0; + +out_probe: + free_irq(IRQ_AMIGA_CIAA_TB, NULL); +out_irq2: + free_irq(IRQ_AMIGA_DSKBLK, NULL); +out_irq: + amiga_chip_free(raw_buf); +out_blkdev: + unregister_blkdev(FLOPPY_MAJOR,"fd"); + return ret; +} + +#if 0 /* not safe to unload */ +static int __exit amiga_floppy_remove(struct platform_device *pdev) +{ + int i; + + for( i = 0; i < FD_MAX_UNITS; i++) { + if (unit[i].type->code != FD_NODRIVE) { + struct request_queue *q = unit[i].gendisk->queue; + del_gendisk(unit[i].gendisk); + put_disk(unit[i].gendisk); + kfree(unit[i].trackbuf); + if (q) + blk_cleanup_queue(q); + } + } + blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); + free_irq(IRQ_AMIGA_CIAA_TB, NULL); + free_irq(IRQ_AMIGA_DSKBLK, NULL); + custom.dmacon = DMAF_DISK; /* disable DMA */ + amiga_chip_free(raw_buf); + unregister_blkdev(FLOPPY_MAJOR, "fd"); +} +#endif + +static struct platform_driver amiga_floppy_driver = { + .driver = { + .name = "amiga-floppy", + }, +}; + +static int __init amiga_floppy_init(void) +{ + return platform_driver_probe(&amiga_floppy_driver, amiga_floppy_probe); +} + +module_init(amiga_floppy_init); + +#ifndef MODULE +static int __init amiga_floppy_setup (char *str) +{ + int n; + if (!MACH_IS_AMIGA) + return 0; + if (!get_option(&str, &n)) + return 0; + printk (KERN_INFO "amiflop: Setting default df0 to %x\n", n); + fd_def_df0 = n; + return 1; +} + +__setup("floppy=", amiga_floppy_setup); +#endif + +MODULE_ALIAS("platform:amiga-floppy"); diff --git a/kernel/drivers/block/aoe/Makefile b/kernel/drivers/block/aoe/Makefile new file mode 100644 index 000000000..06ea82cdf --- /dev/null +++ b/kernel/drivers/block/aoe/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for ATA over Ethernet +# + +obj-$(CONFIG_ATA_OVER_ETH) += aoe.o +aoe-y := aoeblk.o aoechr.o aoecmd.o aoedev.o aoemain.o aoenet.o diff --git a/kernel/drivers/block/aoe/aoe.h b/kernel/drivers/block/aoe/aoe.h new file mode 100644 index 000000000..9220f8e83 --- /dev/null +++ b/kernel/drivers/block/aoe/aoe.h @@ -0,0 +1,240 @@ +/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ +#define VERSION "85" +#define AOE_MAJOR 152 +#define DEVICE_NAME "aoe" + +/* set AOE_PARTITIONS to 1 to use whole-disks only + * default is 16, which is 15 partitions plus the whole disk + */ +#ifndef AOE_PARTITIONS +#define AOE_PARTITIONS (16) +#endif + +#define WHITESPACE " \t\v\f\n," + +enum { + AOECMD_ATA, + AOECMD_CFG, + AOECMD_VEND_MIN = 0xf0, + + AOEFL_RSP = (1<<3), + AOEFL_ERR = (1<<2), + + AOEAFL_EXT = (1<<6), + AOEAFL_DEV = (1<<4), + AOEAFL_ASYNC = (1<<1), + AOEAFL_WRITE = (1<<0), + + AOECCMD_READ = 0, + AOECCMD_TEST, + AOECCMD_PTEST, + AOECCMD_SET, + AOECCMD_FSET, + + AOE_HVER = 0x10, +}; + +struct aoe_hdr { + unsigned char dst[6]; + unsigned char src[6]; + __be16 type; + unsigned char verfl; + unsigned char err; + __be16 major; + unsigned char minor; + unsigned char cmd; + __be32 tag; +}; + +struct aoe_atahdr { + unsigned char aflags; + unsigned char errfeat; + unsigned char scnt; + unsigned char cmdstat; + unsigned char lba0; + unsigned char lba1; + unsigned char lba2; + unsigned char lba3; + unsigned char lba4; + unsigned char lba5; + unsigned char res[2]; +}; + +struct aoe_cfghdr { + __be16 bufcnt; + __be16 fwver; + unsigned char scnt; + unsigned char aoeccmd; + unsigned char cslen[2]; +}; + +enum { + DEVFL_UP = 1, /* device is installed in system and ready for AoE->ATA commands */ + DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */ + DEVFL_EXT = (1<<2), /* device accepts lba48 commands */ + DEVFL_GDALLOC = (1<<3), /* need to alloc gendisk */ + DEVFL_GD_NOW = (1<<4), /* allocating gendisk */ + DEVFL_KICKME = (1<<5), /* slow polling network card catch */ + DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */ + DEVFL_FREEING = (1<<7), /* set when device is being cleaned up */ + DEVFL_FREED = (1<<8), /* device has been cleaned up */ +}; + +enum { + DEFAULTBCNT = 2 * 512, /* 2 sectors */ + MIN_BUFS = 16, + NTARGETS = 4, + NAOEIFS = 8, + NSKBPOOLMAX = 256, + NFACTIVE = 61, + + TIMERTICK = HZ / 10, + RTTSCALE = 8, + RTTDSCALE = 3, + RTTAVG_INIT = USEC_PER_SEC / 4 << RTTSCALE, + RTTDEV_INIT = RTTAVG_INIT / 4, + + HARD_SCORN_SECS = 10, /* try another remote port after this */ + MAX_TAINT = 1000, /* cap on aoetgt taint */ +}; + +struct buf { + ulong nframesout; + struct bio *bio; + struct bvec_iter iter; + struct request *rq; +}; + +enum frame_flags { + FFL_PROBE = 1, +}; + +struct frame { + struct list_head head; + u32 tag; + struct timeval sent; /* high-res time packet was sent */ + u32 sent_jiffs; /* low-res jiffies-based sent time */ + ulong waited; + ulong waited_total; + struct aoetgt *t; /* parent target I belong to */ + struct sk_buff *skb; /* command skb freed on module exit */ + struct sk_buff *r_skb; /* response skb for async processing */ + struct buf *buf; + struct bvec_iter iter; + char flags; +}; + +struct aoeif { + struct net_device *nd; + ulong lost; + int bcnt; +}; + +struct aoetgt { + unsigned char addr[6]; + ushort nframes; /* cap on frames to use */ + struct aoedev *d; /* parent device I belong to */ + struct list_head ffree; /* list of free frames */ + struct aoeif ifs[NAOEIFS]; + struct aoeif *ifp; /* current aoeif in use */ + ushort nout; /* number of AoE commands outstanding */ + ushort maxout; /* current value for max outstanding */ + ushort next_cwnd; /* incr maxout after decrementing to zero */ + ushort ssthresh; /* slow start threshold */ + ulong falloc; /* number of allocated frames */ + int taint; /* how much we want to avoid this aoetgt */ + int minbcnt; + int wpkts, rpkts; + char nout_probes; +}; + +struct aoedev { + struct aoedev *next; + ulong sysminor; + ulong aoemajor; + u32 rttavg; /* scaled AoE round trip time average */ + u32 rttdev; /* scaled round trip time mean deviation */ + u16 aoeminor; + u16 flags; + u16 nopen; /* (bd_openers isn't available without sleeping) */ + u16 fw_ver; /* version of blade's firmware */ + u16 lasttag; /* last tag sent */ + u16 useme; + ulong ref; + struct work_struct work;/* disk create work struct */ + struct gendisk *gd; + struct dentry *debugfs; + struct request_queue *blkq; + struct hd_geometry geo; + sector_t ssize; + struct timer_list timer; + spinlock_t lock; + struct sk_buff_head skbpool; + mempool_t *bufpool; /* for deadlock-free Buf allocation */ + struct { /* pointers to work in progress */ + struct buf *buf; + struct bio *nxbio; + struct request *rq; + } ip; + ulong maxbcnt; + struct list_head factive[NFACTIVE]; /* hash of active frames */ + struct list_head rexmitq; /* deferred retransmissions */ + struct aoetgt **targets; + ulong ntargets; /* number of allocated aoetgt pointers */ + struct aoetgt **tgt; /* target in use when working */ + ulong kicked; + char ident[512]; +}; + +/* kthread tracking */ +struct ktstate { + struct completion rendez; + struct task_struct *task; + wait_queue_head_t *waitq; + int (*fn) (int); + char name[12]; + spinlock_t *lock; + int id; + int active; +}; + +int aoeblk_init(void); +void aoeblk_exit(void); +void aoeblk_gdalloc(void *); +void aoedisk_rm_debugfs(struct aoedev *d); +void aoedisk_rm_sysfs(struct aoedev *d); + +int aoechr_init(void); +void aoechr_exit(void); +void aoechr_error(char *); + +void aoecmd_work(struct aoedev *d); +void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor); +struct sk_buff *aoecmd_ata_rsp(struct sk_buff *); +void aoecmd_cfg_rsp(struct sk_buff *); +void aoecmd_sleepwork(struct work_struct *); +void aoecmd_wreset(struct aoetgt *t); +void aoecmd_cleanslate(struct aoedev *); +void aoecmd_exit(void); +int aoecmd_init(void); +struct sk_buff *aoecmd_ata_id(struct aoedev *); +void aoe_freetframe(struct frame *); +void aoe_flush_iocq(void); +void aoe_flush_iocq_by_index(int); +void aoe_end_request(struct aoedev *, struct request *, int); +int aoe_ktstart(struct ktstate *k); +void aoe_ktstop(struct ktstate *k); + +int aoedev_init(void); +void aoedev_exit(void); +struct aoedev *aoedev_by_aoeaddr(ulong maj, int min, int do_alloc); +void aoedev_downdev(struct aoedev *d); +int aoedev_flush(const char __user *str, size_t size); +void aoe_failbuf(struct aoedev *, struct buf *); +void aoedev_put(struct aoedev *); + +int aoenet_init(void); +void aoenet_exit(void); +void aoenet_xmit(struct sk_buff_head *); +int is_aoe_netif(struct net_device *ifp); +int set_aoe_iflist(const char __user *str, size_t size); diff --git a/kernel/drivers/block/aoe/aoeblk.c b/kernel/drivers/block/aoe/aoeblk.c new file mode 100644 index 000000000..46c282fff --- /dev/null +++ b/kernel/drivers/block/aoe/aoeblk.c @@ -0,0 +1,464 @@ +/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoeblk.c + * block device routines + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "aoe.h" + +static DEFINE_MUTEX(aoeblk_mutex); +static struct kmem_cache *buf_pool_cache; +static struct dentry *aoe_debugfs_dir; + +/* GPFS needs a larger value than the default. */ +static int aoe_maxsectors; +module_param(aoe_maxsectors, int, 0644); +MODULE_PARM_DESC(aoe_maxsectors, + "When nonzero, set the maximum number of sectors per I/O request"); + +static ssize_t aoedisk_show_state(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + struct aoedev *d = disk->private_data; + + return snprintf(page, PAGE_SIZE, + "%s%s\n", + (d->flags & DEVFL_UP) ? "up" : "down", + (d->flags & DEVFL_KICKME) ? ",kickme" : + (d->nopen && !(d->flags & DEVFL_UP)) ? ",closewait" : ""); + /* I'd rather see nopen exported so we can ditch closewait */ +} +static ssize_t aoedisk_show_mac(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + struct aoedev *d = disk->private_data; + struct aoetgt *t = d->targets[0]; + + if (t == NULL) + return snprintf(page, PAGE_SIZE, "none\n"); + return snprintf(page, PAGE_SIZE, "%pm\n", t->addr); +} +static ssize_t aoedisk_show_netif(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + struct aoedev *d = disk->private_data; + struct net_device *nds[8], **nd, **nnd, **ne; + struct aoetgt **t, **te; + struct aoeif *ifp, *e; + char *p; + + memset(nds, 0, sizeof nds); + nd = nds; + ne = nd + ARRAY_SIZE(nds); + t = d->targets; + te = t + d->ntargets; + for (; t < te && *t; t++) { + ifp = (*t)->ifs; + e = ifp + NAOEIFS; + for (; ifp < e && ifp->nd; ifp++) { + for (nnd = nds; nnd < nd; nnd++) + if (*nnd == ifp->nd) + break; + if (nnd == nd && nd != ne) + *nd++ = ifp->nd; + } + } + + ne = nd; + nd = nds; + if (*nd == NULL) + return snprintf(page, PAGE_SIZE, "none\n"); + for (p = page; nd < ne; nd++) + p += snprintf(p, PAGE_SIZE - (p-page), "%s%s", + p == page ? "" : ",", (*nd)->name); + p += snprintf(p, PAGE_SIZE - (p-page), "\n"); + return p-page; +} +/* firmware version */ +static ssize_t aoedisk_show_fwver(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + struct aoedev *d = disk->private_data; + + return snprintf(page, PAGE_SIZE, "0x%04x\n", (unsigned int) d->fw_ver); +} +static ssize_t aoedisk_show_payload(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + struct aoedev *d = disk->private_data; + + return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt); +} + +static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) +{ + struct aoedev *d; + struct aoetgt **t, **te; + struct aoeif *ifp, *ife; + unsigned long flags; + char c; + + d = s->private; + seq_printf(s, "rttavg: %d rttdev: %d\n", + d->rttavg >> RTTSCALE, + d->rttdev >> RTTDSCALE); + seq_printf(s, "nskbpool: %d\n", skb_queue_len(&d->skbpool)); + seq_printf(s, "kicked: %ld\n", d->kicked); + seq_printf(s, "maxbcnt: %ld\n", d->maxbcnt); + seq_printf(s, "ref: %ld\n", d->ref); + + spin_lock_irqsave(&d->lock, flags); + t = d->targets; + te = t + d->ntargets; + for (; t < te && *t; t++) { + c = '\t'; + seq_printf(s, "falloc: %ld\n", (*t)->falloc); + seq_printf(s, "ffree: %p\n", + list_empty(&(*t)->ffree) ? NULL : (*t)->ffree.next); + seq_printf(s, "%pm:%d:%d:%d\n", (*t)->addr, (*t)->nout, + (*t)->maxout, (*t)->nframes); + seq_printf(s, "\tssthresh:%d\n", (*t)->ssthresh); + seq_printf(s, "\ttaint:%d\n", (*t)->taint); + seq_printf(s, "\tr:%d\n", (*t)->rpkts); + seq_printf(s, "\tw:%d\n", (*t)->wpkts); + ifp = (*t)->ifs; + ife = ifp + ARRAY_SIZE((*t)->ifs); + for (; ifp->nd && ifp < ife; ifp++) { + seq_printf(s, "%c%s", c, ifp->nd->name); + c = ','; + } + seq_puts(s, "\n"); + } + spin_unlock_irqrestore(&d->lock, flags); + + return 0; +} + +static int aoe_debugfs_open(struct inode *inode, struct file *file) +{ + return single_open(file, aoedisk_debugfs_show, inode->i_private); +} + +static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL); +static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL); +static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL); +static struct device_attribute dev_attr_firmware_version = { + .attr = { .name = "firmware-version", .mode = S_IRUGO }, + .show = aoedisk_show_fwver, +}; +static DEVICE_ATTR(payload, S_IRUGO, aoedisk_show_payload, NULL); + +static struct attribute *aoe_attrs[] = { + &dev_attr_state.attr, + &dev_attr_mac.attr, + &dev_attr_netif.attr, + &dev_attr_firmware_version.attr, + &dev_attr_payload.attr, + NULL, +}; + +static const struct attribute_group attr_group = { + .attrs = aoe_attrs, +}; + +static const struct file_operations aoe_debugfs_fops = { + .open = aoe_debugfs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void +aoedisk_add_debugfs(struct aoedev *d) +{ + struct dentry *entry; + char *p; + + if (aoe_debugfs_dir == NULL) + return; + p = strchr(d->gd->disk_name, '/'); + if (p == NULL) + p = d->gd->disk_name; + else + p++; + BUG_ON(*p == '\0'); + entry = debugfs_create_file(p, 0444, aoe_debugfs_dir, d, + &aoe_debugfs_fops); + if (IS_ERR_OR_NULL(entry)) { + pr_info("aoe: cannot create debugfs file for %s\n", + d->gd->disk_name); + return; + } + BUG_ON(d->debugfs); + d->debugfs = entry; +} +void +aoedisk_rm_debugfs(struct aoedev *d) +{ + debugfs_remove(d->debugfs); + d->debugfs = NULL; +} + +static int +aoedisk_add_sysfs(struct aoedev *d) +{ + return sysfs_create_group(&disk_to_dev(d->gd)->kobj, &attr_group); +} +void +aoedisk_rm_sysfs(struct aoedev *d) +{ + sysfs_remove_group(&disk_to_dev(d->gd)->kobj, &attr_group); +} + +static int +aoeblk_open(struct block_device *bdev, fmode_t mode) +{ + struct aoedev *d = bdev->bd_disk->private_data; + ulong flags; + + if (!virt_addr_valid(d)) { + pr_crit("aoe: invalid device pointer in %s\n", + __func__); + WARN_ON(1); + return -ENODEV; + } + if (!(d->flags & DEVFL_UP) || d->flags & DEVFL_TKILL) + return -ENODEV; + + mutex_lock(&aoeblk_mutex); + spin_lock_irqsave(&d->lock, flags); + if (d->flags & DEVFL_UP && !(d->flags & DEVFL_TKILL)) { + d->nopen++; + spin_unlock_irqrestore(&d->lock, flags); + mutex_unlock(&aoeblk_mutex); + return 0; + } + spin_unlock_irqrestore(&d->lock, flags); + mutex_unlock(&aoeblk_mutex); + return -ENODEV; +} + +static void +aoeblk_release(struct gendisk *disk, fmode_t mode) +{ + struct aoedev *d = disk->private_data; + ulong flags; + + spin_lock_irqsave(&d->lock, flags); + + if (--d->nopen == 0) { + spin_unlock_irqrestore(&d->lock, flags); + aoecmd_cfg(d->aoemajor, d->aoeminor); + return; + } + spin_unlock_irqrestore(&d->lock, flags); +} + +static void +aoeblk_request(struct request_queue *q) +{ + struct aoedev *d; + struct request *rq; + + d = q->queuedata; + if ((d->flags & DEVFL_UP) == 0) { + pr_info_ratelimited("aoe: device %ld.%d is not up\n", + d->aoemajor, d->aoeminor); + while ((rq = blk_peek_request(q))) { + blk_start_request(rq); + aoe_end_request(d, rq, 1); + } + return; + } + aoecmd_work(d); +} + +static int +aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct aoedev *d = bdev->bd_disk->private_data; + + if ((d->flags & DEVFL_UP) == 0) { + printk(KERN_ERR "aoe: disk not up\n"); + return -ENODEV; + } + + geo->cylinders = d->geo.cylinders; + geo->heads = d->geo.heads; + geo->sectors = d->geo.sectors; + return 0; +} + +static int +aoeblk_ioctl(struct block_device *bdev, fmode_t mode, uint cmd, ulong arg) +{ + struct aoedev *d; + + if (!arg) + return -EINVAL; + + d = bdev->bd_disk->private_data; + if ((d->flags & DEVFL_UP) == 0) { + pr_err("aoe: disk not up\n"); + return -ENODEV; + } + + if (cmd == HDIO_GET_IDENTITY) { + if (!copy_to_user((void __user *) arg, &d->ident, + sizeof(d->ident))) + return 0; + return -EFAULT; + } + + /* udev calls scsi_id, which uses SG_IO, resulting in noise */ + if (cmd != SG_IO) + pr_info("aoe: unknown ioctl 0x%x\n", cmd); + + return -ENOTTY; +} + +static const struct block_device_operations aoe_bdops = { + .open = aoeblk_open, + .release = aoeblk_release, + .ioctl = aoeblk_ioctl, + .getgeo = aoeblk_getgeo, + .owner = THIS_MODULE, +}; + +/* alloc_disk and add_disk can sleep */ +void +aoeblk_gdalloc(void *vp) +{ + struct aoedev *d = vp; + struct gendisk *gd; + mempool_t *mp; + struct request_queue *q; + enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, }; + ulong flags; + int late = 0; + + spin_lock_irqsave(&d->lock, flags); + if (d->flags & DEVFL_GDALLOC + && !(d->flags & DEVFL_TKILL) + && !(d->flags & DEVFL_GD_NOW)) + d->flags |= DEVFL_GD_NOW; + else + late = 1; + spin_unlock_irqrestore(&d->lock, flags); + if (late) + return; + + gd = alloc_disk(AOE_PARTITIONS); + if (gd == NULL) { + pr_err("aoe: cannot allocate disk structure for %ld.%d\n", + d->aoemajor, d->aoeminor); + goto err; + } + + mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab, + buf_pool_cache); + if (mp == NULL) { + printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n", + d->aoemajor, d->aoeminor); + goto err_disk; + } + q = blk_init_queue(aoeblk_request, &d->lock); + if (q == NULL) { + pr_err("aoe: cannot allocate block queue for %ld.%d\n", + d->aoemajor, d->aoeminor); + goto err_mempool; + } + + spin_lock_irqsave(&d->lock, flags); + WARN_ON(!(d->flags & DEVFL_GD_NOW)); + WARN_ON(!(d->flags & DEVFL_GDALLOC)); + WARN_ON(d->flags & DEVFL_TKILL); + WARN_ON(d->gd); + WARN_ON(d->flags & DEVFL_UP); + blk_queue_max_hw_sectors(q, 1024); + q->backing_dev_info.name = "aoe"; + q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; + d->bufpool = mp; + d->blkq = gd->queue = q; + q->queuedata = d; + d->gd = gd; + if (aoe_maxsectors) + blk_queue_max_hw_sectors(q, aoe_maxsectors); + gd->major = AOE_MAJOR; + gd->first_minor = d->sysminor; + gd->fops = &aoe_bdops; + gd->private_data = d; + set_capacity(gd, d->ssize); + snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d", + d->aoemajor, d->aoeminor); + + d->flags &= ~DEVFL_GDALLOC; + d->flags |= DEVFL_UP; + + spin_unlock_irqrestore(&d->lock, flags); + + add_disk(gd); + aoedisk_add_sysfs(d); + aoedisk_add_debugfs(d); + + spin_lock_irqsave(&d->lock, flags); + WARN_ON(!(d->flags & DEVFL_GD_NOW)); + d->flags &= ~DEVFL_GD_NOW; + spin_unlock_irqrestore(&d->lock, flags); + return; + +err_mempool: + mempool_destroy(mp); +err_disk: + put_disk(gd); +err: + spin_lock_irqsave(&d->lock, flags); + d->flags &= ~DEVFL_GD_NOW; + schedule_work(&d->work); + spin_unlock_irqrestore(&d->lock, flags); +} + +void +aoeblk_exit(void) +{ + debugfs_remove_recursive(aoe_debugfs_dir); + aoe_debugfs_dir = NULL; + kmem_cache_destroy(buf_pool_cache); +} + +int __init +aoeblk_init(void) +{ + buf_pool_cache = kmem_cache_create("aoe_bufs", + sizeof(struct buf), + 0, 0, NULL); + if (buf_pool_cache == NULL) + return -ENOMEM; + aoe_debugfs_dir = debugfs_create_dir("aoe", NULL); + if (IS_ERR_OR_NULL(aoe_debugfs_dir)) { + pr_info("aoe: cannot create debugfs directory\n"); + aoe_debugfs_dir = NULL; + } + return 0; +} + diff --git a/kernel/drivers/block/aoe/aoechr.c b/kernel/drivers/block/aoe/aoechr.c new file mode 100644 index 000000000..ab41be625 --- /dev/null +++ b/kernel/drivers/block/aoe/aoechr.c @@ -0,0 +1,320 @@ +/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoechr.c + * AoE character device driver + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "aoe.h" + +enum { + //MINOR_STAT = 1, (moved to sysfs) + MINOR_ERR = 2, + MINOR_DISCOVER, + MINOR_INTERFACES, + MINOR_REVALIDATE, + MINOR_FLUSH, + MSGSZ = 2048, + NMSG = 100, /* message backlog to retain */ +}; + +struct aoe_chardev { + ulong minor; + char name[32]; +}; + +enum { EMFL_VALID = 1 }; + +struct ErrMsg { + short flags; + short len; + char *msg; +}; + +static DEFINE_MUTEX(aoechr_mutex); + +/* A ring buffer of error messages, to be read through + * "/dev/etherd/err". When no messages are present, + * readers will block waiting for messages to appear. + */ +static struct ErrMsg emsgs[NMSG]; +static int emsgs_head_idx, emsgs_tail_idx; +static struct completion emsgs_comp; +static spinlock_t emsgs_lock; +static int nblocked_emsgs_readers; +static struct class *aoe_class; +static struct aoe_chardev chardevs[] = { + { MINOR_ERR, "err" }, + { MINOR_DISCOVER, "discover" }, + { MINOR_INTERFACES, "interfaces" }, + { MINOR_REVALIDATE, "revalidate" }, + { MINOR_FLUSH, "flush" }, +}; + +static int +discover(void) +{ + aoecmd_cfg(0xffff, 0xff); + return 0; +} + +static int +interfaces(const char __user *str, size_t size) +{ + if (set_aoe_iflist(str, size)) { + printk(KERN_ERR + "aoe: could not set interface list: too many interfaces\n"); + return -EINVAL; + } + return 0; +} + +static int +revalidate(const char __user *str, size_t size) +{ + int major, minor, n; + ulong flags; + struct aoedev *d; + struct sk_buff *skb; + char buf[16]; + + if (size >= sizeof buf) + return -EINVAL; + buf[sizeof buf - 1] = '\0'; + if (copy_from_user(buf, str, size)) + return -EFAULT; + + n = sscanf(buf, "e%d.%d", &major, &minor); + if (n != 2) { + pr_err("aoe: invalid device specification %s\n", buf); + return -EINVAL; + } + d = aoedev_by_aoeaddr(major, minor, 0); + if (!d) + return -EINVAL; + spin_lock_irqsave(&d->lock, flags); + aoecmd_cleanslate(d); + aoecmd_cfg(major, minor); +loop: + skb = aoecmd_ata_id(d); + spin_unlock_irqrestore(&d->lock, flags); + /* try again if we are able to sleep a bit, + * otherwise give up this revalidation + */ + if (!skb && !msleep_interruptible(250)) { + spin_lock_irqsave(&d->lock, flags); + goto loop; + } + aoedev_put(d); + if (skb) { + struct sk_buff_head queue; + __skb_queue_head_init(&queue); + __skb_queue_tail(&queue, skb); + aoenet_xmit(&queue); + } + return 0; +} + +void +aoechr_error(char *msg) +{ + struct ErrMsg *em; + char *mp; + ulong flags, n; + + n = strlen(msg); + + spin_lock_irqsave(&emsgs_lock, flags); + + em = emsgs + emsgs_tail_idx; + if ((em->flags & EMFL_VALID)) { +bail: spin_unlock_irqrestore(&emsgs_lock, flags); + return; + } + + mp = kmemdup(msg, n, GFP_ATOMIC); + if (mp == NULL) { + printk(KERN_ERR "aoe: allocation failure, len=%ld\n", n); + goto bail; + } + + em->msg = mp; + em->flags |= EMFL_VALID; + em->len = n; + + emsgs_tail_idx++; + emsgs_tail_idx %= ARRAY_SIZE(emsgs); + + spin_unlock_irqrestore(&emsgs_lock, flags); + + if (nblocked_emsgs_readers) + complete(&emsgs_comp); +} + +static ssize_t +aoechr_write(struct file *filp, const char __user *buf, size_t cnt, loff_t *offp) +{ + int ret = -EINVAL; + + switch ((unsigned long) filp->private_data) { + default: + printk(KERN_INFO "aoe: can't write to that file.\n"); + break; + case MINOR_DISCOVER: + ret = discover(); + break; + case MINOR_INTERFACES: + ret = interfaces(buf, cnt); + break; + case MINOR_REVALIDATE: + ret = revalidate(buf, cnt); + break; + case MINOR_FLUSH: + ret = aoedev_flush(buf, cnt); + break; + } + if (ret == 0) + ret = cnt; + return ret; +} + +static int +aoechr_open(struct inode *inode, struct file *filp) +{ + int n, i; + + mutex_lock(&aoechr_mutex); + n = iminor(inode); + filp->private_data = (void *) (unsigned long) n; + + for (i = 0; i < ARRAY_SIZE(chardevs); ++i) + if (chardevs[i].minor == n) { + mutex_unlock(&aoechr_mutex); + return 0; + } + mutex_unlock(&aoechr_mutex); + return -EINVAL; +} + +static int +aoechr_rel(struct inode *inode, struct file *filp) +{ + return 0; +} + +static ssize_t +aoechr_read(struct file *filp, char __user *buf, size_t cnt, loff_t *off) +{ + unsigned long n; + char *mp; + struct ErrMsg *em; + ssize_t len; + ulong flags; + + n = (unsigned long) filp->private_data; + if (n != MINOR_ERR) + return -EFAULT; + + spin_lock_irqsave(&emsgs_lock, flags); + + for (;;) { + em = emsgs + emsgs_head_idx; + if ((em->flags & EMFL_VALID) != 0) + break; + if (filp->f_flags & O_NDELAY) { + spin_unlock_irqrestore(&emsgs_lock, flags); + return -EAGAIN; + } + nblocked_emsgs_readers++; + + spin_unlock_irqrestore(&emsgs_lock, flags); + + n = wait_for_completion_interruptible(&emsgs_comp); + + spin_lock_irqsave(&emsgs_lock, flags); + + nblocked_emsgs_readers--; + + if (n) { + spin_unlock_irqrestore(&emsgs_lock, flags); + return -ERESTARTSYS; + } + } + if (em->len > cnt) { + spin_unlock_irqrestore(&emsgs_lock, flags); + return -EAGAIN; + } + mp = em->msg; + len = em->len; + em->msg = NULL; + em->flags &= ~EMFL_VALID; + + emsgs_head_idx++; + emsgs_head_idx %= ARRAY_SIZE(emsgs); + + spin_unlock_irqrestore(&emsgs_lock, flags); + + n = copy_to_user(buf, mp, len); + kfree(mp); + return n == 0 ? len : -EFAULT; +} + +static const struct file_operations aoe_fops = { + .write = aoechr_write, + .read = aoechr_read, + .open = aoechr_open, + .release = aoechr_rel, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static char *aoe_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev)); +} + +int __init +aoechr_init(void) +{ + int n, i; + + n = register_chrdev(AOE_MAJOR, "aoechr", &aoe_fops); + if (n < 0) { + printk(KERN_ERR "aoe: can't register char device\n"); + return n; + } + init_completion(&emsgs_comp); + spin_lock_init(&emsgs_lock); + aoe_class = class_create(THIS_MODULE, "aoe"); + if (IS_ERR(aoe_class)) { + unregister_chrdev(AOE_MAJOR, "aoechr"); + return PTR_ERR(aoe_class); + } + aoe_class->devnode = aoe_devnode; + + for (i = 0; i < ARRAY_SIZE(chardevs); ++i) + device_create(aoe_class, NULL, + MKDEV(AOE_MAJOR, chardevs[i].minor), NULL, + chardevs[i].name); + + return 0; +} + +void +aoechr_exit(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(chardevs); ++i) + device_destroy(aoe_class, MKDEV(AOE_MAJOR, chardevs[i].minor)); + class_destroy(aoe_class); + unregister_chrdev(AOE_MAJOR, "aoechr"); +} + diff --git a/kernel/drivers/block/aoe/aoecmd.c b/kernel/drivers/block/aoe/aoecmd.c new file mode 100644 index 000000000..422b7d84f --- /dev/null +++ b/kernel/drivers/block/aoe/aoecmd.c @@ -0,0 +1,1826 @@ +/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoecmd.c + * Filesystem request handling methods + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "aoe.h" + +#define MAXIOC (8192) /* default meant to avoid most soft lockups */ + +static void ktcomplete(struct frame *, struct sk_buff *); +static int count_targets(struct aoedev *d, int *untainted); + +static struct buf *nextbuf(struct aoedev *); + +static int aoe_deadsecs = 60 * 3; +module_param(aoe_deadsecs, int, 0644); +MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev."); + +static int aoe_maxout = 64; +module_param(aoe_maxout, int, 0644); +MODULE_PARM_DESC(aoe_maxout, + "Only aoe_maxout outstanding packets for every MAC on eX.Y."); + +/* The number of online cpus during module initialization gives us a + * convenient heuristic cap on the parallelism used for ktio threads + * doing I/O completion. It is not important that the cap equal the + * actual number of running CPUs at any given time, but because of CPU + * hotplug, we take care to use ncpus instead of using + * num_online_cpus() after module initialization. + */ +static int ncpus; + +/* mutex lock used for synchronization while thread spawning */ +static DEFINE_MUTEX(ktio_spawn_lock); + +static wait_queue_head_t *ktiowq; +static struct ktstate *kts; + +/* io completion queue */ +struct iocq_ktio { + struct list_head head; + spinlock_t lock; +}; +static struct iocq_ktio *iocq; + +static struct page *empty_page; + +static struct sk_buff * +new_skb(ulong len) +{ + struct sk_buff *skb; + + skb = alloc_skb(len + MAX_HEADER, GFP_ATOMIC); + if (skb) { + skb_reserve(skb, MAX_HEADER); + skb_reset_mac_header(skb); + skb_reset_network_header(skb); + skb->protocol = __constant_htons(ETH_P_AOE); + skb_checksum_none_assert(skb); + } + return skb; +} + +static struct frame * +getframe_deferred(struct aoedev *d, u32 tag) +{ + struct list_head *head, *pos, *nx; + struct frame *f; + + head = &d->rexmitq; + list_for_each_safe(pos, nx, head) { + f = list_entry(pos, struct frame, head); + if (f->tag == tag) { + list_del(pos); + return f; + } + } + return NULL; +} + +static struct frame * +getframe(struct aoedev *d, u32 tag) +{ + struct frame *f; + struct list_head *head, *pos, *nx; + u32 n; + + n = tag % NFACTIVE; + head = &d->factive[n]; + list_for_each_safe(pos, nx, head) { + f = list_entry(pos, struct frame, head); + if (f->tag == tag) { + list_del(pos); + return f; + } + } + return NULL; +} + +/* + * Leave the top bit clear so we have tagspace for userland. + * The bottom 16 bits are the xmit tick for rexmit/rttavg processing. + * This driver reserves tag -1 to mean "unused frame." + */ +static int +newtag(struct aoedev *d) +{ + register ulong n; + + n = jiffies & 0xffff; + return n |= (++d->lasttag & 0x7fff) << 16; +} + +static u32 +aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h) +{ + u32 host_tag = newtag(d); + + memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); + memcpy(h->dst, t->addr, sizeof h->dst); + h->type = __constant_cpu_to_be16(ETH_P_AOE); + h->verfl = AOE_HVER; + h->major = cpu_to_be16(d->aoemajor); + h->minor = d->aoeminor; + h->cmd = AOECMD_ATA; + h->tag = cpu_to_be32(host_tag); + + return host_tag; +} + +static inline void +put_lba(struct aoe_atahdr *ah, sector_t lba) +{ + ah->lba0 = lba; + ah->lba1 = lba >>= 8; + ah->lba2 = lba >>= 8; + ah->lba3 = lba >>= 8; + ah->lba4 = lba >>= 8; + ah->lba5 = lba >>= 8; +} + +static struct aoeif * +ifrotate(struct aoetgt *t) +{ + struct aoeif *ifp; + + ifp = t->ifp; + ifp++; + if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL) + ifp = t->ifs; + if (ifp->nd == NULL) + return NULL; + return t->ifp = ifp; +} + +static void +skb_pool_put(struct aoedev *d, struct sk_buff *skb) +{ + __skb_queue_tail(&d->skbpool, skb); +} + +static struct sk_buff * +skb_pool_get(struct aoedev *d) +{ + struct sk_buff *skb = skb_peek(&d->skbpool); + + if (skb && atomic_read(&skb_shinfo(skb)->dataref) == 1) { + __skb_unlink(skb, &d->skbpool); + return skb; + } + if (skb_queue_len(&d->skbpool) < NSKBPOOLMAX && + (skb = new_skb(ETH_ZLEN))) + return skb; + + return NULL; +} + +void +aoe_freetframe(struct frame *f) +{ + struct aoetgt *t; + + t = f->t; + f->buf = NULL; + memset(&f->iter, 0, sizeof(f->iter)); + f->r_skb = NULL; + f->flags = 0; + list_add(&f->head, &t->ffree); +} + +static struct frame * +newtframe(struct aoedev *d, struct aoetgt *t) +{ + struct frame *f; + struct sk_buff *skb; + struct list_head *pos; + + if (list_empty(&t->ffree)) { + if (t->falloc >= NSKBPOOLMAX*2) + return NULL; + f = kcalloc(1, sizeof(*f), GFP_ATOMIC); + if (f == NULL) + return NULL; + t->falloc++; + f->t = t; + } else { + pos = t->ffree.next; + list_del(pos); + f = list_entry(pos, struct frame, head); + } + + skb = f->skb; + if (skb == NULL) { + f->skb = skb = new_skb(ETH_ZLEN); + if (!skb) { +bail: aoe_freetframe(f); + return NULL; + } + } + + if (atomic_read(&skb_shinfo(skb)->dataref) != 1) { + skb = skb_pool_get(d); + if (skb == NULL) + goto bail; + skb_pool_put(d, f->skb); + f->skb = skb; + } + + skb->truesize -= skb->data_len; + skb_shinfo(skb)->nr_frags = skb->data_len = 0; + skb_trim(skb, 0); + return f; +} + +static struct frame * +newframe(struct aoedev *d) +{ + struct frame *f; + struct aoetgt *t, **tt; + int totout = 0; + int use_tainted; + int has_untainted; + + if (!d->targets || !d->targets[0]) { + printk(KERN_ERR "aoe: NULL TARGETS!\n"); + return NULL; + } + tt = d->tgt; /* last used target */ + for (use_tainted = 0, has_untainted = 0;;) { + tt++; + if (tt >= &d->targets[d->ntargets] || !*tt) + tt = d->targets; + t = *tt; + if (!t->taint) { + has_untainted = 1; + totout += t->nout; + } + if (t->nout < t->maxout + && (use_tainted || !t->taint) + && t->ifp->nd) { + f = newtframe(d, t); + if (f) { + ifrotate(t); + d->tgt = tt; + return f; + } + } + if (tt == d->tgt) { /* we've looped and found nada */ + if (!use_tainted && !has_untainted) + use_tainted = 1; + else + break; + } + } + if (totout == 0) { + d->kicked++; + d->flags |= DEVFL_KICKME; + } + return NULL; +} + +static void +skb_fillup(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter) +{ + int frag = 0; + struct bio_vec bv; + + __bio_for_each_segment(bv, bio, iter, iter) + skb_fill_page_desc(skb, frag++, bv.bv_page, + bv.bv_offset, bv.bv_len); +} + +static void +fhash(struct frame *f) +{ + struct aoedev *d = f->t->d; + u32 n; + + n = f->tag % NFACTIVE; + list_add_tail(&f->head, &d->factive[n]); +} + +static void +ata_rw_frameinit(struct frame *f) +{ + struct aoetgt *t; + struct aoe_hdr *h; + struct aoe_atahdr *ah; + struct sk_buff *skb; + char writebit, extbit; + + skb = f->skb; + h = (struct aoe_hdr *) skb_mac_header(skb); + ah = (struct aoe_atahdr *) (h + 1); + skb_put(skb, sizeof(*h) + sizeof(*ah)); + memset(h, 0, skb->len); + + writebit = 0x10; + extbit = 0x4; + + t = f->t; + f->tag = aoehdr_atainit(t->d, t, h); + fhash(f); + t->nout++; + f->waited = 0; + f->waited_total = 0; + + /* set up ata header */ + ah->scnt = f->iter.bi_size >> 9; + put_lba(ah, f->iter.bi_sector); + if (t->d->flags & DEVFL_EXT) { + ah->aflags |= AOEAFL_EXT; + } else { + extbit = 0; + ah->lba3 &= 0x0f; + ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */ + } + if (f->buf && bio_data_dir(f->buf->bio) == WRITE) { + skb_fillup(skb, f->buf->bio, f->iter); + ah->aflags |= AOEAFL_WRITE; + skb->len += f->iter.bi_size; + skb->data_len = f->iter.bi_size; + skb->truesize += f->iter.bi_size; + t->wpkts++; + } else { + t->rpkts++; + writebit = 0; + } + + ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit; + skb->dev = t->ifp->nd; +} + +static int +aoecmd_ata_rw(struct aoedev *d) +{ + struct frame *f; + struct buf *buf; + struct sk_buff *skb; + struct sk_buff_head queue; + + buf = nextbuf(d); + if (buf == NULL) + return 0; + f = newframe(d); + if (f == NULL) + return 0; + + /* initialize the headers & frame */ + f->buf = buf; + f->iter = buf->iter; + f->iter.bi_size = min_t(unsigned long, + d->maxbcnt ?: DEFAULTBCNT, + f->iter.bi_size); + bio_advance_iter(buf->bio, &buf->iter, f->iter.bi_size); + + if (!buf->iter.bi_size) + d->ip.buf = NULL; + + /* mark all tracking fields and load out */ + buf->nframesout += 1; + + ata_rw_frameinit(f); + + skb = skb_clone(f->skb, GFP_ATOMIC); + if (skb) { + do_gettimeofday(&f->sent); + f->sent_jiffs = (u32) jiffies; + __skb_queue_head_init(&queue); + __skb_queue_tail(&queue, skb); + aoenet_xmit(&queue); + } + return 1; +} + +/* some callers cannot sleep, and they can call this function, + * transmitting the packets later, when interrupts are on + */ +static void +aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *queue) +{ + struct aoe_hdr *h; + struct aoe_cfghdr *ch; + struct sk_buff *skb; + struct net_device *ifp; + + rcu_read_lock(); + for_each_netdev_rcu(&init_net, ifp) { + dev_hold(ifp); + if (!is_aoe_netif(ifp)) + goto cont; + + skb = new_skb(sizeof *h + sizeof *ch); + if (skb == NULL) { + printk(KERN_INFO "aoe: skb alloc failure\n"); + goto cont; + } + skb_put(skb, sizeof *h + sizeof *ch); + skb->dev = ifp; + __skb_queue_tail(queue, skb); + h = (struct aoe_hdr *) skb_mac_header(skb); + memset(h, 0, sizeof *h + sizeof *ch); + + memset(h->dst, 0xff, sizeof h->dst); + memcpy(h->src, ifp->dev_addr, sizeof h->src); + h->type = __constant_cpu_to_be16(ETH_P_AOE); + h->verfl = AOE_HVER; + h->major = cpu_to_be16(aoemajor); + h->minor = aoeminor; + h->cmd = AOECMD_CFG; + +cont: + dev_put(ifp); + } + rcu_read_unlock(); +} + +static void +resend(struct aoedev *d, struct frame *f) +{ + struct sk_buff *skb; + struct sk_buff_head queue; + struct aoe_hdr *h; + struct aoetgt *t; + char buf[128]; + u32 n; + + t = f->t; + n = newtag(d); + skb = f->skb; + if (ifrotate(t) == NULL) { + /* probably can't happen, but set it up to fail anyway */ + pr_info("aoe: resend: no interfaces to rotate to.\n"); + ktcomplete(f, NULL); + return; + } + h = (struct aoe_hdr *) skb_mac_header(skb); + + if (!(f->flags & FFL_PROBE)) { + snprintf(buf, sizeof(buf), + "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n", + "retransmit", d->aoemajor, d->aoeminor, + f->tag, jiffies, n, + h->src, h->dst, t->nout); + aoechr_error(buf); + } + + f->tag = n; + fhash(f); + h->tag = cpu_to_be32(n); + memcpy(h->dst, t->addr, sizeof h->dst); + memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src); + + skb->dev = t->ifp->nd; + skb = skb_clone(skb, GFP_ATOMIC); + if (skb == NULL) + return; + do_gettimeofday(&f->sent); + f->sent_jiffs = (u32) jiffies; + __skb_queue_head_init(&queue); + __skb_queue_tail(&queue, skb); + aoenet_xmit(&queue); +} + +static int +tsince_hr(struct frame *f) +{ + struct timeval now; + int n; + + do_gettimeofday(&now); + n = now.tv_usec - f->sent.tv_usec; + n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC; + + if (n < 0) + n = -n; + + /* For relatively long periods, use jiffies to avoid + * discrepancies caused by updates to the system time. + * + * On system with HZ of 1000, 32-bits is over 49 days + * worth of jiffies, or over 71 minutes worth of usecs. + * + * Jiffies overflow is handled by subtraction of unsigned ints: + * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe + * $3 = 4 + * (gdb) + */ + if (n > USEC_PER_SEC / 4) { + n = ((u32) jiffies) - f->sent_jiffs; + n *= USEC_PER_SEC / HZ; + } + + return n; +} + +static int +tsince(u32 tag) +{ + int n; + + n = jiffies & 0xffff; + n -= tag & 0xffff; + if (n < 0) + n += 1<<16; + return jiffies_to_usecs(n + 1); +} + +static struct aoeif * +getif(struct aoetgt *t, struct net_device *nd) +{ + struct aoeif *p, *e; + + p = t->ifs; + e = p + NAOEIFS; + for (; p < e; p++) + if (p->nd == nd) + return p; + return NULL; +} + +static void +ejectif(struct aoetgt *t, struct aoeif *ifp) +{ + struct aoeif *e; + struct net_device *nd; + ulong n; + + nd = ifp->nd; + e = t->ifs + NAOEIFS - 1; + n = (e - ifp) * sizeof *ifp; + memmove(ifp, ifp+1, n); + e->nd = NULL; + dev_put(nd); +} + +static struct frame * +reassign_frame(struct frame *f) +{ + struct frame *nf; + struct sk_buff *skb; + + nf = newframe(f->t->d); + if (!nf) + return NULL; + if (nf->t == f->t) { + aoe_freetframe(nf); + return NULL; + } + + skb = nf->skb; + nf->skb = f->skb; + nf->buf = f->buf; + nf->iter = f->iter; + nf->waited = 0; + nf->waited_total = f->waited_total; + nf->sent = f->sent; + nf->sent_jiffs = f->sent_jiffs; + f->skb = skb; + + return nf; +} + +static void +probe(struct aoetgt *t) +{ + struct aoedev *d; + struct frame *f; + struct sk_buff *skb; + struct sk_buff_head queue; + size_t n, m; + int frag; + + d = t->d; + f = newtframe(d, t); + if (!f) { + pr_err("%s %pm for e%ld.%d: %s\n", + "aoe: cannot probe remote address", + t->addr, + (long) d->aoemajor, d->aoeminor, + "no frame available"); + return; + } + f->flags |= FFL_PROBE; + ifrotate(t); + f->iter.bi_size = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT; + ata_rw_frameinit(f); + skb = f->skb; + for (frag = 0, n = f->iter.bi_size; n > 0; ++frag, n -= m) { + if (n < PAGE_SIZE) + m = n; + else + m = PAGE_SIZE; + skb_fill_page_desc(skb, frag, empty_page, 0, m); + } + skb->len += f->iter.bi_size; + skb->data_len = f->iter.bi_size; + skb->truesize += f->iter.bi_size; + + skb = skb_clone(f->skb, GFP_ATOMIC); + if (skb) { + do_gettimeofday(&f->sent); + f->sent_jiffs = (u32) jiffies; + __skb_queue_head_init(&queue); + __skb_queue_tail(&queue, skb); + aoenet_xmit(&queue); + } +} + +static long +rto(struct aoedev *d) +{ + long t; + + t = 2 * d->rttavg >> RTTSCALE; + t += 8 * d->rttdev >> RTTDSCALE; + if (t == 0) + t = 1; + + return t; +} + +static void +rexmit_deferred(struct aoedev *d) +{ + struct aoetgt *t; + struct frame *f; + struct frame *nf; + struct list_head *pos, *nx, *head; + int since; + int untainted; + + count_targets(d, &untainted); + + head = &d->rexmitq; + list_for_each_safe(pos, nx, head) { + f = list_entry(pos, struct frame, head); + t = f->t; + if (t->taint) { + if (!(f->flags & FFL_PROBE)) { + nf = reassign_frame(f); + if (nf) { + if (t->nout_probes == 0 + && untainted > 0) { + probe(t); + t->nout_probes++; + } + list_replace(&f->head, &nf->head); + pos = &nf->head; + aoe_freetframe(f); + f = nf; + t = f->t; + } + } else if (untainted < 1) { + /* don't probe w/o other untainted aoetgts */ + goto stop_probe; + } else if (tsince_hr(f) < t->taint * rto(d)) { + /* reprobe slowly when taint is high */ + continue; + } + } else if (f->flags & FFL_PROBE) { +stop_probe: /* don't probe untainted aoetgts */ + list_del(pos); + aoe_freetframe(f); + /* leaving d->kicked, because this is routine */ + f->t->d->flags |= DEVFL_KICKME; + continue; + } + if (t->nout >= t->maxout) + continue; + list_del(pos); + t->nout++; + if (f->flags & FFL_PROBE) + t->nout_probes++; + since = tsince_hr(f); + f->waited += since; + f->waited_total += since; + resend(d, f); + } +} + +/* An aoetgt accumulates demerits quickly, and successful + * probing redeems the aoetgt slowly. + */ +static void +scorn(struct aoetgt *t) +{ + int n; + + n = t->taint++; + t->taint += t->taint * 2; + if (n > t->taint) + t->taint = n; + if (t->taint > MAX_TAINT) + t->taint = MAX_TAINT; +} + +static int +count_targets(struct aoedev *d, int *untainted) +{ + int i, good; + + for (i = good = 0; i < d->ntargets && d->targets[i]; ++i) + if (d->targets[i]->taint == 0) + good++; + + if (untainted) + *untainted = good; + return i; +} + +static void +rexmit_timer(ulong vp) +{ + struct aoedev *d; + struct aoetgt *t; + struct aoeif *ifp; + struct frame *f; + struct list_head *head, *pos, *nx; + LIST_HEAD(flist); + register long timeout; + ulong flags, n; + int i; + int utgts; /* number of aoetgt descriptors (not slots) */ + int since; + + d = (struct aoedev *) vp; + + spin_lock_irqsave(&d->lock, flags); + + /* timeout based on observed timings and variations */ + timeout = rto(d); + + utgts = count_targets(d, NULL); + + if (d->flags & DEVFL_TKILL) { + spin_unlock_irqrestore(&d->lock, flags); + return; + } + + /* collect all frames to rexmit into flist */ + for (i = 0; i < NFACTIVE; i++) { + head = &d->factive[i]; + list_for_each_safe(pos, nx, head) { + f = list_entry(pos, struct frame, head); + if (tsince_hr(f) < timeout) + break; /* end of expired frames */ + /* move to flist for later processing */ + list_move_tail(pos, &flist); + } + } + + /* process expired frames */ + while (!list_empty(&flist)) { + pos = flist.next; + f = list_entry(pos, struct frame, head); + since = tsince_hr(f); + n = f->waited_total + since; + n /= USEC_PER_SEC; + if (aoe_deadsecs + && n > aoe_deadsecs + && !(f->flags & FFL_PROBE)) { + /* Waited too long. Device failure. + * Hang all frames on first hash bucket for downdev + * to clean up. + */ + list_splice(&flist, &d->factive[0]); + aoedev_downdev(d); + goto out; + } + + t = f->t; + n = f->waited + since; + n /= USEC_PER_SEC; + if (aoe_deadsecs && utgts > 0 + && (n > aoe_deadsecs / utgts || n > HARD_SCORN_SECS)) + scorn(t); /* avoid this target */ + + if (t->maxout != 1) { + t->ssthresh = t->maxout / 2; + t->maxout = 1; + } + + if (f->flags & FFL_PROBE) { + t->nout_probes--; + } else { + ifp = getif(t, f->skb->dev); + if (ifp && ++ifp->lost > (t->nframes << 1) + && (ifp != t->ifs || t->ifs[1].nd)) { + ejectif(t, ifp); + ifp = NULL; + } + } + list_move_tail(pos, &d->rexmitq); + t->nout--; + } + rexmit_deferred(d); + +out: + if ((d->flags & DEVFL_KICKME) && d->blkq) { + d->flags &= ~DEVFL_KICKME; + d->blkq->request_fn(d->blkq); + } + + d->timer.expires = jiffies + TIMERTICK; + add_timer(&d->timer); + + spin_unlock_irqrestore(&d->lock, flags); +} + +static unsigned long +rqbiocnt(struct request *r) +{ + struct bio *bio; + unsigned long n = 0; + + __rq_for_each_bio(bio, r) + n++; + return n; +} + +/* This can be removed if we are certain that no users of the block + * layer will ever use zero-count pages in bios. Otherwise we have to + * protect against the put_page sometimes done by the network layer. + * + * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for + * discussion. + * + * We cannot use get_page in the workaround, because it insists on a + * positive page count as a precondition. So we use _count directly. + */ +static void +bio_pageinc(struct bio *bio) +{ + struct bio_vec bv; + struct page *page; + struct bvec_iter iter; + + bio_for_each_segment(bv, bio, iter) { + /* Non-zero page count for non-head members of + * compound pages is no longer allowed by the kernel. + */ + page = compound_head(bv.bv_page); + atomic_inc(&page->_count); + } +} + +static void +bio_pagedec(struct bio *bio) +{ + struct page *page; + struct bio_vec bv; + struct bvec_iter iter; + + bio_for_each_segment(bv, bio, iter) { + page = compound_head(bv.bv_page); + atomic_dec(&page->_count); + } +} + +static void +bufinit(struct buf *buf, struct request *rq, struct bio *bio) +{ + memset(buf, 0, sizeof(*buf)); + buf->rq = rq; + buf->bio = bio; + buf->iter = bio->bi_iter; + bio_pageinc(bio); +} + +static struct buf * +nextbuf(struct aoedev *d) +{ + struct request *rq; + struct request_queue *q; + struct buf *buf; + struct bio *bio; + + q = d->blkq; + if (q == NULL) + return NULL; /* initializing */ + if (d->ip.buf) + return d->ip.buf; + rq = d->ip.rq; + if (rq == NULL) { + rq = blk_peek_request(q); + if (rq == NULL) + return NULL; + blk_start_request(rq); + d->ip.rq = rq; + d->ip.nxbio = rq->bio; + rq->special = (void *) rqbiocnt(rq); + } + buf = mempool_alloc(d->bufpool, GFP_ATOMIC); + if (buf == NULL) { + pr_err("aoe: nextbuf: unable to mempool_alloc!\n"); + return NULL; + } + bio = d->ip.nxbio; + bufinit(buf, rq, bio); + bio = bio->bi_next; + d->ip.nxbio = bio; + if (bio == NULL) + d->ip.rq = NULL; + return d->ip.buf = buf; +} + +/* enters with d->lock held */ +void +aoecmd_work(struct aoedev *d) +{ + rexmit_deferred(d); + while (aoecmd_ata_rw(d)) + ; +} + +/* this function performs work that has been deferred until sleeping is OK + */ +void +aoecmd_sleepwork(struct work_struct *work) +{ + struct aoedev *d = container_of(work, struct aoedev, work); + struct block_device *bd; + u64 ssize; + + if (d->flags & DEVFL_GDALLOC) + aoeblk_gdalloc(d); + + if (d->flags & DEVFL_NEWSIZE) { + ssize = get_capacity(d->gd); + bd = bdget_disk(d->gd, 0); + if (bd) { + mutex_lock(&bd->bd_inode->i_mutex); + i_size_write(bd->bd_inode, (loff_t)ssize<<9); + mutex_unlock(&bd->bd_inode->i_mutex); + bdput(bd); + } + spin_lock_irq(&d->lock); + d->flags |= DEVFL_UP; + d->flags &= ~DEVFL_NEWSIZE; + spin_unlock_irq(&d->lock); + } +} + +static void +ata_ident_fixstring(u16 *id, int ns) +{ + u16 s; + + while (ns-- > 0) { + s = *id; + *id++ = s >> 8 | s << 8; + } +} + +static void +ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id) +{ + u64 ssize; + u16 n; + + /* word 83: command set supported */ + n = get_unaligned_le16(&id[83 << 1]); + + /* word 86: command set/feature enabled */ + n |= get_unaligned_le16(&id[86 << 1]); + + if (n & (1<<10)) { /* bit 10: LBA 48 */ + d->flags |= DEVFL_EXT; + + /* word 100: number lba48 sectors */ + ssize = get_unaligned_le64(&id[100 << 1]); + + /* set as in ide-disk.c:init_idedisk_capacity */ + d->geo.cylinders = ssize; + d->geo.cylinders /= (255 * 63); + d->geo.heads = 255; + d->geo.sectors = 63; + } else { + d->flags &= ~DEVFL_EXT; + + /* number lba28 sectors */ + ssize = get_unaligned_le32(&id[60 << 1]); + + /* NOTE: obsolete in ATA 6 */ + d->geo.cylinders = get_unaligned_le16(&id[54 << 1]); + d->geo.heads = get_unaligned_le16(&id[55 << 1]); + d->geo.sectors = get_unaligned_le16(&id[56 << 1]); + } + + ata_ident_fixstring((u16 *) &id[10<<1], 10); /* serial */ + ata_ident_fixstring((u16 *) &id[23<<1], 4); /* firmware */ + ata_ident_fixstring((u16 *) &id[27<<1], 20); /* model */ + memcpy(d->ident, id, sizeof(d->ident)); + + if (d->ssize != ssize) + printk(KERN_INFO + "aoe: %pm e%ld.%d v%04x has %llu sectors\n", + t->addr, + d->aoemajor, d->aoeminor, + d->fw_ver, (long long)ssize); + d->ssize = ssize; + d->geo.start = 0; + if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE)) + return; + if (d->gd != NULL) { + set_capacity(d->gd, ssize); + d->flags |= DEVFL_NEWSIZE; + } else + d->flags |= DEVFL_GDALLOC; + schedule_work(&d->work); +} + +static void +calc_rttavg(struct aoedev *d, struct aoetgt *t, int rtt) +{ + register long n; + + n = rtt; + + /* cf. Congestion Avoidance and Control, Jacobson & Karels, 1988 */ + n -= d->rttavg >> RTTSCALE; + d->rttavg += n; + if (n < 0) + n = -n; + n -= d->rttdev >> RTTDSCALE; + d->rttdev += n; + + if (!t || t->maxout >= t->nframes) + return; + if (t->maxout < t->ssthresh) + t->maxout += 1; + else if (t->nout == t->maxout && t->next_cwnd-- == 0) { + t->maxout += 1; + t->next_cwnd = t->maxout; + } +} + +static struct aoetgt * +gettgt(struct aoedev *d, char *addr) +{ + struct aoetgt **t, **e; + + t = d->targets; + e = t + d->ntargets; + for (; t < e && *t; t++) + if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0) + return *t; + return NULL; +} + +static void +bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt) +{ + int soff = 0; + struct bio_vec bv; + + iter.bi_size = cnt; + + __bio_for_each_segment(bv, bio, iter, iter) { + char *p = page_address(bv.bv_page) + bv.bv_offset; + skb_copy_bits(skb, soff, p, bv.bv_len); + soff += bv.bv_len; + } +} + +void +aoe_end_request(struct aoedev *d, struct request *rq, int fastfail) +{ + struct bio *bio; + int bok; + struct request_queue *q; + + q = d->blkq; + if (rq == d->ip.rq) + d->ip.rq = NULL; + do { + bio = rq->bio; + bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags); + } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size)); + + /* cf. http://lkml.org/lkml/2006/10/31/28 */ + if (!fastfail) + __blk_run_queue(q); +} + +static void +aoe_end_buf(struct aoedev *d, struct buf *buf) +{ + struct request *rq; + unsigned long n; + + if (buf == d->ip.buf) + d->ip.buf = NULL; + rq = buf->rq; + bio_pagedec(buf->bio); + mempool_free(buf, d->bufpool); + n = (unsigned long) rq->special; + rq->special = (void *) --n; + if (n == 0) + aoe_end_request(d, rq, 0); +} + +static void +ktiocomplete(struct frame *f) +{ + struct aoe_hdr *hin, *hout; + struct aoe_atahdr *ahin, *ahout; + struct buf *buf; + struct sk_buff *skb; + struct aoetgt *t; + struct aoeif *ifp; + struct aoedev *d; + long n; + int untainted; + + if (f == NULL) + return; + + t = f->t; + d = t->d; + skb = f->r_skb; + buf = f->buf; + if (f->flags & FFL_PROBE) + goto out; + if (!skb) /* just fail the buf. */ + goto noskb; + + hout = (struct aoe_hdr *) skb_mac_header(f->skb); + ahout = (struct aoe_atahdr *) (hout+1); + + hin = (struct aoe_hdr *) skb->data; + skb_pull(skb, sizeof(*hin)); + ahin = (struct aoe_atahdr *) skb->data; + skb_pull(skb, sizeof(*ahin)); + if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */ + pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n", + ahout->cmdstat, ahin->cmdstat, + d->aoemajor, d->aoeminor); +noskb: if (buf) + clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + goto out; + } + + n = ahout->scnt << 9; + switch (ahout->cmdstat) { + case ATA_CMD_PIO_READ: + case ATA_CMD_PIO_READ_EXT: + if (skb->len < n) { + pr_err("%s e%ld.%d. skb->len=%d need=%ld\n", + "aoe: runt data size in read from", + (long) d->aoemajor, d->aoeminor, + skb->len, n); + clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + break; + } + if (n > f->iter.bi_size) { + pr_err_ratelimited("%s e%ld.%d. bytes=%ld need=%u\n", + "aoe: too-large data size in read from", + (long) d->aoemajor, d->aoeminor, + n, f->iter.bi_size); + clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + break; + } + bvcpy(skb, f->buf->bio, f->iter, n); + case ATA_CMD_PIO_WRITE: + case ATA_CMD_PIO_WRITE_EXT: + spin_lock_irq(&d->lock); + ifp = getif(t, skb->dev); + if (ifp) + ifp->lost = 0; + spin_unlock_irq(&d->lock); + break; + case ATA_CMD_ID_ATA: + if (skb->len < 512) { + pr_info("%s e%ld.%d. skb->len=%d need=512\n", + "aoe: runt data size in ataid from", + (long) d->aoemajor, d->aoeminor, + skb->len); + break; + } + if (skb_linearize(skb)) + break; + spin_lock_irq(&d->lock); + ataid_complete(d, t, skb->data); + spin_unlock_irq(&d->lock); + break; + default: + pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n", + ahout->cmdstat, + be16_to_cpu(get_unaligned(&hin->major)), + hin->minor); + } +out: + spin_lock_irq(&d->lock); + if (t->taint > 0 + && --t->taint > 0 + && t->nout_probes == 0) { + count_targets(d, &untainted); + if (untainted > 0) { + probe(t); + t->nout_probes++; + } + } + + aoe_freetframe(f); + + if (buf && --buf->nframesout == 0 && buf->iter.bi_size == 0) + aoe_end_buf(d, buf); + + spin_unlock_irq(&d->lock); + aoedev_put(d); + dev_kfree_skb(skb); +} + +/* Enters with iocq.lock held. + * Returns true iff responses needing processing remain. + */ +static int +ktio(int id) +{ + struct frame *f; + struct list_head *pos; + int i; + int actual_id; + + for (i = 0; ; ++i) { + if (i == MAXIOC) + return 1; + if (list_empty(&iocq[id].head)) + return 0; + pos = iocq[id].head.next; + list_del(pos); + f = list_entry(pos, struct frame, head); + spin_unlock_irq(&iocq[id].lock); + ktiocomplete(f); + + /* Figure out if extra threads are required. */ + actual_id = f->t->d->aoeminor % ncpus; + + if (!kts[actual_id].active) { + BUG_ON(id != 0); + mutex_lock(&ktio_spawn_lock); + if (!kts[actual_id].active + && aoe_ktstart(&kts[actual_id]) == 0) + kts[actual_id].active = 1; + mutex_unlock(&ktio_spawn_lock); + } + spin_lock_irq(&iocq[id].lock); + } +} + +static int +kthread(void *vp) +{ + struct ktstate *k; + DECLARE_WAITQUEUE(wait, current); + int more; + + k = vp; + current->flags |= PF_NOFREEZE; + set_user_nice(current, -10); + complete(&k->rendez); /* tell spawner we're running */ + do { + spin_lock_irq(k->lock); + more = k->fn(k->id); + if (!more) { + add_wait_queue(k->waitq, &wait); + __set_current_state(TASK_INTERRUPTIBLE); + } + spin_unlock_irq(k->lock); + if (!more) { + schedule(); + remove_wait_queue(k->waitq, &wait); + } else + cond_resched(); + } while (!kthread_should_stop()); + complete(&k->rendez); /* tell spawner we're stopping */ + return 0; +} + +void +aoe_ktstop(struct ktstate *k) +{ + kthread_stop(k->task); + wait_for_completion(&k->rendez); +} + +int +aoe_ktstart(struct ktstate *k) +{ + struct task_struct *task; + + init_completion(&k->rendez); + task = kthread_run(kthread, k, "%s", k->name); + if (task == NULL || IS_ERR(task)) + return -ENOMEM; + k->task = task; + wait_for_completion(&k->rendez); /* allow kthread to start */ + init_completion(&k->rendez); /* for waiting for exit later */ + return 0; +} + +/* pass it off to kthreads for processing */ +static void +ktcomplete(struct frame *f, struct sk_buff *skb) +{ + int id; + ulong flags; + + f->r_skb = skb; + id = f->t->d->aoeminor % ncpus; + spin_lock_irqsave(&iocq[id].lock, flags); + if (!kts[id].active) { + spin_unlock_irqrestore(&iocq[id].lock, flags); + /* The thread with id has not been spawned yet, + * so delegate the work to the main thread and + * try spawning a new thread. + */ + id = 0; + spin_lock_irqsave(&iocq[id].lock, flags); + } + list_add_tail(&f->head, &iocq[id].head); + spin_unlock_irqrestore(&iocq[id].lock, flags); + wake_up(&ktiowq[id]); +} + +struct sk_buff * +aoecmd_ata_rsp(struct sk_buff *skb) +{ + struct aoedev *d; + struct aoe_hdr *h; + struct frame *f; + u32 n; + ulong flags; + char ebuf[128]; + u16 aoemajor; + + h = (struct aoe_hdr *) skb->data; + aoemajor = be16_to_cpu(get_unaligned(&h->major)); + d = aoedev_by_aoeaddr(aoemajor, h->minor, 0); + if (d == NULL) { + snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response " + "for unknown device %d.%d\n", + aoemajor, h->minor); + aoechr_error(ebuf); + return skb; + } + + spin_lock_irqsave(&d->lock, flags); + + n = be32_to_cpu(get_unaligned(&h->tag)); + f = getframe(d, n); + if (f) { + calc_rttavg(d, f->t, tsince_hr(f)); + f->t->nout--; + if (f->flags & FFL_PROBE) + f->t->nout_probes--; + } else { + f = getframe_deferred(d, n); + if (f) { + calc_rttavg(d, NULL, tsince_hr(f)); + } else { + calc_rttavg(d, NULL, tsince(n)); + spin_unlock_irqrestore(&d->lock, flags); + aoedev_put(d); + snprintf(ebuf, sizeof(ebuf), + "%15s e%d.%d tag=%08x@%08lx s=%pm d=%pm\n", + "unexpected rsp", + get_unaligned_be16(&h->major), + h->minor, + get_unaligned_be32(&h->tag), + jiffies, + h->src, + h->dst); + aoechr_error(ebuf); + return skb; + } + } + aoecmd_work(d); + + spin_unlock_irqrestore(&d->lock, flags); + + ktcomplete(f, skb); + + /* + * Note here that we do not perform an aoedev_put, as we are + * leaving this reference for the ktio to release. + */ + return NULL; +} + +void +aoecmd_cfg(ushort aoemajor, unsigned char aoeminor) +{ + struct sk_buff_head queue; + + __skb_queue_head_init(&queue); + aoecmd_cfg_pkts(aoemajor, aoeminor, &queue); + aoenet_xmit(&queue); +} + +struct sk_buff * +aoecmd_ata_id(struct aoedev *d) +{ + struct aoe_hdr *h; + struct aoe_atahdr *ah; + struct frame *f; + struct sk_buff *skb; + struct aoetgt *t; + + f = newframe(d); + if (f == NULL) + return NULL; + + t = *d->tgt; + + /* initialize the headers & frame */ + skb = f->skb; + h = (struct aoe_hdr *) skb_mac_header(skb); + ah = (struct aoe_atahdr *) (h+1); + skb_put(skb, sizeof *h + sizeof *ah); + memset(h, 0, skb->len); + f->tag = aoehdr_atainit(d, t, h); + fhash(f); + t->nout++; + f->waited = 0; + f->waited_total = 0; + + /* set up ata header */ + ah->scnt = 1; + ah->cmdstat = ATA_CMD_ID_ATA; + ah->lba3 = 0xa0; + + skb->dev = t->ifp->nd; + + d->rttavg = RTTAVG_INIT; + d->rttdev = RTTDEV_INIT; + d->timer.function = rexmit_timer; + + skb = skb_clone(skb, GFP_ATOMIC); + if (skb) { + do_gettimeofday(&f->sent); + f->sent_jiffs = (u32) jiffies; + } + + return skb; +} + +static struct aoetgt ** +grow_targets(struct aoedev *d) +{ + ulong oldn, newn; + struct aoetgt **tt; + + oldn = d->ntargets; + newn = oldn * 2; + tt = kcalloc(newn, sizeof(*d->targets), GFP_ATOMIC); + if (!tt) + return NULL; + memmove(tt, d->targets, sizeof(*d->targets) * oldn); + d->tgt = tt + (d->tgt - d->targets); + kfree(d->targets); + d->targets = tt; + d->ntargets = newn; + + return &d->targets[oldn]; +} + +static struct aoetgt * +addtgt(struct aoedev *d, char *addr, ulong nframes) +{ + struct aoetgt *t, **tt, **te; + + tt = d->targets; + te = tt + d->ntargets; + for (; tt < te && *tt; tt++) + ; + + if (tt == te) { + tt = grow_targets(d); + if (!tt) + goto nomem; + } + t = kzalloc(sizeof(*t), GFP_ATOMIC); + if (!t) + goto nomem; + t->nframes = nframes; + t->d = d; + memcpy(t->addr, addr, sizeof t->addr); + t->ifp = t->ifs; + aoecmd_wreset(t); + t->maxout = t->nframes / 2; + INIT_LIST_HEAD(&t->ffree); + return *tt = t; + + nomem: + pr_info("aoe: cannot allocate memory to add target\n"); + return NULL; +} + +static void +setdbcnt(struct aoedev *d) +{ + struct aoetgt **t, **e; + int bcnt = 0; + + t = d->targets; + e = t + d->ntargets; + for (; t < e && *t; t++) + if (bcnt == 0 || bcnt > (*t)->minbcnt) + bcnt = (*t)->minbcnt; + if (bcnt != d->maxbcnt) { + d->maxbcnt = bcnt; + pr_info("aoe: e%ld.%d: setting %d byte data frames\n", + d->aoemajor, d->aoeminor, bcnt); + } +} + +static void +setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt) +{ + struct aoedev *d; + struct aoeif *p, *e; + int minbcnt; + + d = t->d; + minbcnt = bcnt; + p = t->ifs; + e = p + NAOEIFS; + for (; p < e; p++) { + if (p->nd == NULL) + break; /* end of the valid interfaces */ + if (p->nd == nd) { + p->bcnt = bcnt; /* we're updating */ + nd = NULL; + } else if (minbcnt > p->bcnt) + minbcnt = p->bcnt; /* find the min interface */ + } + if (nd) { + if (p == e) { + pr_err("aoe: device setifbcnt failure; too many interfaces.\n"); + return; + } + dev_hold(nd); + p->nd = nd; + p->bcnt = bcnt; + } + t->minbcnt = minbcnt; + setdbcnt(d); +} + +void +aoecmd_cfg_rsp(struct sk_buff *skb) +{ + struct aoedev *d; + struct aoe_hdr *h; + struct aoe_cfghdr *ch; + struct aoetgt *t; + ulong flags, aoemajor; + struct sk_buff *sl; + struct sk_buff_head queue; + u16 n; + + sl = NULL; + h = (struct aoe_hdr *) skb_mac_header(skb); + ch = (struct aoe_cfghdr *) (h+1); + + /* + * Enough people have their dip switches set backwards to + * warrant a loud message for this special case. + */ + aoemajor = get_unaligned_be16(&h->major); + if (aoemajor == 0xfff) { + printk(KERN_ERR "aoe: Warning: shelf address is all ones. " + "Check shelf dip switches.\n"); + return; + } + if (aoemajor == 0xffff) { + pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n", + aoemajor, (int) h->minor); + return; + } + if (h->minor == 0xff) { + pr_info("aoe: e%ld.%d: broadcast slot number invalid\n", + aoemajor, (int) h->minor); + return; + } + + n = be16_to_cpu(ch->bufcnt); + if (n > aoe_maxout) /* keep it reasonable */ + n = aoe_maxout; + + d = aoedev_by_aoeaddr(aoemajor, h->minor, 1); + if (d == NULL) { + pr_info("aoe: device allocation failure\n"); + return; + } + + spin_lock_irqsave(&d->lock, flags); + + t = gettgt(d, h->src); + if (t) { + t->nframes = n; + if (n < t->maxout) + aoecmd_wreset(t); + } else { + t = addtgt(d, h->src, n); + if (!t) + goto bail; + } + n = skb->dev->mtu; + n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr); + n /= 512; + if (n > ch->scnt) + n = ch->scnt; + n = n ? n * 512 : DEFAULTBCNT; + setifbcnt(t, skb->dev, n); + + /* don't change users' perspective */ + if (d->nopen == 0) { + d->fw_ver = be16_to_cpu(ch->fwver); + sl = aoecmd_ata_id(d); + } +bail: + spin_unlock_irqrestore(&d->lock, flags); + aoedev_put(d); + if (sl) { + __skb_queue_head_init(&queue); + __skb_queue_tail(&queue, sl); + aoenet_xmit(&queue); + } +} + +void +aoecmd_wreset(struct aoetgt *t) +{ + t->maxout = 1; + t->ssthresh = t->nframes / 2; + t->next_cwnd = t->nframes; +} + +void +aoecmd_cleanslate(struct aoedev *d) +{ + struct aoetgt **t, **te; + + d->rttavg = RTTAVG_INIT; + d->rttdev = RTTDEV_INIT; + d->maxbcnt = 0; + + t = d->targets; + te = t + d->ntargets; + for (; t < te && *t; t++) + aoecmd_wreset(*t); +} + +void +aoe_failbuf(struct aoedev *d, struct buf *buf) +{ + if (buf == NULL) + return; + buf->iter.bi_size = 0; + clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + if (buf->nframesout == 0) + aoe_end_buf(d, buf); +} + +void +aoe_flush_iocq(void) +{ + int i; + + for (i = 0; i < ncpus; i++) { + if (kts[i].active) + aoe_flush_iocq_by_index(i); + } +} + +void +aoe_flush_iocq_by_index(int id) +{ + struct frame *f; + struct aoedev *d; + LIST_HEAD(flist); + struct list_head *pos; + struct sk_buff *skb; + ulong flags; + + spin_lock_irqsave(&iocq[id].lock, flags); + list_splice_init(&iocq[id].head, &flist); + spin_unlock_irqrestore(&iocq[id].lock, flags); + while (!list_empty(&flist)) { + pos = flist.next; + list_del(pos); + f = list_entry(pos, struct frame, head); + d = f->t->d; + skb = f->r_skb; + spin_lock_irqsave(&d->lock, flags); + if (f->buf) { + f->buf->nframesout--; + aoe_failbuf(d, f->buf); + } + aoe_freetframe(f); + spin_unlock_irqrestore(&d->lock, flags); + dev_kfree_skb(skb); + aoedev_put(d); + } +} + +int __init +aoecmd_init(void) +{ + void *p; + int i; + int ret; + + /* get_zeroed_page returns page with ref count 1 */ + p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT); + if (!p) + return -ENOMEM; + empty_page = virt_to_page(p); + + ncpus = num_online_cpus(); + + iocq = kcalloc(ncpus, sizeof(struct iocq_ktio), GFP_KERNEL); + if (!iocq) + return -ENOMEM; + + kts = kcalloc(ncpus, sizeof(struct ktstate), GFP_KERNEL); + if (!kts) { + ret = -ENOMEM; + goto kts_fail; + } + + ktiowq = kcalloc(ncpus, sizeof(wait_queue_head_t), GFP_KERNEL); + if (!ktiowq) { + ret = -ENOMEM; + goto ktiowq_fail; + } + + mutex_init(&ktio_spawn_lock); + + for (i = 0; i < ncpus; i++) { + INIT_LIST_HEAD(&iocq[i].head); + spin_lock_init(&iocq[i].lock); + init_waitqueue_head(&ktiowq[i]); + snprintf(kts[i].name, sizeof(kts[i].name), "aoe_ktio%d", i); + kts[i].fn = ktio; + kts[i].waitq = &ktiowq[i]; + kts[i].lock = &iocq[i].lock; + kts[i].id = i; + kts[i].active = 0; + } + kts[0].active = 1; + if (aoe_ktstart(&kts[0])) { + ret = -ENOMEM; + goto ktstart_fail; + } + return 0; + +ktstart_fail: + kfree(ktiowq); +ktiowq_fail: + kfree(kts); +kts_fail: + kfree(iocq); + + return ret; +} + +void +aoecmd_exit(void) +{ + int i; + + for (i = 0; i < ncpus; i++) + if (kts[i].active) + aoe_ktstop(&kts[i]); + + aoe_flush_iocq(); + + /* Free up the iocq and thread speicific configuration + * allocated during startup. + */ + kfree(iocq); + kfree(kts); + kfree(ktiowq); + + free_page((unsigned long) page_address(empty_page)); + empty_page = NULL; +} diff --git a/kernel/drivers/block/aoe/aoedev.c b/kernel/drivers/block/aoe/aoedev.c new file mode 100644 index 000000000..e774c50b6 --- /dev/null +++ b/kernel/drivers/block/aoe/aoedev.c @@ -0,0 +1,526 @@ +/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoedev.c + * AoE device utility functions; maintains device list. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "aoe.h" + +static void dummy_timer(ulong); +static void freetgt(struct aoedev *d, struct aoetgt *t); +static void skbpoolfree(struct aoedev *d); + +static int aoe_dyndevs = 1; +module_param(aoe_dyndevs, int, 0644); +MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices."); + +static struct aoedev *devlist; +static DEFINE_SPINLOCK(devlist_lock); + +/* Because some systems will have one, many, or no + * - partitions, + * - slots per shelf, + * - or shelves, + * we need some flexibility in the way the minor numbers + * are allocated. So they are dynamic. + */ +#define N_DEVS ((1U<= NPERSHELF) { + pr_err("aoe: %s %d slots per shelf\n", + "static minor device numbers support only", + NPERSHELF); + error = -1; + goto out; + } + + n = aoemaj * NPERSHELF + aoemin; + if (n >= N_DEVS) { + pr_err("aoe: %s with e%ld.%d\n", + "cannot use static minor device numbers", + aoemaj, aoemin); + error = -1; + goto out; + } + + spin_lock_irqsave(&used_minors_lock, flags); + if (test_bit(n, used_minors)) { + pr_err("aoe: %s %lu\n", + "existing device already has static minor number", + n); + error = -1; + } else + set_bit(n, used_minors); + spin_unlock_irqrestore(&used_minors_lock, flags); + *sysminor = n * AOE_PARTITIONS; +out: + return error; +} + +static int +minor_get(ulong *sysminor, ulong aoemaj, int aoemin) +{ + if (aoe_dyndevs) + return minor_get_dyn(sysminor); + else + return minor_get_static(sysminor, aoemaj, aoemin); +} + +static void +minor_free(ulong minor) +{ + ulong flags; + + minor /= AOE_PARTITIONS; + BUG_ON(minor >= N_DEVS); + + spin_lock_irqsave(&used_minors_lock, flags); + BUG_ON(!test_bit(minor, used_minors)); + clear_bit(minor, used_minors); + spin_unlock_irqrestore(&used_minors_lock, flags); +} + +/* + * Users who grab a pointer to the device with aoedev_by_aoeaddr + * automatically get a reference count and must be responsible + * for performing a aoedev_put. With the addition of async + * kthread processing I'm no longer confident that we can + * guarantee consistency in the face of device flushes. + * + * For the time being, we only bother to add extra references for + * frames sitting on the iocq. When the kthreads finish processing + * these frames, they will aoedev_put the device. + */ + +void +aoedev_put(struct aoedev *d) +{ + ulong flags; + + spin_lock_irqsave(&devlist_lock, flags); + d->ref--; + spin_unlock_irqrestore(&devlist_lock, flags); +} + +static void +dummy_timer(ulong vp) +{ + struct aoedev *d; + + d = (struct aoedev *)vp; + if (d->flags & DEVFL_TKILL) + return; + d->timer.expires = jiffies + HZ; + add_timer(&d->timer); +} + +static void +aoe_failip(struct aoedev *d) +{ + struct request *rq; + struct bio *bio; + unsigned long n; + + aoe_failbuf(d, d->ip.buf); + + rq = d->ip.rq; + if (rq == NULL) + return; + while ((bio = d->ip.nxbio)) { + clear_bit(BIO_UPTODATE, &bio->bi_flags); + d->ip.nxbio = bio->bi_next; + n = (unsigned long) rq->special; + rq->special = (void *) --n; + } + if ((unsigned long) rq->special == 0) + aoe_end_request(d, rq, 0); +} + +static void +downdev_frame(struct list_head *pos) +{ + struct frame *f; + + f = list_entry(pos, struct frame, head); + list_del(pos); + if (f->buf) { + f->buf->nframesout--; + aoe_failbuf(f->t->d, f->buf); + } + aoe_freetframe(f); +} + +void +aoedev_downdev(struct aoedev *d) +{ + struct aoetgt *t, **tt, **te; + struct list_head *head, *pos, *nx; + struct request *rq; + int i; + + d->flags &= ~DEVFL_UP; + + /* clean out active and to-be-retransmitted buffers */ + for (i = 0; i < NFACTIVE; i++) { + head = &d->factive[i]; + list_for_each_safe(pos, nx, head) + downdev_frame(pos); + } + head = &d->rexmitq; + list_for_each_safe(pos, nx, head) + downdev_frame(pos); + + /* reset window dressings */ + tt = d->targets; + te = tt + d->ntargets; + for (; tt < te && (t = *tt); tt++) { + aoecmd_wreset(t); + t->nout = 0; + } + + /* clean out the in-process request (if any) */ + aoe_failip(d); + + /* fast fail all pending I/O */ + if (d->blkq) { + while ((rq = blk_peek_request(d->blkq))) { + blk_start_request(rq); + aoe_end_request(d, rq, 1); + } + } + + if (d->gd) + set_capacity(d->gd, 0); +} + +/* return whether the user asked for this particular + * device to be flushed + */ +static int +user_req(char *s, size_t slen, struct aoedev *d) +{ + const char *p; + size_t lim; + + if (!d->gd) + return 0; + p = kbasename(d->gd->disk_name); + lim = sizeof(d->gd->disk_name); + lim -= p - d->gd->disk_name; + if (slen < lim) + lim = slen; + + return !strncmp(s, p, lim); +} + +static void +freedev(struct aoedev *d) +{ + struct aoetgt **t, **e; + int freeing = 0; + unsigned long flags; + + spin_lock_irqsave(&d->lock, flags); + if (d->flags & DEVFL_TKILL + && !(d->flags & DEVFL_FREEING)) { + d->flags |= DEVFL_FREEING; + freeing = 1; + } + spin_unlock_irqrestore(&d->lock, flags); + if (!freeing) + return; + + del_timer_sync(&d->timer); + if (d->gd) { + aoedisk_rm_debugfs(d); + aoedisk_rm_sysfs(d); + del_gendisk(d->gd); + put_disk(d->gd); + blk_cleanup_queue(d->blkq); + } + t = d->targets; + e = t + d->ntargets; + for (; t < e && *t; t++) + freetgt(d, *t); + if (d->bufpool) + mempool_destroy(d->bufpool); + skbpoolfree(d); + minor_free(d->sysminor); + + spin_lock_irqsave(&d->lock, flags); + d->flags |= DEVFL_FREED; + spin_unlock_irqrestore(&d->lock, flags); +} + +enum flush_parms { + NOT_EXITING = 0, + EXITING = 1, +}; + +static int +flush(const char __user *str, size_t cnt, int exiting) +{ + ulong flags; + struct aoedev *d, **dd; + char buf[16]; + int all = 0; + int specified = 0; /* flush a specific device */ + unsigned int skipflags; + + skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL; + + if (!exiting && cnt >= 3) { + if (cnt > sizeof buf) + cnt = sizeof buf; + if (copy_from_user(buf, str, cnt)) + return -EFAULT; + all = !strncmp(buf, "all", 3); + if (!all) + specified = 1; + } + + flush_scheduled_work(); + /* pass one: without sleeping, do aoedev_downdev */ + spin_lock_irqsave(&devlist_lock, flags); + for (d = devlist; d; d = d->next) { + spin_lock(&d->lock); + if (exiting) { + /* unconditionally take each device down */ + } else if (specified) { + if (!user_req(buf, cnt, d)) + goto cont; + } else if ((!all && (d->flags & DEVFL_UP)) + || d->flags & skipflags + || d->nopen + || d->ref) + goto cont; + + aoedev_downdev(d); + d->flags |= DEVFL_TKILL; +cont: + spin_unlock(&d->lock); + } + spin_unlock_irqrestore(&devlist_lock, flags); + + /* pass two: call freedev, which might sleep, + * for aoedevs marked with DEVFL_TKILL + */ +restart: + spin_lock_irqsave(&devlist_lock, flags); + for (d = devlist; d; d = d->next) { + spin_lock(&d->lock); + if (d->flags & DEVFL_TKILL + && !(d->flags & DEVFL_FREEING)) { + spin_unlock(&d->lock); + spin_unlock_irqrestore(&devlist_lock, flags); + freedev(d); + goto restart; + } + spin_unlock(&d->lock); + } + + /* pass three: remove aoedevs marked with DEVFL_FREED */ + for (dd = &devlist, d = *dd; d; d = *dd) { + struct aoedev *doomed = NULL; + + spin_lock(&d->lock); + if (d->flags & DEVFL_FREED) { + *dd = d->next; + doomed = d; + } else { + dd = &d->next; + } + spin_unlock(&d->lock); + if (doomed) + kfree(doomed->targets); + kfree(doomed); + } + spin_unlock_irqrestore(&devlist_lock, flags); + + return 0; +} + +int +aoedev_flush(const char __user *str, size_t cnt) +{ + return flush(str, cnt, NOT_EXITING); +} + +/* This has been confirmed to occur once with Tms=3*1000 due to the + * driver changing link and not processing its transmit ring. The + * problem is hard enough to solve by returning an error that I'm + * still punting on "solving" this. + */ +static void +skbfree(struct sk_buff *skb) +{ + enum { Sms = 250, Tms = 30 * 1000}; + int i = Tms / Sms; + + if (skb == NULL) + return; + while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0) + msleep(Sms); + if (i < 0) { + printk(KERN_ERR + "aoe: %s holds ref: %s\n", + skb->dev ? skb->dev->name : "netif", + "cannot free skb -- memory leaked."); + return; + } + skb->truesize -= skb->data_len; + skb_shinfo(skb)->nr_frags = skb->data_len = 0; + skb_trim(skb, 0); + dev_kfree_skb(skb); +} + +static void +skbpoolfree(struct aoedev *d) +{ + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(&d->skbpool, skb, tmp) + skbfree(skb); + + __skb_queue_head_init(&d->skbpool); +} + +/* find it or allocate it */ +struct aoedev * +aoedev_by_aoeaddr(ulong maj, int min, int do_alloc) +{ + struct aoedev *d; + int i; + ulong flags; + ulong sysminor = 0; + + spin_lock_irqsave(&devlist_lock, flags); + + for (d=devlist; d; d=d->next) + if (d->aoemajor == maj && d->aoeminor == min) { + spin_lock(&d->lock); + if (d->flags & DEVFL_TKILL) { + spin_unlock(&d->lock); + d = NULL; + goto out; + } + d->ref++; + spin_unlock(&d->lock); + break; + } + if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0) + goto out; + d = kcalloc(1, sizeof *d, GFP_ATOMIC); + if (!d) + goto out; + d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC); + if (!d->targets) { + kfree(d); + d = NULL; + goto out; + } + d->ntargets = NTARGETS; + INIT_WORK(&d->work, aoecmd_sleepwork); + spin_lock_init(&d->lock); + skb_queue_head_init(&d->skbpool); + init_timer(&d->timer); + d->timer.data = (ulong) d; + d->timer.function = dummy_timer; + d->timer.expires = jiffies + HZ; + add_timer(&d->timer); + d->bufpool = NULL; /* defer to aoeblk_gdalloc */ + d->tgt = d->targets; + d->ref = 1; + for (i = 0; i < NFACTIVE; i++) + INIT_LIST_HEAD(&d->factive[i]); + INIT_LIST_HEAD(&d->rexmitq); + d->sysminor = sysminor; + d->aoemajor = maj; + d->aoeminor = min; + d->rttavg = RTTAVG_INIT; + d->rttdev = RTTDEV_INIT; + d->next = devlist; + devlist = d; + out: + spin_unlock_irqrestore(&devlist_lock, flags); + return d; +} + +static void +freetgt(struct aoedev *d, struct aoetgt *t) +{ + struct frame *f; + struct list_head *pos, *nx, *head; + struct aoeif *ifp; + + for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) { + if (!ifp->nd) + break; + dev_put(ifp->nd); + } + + head = &t->ffree; + list_for_each_safe(pos, nx, head) { + list_del(pos); + f = list_entry(pos, struct frame, head); + skbfree(f->skb); + kfree(f); + } + kfree(t); +} + +void +aoedev_exit(void) +{ + flush_scheduled_work(); + flush(NULL, 0, EXITING); +} + +int __init +aoedev_init(void) +{ + return 0; +} diff --git a/kernel/drivers/block/aoe/aoemain.c b/kernel/drivers/block/aoe/aoemain.c new file mode 100644 index 000000000..4b987c2fe --- /dev/null +++ b/kernel/drivers/block/aoe/aoemain.c @@ -0,0 +1,115 @@ +/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoemain.c + * Module initialization routines, discover timer + */ + +#include +#include +#include +#include +#include "aoe.h" + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sam Hopkins "); +MODULE_DESCRIPTION("AoE block/char driver for 2.6.2 and newer 2.6 kernels"); +MODULE_VERSION(VERSION); + +enum { TINIT, TRUN, TKILL }; + +static void +discover_timer(ulong vp) +{ + static struct timer_list t; + static volatile ulong die; + static spinlock_t lock; + ulong flags; + enum { DTIMERTICK = HZ * 60 }; /* one minute */ + + switch (vp) { + case TINIT: + init_timer(&t); + spin_lock_init(&lock); + t.data = TRUN; + t.function = discover_timer; + die = 0; + case TRUN: + spin_lock_irqsave(&lock, flags); + if (!die) { + t.expires = jiffies + DTIMERTICK; + add_timer(&t); + } + spin_unlock_irqrestore(&lock, flags); + + aoecmd_cfg(0xffff, 0xff); + return; + case TKILL: + spin_lock_irqsave(&lock, flags); + die = 1; + spin_unlock_irqrestore(&lock, flags); + + del_timer_sync(&t); + default: + return; + } +} + +static void +aoe_exit(void) +{ + discover_timer(TKILL); + + aoenet_exit(); + unregister_blkdev(AOE_MAJOR, DEVICE_NAME); + aoecmd_exit(); + aoechr_exit(); + aoedev_exit(); + aoeblk_exit(); /* free cache after de-allocating bufs */ +} + +static int __init +aoe_init(void) +{ + int ret; + + ret = aoedev_init(); + if (ret) + return ret; + ret = aoechr_init(); + if (ret) + goto chr_fail; + ret = aoeblk_init(); + if (ret) + goto blk_fail; + ret = aoenet_init(); + if (ret) + goto net_fail; + ret = aoecmd_init(); + if (ret) + goto cmd_fail; + ret = register_blkdev(AOE_MAJOR, DEVICE_NAME); + if (ret < 0) { + printk(KERN_ERR "aoe: can't register major\n"); + goto blkreg_fail; + } + printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION); + discover_timer(TINIT); + return 0; + blkreg_fail: + aoecmd_exit(); + cmd_fail: + aoenet_exit(); + net_fail: + aoeblk_exit(); + blk_fail: + aoechr_exit(); + chr_fail: + aoedev_exit(); + + printk(KERN_INFO "aoe: initialisation failure.\n"); + return ret; +} + +module_init(aoe_init); +module_exit(aoe_exit); + diff --git a/kernel/drivers/block/aoe/aoenet.c b/kernel/drivers/block/aoe/aoenet.c new file mode 100644 index 000000000..63773a905 --- /dev/null +++ b/kernel/drivers/block/aoe/aoenet.c @@ -0,0 +1,223 @@ +/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ +/* + * aoenet.c + * Ethernet portion of AoE driver + */ + +#include +#include +#include +#include +#include +#include +#include +#include "aoe.h" + +#define NECODES 5 + +static char *aoe_errlist[] = +{ + "no such error", + "unrecognized command code", + "bad argument parameter", + "device unavailable", + "config string present", + "unsupported version" +}; + +enum { + IFLISTSZ = 1024, +}; + +static char aoe_iflist[IFLISTSZ]; +module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600); +MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]"); + +static wait_queue_head_t txwq; +static struct ktstate kts; + +#ifndef MODULE +static int __init aoe_iflist_setup(char *str) +{ + strncpy(aoe_iflist, str, IFLISTSZ); + aoe_iflist[IFLISTSZ - 1] = '\0'; + return 1; +} + +__setup("aoe_iflist=", aoe_iflist_setup); +#endif + +static spinlock_t txlock; +static struct sk_buff_head skbtxq; + +/* enters with txlock held */ +static int +tx(int id) __must_hold(&txlock) +{ + struct sk_buff *skb; + struct net_device *ifp; + + while ((skb = skb_dequeue(&skbtxq))) { + spin_unlock_irq(&txlock); + ifp = skb->dev; + if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit()) + pr_warn("aoe: packet could not be sent on %s. %s\n", + ifp ? ifp->name : "netif", + "consider increasing tx_queue_len"); + spin_lock_irq(&txlock); + } + return 0; +} + +int +is_aoe_netif(struct net_device *ifp) +{ + register char *p, *q; + register int len; + + if (aoe_iflist[0] == '\0') + return 1; + + p = aoe_iflist + strspn(aoe_iflist, WHITESPACE); + for (; *p; p = q + strspn(q, WHITESPACE)) { + q = p + strcspn(p, WHITESPACE); + if (q != p) + len = q - p; + else + len = strlen(p); /* last token in aoe_iflist */ + + if (strlen(ifp->name) == len && !strncmp(ifp->name, p, len)) + return 1; + if (q == p) + break; + } + + return 0; +} + +int +set_aoe_iflist(const char __user *user_str, size_t size) +{ + if (size >= IFLISTSZ) + return -EINVAL; + + if (copy_from_user(aoe_iflist, user_str, size)) { + printk(KERN_INFO "aoe: copy from user failed\n"); + return -EFAULT; + } + aoe_iflist[size] = 0x00; + return 0; +} + +void +aoenet_xmit(struct sk_buff_head *queue) +{ + struct sk_buff *skb, *tmp; + ulong flags; + + skb_queue_walk_safe(queue, skb, tmp) { + __skb_unlink(skb, queue); + spin_lock_irqsave(&txlock, flags); + skb_queue_tail(&skbtxq, skb); + spin_unlock_irqrestore(&txlock, flags); + wake_up(&txwq); + } +} + +/* + * (1) len doesn't include the header by default. I want this. + */ +static int +aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) +{ + struct aoe_hdr *h; + struct aoe_atahdr *ah; + u32 n; + int sn; + + if (dev_net(ifp) != &init_net) + goto exit; + + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb == NULL) + return 0; + if (!is_aoe_netif(ifp)) + goto exit; + skb_push(skb, ETH_HLEN); /* (1) */ + sn = sizeof(*h) + sizeof(*ah); + if (skb->len >= sn) { + sn -= skb_headlen(skb); + if (sn > 0 && !__pskb_pull_tail(skb, sn)) + goto exit; + } + h = (struct aoe_hdr *) skb->data; + n = get_unaligned_be32(&h->tag); + if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31)) + goto exit; + + if (h->verfl & AOEFL_ERR) { + n = h->err; + if (n > NECODES) + n = 0; + if (net_ratelimit()) + printk(KERN_ERR + "%s%d.%d@%s; ecode=%d '%s'\n", + "aoe: error packet from ", + get_unaligned_be16(&h->major), + h->minor, skb->dev->name, + h->err, aoe_errlist[n]); + goto exit; + } + + switch (h->cmd) { + case AOECMD_ATA: + /* ata_rsp may keep skb for later processing or give it back */ + skb = aoecmd_ata_rsp(skb); + break; + case AOECMD_CFG: + aoecmd_cfg_rsp(skb); + break; + default: + if (h->cmd >= AOECMD_VEND_MIN) + break; /* don't complain about vendor commands */ + pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd); + break; + } + + if (!skb) + return 0; +exit: + dev_kfree_skb(skb); + return 0; +} + +static struct packet_type aoe_pt __read_mostly = { + .type = __constant_htons(ETH_P_AOE), + .func = aoenet_rcv, +}; + +int __init +aoenet_init(void) +{ + skb_queue_head_init(&skbtxq); + init_waitqueue_head(&txwq); + spin_lock_init(&txlock); + kts.lock = &txlock; + kts.fn = tx; + kts.waitq = &txwq; + kts.id = 0; + snprintf(kts.name, sizeof(kts.name), "aoe_tx%d", kts.id); + if (aoe_ktstart(&kts)) + return -EAGAIN; + dev_add_pack(&aoe_pt); + return 0; +} + +void +aoenet_exit(void) +{ + aoe_ktstop(&kts); + skb_queue_purge(&skbtxq); + dev_remove_pack(&aoe_pt); +} + diff --git a/kernel/drivers/block/ataflop.c b/kernel/drivers/block/ataflop.c new file mode 100644 index 000000000..2104b1b4c --- /dev/null +++ b/kernel/drivers/block/ataflop.c @@ -0,0 +1,2056 @@ +/* + * drivers/block/ataflop.c + * + * Copyright (C) 1993 Greg Harp + * Atari Support by Bjoern Brauel, Roman Hodek + * + * Big cleanup Sep 11..14 1994 Roman Hodek: + * - Driver now works interrupt driven + * - Support for two drives; should work, but I cannot test that :-( + * - Reading is done in whole tracks and buffered to speed up things + * - Disk change detection and drive deselecting after motor-off + * similar to TOS + * - Autodetection of disk format (DD/HD); untested yet, because I + * don't have an HD drive :-( + * + * Fixes Nov 13 1994 Martin Schaller: + * - Autodetection works now + * - Support for 5 1/4'' disks + * - Removed drive type (unknown on atari) + * - Do seeks with 8 Mhz + * + * Changes by Andreas Schwab: + * - After errors in multiple read mode try again reading single sectors + * (Feb 1995): + * - Clean up error handling + * - Set blk_size for proper size checking + * - Initialize track register when testing presence of floppy + * - Implement some ioctl's + * + * Changes by Torsten Lang: + * - When probing the floppies we should add the FDCCMDADD_H flag since + * the FDC will otherwise wait forever when no disk is inserted... + * + * ++ Freddi Aschwanden (fa) 20.9.95 fixes for medusa: + * - MFPDELAY() after each FDC access -> atari + * - more/other disk formats + * - DMA to the block buffer directly if we have a 32bit DMA + * - for medusa, the step rate is always 3ms + * - on medusa, use only cache_push() + * Roman: + * - Make disk format numbering independent from minors + * - Let user set max. supported drive type (speeds up format + * detection, saves buffer space) + * + * Roman 10/15/95: + * - implement some more ioctls + * - disk formatting + * + * Andreas 95/12/12: + * - increase gap size at start of track for HD/ED disks + * + * Michael (MSch) 11/07/96: + * - implemented FDSETPRM and FDDEFPRM ioctl + * + * Andreas (97/03/19): + * - implemented missing BLK* ioctls + * + * Things left to do: + * - Formatting + * - Maybe a better strategy for disk change detection (does anyone + * know one?) + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define FD_MAX_UNITS 2 + +#undef DEBUG + +static DEFINE_MUTEX(ataflop_mutex); +static struct request *fd_request; +static int fdc_queue; + +/* Disk types: DD, HD, ED */ +static struct atari_disk_type { + const char *name; + unsigned spt; /* sectors per track */ + unsigned blocks; /* total number of blocks */ + unsigned fdc_speed; /* fdc_speed setting */ + unsigned stretch; /* track doubling ? */ +} atari_disk_type[] = { + { "d360", 9, 720, 0, 0}, /* 0: 360kB diskette */ + { "D360", 9, 720, 0, 1}, /* 1: 360kb in 720k or 1.2MB drive */ + { "D720", 9,1440, 0, 0}, /* 2: 720kb in 720k or 1.2MB drive */ + { "D820", 10,1640, 0, 0}, /* 3: DD disk with 82 tracks/10 sectors */ +/* formats above are probed for type DD */ +#define MAX_TYPE_DD 3 + { "h1200",15,2400, 3, 0}, /* 4: 1.2MB diskette */ + { "H1440",18,2880, 3, 0}, /* 5: 1.4 MB diskette (HD) */ + { "H1640",20,3280, 3, 0}, /* 6: 1.64MB diskette (fat HD) 82 tr 20 sec */ +/* formats above are probed for types DD and HD */ +#define MAX_TYPE_HD 6 + { "E2880",36,5760, 3, 0}, /* 7: 2.8 MB diskette (ED) */ + { "E3280",40,6560, 3, 0}, /* 8: 3.2 MB diskette (fat ED) 82 tr 40 sec */ +/* formats above are probed for types DD, HD and ED */ +#define MAX_TYPE_ED 8 +/* types below are never autoprobed */ + { "H1680",21,3360, 3, 0}, /* 9: 1.68MB diskette (fat HD) 80 tr 21 sec */ + { "h410",10,820, 0, 1}, /* 10: 410k diskette 41 tr 10 sec, stretch */ + { "h1476",18,2952, 3, 0}, /* 11: 1.48MB diskette 82 tr 18 sec */ + { "H1722",21,3444, 3, 0}, /* 12: 1.72MB diskette 82 tr 21 sec */ + { "h420",10,840, 0, 1}, /* 13: 420k diskette 42 tr 10 sec, stretch */ + { "H830",10,1660, 0, 0}, /* 14: 820k diskette 83 tr 10 sec */ + { "h1494",18,2952, 3, 0}, /* 15: 1.49MB diskette 83 tr 18 sec */ + { "H1743",21,3486, 3, 0}, /* 16: 1.74MB diskette 83 tr 21 sec */ + { "h880",11,1760, 0, 0}, /* 17: 880k diskette 80 tr 11 sec */ + { "D1040",13,2080, 0, 0}, /* 18: 1.04MB diskette 80 tr 13 sec */ + { "D1120",14,2240, 0, 0}, /* 19: 1.12MB diskette 80 tr 14 sec */ + { "h1600",20,3200, 3, 0}, /* 20: 1.60MB diskette 80 tr 20 sec */ + { "H1760",22,3520, 3, 0}, /* 21: 1.76MB diskette 80 tr 22 sec */ + { "H1920",24,3840, 3, 0}, /* 22: 1.92MB diskette 80 tr 24 sec */ + { "E3200",40,6400, 3, 0}, /* 23: 3.2MB diskette 80 tr 40 sec */ + { "E3520",44,7040, 3, 0}, /* 24: 3.52MB diskette 80 tr 44 sec */ + { "E3840",48,7680, 3, 0}, /* 25: 3.84MB diskette 80 tr 48 sec */ + { "H1840",23,3680, 3, 0}, /* 26: 1.84MB diskette 80 tr 23 sec */ + { "D800",10,1600, 0, 0}, /* 27: 800k diskette 80 tr 10 sec */ +}; + +static int StartDiskType[] = { + MAX_TYPE_DD, + MAX_TYPE_HD, + MAX_TYPE_ED +}; + +#define TYPE_DD 0 +#define TYPE_HD 1 +#define TYPE_ED 2 + +static int DriveType = TYPE_HD; + +static DEFINE_SPINLOCK(ataflop_lock); + +/* Array for translating minors into disk formats */ +static struct { + int index; + unsigned drive_types; +} minor2disktype[] = { + { 0, TYPE_DD }, /* 1: d360 */ + { 4, TYPE_HD }, /* 2: h1200 */ + { 1, TYPE_DD }, /* 3: D360 */ + { 2, TYPE_DD }, /* 4: D720 */ + { 1, TYPE_DD }, /* 5: h360 = D360 */ + { 2, TYPE_DD }, /* 6: h720 = D720 */ + { 5, TYPE_HD }, /* 7: H1440 */ + { 7, TYPE_ED }, /* 8: E2880 */ +/* some PC formats :-) */ + { 8, TYPE_ED }, /* 9: E3280 <- was "CompaQ" == E2880 for PC */ + { 5, TYPE_HD }, /* 10: h1440 = H1440 */ + { 9, TYPE_HD }, /* 11: H1680 */ + { 10, TYPE_DD }, /* 12: h410 */ + { 3, TYPE_DD }, /* 13: H820 <- == D820, 82x10 */ + { 11, TYPE_HD }, /* 14: h1476 */ + { 12, TYPE_HD }, /* 15: H1722 */ + { 13, TYPE_DD }, /* 16: h420 */ + { 14, TYPE_DD }, /* 17: H830 */ + { 15, TYPE_HD }, /* 18: h1494 */ + { 16, TYPE_HD }, /* 19: H1743 */ + { 17, TYPE_DD }, /* 20: h880 */ + { 18, TYPE_DD }, /* 21: D1040 */ + { 19, TYPE_DD }, /* 22: D1120 */ + { 20, TYPE_HD }, /* 23: h1600 */ + { 21, TYPE_HD }, /* 24: H1760 */ + { 22, TYPE_HD }, /* 25: H1920 */ + { 23, TYPE_ED }, /* 26: E3200 */ + { 24, TYPE_ED }, /* 27: E3520 */ + { 25, TYPE_ED }, /* 28: E3840 */ + { 26, TYPE_HD }, /* 29: H1840 */ + { 27, TYPE_DD }, /* 30: D800 */ + { 6, TYPE_HD }, /* 31: H1640 <- was H1600 == h1600 for PC */ +}; + +#define NUM_DISK_MINORS ARRAY_SIZE(minor2disktype) + +/* + * Maximum disk size (in kilobytes). This default is used whenever the + * current disk size is unknown. + */ +#define MAX_DISK_SIZE 3280 + +/* + * MSch: User-provided type information. 'drive' points to + * the respective entry of this array. Set by FDSETPRM ioctls. + */ +static struct atari_disk_type user_params[FD_MAX_UNITS]; + +/* + * User-provided permanent type information. 'drive' points to + * the respective entry of this array. Set by FDDEFPRM ioctls, + * restored upon disk change by floppy_revalidate() if valid (as seen by + * default_params[].blocks > 0 - a bit in unit[].flags might be used for this?) + */ +static struct atari_disk_type default_params[FD_MAX_UNITS]; + +/* current info on each unit */ +static struct atari_floppy_struct { + int connected; /* !=0 : drive is connected */ + int autoprobe; /* !=0 : do autoprobe */ + + struct atari_disk_type *disktype; /* current type of disk */ + + int track; /* current head position or -1 if + unknown */ + unsigned int steprate; /* steprate setting */ + unsigned int wpstat; /* current state of WP signal (for + disk change detection) */ + int flags; /* flags */ + struct gendisk *disk; + int ref; + int type; +} unit[FD_MAX_UNITS]; + +#define UD unit[drive] +#define UDT unit[drive].disktype +#define SUD unit[SelectedDrive] +#define SUDT unit[SelectedDrive].disktype + + +#define FDC_READ(reg) ({ \ + /* unsigned long __flags; */ \ + unsigned short __val; \ + /* local_irq_save(__flags); */ \ + dma_wd.dma_mode_status = 0x80 | (reg); \ + udelay(25); \ + __val = dma_wd.fdc_acces_seccount; \ + MFPDELAY(); \ + /* local_irq_restore(__flags); */ \ + __val & 0xff; \ +}) + +#define FDC_WRITE(reg,val) \ + do { \ + /* unsigned long __flags; */ \ + /* local_irq_save(__flags); */ \ + dma_wd.dma_mode_status = 0x80 | (reg); \ + udelay(25); \ + dma_wd.fdc_acces_seccount = (val); \ + MFPDELAY(); \ + /* local_irq_restore(__flags); */ \ + } while(0) + + +/* Buffering variables: + * First, there is a DMA buffer in ST-RAM that is used for floppy DMA + * operations. Second, a track buffer is used to cache a whole track + * of the disk to save read operations. These are two separate buffers + * because that allows write operations without clearing the track buffer. + */ + +static int MaxSectors[] = { + 11, 22, 44 +}; +static int BufferSize[] = { + 15*512, 30*512, 60*512 +}; + +#define BUFFER_SIZE (BufferSize[DriveType]) + +unsigned char *DMABuffer; /* buffer for writes */ +static unsigned long PhysDMABuffer; /* physical address */ + +static int UseTrackbuffer = -1; /* Do track buffering? */ +module_param(UseTrackbuffer, int, 0); + +unsigned char *TrackBuffer; /* buffer for reads */ +static unsigned long PhysTrackBuffer; /* physical address */ +static int BufferDrive, BufferSide, BufferTrack; +static int read_track; /* non-zero if we are reading whole tracks */ + +#define SECTOR_BUFFER(sec) (TrackBuffer + ((sec)-1)*512) +#define IS_BUFFERED(drive,side,track) \ + (BufferDrive == (drive) && BufferSide == (side) && BufferTrack == (track)) + +/* + * These are global variables, as that's the easiest way to give + * information to interrupts. They are the data used for the current + * request. + */ +static int SelectedDrive = 0; +static int ReqCmd, ReqBlock; +static int ReqSide, ReqTrack, ReqSector, ReqCnt; +static int HeadSettleFlag = 0; +static unsigned char *ReqData, *ReqBuffer; +static int MotorOn = 0, MotorOffTrys; +static int IsFormatting = 0, FormatError; + +static int UserSteprate[FD_MAX_UNITS] = { -1, -1 }; +module_param_array(UserSteprate, int, NULL, 0); + +/* Synchronization of FDC access. */ +static volatile int fdc_busy = 0; +static DECLARE_WAIT_QUEUE_HEAD(fdc_wait); +static DECLARE_COMPLETION(format_wait); + +static unsigned long changed_floppies = 0xff, fake_change = 0; +#define CHECK_CHANGE_DELAY HZ/2 + +#define FD_MOTOR_OFF_DELAY (3*HZ) +#define FD_MOTOR_OFF_MAXTRY (10*20) + +#define FLOPPY_TIMEOUT (6*HZ) +#define RECALIBRATE_ERRORS 4 /* After this many errors the drive + * will be recalibrated. */ +#define MAX_ERRORS 8 /* After this many errors the driver + * will give up. */ + + +/* + * The driver is trying to determine the correct media format + * while Probing is set. fd_rwsec_done() clears it after a + * successful access. + */ +static int Probing = 0; + +/* This flag is set when a dummy seek is necessary to make the WP + * status bit accessible. + */ +static int NeedSeek = 0; + + +#ifdef DEBUG +#define DPRINT(a) printk a +#else +#define DPRINT(a) +#endif + +/***************************** Prototypes *****************************/ + +static void fd_select_side( int side ); +static void fd_select_drive( int drive ); +static void fd_deselect( void ); +static void fd_motor_off_timer( unsigned long dummy ); +static void check_change( unsigned long dummy ); +static irqreturn_t floppy_irq (int irq, void *dummy); +static void fd_error( void ); +static int do_format(int drive, int type, struct atari_format_descr *desc); +static void do_fd_action( int drive ); +static void fd_calibrate( void ); +static void fd_calibrate_done( int status ); +static void fd_seek( void ); +static void fd_seek_done( int status ); +static void fd_rwsec( void ); +static void fd_readtrack_check( unsigned long dummy ); +static void fd_rwsec_done( int status ); +static void fd_rwsec_done1(int status); +static void fd_writetrack( void ); +static void fd_writetrack_done( int status ); +static void fd_times_out( unsigned long dummy ); +static void finish_fdc( void ); +static void finish_fdc_done( int dummy ); +static void setup_req_params( int drive ); +static void redo_fd_request( void); +static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int + cmd, unsigned long param); +static void fd_probe( int drive ); +static int fd_test_drive_present( int drive ); +static void config_types( void ); +static int floppy_open(struct block_device *bdev, fmode_t mode); +static void floppy_release(struct gendisk *disk, fmode_t mode); + +/************************* End of Prototypes **************************/ + +static DEFINE_TIMER(motor_off_timer, fd_motor_off_timer, 0, 0); +static DEFINE_TIMER(readtrack_timer, fd_readtrack_check, 0, 0); +static DEFINE_TIMER(timeout_timer, fd_times_out, 0, 0); +static DEFINE_TIMER(fd_timer, check_change, 0, 0); + +static void fd_end_request_cur(int err) +{ + if (!__blk_end_request_cur(fd_request, err)) + fd_request = NULL; +} + +static inline void start_motor_off_timer(void) +{ + mod_timer(&motor_off_timer, jiffies + FD_MOTOR_OFF_DELAY); + MotorOffTrys = 0; +} + +static inline void start_check_change_timer( void ) +{ + mod_timer(&fd_timer, jiffies + CHECK_CHANGE_DELAY); +} + +static inline void start_timeout(void) +{ + mod_timer(&timeout_timer, jiffies + FLOPPY_TIMEOUT); +} + +static inline void stop_timeout(void) +{ + del_timer(&timeout_timer); +} + +/* Select the side to use. */ + +static void fd_select_side( int side ) +{ + unsigned long flags; + + /* protect against various other ints mucking around with the PSG */ + local_irq_save(flags); + + sound_ym.rd_data_reg_sel = 14; /* Select PSG Port A */ + sound_ym.wd_data = (side == 0) ? sound_ym.rd_data_reg_sel | 0x01 : + sound_ym.rd_data_reg_sel & 0xfe; + + local_irq_restore(flags); +} + + +/* Select a drive, update the FDC's track register and set the correct + * clock speed for this disk's type. + */ + +static void fd_select_drive( int drive ) +{ + unsigned long flags; + unsigned char tmp; + + if (drive == SelectedDrive) + return; + + /* protect against various other ints mucking around with the PSG */ + local_irq_save(flags); + sound_ym.rd_data_reg_sel = 14; /* Select PSG Port A */ + tmp = sound_ym.rd_data_reg_sel; + sound_ym.wd_data = (tmp | DSKDRVNONE) & ~(drive == 0 ? DSKDRV0 : DSKDRV1); + atari_dont_touch_floppy_select = 1; + local_irq_restore(flags); + + /* restore track register to saved value */ + FDC_WRITE( FDCREG_TRACK, UD.track ); + udelay(25); + + /* select 8/16 MHz */ + if (UDT) + if (ATARIHW_PRESENT(FDCSPEED)) + dma_wd.fdc_speed = UDT->fdc_speed; + + SelectedDrive = drive; +} + + +/* Deselect both drives. */ + +static void fd_deselect( void ) +{ + unsigned long flags; + + /* protect against various other ints mucking around with the PSG */ + local_irq_save(flags); + atari_dont_touch_floppy_select = 0; + sound_ym.rd_data_reg_sel=14; /* Select PSG Port A */ + sound_ym.wd_data = (sound_ym.rd_data_reg_sel | + (MACH_IS_FALCON ? 3 : 7)); /* no drives selected */ + /* On Falcon, the drive B select line is used on the printer port, so + * leave it alone... */ + SelectedDrive = -1; + local_irq_restore(flags); +} + + +/* This timer function deselects the drives when the FDC switched the + * motor off. The deselection cannot happen earlier because the FDC + * counts the index signals, which arrive only if one drive is selected. + */ + +static void fd_motor_off_timer( unsigned long dummy ) +{ + unsigned char status; + + if (SelectedDrive < 0) + /* no drive selected, needn't deselect anyone */ + return; + + if (stdma_islocked()) + goto retry; + + status = FDC_READ( FDCREG_STATUS ); + + if (!(status & 0x80)) { + /* motor already turned off by FDC -> deselect drives */ + MotorOn = 0; + fd_deselect(); + return; + } + /* not yet off, try again */ + + retry: + /* Test again later; if tested too often, it seems there is no disk + * in the drive and the FDC will leave the motor on forever (or, + * at least until a disk is inserted). So we'll test only twice + * per second from then on... + */ + mod_timer(&motor_off_timer, + jiffies + (MotorOffTrys++ < FD_MOTOR_OFF_MAXTRY ? HZ/20 : HZ/2)); +} + + +/* This function is repeatedly called to detect disk changes (as good + * as possible) and keep track of the current state of the write protection. + */ + +static void check_change( unsigned long dummy ) +{ + static int drive = 0; + + unsigned long flags; + unsigned char old_porta; + int stat; + + if (++drive > 1 || !UD.connected) + drive = 0; + + /* protect against various other ints mucking around with the PSG */ + local_irq_save(flags); + + if (!stdma_islocked()) { + sound_ym.rd_data_reg_sel = 14; + old_porta = sound_ym.rd_data_reg_sel; + sound_ym.wd_data = (old_porta | DSKDRVNONE) & + ~(drive == 0 ? DSKDRV0 : DSKDRV1); + stat = !!(FDC_READ( FDCREG_STATUS ) & FDCSTAT_WPROT); + sound_ym.wd_data = old_porta; + + if (stat != UD.wpstat) { + DPRINT(( "wpstat[%d] = %d\n", drive, stat )); + UD.wpstat = stat; + set_bit (drive, &changed_floppies); + } + } + local_irq_restore(flags); + + start_check_change_timer(); +} + + +/* Handling of the Head Settling Flag: This flag should be set after each + * seek operation, because we don't use seeks with verify. + */ + +static inline void set_head_settle_flag(void) +{ + HeadSettleFlag = FDCCMDADD_E; +} + +static inline int get_head_settle_flag(void) +{ + int tmp = HeadSettleFlag; + HeadSettleFlag = 0; + return( tmp ); +} + +static inline void copy_buffer(void *from, void *to) +{ + ulong *p1 = (ulong *)from, *p2 = (ulong *)to; + int cnt; + + for (cnt = 512/4; cnt; cnt--) + *p2++ = *p1++; +} + + + + +/* General Interrupt Handling */ + +static void (*FloppyIRQHandler)( int status ) = NULL; + +static irqreturn_t floppy_irq (int irq, void *dummy) +{ + unsigned char status; + void (*handler)( int ); + + handler = xchg(&FloppyIRQHandler, NULL); + + if (handler) { + nop(); + status = FDC_READ( FDCREG_STATUS ); + DPRINT(("FDC irq, status = %02x handler = %08lx\n",status,(unsigned long)handler)); + handler( status ); + } + else { + DPRINT(("FDC irq, no handler\n")); + } + return IRQ_HANDLED; +} + + +/* Error handling: If some error happened, retry some times, then + * recalibrate, then try again, and fail after MAX_ERRORS. + */ + +static void fd_error( void ) +{ + if (IsFormatting) { + IsFormatting = 0; + FormatError = 1; + complete(&format_wait); + return; + } + + if (!fd_request) + return; + + fd_request->errors++; + if (fd_request->errors >= MAX_ERRORS) { + printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive ); + fd_end_request_cur(-EIO); + } + else if (fd_request->errors == RECALIBRATE_ERRORS) { + printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive ); + if (SelectedDrive != -1) + SUD.track = -1; + } + redo_fd_request(); +} + + + +#define SET_IRQ_HANDLER(proc) do { FloppyIRQHandler = (proc); } while(0) + + +/* ---------- Formatting ---------- */ + +#define FILL(n,val) \ + do { \ + memset( p, val, n ); \ + p += n; \ + } while(0) + +static int do_format(int drive, int type, struct atari_format_descr *desc) +{ + unsigned char *p; + int sect, nsect; + unsigned long flags; + + DPRINT(("do_format( dr=%d tr=%d he=%d offs=%d )\n", + drive, desc->track, desc->head, desc->sect_offset )); + + wait_event(fdc_wait, cmpxchg(&fdc_busy, 0, 1) == 0); + local_irq_save(flags); + stdma_lock(floppy_irq, NULL); + atari_turnon_irq( IRQ_MFP_FDC ); /* should be already, just to be sure */ + local_irq_restore(flags); + + if (type) { + if (--type >= NUM_DISK_MINORS || + minor2disktype[type].drive_types > DriveType) { + redo_fd_request(); + return -EINVAL; + } + type = minor2disktype[type].index; + UDT = &atari_disk_type[type]; + } + + if (!UDT || desc->track >= UDT->blocks/UDT->spt/2 || desc->head >= 2) { + redo_fd_request(); + return -EINVAL; + } + + nsect = UDT->spt; + p = TrackBuffer; + /* The track buffer is used for the raw track data, so its + contents become invalid! */ + BufferDrive = -1; + /* stop deselect timer */ + del_timer( &motor_off_timer ); + + FILL( 60 * (nsect / 9), 0x4e ); + for( sect = 0; sect < nsect; ++sect ) { + FILL( 12, 0 ); + FILL( 3, 0xf5 ); + *p++ = 0xfe; + *p++ = desc->track; + *p++ = desc->head; + *p++ = (nsect + sect - desc->sect_offset) % nsect + 1; + *p++ = 2; + *p++ = 0xf7; + FILL( 22, 0x4e ); + FILL( 12, 0 ); + FILL( 3, 0xf5 ); + *p++ = 0xfb; + FILL( 512, 0xe5 ); + *p++ = 0xf7; + FILL( 40, 0x4e ); + } + FILL( TrackBuffer+BUFFER_SIZE-p, 0x4e ); + + IsFormatting = 1; + FormatError = 0; + ReqTrack = desc->track; + ReqSide = desc->head; + do_fd_action( drive ); + + wait_for_completion(&format_wait); + + redo_fd_request(); + return( FormatError ? -EIO : 0 ); +} + + +/* do_fd_action() is the general procedure for a fd request: All + * required parameter settings (drive select, side select, track + * position) are checked and set if needed. For each of these + * parameters and the actual reading or writing exist two functions: + * one that starts the setting (or skips it if possible) and one + * callback for the "done" interrupt. Each done func calls the next + * set function to propagate the request down to fd_rwsec_done(). + */ + +static void do_fd_action( int drive ) +{ + DPRINT(("do_fd_action\n")); + + if (UseTrackbuffer && !IsFormatting) { + repeat: + if (IS_BUFFERED( drive, ReqSide, ReqTrack )) { + if (ReqCmd == READ) { + copy_buffer( SECTOR_BUFFER(ReqSector), ReqData ); + if (++ReqCnt < blk_rq_cur_sectors(fd_request)) { + /* read next sector */ + setup_req_params( drive ); + goto repeat; + } + else { + /* all sectors finished */ + fd_end_request_cur(0); + redo_fd_request(); + return; + } + } + else { + /* cmd == WRITE, pay attention to track buffer + * consistency! */ + copy_buffer( ReqData, SECTOR_BUFFER(ReqSector) ); + } + } + } + + if (SelectedDrive != drive) + fd_select_drive( drive ); + + if (UD.track == -1) + fd_calibrate(); + else if (UD.track != ReqTrack << UDT->stretch) + fd_seek(); + else if (IsFormatting) + fd_writetrack(); + else + fd_rwsec(); +} + + +/* Seek to track 0 if the current track is unknown */ + +static void fd_calibrate( void ) +{ + if (SUD.track >= 0) { + fd_calibrate_done( 0 ); + return; + } + + if (ATARIHW_PRESENT(FDCSPEED)) + dma_wd.fdc_speed = 0; /* always seek with 8 Mhz */; + DPRINT(("fd_calibrate\n")); + SET_IRQ_HANDLER( fd_calibrate_done ); + /* we can't verify, since the speed may be incorrect */ + FDC_WRITE( FDCREG_CMD, FDCCMD_RESTORE | SUD.steprate ); + + NeedSeek = 1; + MotorOn = 1; + start_timeout(); + /* wait for IRQ */ +} + + +static void fd_calibrate_done( int status ) +{ + DPRINT(("fd_calibrate_done()\n")); + stop_timeout(); + + /* set the correct speed now */ + if (ATARIHW_PRESENT(FDCSPEED)) + dma_wd.fdc_speed = SUDT->fdc_speed; + if (status & FDCSTAT_RECNF) { + printk(KERN_ERR "fd%d: restore failed\n", SelectedDrive ); + fd_error(); + } + else { + SUD.track = 0; + fd_seek(); + } +} + + +/* Seek the drive to the requested track. The drive must have been + * calibrated at some point before this. + */ + +static void fd_seek( void ) +{ + if (SUD.track == ReqTrack << SUDT->stretch) { + fd_seek_done( 0 ); + return; + } + + if (ATARIHW_PRESENT(FDCSPEED)) { + dma_wd.fdc_speed = 0; /* always seek witch 8 Mhz */ + MFPDELAY(); + } + + DPRINT(("fd_seek() to track %d\n",ReqTrack)); + FDC_WRITE( FDCREG_DATA, ReqTrack << SUDT->stretch); + udelay(25); + SET_IRQ_HANDLER( fd_seek_done ); + FDC_WRITE( FDCREG_CMD, FDCCMD_SEEK | SUD.steprate ); + + MotorOn = 1; + set_head_settle_flag(); + start_timeout(); + /* wait for IRQ */ +} + + +static void fd_seek_done( int status ) +{ + DPRINT(("fd_seek_done()\n")); + stop_timeout(); + + /* set the correct speed */ + if (ATARIHW_PRESENT(FDCSPEED)) + dma_wd.fdc_speed = SUDT->fdc_speed; + if (status & FDCSTAT_RECNF) { + printk(KERN_ERR "fd%d: seek error (to track %d)\n", + SelectedDrive, ReqTrack ); + /* we don't know exactly which track we are on now! */ + SUD.track = -1; + fd_error(); + } + else { + SUD.track = ReqTrack << SUDT->stretch; + NeedSeek = 0; + if (IsFormatting) + fd_writetrack(); + else + fd_rwsec(); + } +} + + +/* This does the actual reading/writing after positioning the head + * over the correct track. + */ + +static int MultReadInProgress = 0; + + +static void fd_rwsec( void ) +{ + unsigned long paddr, flags; + unsigned int rwflag, old_motoron; + unsigned int track; + + DPRINT(("fd_rwsec(), Sec=%d, Access=%c\n",ReqSector, ReqCmd == WRITE ? 'w' : 'r' )); + if (ReqCmd == WRITE) { + if (ATARIHW_PRESENT(EXTD_DMA)) { + paddr = virt_to_phys(ReqData); + } + else { + copy_buffer( ReqData, DMABuffer ); + paddr = PhysDMABuffer; + } + dma_cache_maintenance( paddr, 512, 1 ); + rwflag = 0x100; + } + else { + if (read_track) + paddr = PhysTrackBuffer; + else + paddr = ATARIHW_PRESENT(EXTD_DMA) ? + virt_to_phys(ReqData) : PhysDMABuffer; + rwflag = 0; + } + + fd_select_side( ReqSide ); + + /* Start sector of this operation */ + FDC_WRITE( FDCREG_SECTOR, read_track ? 1 : ReqSector ); + MFPDELAY(); + /* Cheat for track if stretch != 0 */ + if (SUDT->stretch) { + track = FDC_READ( FDCREG_TRACK); + MFPDELAY(); + FDC_WRITE( FDCREG_TRACK, track >> SUDT->stretch); + } + udelay(25); + + /* Setup DMA */ + local_irq_save(flags); + dma_wd.dma_lo = (unsigned char)paddr; + MFPDELAY(); + paddr >>= 8; + dma_wd.dma_md = (unsigned char)paddr; + MFPDELAY(); + paddr >>= 8; + if (ATARIHW_PRESENT(EXTD_DMA)) + st_dma_ext_dmahi = (unsigned short)paddr; + else + dma_wd.dma_hi = (unsigned char)paddr; + MFPDELAY(); + local_irq_restore(flags); + + /* Clear FIFO and switch DMA to correct mode */ + dma_wd.dma_mode_status = 0x90 | rwflag; + MFPDELAY(); + dma_wd.dma_mode_status = 0x90 | (rwflag ^ 0x100); + MFPDELAY(); + dma_wd.dma_mode_status = 0x90 | rwflag; + MFPDELAY(); + + /* How many sectors for DMA */ + dma_wd.fdc_acces_seccount = read_track ? SUDT->spt : 1; + + udelay(25); + + /* Start operation */ + dma_wd.dma_mode_status = FDCSELREG_STP | rwflag; + udelay(25); + SET_IRQ_HANDLER( fd_rwsec_done ); + dma_wd.fdc_acces_seccount = + (get_head_settle_flag() | + (rwflag ? FDCCMD_WRSEC : (FDCCMD_RDSEC | (read_track ? FDCCMDADD_M : 0)))); + + old_motoron = MotorOn; + MotorOn = 1; + NeedSeek = 1; + /* wait for interrupt */ + + if (read_track) { + /* If reading a whole track, wait about one disk rotation and + * then check if all sectors are read. The FDC will even + * search for the first non-existent sector and need 1 sec to + * recognise that it isn't present :-( + */ + MultReadInProgress = 1; + mod_timer(&readtrack_timer, + /* 1 rot. + 5 rot.s if motor was off */ + jiffies + HZ/5 + (old_motoron ? 0 : HZ)); + } + start_timeout(); +} + + +static void fd_readtrack_check( unsigned long dummy ) +{ + unsigned long flags, addr, addr2; + + local_irq_save(flags); + + if (!MultReadInProgress) { + /* This prevents a race condition that could arise if the + * interrupt is triggered while the calling of this timer + * callback function takes place. The IRQ function then has + * already cleared 'MultReadInProgress' when flow of control + * gets here. + */ + local_irq_restore(flags); + return; + } + + /* get the current DMA address */ + /* ++ f.a. read twice to avoid being fooled by switcher */ + addr = 0; + do { + addr2 = addr; + addr = dma_wd.dma_lo & 0xff; + MFPDELAY(); + addr |= (dma_wd.dma_md & 0xff) << 8; + MFPDELAY(); + if (ATARIHW_PRESENT( EXTD_DMA )) + addr |= (st_dma_ext_dmahi & 0xffff) << 16; + else + addr |= (dma_wd.dma_hi & 0xff) << 16; + MFPDELAY(); + } while(addr != addr2); + + if (addr >= PhysTrackBuffer + SUDT->spt*512) { + /* already read enough data, force an FDC interrupt to stop + * the read operation + */ + SET_IRQ_HANDLER( NULL ); + MultReadInProgress = 0; + local_irq_restore(flags); + DPRINT(("fd_readtrack_check(): done\n")); + FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI ); + udelay(25); + + /* No error until now -- the FDC would have interrupted + * otherwise! + */ + fd_rwsec_done1(0); + } + else { + /* not yet finished, wait another tenth rotation */ + local_irq_restore(flags); + DPRINT(("fd_readtrack_check(): not yet finished\n")); + mod_timer(&readtrack_timer, jiffies + HZ/5/10); + } +} + + +static void fd_rwsec_done( int status ) +{ + DPRINT(("fd_rwsec_done()\n")); + + if (read_track) { + del_timer(&readtrack_timer); + if (!MultReadInProgress) + return; + MultReadInProgress = 0; + } + fd_rwsec_done1(status); +} + +static void fd_rwsec_done1(int status) +{ + unsigned int track; + + stop_timeout(); + + /* Correct the track if stretch != 0 */ + if (SUDT->stretch) { + track = FDC_READ( FDCREG_TRACK); + MFPDELAY(); + FDC_WRITE( FDCREG_TRACK, track << SUDT->stretch); + } + + if (!UseTrackbuffer) { + dma_wd.dma_mode_status = 0x90; + MFPDELAY(); + if (!(dma_wd.dma_mode_status & 0x01)) { + printk(KERN_ERR "fd%d: DMA error\n", SelectedDrive ); + goto err_end; + } + } + MFPDELAY(); + + if (ReqCmd == WRITE && (status & FDCSTAT_WPROT)) { + printk(KERN_NOTICE "fd%d: is write protected\n", SelectedDrive ); + goto err_end; + } + if ((status & FDCSTAT_RECNF) && + /* RECNF is no error after a multiple read when the FDC + searched for a non-existent sector! */ + !(read_track && FDC_READ(FDCREG_SECTOR) > SUDT->spt)) { + if (Probing) { + if (SUDT > atari_disk_type) { + if (SUDT[-1].blocks > ReqBlock) { + /* try another disk type */ + SUDT--; + set_capacity(unit[SelectedDrive].disk, + SUDT->blocks); + } else + Probing = 0; + } + else { + if (SUD.flags & FTD_MSG) + printk(KERN_INFO "fd%d: Auto-detected floppy type %s\n", + SelectedDrive, SUDT->name ); + Probing=0; + } + } else { +/* record not found, but not probing. Maybe stretch wrong ? Restart probing */ + if (SUD.autoprobe) { + SUDT = atari_disk_type + StartDiskType[DriveType]; + set_capacity(unit[SelectedDrive].disk, + SUDT->blocks); + Probing = 1; + } + } + if (Probing) { + if (ATARIHW_PRESENT(FDCSPEED)) { + dma_wd.fdc_speed = SUDT->fdc_speed; + MFPDELAY(); + } + setup_req_params( SelectedDrive ); + BufferDrive = -1; + do_fd_action( SelectedDrive ); + return; + } + + printk(KERN_ERR "fd%d: sector %d not found (side %d, track %d)\n", + SelectedDrive, FDC_READ (FDCREG_SECTOR), ReqSide, ReqTrack ); + goto err_end; + } + if (status & FDCSTAT_CRC) { + printk(KERN_ERR "fd%d: CRC error (side %d, track %d, sector %d)\n", + SelectedDrive, ReqSide, ReqTrack, FDC_READ (FDCREG_SECTOR) ); + goto err_end; + } + if (status & FDCSTAT_LOST) { + printk(KERN_ERR "fd%d: lost data (side %d, track %d, sector %d)\n", + SelectedDrive, ReqSide, ReqTrack, FDC_READ (FDCREG_SECTOR) ); + goto err_end; + } + + Probing = 0; + + if (ReqCmd == READ) { + if (!read_track) { + void *addr; + addr = ATARIHW_PRESENT( EXTD_DMA ) ? ReqData : DMABuffer; + dma_cache_maintenance( virt_to_phys(addr), 512, 0 ); + if (!ATARIHW_PRESENT( EXTD_DMA )) + copy_buffer (addr, ReqData); + } else { + dma_cache_maintenance( PhysTrackBuffer, MaxSectors[DriveType] * 512, 0 ); + BufferDrive = SelectedDrive; + BufferSide = ReqSide; + BufferTrack = ReqTrack; + copy_buffer (SECTOR_BUFFER (ReqSector), ReqData); + } + } + + if (++ReqCnt < blk_rq_cur_sectors(fd_request)) { + /* read next sector */ + setup_req_params( SelectedDrive ); + do_fd_action( SelectedDrive ); + } + else { + /* all sectors finished */ + fd_end_request_cur(0); + redo_fd_request(); + } + return; + + err_end: + BufferDrive = -1; + fd_error(); +} + + +static void fd_writetrack( void ) +{ + unsigned long paddr, flags; + unsigned int track; + + DPRINT(("fd_writetrack() Tr=%d Si=%d\n", ReqTrack, ReqSide )); + + paddr = PhysTrackBuffer; + dma_cache_maintenance( paddr, BUFFER_SIZE, 1 ); + + fd_select_side( ReqSide ); + + /* Cheat for track if stretch != 0 */ + if (SUDT->stretch) { + track = FDC_READ( FDCREG_TRACK); + MFPDELAY(); + FDC_WRITE(FDCREG_TRACK,track >> SUDT->stretch); + } + udelay(40); + + /* Setup DMA */ + local_irq_save(flags); + dma_wd.dma_lo = (unsigned char)paddr; + MFPDELAY(); + paddr >>= 8; + dma_wd.dma_md = (unsigned char)paddr; + MFPDELAY(); + paddr >>= 8; + if (ATARIHW_PRESENT( EXTD_DMA )) + st_dma_ext_dmahi = (unsigned short)paddr; + else + dma_wd.dma_hi = (unsigned char)paddr; + MFPDELAY(); + local_irq_restore(flags); + + /* Clear FIFO and switch DMA to correct mode */ + dma_wd.dma_mode_status = 0x190; + MFPDELAY(); + dma_wd.dma_mode_status = 0x90; + MFPDELAY(); + dma_wd.dma_mode_status = 0x190; + MFPDELAY(); + + /* How many sectors for DMA */ + dma_wd.fdc_acces_seccount = BUFFER_SIZE/512; + udelay(40); + + /* Start operation */ + dma_wd.dma_mode_status = FDCSELREG_STP | 0x100; + udelay(40); + SET_IRQ_HANDLER( fd_writetrack_done ); + dma_wd.fdc_acces_seccount = FDCCMD_WRTRA | get_head_settle_flag(); + + MotorOn = 1; + start_timeout(); + /* wait for interrupt */ +} + + +static void fd_writetrack_done( int status ) +{ + DPRINT(("fd_writetrack_done()\n")); + + stop_timeout(); + + if (status & FDCSTAT_WPROT) { + printk(KERN_NOTICE "fd%d: is write protected\n", SelectedDrive ); + goto err_end; + } + if (status & FDCSTAT_LOST) { + printk(KERN_ERR "fd%d: lost data (side %d, track %d)\n", + SelectedDrive, ReqSide, ReqTrack ); + goto err_end; + } + + complete(&format_wait); + return; + + err_end: + fd_error(); +} + +static void fd_times_out( unsigned long dummy ) +{ + atari_disable_irq( IRQ_MFP_FDC ); + if (!FloppyIRQHandler) goto end; /* int occurred after timer was fired, but + * before we came here... */ + + SET_IRQ_HANDLER( NULL ); + /* If the timeout occurred while the readtrack_check timer was + * active, we need to cancel it, else bad things will happen */ + if (UseTrackbuffer) + del_timer( &readtrack_timer ); + FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI ); + udelay( 25 ); + + printk(KERN_ERR "floppy timeout\n" ); + fd_error(); + end: + atari_enable_irq( IRQ_MFP_FDC ); +} + + +/* The (noop) seek operation here is needed to make the WP bit in the + * FDC status register accessible for check_change. If the last disk + * operation would have been a RDSEC, this bit would always read as 0 + * no matter what :-( To save time, the seek goes to the track we're + * already on. + */ + +static void finish_fdc( void ) +{ + if (!NeedSeek) { + finish_fdc_done( 0 ); + } + else { + DPRINT(("finish_fdc: dummy seek started\n")); + FDC_WRITE (FDCREG_DATA, SUD.track); + SET_IRQ_HANDLER( finish_fdc_done ); + FDC_WRITE (FDCREG_CMD, FDCCMD_SEEK); + MotorOn = 1; + start_timeout(); + /* we must wait for the IRQ here, because the ST-DMA + is released immediately afterwards and the interrupt + may be delivered to the wrong driver. */ + } +} + + +static void finish_fdc_done( int dummy ) +{ + unsigned long flags; + + DPRINT(("finish_fdc_done entered\n")); + stop_timeout(); + NeedSeek = 0; + + if (timer_pending(&fd_timer) && time_before(fd_timer.expires, jiffies + 5)) + /* If the check for a disk change is done too early after this + * last seek command, the WP bit still reads wrong :-(( + */ + mod_timer(&fd_timer, jiffies + 5); + else + start_check_change_timer(); + start_motor_off_timer(); + + local_irq_save(flags); + stdma_release(); + fdc_busy = 0; + wake_up( &fdc_wait ); + local_irq_restore(flags); + + DPRINT(("finish_fdc() finished\n")); +} + +/* The detection of disk changes is a dark chapter in Atari history :-( + * Because the "Drive ready" signal isn't present in the Atari + * hardware, one has to rely on the "Write Protect". This works fine, + * as long as no write protected disks are used. TOS solves this + * problem by introducing tri-state logic ("maybe changed") and + * looking at the serial number in block 0. This isn't possible for + * Linux, since the floppy driver can't make assumptions about the + * filesystem used on the disk and thus the contents of block 0. I've + * chosen the method to always say "The disk was changed" if it is + * unsure whether it was. This implies that every open or mount + * invalidates the disk buffers if you work with write protected + * disks. But at least this is better than working with incorrect data + * due to unrecognised disk changes. + */ + +static unsigned int floppy_check_events(struct gendisk *disk, + unsigned int clearing) +{ + struct atari_floppy_struct *p = disk->private_data; + unsigned int drive = p - unit; + if (test_bit (drive, &fake_change)) { + /* simulated change (e.g. after formatting) */ + return DISK_EVENT_MEDIA_CHANGE; + } + if (test_bit (drive, &changed_floppies)) { + /* surely changed (the WP signal changed at least once) */ + return DISK_EVENT_MEDIA_CHANGE; + } + if (UD.wpstat) { + /* WP is on -> could be changed: to be sure, buffers should be + * invalidated... + */ + return DISK_EVENT_MEDIA_CHANGE; + } + + return 0; +} + +static int floppy_revalidate(struct gendisk *disk) +{ + struct atari_floppy_struct *p = disk->private_data; + unsigned int drive = p - unit; + + if (test_bit(drive, &changed_floppies) || + test_bit(drive, &fake_change) || + p->disktype == 0) { + if (UD.flags & FTD_MSG) + printk(KERN_ERR "floppy: clear format %p!\n", UDT); + BufferDrive = -1; + clear_bit(drive, &fake_change); + clear_bit(drive, &changed_floppies); + /* MSch: clearing geometry makes sense only for autoprobe + formats, for 'permanent user-defined' parameter: + restore default_params[] here if flagged valid! */ + if (default_params[drive].blocks == 0) + UDT = NULL; + else + UDT = &default_params[drive]; + } + return 0; +} + + +/* This sets up the global variables describing the current request. */ + +static void setup_req_params( int drive ) +{ + int block = ReqBlock + ReqCnt; + + ReqTrack = block / UDT->spt; + ReqSector = block - ReqTrack * UDT->spt + 1; + ReqSide = ReqTrack & 1; + ReqTrack >>= 1; + ReqData = ReqBuffer + 512 * ReqCnt; + + if (UseTrackbuffer) + read_track = (ReqCmd == READ && fd_request->errors == 0); + else + read_track = 0; + + DPRINT(("Request params: Si=%d Tr=%d Se=%d Data=%08lx\n",ReqSide, + ReqTrack, ReqSector, (unsigned long)ReqData )); +} + +/* + * Round-robin between our available drives, doing one request from each + */ +static struct request *set_next_request(void) +{ + struct request_queue *q; + int old_pos = fdc_queue; + struct request *rq = NULL; + + do { + q = unit[fdc_queue].disk->queue; + if (++fdc_queue == FD_MAX_UNITS) + fdc_queue = 0; + if (q) { + rq = blk_fetch_request(q); + if (rq) + break; + } + } while (fdc_queue != old_pos); + + return rq; +} + + +static void redo_fd_request(void) +{ + int drive, type; + struct atari_floppy_struct *floppy; + + DPRINT(("redo_fd_request: fd_request=%p dev=%s fd_request->sector=%ld\n", + fd_request, fd_request ? fd_request->rq_disk->disk_name : "", + fd_request ? blk_rq_pos(fd_request) : 0 )); + + IsFormatting = 0; + +repeat: + if (!fd_request) { + fd_request = set_next_request(); + if (!fd_request) + goto the_end; + } + + floppy = fd_request->rq_disk->private_data; + drive = floppy - unit; + type = floppy->type; + + if (!UD.connected) { + /* drive not connected */ + printk(KERN_ERR "Unknown Device: fd%d\n", drive ); + fd_end_request_cur(-EIO); + goto repeat; + } + + if (type == 0) { + if (!UDT) { + Probing = 1; + UDT = atari_disk_type + StartDiskType[DriveType]; + set_capacity(floppy->disk, UDT->blocks); + UD.autoprobe = 1; + } + } + else { + /* user supplied disk type */ + if (--type >= NUM_DISK_MINORS) { + printk(KERN_WARNING "fd%d: invalid disk format", drive ); + fd_end_request_cur(-EIO); + goto repeat; + } + if (minor2disktype[type].drive_types > DriveType) { + printk(KERN_WARNING "fd%d: unsupported disk format", drive ); + fd_end_request_cur(-EIO); + goto repeat; + } + type = minor2disktype[type].index; + UDT = &atari_disk_type[type]; + set_capacity(floppy->disk, UDT->blocks); + UD.autoprobe = 0; + } + + if (blk_rq_pos(fd_request) + 1 > UDT->blocks) { + fd_end_request_cur(-EIO); + goto repeat; + } + + /* stop deselect timer */ + del_timer( &motor_off_timer ); + + ReqCnt = 0; + ReqCmd = rq_data_dir(fd_request); + ReqBlock = blk_rq_pos(fd_request); + ReqBuffer = bio_data(fd_request->bio); + setup_req_params( drive ); + do_fd_action( drive ); + + return; + + the_end: + finish_fdc(); +} + + +void do_fd_request(struct request_queue * q) +{ + DPRINT(("do_fd_request for pid %d\n",current->pid)); + wait_event(fdc_wait, cmpxchg(&fdc_busy, 0, 1) == 0); + stdma_lock(floppy_irq, NULL); + + atari_disable_irq( IRQ_MFP_FDC ); + redo_fd_request(); + atari_enable_irq( IRQ_MFP_FDC ); +} + +static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long param) +{ + struct gendisk *disk = bdev->bd_disk; + struct atari_floppy_struct *floppy = disk->private_data; + int drive = floppy - unit; + int type = floppy->type; + struct atari_format_descr fmt_desc; + struct atari_disk_type *dtp; + struct floppy_struct getprm; + int settype; + struct floppy_struct setprm; + void __user *argp = (void __user *)param; + + switch (cmd) { + case FDGETPRM: + if (type) { + if (--type >= NUM_DISK_MINORS) + return -ENODEV; + if (minor2disktype[type].drive_types > DriveType) + return -ENODEV; + type = minor2disktype[type].index; + dtp = &atari_disk_type[type]; + if (UD.flags & FTD_MSG) + printk (KERN_ERR "floppy%d: found dtp %p name %s!\n", + drive, dtp, dtp->name); + } + else { + if (!UDT) + return -ENXIO; + else + dtp = UDT; + } + memset((void *)&getprm, 0, sizeof(getprm)); + getprm.size = dtp->blocks; + getprm.sect = dtp->spt; + getprm.head = 2; + getprm.track = dtp->blocks/dtp->spt/2; + getprm.stretch = dtp->stretch; + if (copy_to_user(argp, &getprm, sizeof(getprm))) + return -EFAULT; + return 0; + } + switch (cmd) { + case FDSETPRM: + case FDDEFPRM: + /* + * MSch 7/96: simple 'set geometry' case: just set the + * 'default' device params (minor == 0). + * Currently, the drive geometry is cleared after each + * disk change and subsequent revalidate()! simple + * implementation of FDDEFPRM: save geometry from a + * FDDEFPRM call and restore it in floppy_revalidate() ! + */ + + /* get the parameters from user space */ + if (floppy->ref != 1 && floppy->ref != -1) + return -EBUSY; + if (copy_from_user(&setprm, argp, sizeof(setprm))) + return -EFAULT; + /* + * first of all: check for floppy change and revalidate, + * or the next access will revalidate - and clear UDT :-( + */ + + if (floppy_check_events(disk, 0)) + floppy_revalidate(disk); + + if (UD.flags & FTD_MSG) + printk (KERN_INFO "floppy%d: setting size %d spt %d str %d!\n", + drive, setprm.size, setprm.sect, setprm.stretch); + + /* what if type > 0 here? Overwrite specified entry ? */ + if (type) { + /* refuse to re-set a predefined type for now */ + redo_fd_request(); + return -EINVAL; + } + + /* + * type == 0: first look for a matching entry in the type list, + * and set the UD.disktype field to use the perdefined entry. + * TODO: add user-defined format to head of autoprobe list ? + * Useful to include the user-type for future autodetection! + */ + + for (settype = 0; settype < NUM_DISK_MINORS; settype++) { + int setidx = 0; + if (minor2disktype[settype].drive_types > DriveType) { + /* skip this one, invalid for drive ... */ + continue; + } + setidx = minor2disktype[settype].index; + dtp = &atari_disk_type[setidx]; + + /* found matching entry ?? */ + if ( dtp->blocks == setprm.size + && dtp->spt == setprm.sect + && dtp->stretch == setprm.stretch ) { + if (UD.flags & FTD_MSG) + printk (KERN_INFO "floppy%d: setting %s %p!\n", + drive, dtp->name, dtp); + UDT = dtp; + set_capacity(floppy->disk, UDT->blocks); + + if (cmd == FDDEFPRM) { + /* save settings as permanent default type */ + default_params[drive].name = dtp->name; + default_params[drive].spt = dtp->spt; + default_params[drive].blocks = dtp->blocks; + default_params[drive].fdc_speed = dtp->fdc_speed; + default_params[drive].stretch = dtp->stretch; + } + + return 0; + } + + } + + /* no matching disk type found above - setting user_params */ + + if (cmd == FDDEFPRM) { + /* set permanent type */ + dtp = &default_params[drive]; + } else + /* set user type (reset by disk change!) */ + dtp = &user_params[drive]; + + dtp->name = "user format"; + dtp->blocks = setprm.size; + dtp->spt = setprm.sect; + if (setprm.sect > 14) + dtp->fdc_speed = 3; + else + dtp->fdc_speed = 0; + dtp->stretch = setprm.stretch; + + if (UD.flags & FTD_MSG) + printk (KERN_INFO "floppy%d: blk %d spt %d str %d!\n", + drive, dtp->blocks, dtp->spt, dtp->stretch); + + /* sanity check */ + if (setprm.track != dtp->blocks/dtp->spt/2 || + setprm.head != 2) { + redo_fd_request(); + return -EINVAL; + } + + UDT = dtp; + set_capacity(floppy->disk, UDT->blocks); + + return 0; + case FDMSGON: + UD.flags |= FTD_MSG; + return 0; + case FDMSGOFF: + UD.flags &= ~FTD_MSG; + return 0; + case FDSETEMSGTRESH: + return -EINVAL; + case FDFMTBEG: + return 0; + case FDFMTTRK: + if (floppy->ref != 1 && floppy->ref != -1) + return -EBUSY; + if (copy_from_user(&fmt_desc, argp, sizeof(fmt_desc))) + return -EFAULT; + return do_format(drive, type, &fmt_desc); + case FDCLRPRM: + UDT = NULL; + /* MSch: invalidate default_params */ + default_params[drive].blocks = 0; + set_capacity(floppy->disk, MAX_DISK_SIZE * 2); + case FDFMTEND: + case FDFLUSH: + /* invalidate the buffer track to force a reread */ + BufferDrive = -1; + set_bit(drive, &fake_change); + check_disk_change(bdev); + return 0; + default: + return -EINVAL; + } +} + +static int fd_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + int ret; + + mutex_lock(&ataflop_mutex); + ret = fd_locked_ioctl(bdev, mode, cmd, arg); + mutex_unlock(&ataflop_mutex); + + return ret; +} + +/* Initialize the 'unit' variable for drive 'drive' */ + +static void __init fd_probe( int drive ) +{ + UD.connected = 0; + UDT = NULL; + + if (!fd_test_drive_present( drive )) + return; + + UD.connected = 1; + UD.track = 0; + switch( UserSteprate[drive] ) { + case 2: + UD.steprate = FDCSTEP_2; + break; + case 3: + UD.steprate = FDCSTEP_3; + break; + case 6: + UD.steprate = FDCSTEP_6; + break; + case 12: + UD.steprate = FDCSTEP_12; + break; + default: /* should be -1 for "not set by user" */ + if (ATARIHW_PRESENT( FDCSPEED ) || MACH_IS_MEDUSA) + UD.steprate = FDCSTEP_3; + else + UD.steprate = FDCSTEP_6; + break; + } + MotorOn = 1; /* from probe restore operation! */ +} + + +/* This function tests the physical presence of a floppy drive (not + * whether a disk is inserted). This is done by issuing a restore + * command, waiting max. 2 seconds (that should be enough to move the + * head across the whole disk) and looking at the state of the "TR00" + * signal. This should now be raised if there is a drive connected + * (and there is no hardware failure :-) Otherwise, the drive is + * declared absent. + */ + +static int __init fd_test_drive_present( int drive ) +{ + unsigned long timeout; + unsigned char status; + int ok; + + if (drive >= (MACH_IS_FALCON ? 1 : 2)) return( 0 ); + fd_select_drive( drive ); + + /* disable interrupt temporarily */ + atari_turnoff_irq( IRQ_MFP_FDC ); + FDC_WRITE (FDCREG_TRACK, 0xff00); + FDC_WRITE( FDCREG_CMD, FDCCMD_RESTORE | FDCCMDADD_H | FDCSTEP_6 ); + + timeout = jiffies + 2*HZ+HZ/2; + while (time_before(jiffies, timeout)) + if (!(st_mfp.par_dt_reg & 0x20)) + break; + + status = FDC_READ( FDCREG_STATUS ); + ok = (status & FDCSTAT_TR00) != 0; + + /* force interrupt to abort restore operation (FDC would try + * about 50 seconds!) */ + FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI ); + udelay(500); + status = FDC_READ( FDCREG_STATUS ); + udelay(20); + + if (ok) { + /* dummy seek command to make WP bit accessible */ + FDC_WRITE( FDCREG_DATA, 0 ); + FDC_WRITE( FDCREG_CMD, FDCCMD_SEEK ); + while( st_mfp.par_dt_reg & 0x20 ) + ; + status = FDC_READ( FDCREG_STATUS ); + } + + atari_turnon_irq( IRQ_MFP_FDC ); + return( ok ); +} + + +/* Look how many and which kind of drives are connected. If there are + * floppies, additionally start the disk-change and motor-off timers. + */ + +static void __init config_types( void ) +{ + int drive, cnt = 0; + + /* for probing drives, set the FDC speed to 8 MHz */ + if (ATARIHW_PRESENT(FDCSPEED)) + dma_wd.fdc_speed = 0; + + printk(KERN_INFO "Probing floppy drive(s):\n"); + for( drive = 0; drive < FD_MAX_UNITS; drive++ ) { + fd_probe( drive ); + if (UD.connected) { + printk(KERN_INFO "fd%d\n", drive); + ++cnt; + } + } + + if (FDC_READ( FDCREG_STATUS ) & FDCSTAT_BUSY) { + /* If FDC is still busy from probing, give it another FORCI + * command to abort the operation. If this isn't done, the FDC + * will interrupt later and its IRQ line stays low, because + * the status register isn't read. And this will block any + * interrupts on this IRQ line :-( + */ + FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI ); + udelay(500); + FDC_READ( FDCREG_STATUS ); + udelay(20); + } + + if (cnt > 0) { + start_motor_off_timer(); + if (cnt == 1) fd_select_drive( 0 ); + start_check_change_timer(); + } +} + +/* + * floppy_open check for aliasing (/dev/fd0 can be the same as + * /dev/PS0 etc), and disallows simultaneous access to the same + * drive with different device numbers. + */ + +static int floppy_open(struct block_device *bdev, fmode_t mode) +{ + struct atari_floppy_struct *p = bdev->bd_disk->private_data; + int type = MINOR(bdev->bd_dev) >> 2; + + DPRINT(("fd_open: type=%d\n",type)); + if (p->ref && p->type != type) + return -EBUSY; + + if (p->ref == -1 || (p->ref && mode & FMODE_EXCL)) + return -EBUSY; + + if (mode & FMODE_EXCL) + p->ref = -1; + else + p->ref++; + + p->type = type; + + if (mode & FMODE_NDELAY) + return 0; + + if (mode & (FMODE_READ|FMODE_WRITE)) { + check_disk_change(bdev); + if (mode & FMODE_WRITE) { + if (p->wpstat) { + if (p->ref < 0) + p->ref = 0; + else + p->ref--; + return -EROFS; + } + } + } + return 0; +} + +static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + mutex_lock(&ataflop_mutex); + ret = floppy_open(bdev, mode); + mutex_unlock(&ataflop_mutex); + + return ret; +} + +static void floppy_release(struct gendisk *disk, fmode_t mode) +{ + struct atari_floppy_struct *p = disk->private_data; + mutex_lock(&ataflop_mutex); + if (p->ref < 0) + p->ref = 0; + else if (!p->ref--) { + printk(KERN_ERR "floppy_release with fd_ref == 0"); + p->ref = 0; + } + mutex_unlock(&ataflop_mutex); +} + +static const struct block_device_operations floppy_fops = { + .owner = THIS_MODULE, + .open = floppy_unlocked_open, + .release = floppy_release, + .ioctl = fd_ioctl, + .check_events = floppy_check_events, + .revalidate_disk= floppy_revalidate, +}; + +static struct kobject *floppy_find(dev_t dev, int *part, void *data) +{ + int drive = *part & 3; + int type = *part >> 2; + if (drive >= FD_MAX_UNITS || type > NUM_DISK_MINORS) + return NULL; + *part = 0; + return get_disk(unit[drive].disk); +} + +static int __init atari_floppy_init (void) +{ + int i; + + if (!MACH_IS_ATARI) + /* Amiga, Mac, ... don't have Atari-compatible floppy :-) */ + return -ENODEV; + + if (register_blkdev(FLOPPY_MAJOR,"fd")) + return -EBUSY; + + for (i = 0; i < FD_MAX_UNITS; i++) { + unit[i].disk = alloc_disk(1); + if (!unit[i].disk) + goto Enomem; + } + + if (UseTrackbuffer < 0) + /* not set by user -> use default: for now, we turn + track buffering off for all Medusas, though it + could be used with ones that have a counter + card. But the test is too hard :-( */ + UseTrackbuffer = !MACH_IS_MEDUSA; + + /* initialize variables */ + SelectedDrive = -1; + BufferDrive = -1; + + DMABuffer = atari_stram_alloc(BUFFER_SIZE+512, "ataflop"); + if (!DMABuffer) { + printk(KERN_ERR "atari_floppy_init: cannot get dma buffer\n"); + goto Enomem; + } + TrackBuffer = DMABuffer + 512; + PhysDMABuffer = atari_stram_to_phys(DMABuffer); + PhysTrackBuffer = virt_to_phys(TrackBuffer); + BufferDrive = BufferSide = BufferTrack = -1; + + for (i = 0; i < FD_MAX_UNITS; i++) { + unit[i].track = -1; + unit[i].flags = 0; + unit[i].disk->major = FLOPPY_MAJOR; + unit[i].disk->first_minor = i; + sprintf(unit[i].disk->disk_name, "fd%d", i); + unit[i].disk->fops = &floppy_fops; + unit[i].disk->private_data = &unit[i]; + unit[i].disk->queue = blk_init_queue(do_fd_request, + &ataflop_lock); + if (!unit[i].disk->queue) + goto Enomem; + set_capacity(unit[i].disk, MAX_DISK_SIZE * 2); + add_disk(unit[i].disk); + } + + blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, + floppy_find, NULL, NULL); + + printk(KERN_INFO "Atari floppy driver: max. %cD, %strack buffering\n", + DriveType == 0 ? 'D' : DriveType == 1 ? 'H' : 'E', + UseTrackbuffer ? "" : "no "); + config_types(); + + return 0; +Enomem: + while (i--) { + struct request_queue *q = unit[i].disk->queue; + + put_disk(unit[i].disk); + if (q) + blk_cleanup_queue(q); + } + + unregister_blkdev(FLOPPY_MAJOR, "fd"); + return -ENOMEM; +} + +#ifndef MODULE +static int __init atari_floppy_setup(char *str) +{ + int ints[3 + FD_MAX_UNITS]; + int i; + + if (!MACH_IS_ATARI) + return 0; + + str = get_options(str, 3 + FD_MAX_UNITS, ints); + + if (ints[0] < 1) { + printk(KERN_ERR "ataflop_setup: no arguments!\n" ); + return 0; + } + else if (ints[0] > 2+FD_MAX_UNITS) { + printk(KERN_ERR "ataflop_setup: too many arguments\n" ); + } + + if (ints[1] < 0 || ints[1] > 2) + printk(KERN_ERR "ataflop_setup: bad drive type\n" ); + else + DriveType = ints[1]; + + if (ints[0] >= 2) + UseTrackbuffer = (ints[2] > 0); + + for( i = 3; i <= ints[0] && i-3 < FD_MAX_UNITS; ++i ) { + if (ints[i] != 2 && ints[i] != 3 && ints[i] != 6 && ints[i] != 12) + printk(KERN_ERR "ataflop_setup: bad steprate\n" ); + else + UserSteprate[i-3] = ints[i]; + } + return 1; +} + +__setup("floppy=", atari_floppy_setup); +#endif + +static void __exit atari_floppy_exit(void) +{ + int i; + blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); + for (i = 0; i < FD_MAX_UNITS; i++) { + struct request_queue *q = unit[i].disk->queue; + + del_gendisk(unit[i].disk); + put_disk(unit[i].disk); + blk_cleanup_queue(q); + } + unregister_blkdev(FLOPPY_MAJOR, "fd"); + + del_timer_sync(&fd_timer); + atari_stram_free( DMABuffer ); +} + +module_init(atari_floppy_init) +module_exit(atari_floppy_exit) + +MODULE_LICENSE("GPL"); diff --git a/kernel/drivers/block/brd.c b/kernel/drivers/block/brd.c new file mode 100644 index 000000000..64ab4951e --- /dev/null +++ b/kernel/drivers/block/brd.c @@ -0,0 +1,651 @@ +/* + * Ram backed block device driver. + * + * Copyright (C) 2007 Nick Piggin + * Copyright (C) 2007 Novell Inc. + * + * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright + * of their respective owners. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define SECTOR_SHIFT 9 +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) + +/* + * Each block ramdisk device has a radix_tree brd_pages of pages that stores + * the pages containing the block device's contents. A brd page's ->index is + * its offset in PAGE_SIZE units. This is similar to, but in no way connected + * with, the kernel's pagecache or buffer cache (which sit above our block + * device). + */ +struct brd_device { + int brd_number; + + struct request_queue *brd_queue; + struct gendisk *brd_disk; + struct list_head brd_list; + + /* + * Backing store of pages and lock to protect it. This is the contents + * of the block device. + */ + spinlock_t brd_lock; + struct radix_tree_root brd_pages; +}; + +/* + * Look up and return a brd's page for a given sector. + */ +static DEFINE_MUTEX(brd_mutex); +static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) +{ + pgoff_t idx; + struct page *page; + + /* + * The page lifetime is protected by the fact that we have opened the + * device node -- brd pages will never be deleted under us, so we + * don't need any further locking or refcounting. + * + * This is strictly true for the radix-tree nodes as well (ie. we + * don't actually need the rcu_read_lock()), however that is not a + * documented feature of the radix-tree API so it is better to be + * safe here (we don't have total exclusion from radix tree updates + * here, only deletes). + */ + rcu_read_lock(); + idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */ + page = radix_tree_lookup(&brd->brd_pages, idx); + rcu_read_unlock(); + + BUG_ON(page && page->index != idx); + + return page; +} + +/* + * Look up and return a brd's page for a given sector. + * If one does not exist, allocate an empty page, and insert that. Then + * return it. + */ +static struct page *brd_insert_page(struct brd_device *brd, sector_t sector) +{ + pgoff_t idx; + struct page *page; + gfp_t gfp_flags; + + page = brd_lookup_page(brd, sector); + if (page) + return page; + + /* + * Must use NOIO because we don't want to recurse back into the + * block or filesystem layers from page reclaim. + * + * Cannot support DAX and highmem, because our ->direct_access + * routine for DAX must return memory that is always addressable. + * If DAX was reworked to use pfns and kmap throughout, this + * restriction might be able to be lifted. + */ + gfp_flags = GFP_NOIO | __GFP_ZERO; +#ifndef CONFIG_BLK_DEV_RAM_DAX + gfp_flags |= __GFP_HIGHMEM; +#endif + page = alloc_page(gfp_flags); + if (!page) + return NULL; + + if (radix_tree_preload(GFP_NOIO)) { + __free_page(page); + return NULL; + } + + spin_lock(&brd->brd_lock); + idx = sector >> PAGE_SECTORS_SHIFT; + page->index = idx; + if (radix_tree_insert(&brd->brd_pages, idx, page)) { + __free_page(page); + page = radix_tree_lookup(&brd->brd_pages, idx); + BUG_ON(!page); + BUG_ON(page->index != idx); + } + spin_unlock(&brd->brd_lock); + + radix_tree_preload_end(); + + return page; +} + +static void brd_free_page(struct brd_device *brd, sector_t sector) +{ + struct page *page; + pgoff_t idx; + + spin_lock(&brd->brd_lock); + idx = sector >> PAGE_SECTORS_SHIFT; + page = radix_tree_delete(&brd->brd_pages, idx); + spin_unlock(&brd->brd_lock); + if (page) + __free_page(page); +} + +static void brd_zero_page(struct brd_device *brd, sector_t sector) +{ + struct page *page; + + page = brd_lookup_page(brd, sector); + if (page) + clear_highpage(page); +} + +/* + * Free all backing store pages and radix tree. This must only be called when + * there are no other users of the device. + */ +#define FREE_BATCH 16 +static void brd_free_pages(struct brd_device *brd) +{ + unsigned long pos = 0; + struct page *pages[FREE_BATCH]; + int nr_pages; + + do { + int i; + + nr_pages = radix_tree_gang_lookup(&brd->brd_pages, + (void **)pages, pos, FREE_BATCH); + + for (i = 0; i < nr_pages; i++) { + void *ret; + + BUG_ON(pages[i]->index < pos); + pos = pages[i]->index; + ret = radix_tree_delete(&brd->brd_pages, pos); + BUG_ON(!ret || ret != pages[i]); + __free_page(pages[i]); + } + + pos++; + + /* + * This assumes radix_tree_gang_lookup always returns as + * many pages as possible. If the radix-tree code changes, + * so will this have to. + */ + } while (nr_pages == FREE_BATCH); +} + +/* + * copy_to_brd_setup must be called before copy_to_brd. It may sleep. + */ +static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n) +{ + unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; + size_t copy; + + copy = min_t(size_t, n, PAGE_SIZE - offset); + if (!brd_insert_page(brd, sector)) + return -ENOSPC; + if (copy < n) { + sector += copy >> SECTOR_SHIFT; + if (!brd_insert_page(brd, sector)) + return -ENOSPC; + } + return 0; +} + +static void discard_from_brd(struct brd_device *brd, + sector_t sector, size_t n) +{ + while (n >= PAGE_SIZE) { + /* + * Don't want to actually discard pages here because + * re-allocating the pages can result in writeback + * deadlocks under heavy load. + */ + if (0) + brd_free_page(brd, sector); + else + brd_zero_page(brd, sector); + sector += PAGE_SIZE >> SECTOR_SHIFT; + n -= PAGE_SIZE; + } +} + +/* + * Copy n bytes from src to the brd starting at sector. Does not sleep. + */ +static void copy_to_brd(struct brd_device *brd, const void *src, + sector_t sector, size_t n) +{ + struct page *page; + void *dst; + unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; + size_t copy; + + copy = min_t(size_t, n, PAGE_SIZE - offset); + page = brd_lookup_page(brd, sector); + BUG_ON(!page); + + dst = kmap_atomic(page); + memcpy(dst + offset, src, copy); + kunmap_atomic(dst); + + if (copy < n) { + src += copy; + sector += copy >> SECTOR_SHIFT; + copy = n - copy; + page = brd_lookup_page(brd, sector); + BUG_ON(!page); + + dst = kmap_atomic(page); + memcpy(dst, src, copy); + kunmap_atomic(dst); + } +} + +/* + * Copy n bytes to dst from the brd starting at sector. Does not sleep. + */ +static void copy_from_brd(void *dst, struct brd_device *brd, + sector_t sector, size_t n) +{ + struct page *page; + void *src; + unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT; + size_t copy; + + copy = min_t(size_t, n, PAGE_SIZE - offset); + page = brd_lookup_page(brd, sector); + if (page) { + src = kmap_atomic(page); + memcpy(dst, src + offset, copy); + kunmap_atomic(src); + } else + memset(dst, 0, copy); + + if (copy < n) { + dst += copy; + sector += copy >> SECTOR_SHIFT; + copy = n - copy; + page = brd_lookup_page(brd, sector); + if (page) { + src = kmap_atomic(page); + memcpy(dst, src, copy); + kunmap_atomic(src); + } else + memset(dst, 0, copy); + } +} + +/* + * Process a single bvec of a bio. + */ +static int brd_do_bvec(struct brd_device *brd, struct page *page, + unsigned int len, unsigned int off, int rw, + sector_t sector) +{ + void *mem; + int err = 0; + + if (rw != READ) { + err = copy_to_brd_setup(brd, sector, len); + if (err) + goto out; + } + + mem = kmap_atomic(page); + if (rw == READ) { + copy_from_brd(mem + off, brd, sector, len); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + copy_to_brd(brd, mem + off, sector, len); + } + kunmap_atomic(mem); + +out: + return err; +} + +static void brd_make_request(struct request_queue *q, struct bio *bio) +{ + struct block_device *bdev = bio->bi_bdev; + struct brd_device *brd = bdev->bd_disk->private_data; + int rw; + struct bio_vec bvec; + sector_t sector; + struct bvec_iter iter; + int err = -EIO; + + sector = bio->bi_iter.bi_sector; + if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) + goto out; + + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + err = 0; + discard_from_brd(brd, sector, bio->bi_iter.bi_size); + goto out; + } + + rw = bio_rw(bio); + if (rw == READA) + rw = READ; + + bio_for_each_segment(bvec, bio, iter) { + unsigned int len = bvec.bv_len; + err = brd_do_bvec(brd, bvec.bv_page, len, + bvec.bv_offset, rw, sector); + if (err) + break; + sector += len >> SECTOR_SHIFT; + } + +out: + bio_endio(bio, err); +} + +static int brd_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + struct brd_device *brd = bdev->bd_disk->private_data; + int err = brd_do_bvec(brd, page, PAGE_CACHE_SIZE, 0, rw, sector); + page_endio(page, rw & WRITE, err); + return err; +} + +#ifdef CONFIG_BLK_DEV_RAM_DAX +static long brd_direct_access(struct block_device *bdev, sector_t sector, + void **kaddr, unsigned long *pfn, long size) +{ + struct brd_device *brd = bdev->bd_disk->private_data; + struct page *page; + + if (!brd) + return -ENODEV; + page = brd_insert_page(brd, sector); + if (!page) + return -ENOSPC; + *kaddr = page_address(page); + *pfn = page_to_pfn(page); + + /* + * TODO: If size > PAGE_SIZE, we could look to see if the next page in + * the file happens to be mapped to the next page of physical RAM. + */ + return PAGE_SIZE; +} +#else +#define brd_direct_access NULL +#endif + +static int brd_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + int error; + struct brd_device *brd = bdev->bd_disk->private_data; + + if (cmd != BLKFLSBUF) + return -ENOTTY; + + /* + * ram device BLKFLSBUF has special semantics, we want to actually + * release and destroy the ramdisk data. + */ + mutex_lock(&brd_mutex); + mutex_lock(&bdev->bd_mutex); + error = -EBUSY; + if (bdev->bd_openers <= 1) { + /* + * Kill the cache first, so it isn't written back to the + * device. + * + * Another thread might instantiate more buffercache here, + * but there is not much we can do to close that race. + */ + kill_bdev(bdev); + brd_free_pages(brd); + error = 0; + } + mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&brd_mutex); + + return error; +} + +static const struct block_device_operations brd_fops = { + .owner = THIS_MODULE, + .rw_page = brd_rw_page, + .ioctl = brd_ioctl, + .direct_access = brd_direct_access, +}; + +/* + * And now the modules code and kernel interface. + */ +static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT; +module_param(rd_nr, int, S_IRUGO); +MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); + +int rd_size = CONFIG_BLK_DEV_RAM_SIZE; +module_param(rd_size, int, S_IRUGO); +MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); + +static int max_part = 1; +module_param(max_part, int, S_IRUGO); +MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); +MODULE_ALIAS("rd"); + +#ifndef MODULE +/* Legacy boot options - nonmodular */ +static int __init ramdisk_size(char *str) +{ + rd_size = simple_strtol(str, NULL, 0); + return 1; +} +__setup("ramdisk_size=", ramdisk_size); +#endif + +/* + * The device scheme is derived from loop.c. Keep them in synch where possible + * (should share code eventually). + */ +static LIST_HEAD(brd_devices); +static DEFINE_MUTEX(brd_devices_mutex); + +static struct brd_device *brd_alloc(int i) +{ + struct brd_device *brd; + struct gendisk *disk; + + brd = kzalloc(sizeof(*brd), GFP_KERNEL); + if (!brd) + goto out; + brd->brd_number = i; + spin_lock_init(&brd->brd_lock); + INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); + + brd->brd_queue = blk_alloc_queue(GFP_KERNEL); + if (!brd->brd_queue) + goto out_free_dev; + + blk_queue_make_request(brd->brd_queue, brd_make_request); + blk_queue_max_hw_sectors(brd->brd_queue, 1024); + blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY); + + /* This is so fdisk will align partitions on 4k, because of + * direct_access API needing 4k alignment, returning a PFN + * (This is only a problem on very small devices <= 4M, + * otherwise fdisk will align on 1M. Regardless this call + * is harmless) + */ + blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); + + brd->brd_queue->limits.discard_granularity = PAGE_SIZE; + brd->brd_queue->limits.max_discard_sectors = UINT_MAX; + brd->brd_queue->limits.discard_zeroes_data = 1; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue); + + disk = brd->brd_disk = alloc_disk(max_part); + if (!disk) + goto out_free_queue; + disk->major = RAMDISK_MAJOR; + disk->first_minor = i * max_part; + disk->fops = &brd_fops; + disk->private_data = brd; + disk->queue = brd->brd_queue; + disk->flags = GENHD_FL_EXT_DEVT; + sprintf(disk->disk_name, "ram%d", i); + set_capacity(disk, rd_size * 2); + + return brd; + +out_free_queue: + blk_cleanup_queue(brd->brd_queue); +out_free_dev: + kfree(brd); +out: + return NULL; +} + +static void brd_free(struct brd_device *brd) +{ + put_disk(brd->brd_disk); + blk_cleanup_queue(brd->brd_queue); + brd_free_pages(brd); + kfree(brd); +} + +static struct brd_device *brd_init_one(int i, bool *new) +{ + struct brd_device *brd; + + *new = false; + list_for_each_entry(brd, &brd_devices, brd_list) { + if (brd->brd_number == i) + goto out; + } + + brd = brd_alloc(i); + if (brd) { + add_disk(brd->brd_disk); + list_add_tail(&brd->brd_list, &brd_devices); + } + *new = true; +out: + return brd; +} + +static void brd_del_one(struct brd_device *brd) +{ + list_del(&brd->brd_list); + del_gendisk(brd->brd_disk); + brd_free(brd); +} + +static struct kobject *brd_probe(dev_t dev, int *part, void *data) +{ + struct brd_device *brd; + struct kobject *kobj; + bool new; + + mutex_lock(&brd_devices_mutex); + brd = brd_init_one(MINOR(dev) / max_part, &new); + kobj = brd ? get_disk(brd->brd_disk) : NULL; + mutex_unlock(&brd_devices_mutex); + + if (new) + *part = 0; + + return kobj; +} + +static int __init brd_init(void) +{ + struct brd_device *brd, *next; + int i; + + /* + * brd module now has a feature to instantiate underlying device + * structure on-demand, provided that there is an access dev node. + * + * (1) if rd_nr is specified, create that many upfront. else + * it defaults to CONFIG_BLK_DEV_RAM_COUNT + * (2) User can further extend brd devices by create dev node themselves + * and have kernel automatically instantiate actual device + * on-demand. Example: + * mknod /path/devnod_name b 1 X # 1 is the rd major + * fdisk -l /path/devnod_name + * If (X / max_part) was not already created it will be created + * dynamically. + */ + + if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) + return -EIO; + + if (unlikely(!max_part)) + max_part = 1; + + for (i = 0; i < rd_nr; i++) { + brd = brd_alloc(i); + if (!brd) + goto out_free; + list_add_tail(&brd->brd_list, &brd_devices); + } + + /* point of no return */ + + list_for_each_entry(brd, &brd_devices, brd_list) + add_disk(brd->brd_disk); + + blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS, + THIS_MODULE, brd_probe, NULL, NULL); + + pr_info("brd: module loaded\n"); + return 0; + +out_free: + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { + list_del(&brd->brd_list); + brd_free(brd); + } + unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + + pr_info("brd: module NOT loaded !!!\n"); + return -ENOMEM; +} + +static void __exit brd_exit(void) +{ + struct brd_device *brd, *next; + + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) + brd_del_one(brd); + + blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS); + unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); + + pr_info("brd: module unloaded\n"); +} + +module_init(brd_init); +module_exit(brd_exit); + diff --git a/kernel/drivers/block/cciss.c b/kernel/drivers/block/cciss.c new file mode 100644 index 000000000..ff20f192b --- /dev/null +++ b/kernel/drivers/block/cciss.c @@ -0,0 +1,5392 @@ +/* + * Disk Array driver for HP Smart Array controllers. + * (C) Copyright 2000, 2007 Hewlett-Packard Development Company, L.P. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307, USA. + * + * Questions/Comments/Bugfixes to iss_storagedev@hp.com + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CCISS_DRIVER_VERSION(maj,min,submin) ((maj<<16)|(min<<8)|(submin)) +#define DRIVER_NAME "HP CISS Driver (v 3.6.26)" +#define DRIVER_VERSION CCISS_DRIVER_VERSION(3, 6, 26) + +/* Embedded module documentation macros - see modules.h */ +MODULE_AUTHOR("Hewlett-Packard Company"); +MODULE_DESCRIPTION("Driver for HP Smart Array Controllers"); +MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers"); +MODULE_VERSION("3.6.26"); +MODULE_LICENSE("GPL"); +static int cciss_tape_cmds = 6; +module_param(cciss_tape_cmds, int, 0644); +MODULE_PARM_DESC(cciss_tape_cmds, + "number of commands to allocate for tape devices (default: 6)"); +static int cciss_simple_mode; +module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cciss_simple_mode, + "Use 'simple mode' rather than 'performant mode'"); + +static int cciss_allow_hpsa; +module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cciss_allow_hpsa, + "Prevent cciss driver from accessing hardware known to be " + " supported by the hpsa driver"); + +static DEFINE_MUTEX(cciss_mutex); +static struct proc_dir_entry *proc_cciss; + +#include "cciss_cmd.h" +#include "cciss.h" +#include + +/* define the PCI info for the cards we can control */ +static const struct pci_device_id cciss_pci_device_id[] = { + {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISS, 0x0E11, 0x4070}, + {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSB, 0x0E11, 0x4080}, + {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSB, 0x0E11, 0x4082}, + {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSB, 0x0E11, 0x4083}, + {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSC, 0x0E11, 0x4091}, + {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSC, 0x0E11, 0x409A}, + {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSC, 0x0E11, 0x409B}, + {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSC, 0x0E11, 0x409C}, + {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSC, 0x0E11, 0x409D}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSA, 0x103C, 0x3225}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3223}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3234}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3235}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3211}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3212}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3213}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3214}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3215}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3237}, + {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x323D}, + {0,} +}; + +MODULE_DEVICE_TABLE(pci, cciss_pci_device_id); + +/* board_id = Subsystem Device ID & Vendor ID + * product = Marketing Name for the board + * access = Address of the struct of function pointers + */ +static struct board_type products[] = { + {0x40700E11, "Smart Array 5300", &SA5_access}, + {0x40800E11, "Smart Array 5i", &SA5B_access}, + {0x40820E11, "Smart Array 532", &SA5B_access}, + {0x40830E11, "Smart Array 5312", &SA5B_access}, + {0x409A0E11, "Smart Array 641", &SA5_access}, + {0x409B0E11, "Smart Array 642", &SA5_access}, + {0x409C0E11, "Smart Array 6400", &SA5_access}, + {0x409D0E11, "Smart Array 6400 EM", &SA5_access}, + {0x40910E11, "Smart Array 6i", &SA5_access}, + {0x3225103C, "Smart Array P600", &SA5_access}, + {0x3223103C, "Smart Array P800", &SA5_access}, + {0x3234103C, "Smart Array P400", &SA5_access}, + {0x3235103C, "Smart Array P400i", &SA5_access}, + {0x3211103C, "Smart Array E200i", &SA5_access}, + {0x3212103C, "Smart Array E200", &SA5_access}, + {0x3213103C, "Smart Array E200i", &SA5_access}, + {0x3214103C, "Smart Array E200i", &SA5_access}, + {0x3215103C, "Smart Array E200i", &SA5_access}, + {0x3237103C, "Smart Array E500", &SA5_access}, + {0x3223103C, "Smart Array P800", &SA5_access}, + {0x3234103C, "Smart Array P400", &SA5_access}, + {0x323D103C, "Smart Array P700m", &SA5_access}, +}; + +/* How long to wait (in milliseconds) for board to go into simple mode */ +#define MAX_CONFIG_WAIT 30000 +#define MAX_IOCTL_CONFIG_WAIT 1000 + +/*define how many times we will try a command because of bus resets */ +#define MAX_CMD_RETRIES 3 + +#define MAX_CTLR 32 + +/* Originally cciss driver only supports 8 major numbers */ +#define MAX_CTLR_ORIG 8 + +static ctlr_info_t *hba[MAX_CTLR]; + +static struct task_struct *cciss_scan_thread; +static DEFINE_MUTEX(scan_mutex); +static LIST_HEAD(scan_q); + +static void do_cciss_request(struct request_queue *q); +static irqreturn_t do_cciss_intx(int irq, void *dev_id); +static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id); +static int cciss_open(struct block_device *bdev, fmode_t mode); +static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode); +static void cciss_release(struct gendisk *disk, fmode_t mode); +static int cciss_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo); + +static int cciss_revalidate(struct gendisk *disk); +static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl); +static int deregister_disk(ctlr_info_t *h, int drv_index, + int clear_all, int via_ioctl); + +static void cciss_read_capacity(ctlr_info_t *h, int logvol, + sector_t *total_size, unsigned int *block_size); +static void cciss_read_capacity_16(ctlr_info_t *h, int logvol, + sector_t *total_size, unsigned int *block_size); +static void cciss_geometry_inquiry(ctlr_info_t *h, int logvol, + sector_t total_size, + unsigned int block_size, InquiryData_struct *inq_buff, + drive_info_struct *drv); +static void cciss_interrupt_mode(ctlr_info_t *); +static int cciss_enter_simple_mode(struct ctlr_info *h); +static void start_io(ctlr_info_t *h); +static int sendcmd_withirq(ctlr_info_t *h, __u8 cmd, void *buff, size_t size, + __u8 page_code, unsigned char scsi3addr[], + int cmd_type); +static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c, + int attempt_retry); +static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c); + +static int add_to_scan_list(struct ctlr_info *h); +static int scan_thread(void *data); +static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c); +static void cciss_hba_release(struct device *dev); +static void cciss_device_release(struct device *dev); +static void cciss_free_gendisk(ctlr_info_t *h, int drv_index); +static void cciss_free_drive_info(ctlr_info_t *h, int drv_index); +static inline u32 next_command(ctlr_info_t *h); +static int cciss_find_cfg_addrs(struct pci_dev *pdev, void __iomem *vaddr, + u32 *cfg_base_addr, u64 *cfg_base_addr_index, + u64 *cfg_offset); +static int cciss_pci_find_memory_BAR(struct pci_dev *pdev, + unsigned long *memory_bar); +static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag); +static int write_driver_ver_to_cfgtable(CfgTable_struct __iomem *cfgtable); + +/* performant mode helper functions */ +static void calc_bucket_map(int *bucket, int num_buckets, int nsgs, + int *bucket_map); +static void cciss_put_controller_into_performant_mode(ctlr_info_t *h); + +#ifdef CONFIG_PROC_FS +static void cciss_procinit(ctlr_info_t *h); +#else +static void cciss_procinit(ctlr_info_t *h) +{ +} +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_COMPAT +static int cciss_compat_ioctl(struct block_device *, fmode_t, + unsigned, unsigned long); +#endif + +static const struct block_device_operations cciss_fops = { + .owner = THIS_MODULE, + .open = cciss_unlocked_open, + .release = cciss_release, + .ioctl = cciss_ioctl, + .getgeo = cciss_getgeo, +#ifdef CONFIG_COMPAT + .compat_ioctl = cciss_compat_ioctl, +#endif + .revalidate_disk = cciss_revalidate, +}; + +/* set_performant_mode: Modify the tag for cciss performant + * set bit 0 for pull model, bits 3-1 for block fetch + * register number + */ +static void set_performant_mode(ctlr_info_t *h, CommandList_struct *c) +{ + if (likely(h->transMethod & CFGTBL_Trans_Performant)) + c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1); +} + +/* + * Enqueuing and dequeuing functions for cmdlists. + */ +static inline void addQ(struct list_head *list, CommandList_struct *c) +{ + list_add_tail(&c->list, list); +} + +static inline void removeQ(CommandList_struct *c) +{ + /* + * After kexec/dump some commands might still + * be in flight, which the firmware will try + * to complete. Resetting the firmware doesn't work + * with old fw revisions, so we have to mark + * them off as 'stale' to prevent the driver from + * falling over. + */ + if (WARN_ON(list_empty(&c->list))) { + c->cmd_type = CMD_MSG_STALE; + return; + } + + list_del_init(&c->list); +} + +static void enqueue_cmd_and_start_io(ctlr_info_t *h, + CommandList_struct *c) +{ + unsigned long flags; + set_performant_mode(h, c); + spin_lock_irqsave(&h->lock, flags); + addQ(&h->reqQ, c); + h->Qdepth++; + if (h->Qdepth > h->maxQsinceinit) + h->maxQsinceinit = h->Qdepth; + start_io(h); + spin_unlock_irqrestore(&h->lock, flags); +} + +static void cciss_free_sg_chain_blocks(SGDescriptor_struct **cmd_sg_list, + int nr_cmds) +{ + int i; + + if (!cmd_sg_list) + return; + for (i = 0; i < nr_cmds; i++) { + kfree(cmd_sg_list[i]); + cmd_sg_list[i] = NULL; + } + kfree(cmd_sg_list); +} + +static SGDescriptor_struct **cciss_allocate_sg_chain_blocks( + ctlr_info_t *h, int chainsize, int nr_cmds) +{ + int j; + SGDescriptor_struct **cmd_sg_list; + + if (chainsize <= 0) + return NULL; + + cmd_sg_list = kmalloc(sizeof(*cmd_sg_list) * nr_cmds, GFP_KERNEL); + if (!cmd_sg_list) + return NULL; + + /* Build up chain blocks for each command */ + for (j = 0; j < nr_cmds; j++) { + /* Need a block of chainsized s/g elements. */ + cmd_sg_list[j] = kmalloc((chainsize * + sizeof(*cmd_sg_list[j])), GFP_KERNEL); + if (!cmd_sg_list[j]) { + dev_err(&h->pdev->dev, "Cannot get memory " + "for s/g chains.\n"); + goto clean; + } + } + return cmd_sg_list; +clean: + cciss_free_sg_chain_blocks(cmd_sg_list, nr_cmds); + return NULL; +} + +static void cciss_unmap_sg_chain_block(ctlr_info_t *h, CommandList_struct *c) +{ + SGDescriptor_struct *chain_sg; + u64bit temp64; + + if (c->Header.SGTotal <= h->max_cmd_sgentries) + return; + + chain_sg = &c->SG[h->max_cmd_sgentries - 1]; + temp64.val32.lower = chain_sg->Addr.lower; + temp64.val32.upper = chain_sg->Addr.upper; + pci_unmap_single(h->pdev, temp64.val, chain_sg->Len, PCI_DMA_TODEVICE); +} + +static void cciss_map_sg_chain_block(ctlr_info_t *h, CommandList_struct *c, + SGDescriptor_struct *chain_block, int len) +{ + SGDescriptor_struct *chain_sg; + u64bit temp64; + + chain_sg = &c->SG[h->max_cmd_sgentries - 1]; + chain_sg->Ext = CCISS_SG_CHAIN; + chain_sg->Len = len; + temp64.val = pci_map_single(h->pdev, chain_block, len, + PCI_DMA_TODEVICE); + chain_sg->Addr.lower = temp64.val32.lower; + chain_sg->Addr.upper = temp64.val32.upper; +} + +#include "cciss_scsi.c" /* For SCSI tape support */ + +static const char *raid_label[] = { "0", "4", "1(1+0)", "5", "5+1", "ADG", + "UNKNOWN" +}; +#define RAID_UNKNOWN (ARRAY_SIZE(raid_label)-1) + +#ifdef CONFIG_PROC_FS + +/* + * Report information about this controller. + */ +#define ENG_GIG 1000000000 +#define ENG_GIG_FACTOR (ENG_GIG/512) +#define ENGAGE_SCSI "engage scsi" + +static void cciss_seq_show_header(struct seq_file *seq) +{ + ctlr_info_t *h = seq->private; + + seq_printf(seq, "%s: HP %s Controller\n" + "Board ID: 0x%08lx\n" + "Firmware Version: %c%c%c%c\n" + "IRQ: %d\n" + "Logical drives: %d\n" + "Current Q depth: %d\n" + "Current # commands on controller: %d\n" + "Max Q depth since init: %d\n" + "Max # commands on controller since init: %d\n" + "Max SG entries since init: %d\n", + h->devname, + h->product_name, + (unsigned long)h->board_id, + h->firm_ver[0], h->firm_ver[1], h->firm_ver[2], + h->firm_ver[3], (unsigned int)h->intr[h->intr_mode], + h->num_luns, + h->Qdepth, h->commands_outstanding, + h->maxQsinceinit, h->max_outstanding, h->maxSG); + +#ifdef CONFIG_CISS_SCSI_TAPE + cciss_seq_tape_report(seq, h); +#endif /* CONFIG_CISS_SCSI_TAPE */ +} + +static void *cciss_seq_start(struct seq_file *seq, loff_t *pos) +{ + ctlr_info_t *h = seq->private; + unsigned long flags; + + /* prevent displaying bogus info during configuration + * or deconfiguration of a logical volume + */ + spin_lock_irqsave(&h->lock, flags); + if (h->busy_configuring) { + spin_unlock_irqrestore(&h->lock, flags); + return ERR_PTR(-EBUSY); + } + h->busy_configuring = 1; + spin_unlock_irqrestore(&h->lock, flags); + + if (*pos == 0) + cciss_seq_show_header(seq); + + return pos; +} + +static int cciss_seq_show(struct seq_file *seq, void *v) +{ + sector_t vol_sz, vol_sz_frac; + ctlr_info_t *h = seq->private; + unsigned ctlr = h->ctlr; + loff_t *pos = v; + drive_info_struct *drv = h->drv[*pos]; + + if (*pos > h->highest_lun) + return 0; + + if (drv == NULL) /* it's possible for h->drv[] to have holes. */ + return 0; + + if (drv->heads == 0) + return 0; + + vol_sz = drv->nr_blocks; + vol_sz_frac = sector_div(vol_sz, ENG_GIG_FACTOR); + vol_sz_frac *= 100; + sector_div(vol_sz_frac, ENG_GIG_FACTOR); + + if (drv->raid_level < 0 || drv->raid_level > RAID_UNKNOWN) + drv->raid_level = RAID_UNKNOWN; + seq_printf(seq, "cciss/c%dd%d:" + "\t%4u.%02uGB\tRAID %s\n", + ctlr, (int) *pos, (int)vol_sz, (int)vol_sz_frac, + raid_label[drv->raid_level]); + return 0; +} + +static void *cciss_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ctlr_info_t *h = seq->private; + + if (*pos > h->highest_lun) + return NULL; + *pos += 1; + + return pos; +} + +static void cciss_seq_stop(struct seq_file *seq, void *v) +{ + ctlr_info_t *h = seq->private; + + /* Only reset h->busy_configuring if we succeeded in setting + * it during cciss_seq_start. */ + if (v == ERR_PTR(-EBUSY)) + return; + + h->busy_configuring = 0; +} + +static const struct seq_operations cciss_seq_ops = { + .start = cciss_seq_start, + .show = cciss_seq_show, + .next = cciss_seq_next, + .stop = cciss_seq_stop, +}; + +static int cciss_seq_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &cciss_seq_ops); + struct seq_file *seq = file->private_data; + + if (!ret) + seq->private = PDE_DATA(inode); + + return ret; +} + +static ssize_t +cciss_proc_write(struct file *file, const char __user *buf, + size_t length, loff_t *ppos) +{ + int err; + char *buffer; + +#ifndef CONFIG_CISS_SCSI_TAPE + return -EINVAL; +#endif + + if (!buf || length > PAGE_SIZE - 1) + return -EINVAL; + + buffer = (char *)__get_free_page(GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + err = -EFAULT; + if (copy_from_user(buffer, buf, length)) + goto out; + buffer[length] = '\0'; + +#ifdef CONFIG_CISS_SCSI_TAPE + if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) { + struct seq_file *seq = file->private_data; + ctlr_info_t *h = seq->private; + + err = cciss_engage_scsi(h); + if (err == 0) + err = length; + } else +#endif /* CONFIG_CISS_SCSI_TAPE */ + err = -EINVAL; + /* might be nice to have "disengage" too, but it's not + safely possible. (only 1 module use count, lock issues.) */ + +out: + free_page((unsigned long)buffer); + return err; +} + +static const struct file_operations cciss_proc_fops = { + .owner = THIS_MODULE, + .open = cciss_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = cciss_proc_write, +}; + +static void cciss_procinit(ctlr_info_t *h) +{ + struct proc_dir_entry *pde; + + if (proc_cciss == NULL) + proc_cciss = proc_mkdir("driver/cciss", NULL); + if (!proc_cciss) + return; + pde = proc_create_data(h->devname, S_IWUSR | S_IRUSR | S_IRGRP | + S_IROTH, proc_cciss, + &cciss_proc_fops, h); +} +#endif /* CONFIG_PROC_FS */ + +#define MAX_PRODUCT_NAME_LEN 19 + +#define to_hba(n) container_of(n, struct ctlr_info, dev) +#define to_drv(n) container_of(n, drive_info_struct, dev) + +/* List of controllers which cannot be hard reset on kexec with reset_devices */ +static u32 unresettable_controller[] = { + 0x324a103C, /* Smart Array P712m */ + 0x324b103C, /* SmartArray P711m */ + 0x3223103C, /* Smart Array P800 */ + 0x3234103C, /* Smart Array P400 */ + 0x3235103C, /* Smart Array P400i */ + 0x3211103C, /* Smart Array E200i */ + 0x3212103C, /* Smart Array E200 */ + 0x3213103C, /* Smart Array E200i */ + 0x3214103C, /* Smart Array E200i */ + 0x3215103C, /* Smart Array E200i */ + 0x3237103C, /* Smart Array E500 */ + 0x323D103C, /* Smart Array P700m */ + 0x409C0E11, /* Smart Array 6400 */ + 0x409D0E11, /* Smart Array 6400 EM */ +}; + +/* List of controllers which cannot even be soft reset */ +static u32 soft_unresettable_controller[] = { + 0x409C0E11, /* Smart Array 6400 */ + 0x409D0E11, /* Smart Array 6400 EM */ +}; + +static int ctlr_is_hard_resettable(u32 board_id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++) + if (unresettable_controller[i] == board_id) + return 0; + return 1; +} + +static int ctlr_is_soft_resettable(u32 board_id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(soft_unresettable_controller); i++) + if (soft_unresettable_controller[i] == board_id) + return 0; + return 1; +} + +static int ctlr_is_resettable(u32 board_id) +{ + return ctlr_is_hard_resettable(board_id) || + ctlr_is_soft_resettable(board_id); +} + +static ssize_t host_show_resettable(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ctlr_info *h = to_hba(dev); + + return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h->board_id)); +} +static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL); + +static ssize_t host_store_rescan(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct ctlr_info *h = to_hba(dev); + + add_to_scan_list(h); + wake_up_process(cciss_scan_thread); + wait_for_completion_interruptible(&h->scan_wait); + + return count; +} +static DEVICE_ATTR(rescan, S_IWUSR, NULL, host_store_rescan); + +static ssize_t host_show_transport_mode(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ctlr_info *h = to_hba(dev); + + return snprintf(buf, 20, "%s\n", + h->transMethod & CFGTBL_Trans_Performant ? + "performant" : "simple"); +} +static DEVICE_ATTR(transport_mode, S_IRUGO, host_show_transport_mode, NULL); + +static ssize_t dev_show_unique_id(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + drive_info_struct *drv = to_drv(dev); + struct ctlr_info *h = to_hba(drv->dev.parent); + __u8 sn[16]; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&h->lock, flags); + if (h->busy_configuring) + ret = -EBUSY; + else + memcpy(sn, drv->serial_no, sizeof(sn)); + spin_unlock_irqrestore(&h->lock, flags); + + if (ret) + return ret; + else + return snprintf(buf, 16 * 2 + 2, + "%02X%02X%02X%02X%02X%02X%02X%02X" + "%02X%02X%02X%02X%02X%02X%02X%02X\n", + sn[0], sn[1], sn[2], sn[3], + sn[4], sn[5], sn[6], sn[7], + sn[8], sn[9], sn[10], sn[11], + sn[12], sn[13], sn[14], sn[15]); +} +static DEVICE_ATTR(unique_id, S_IRUGO, dev_show_unique_id, NULL); + +static ssize_t dev_show_vendor(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + drive_info_struct *drv = to_drv(dev); + struct ctlr_info *h = to_hba(drv->dev.parent); + char vendor[VENDOR_LEN + 1]; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&h->lock, flags); + if (h->busy_configuring) + ret = -EBUSY; + else + memcpy(vendor, drv->vendor, VENDOR_LEN + 1); + spin_unlock_irqrestore(&h->lock, flags); + + if (ret) + return ret; + else + return snprintf(buf, sizeof(vendor) + 1, "%s\n", drv->vendor); +} +static DEVICE_ATTR(vendor, S_IRUGO, dev_show_vendor, NULL); + +static ssize_t dev_show_model(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + drive_info_struct *drv = to_drv(dev); + struct ctlr_info *h = to_hba(drv->dev.parent); + char model[MODEL_LEN + 1]; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&h->lock, flags); + if (h->busy_configuring) + ret = -EBUSY; + else + memcpy(model, drv->model, MODEL_LEN + 1); + spin_unlock_irqrestore(&h->lock, flags); + + if (ret) + return ret; + else + return snprintf(buf, sizeof(model) + 1, "%s\n", drv->model); +} +static DEVICE_ATTR(model, S_IRUGO, dev_show_model, NULL); + +static ssize_t dev_show_rev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + drive_info_struct *drv = to_drv(dev); + struct ctlr_info *h = to_hba(drv->dev.parent); + char rev[REV_LEN + 1]; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&h->lock, flags); + if (h->busy_configuring) + ret = -EBUSY; + else + memcpy(rev, drv->rev, REV_LEN + 1); + spin_unlock_irqrestore(&h->lock, flags); + + if (ret) + return ret; + else + return snprintf(buf, sizeof(rev) + 1, "%s\n", drv->rev); +} +static DEVICE_ATTR(rev, S_IRUGO, dev_show_rev, NULL); + +static ssize_t cciss_show_lunid(struct device *dev, + struct device_attribute *attr, char *buf) +{ + drive_info_struct *drv = to_drv(dev); + struct ctlr_info *h = to_hba(drv->dev.parent); + unsigned long flags; + unsigned char lunid[8]; + + spin_lock_irqsave(&h->lock, flags); + if (h->busy_configuring) { + spin_unlock_irqrestore(&h->lock, flags); + return -EBUSY; + } + if (!drv->heads) { + spin_unlock_irqrestore(&h->lock, flags); + return -ENOTTY; + } + memcpy(lunid, drv->LunID, sizeof(lunid)); + spin_unlock_irqrestore(&h->lock, flags); + return snprintf(buf, 20, "0x%02x%02x%02x%02x%02x%02x%02x%02x\n", + lunid[0], lunid[1], lunid[2], lunid[3], + lunid[4], lunid[5], lunid[6], lunid[7]); +} +static DEVICE_ATTR(lunid, S_IRUGO, cciss_show_lunid, NULL); + +static ssize_t cciss_show_raid_level(struct device *dev, + struct device_attribute *attr, char *buf) +{ + drive_info_struct *drv = to_drv(dev); + struct ctlr_info *h = to_hba(drv->dev.parent); + int raid; + unsigned long flags; + + spin_lock_irqsave(&h->lock, flags); + if (h->busy_configuring) { + spin_unlock_irqrestore(&h->lock, flags); + return -EBUSY; + } + raid = drv->raid_level; + spin_unlock_irqrestore(&h->lock, flags); + if (raid < 0 || raid > RAID_UNKNOWN) + raid = RAID_UNKNOWN; + + return snprintf(buf, strlen(raid_label[raid]) + 7, "RAID %s\n", + raid_label[raid]); +} +static DEVICE_ATTR(raid_level, S_IRUGO, cciss_show_raid_level, NULL); + +static ssize_t cciss_show_usage_count(struct device *dev, + struct device_attribute *attr, char *buf) +{ + drive_info_struct *drv = to_drv(dev); + struct ctlr_info *h = to_hba(drv->dev.parent); + unsigned long flags; + int count; + + spin_lock_irqsave(&h->lock, flags); + if (h->busy_configuring) { + spin_unlock_irqrestore(&h->lock, flags); + return -EBUSY; + } + count = drv->usage_count; + spin_unlock_irqrestore(&h->lock, flags); + return snprintf(buf, 20, "%d\n", count); +} +static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL); + +static struct attribute *cciss_host_attrs[] = { + &dev_attr_rescan.attr, + &dev_attr_resettable.attr, + &dev_attr_transport_mode.attr, + NULL +}; + +static struct attribute_group cciss_host_attr_group = { + .attrs = cciss_host_attrs, +}; + +static const struct attribute_group *cciss_host_attr_groups[] = { + &cciss_host_attr_group, + NULL +}; + +static struct device_type cciss_host_type = { + .name = "cciss_host", + .groups = cciss_host_attr_groups, + .release = cciss_hba_release, +}; + +static struct attribute *cciss_dev_attrs[] = { + &dev_attr_unique_id.attr, + &dev_attr_model.attr, + &dev_attr_vendor.attr, + &dev_attr_rev.attr, + &dev_attr_lunid.attr, + &dev_attr_raid_level.attr, + &dev_attr_usage_count.attr, + NULL +}; + +static struct attribute_group cciss_dev_attr_group = { + .attrs = cciss_dev_attrs, +}; + +static const struct attribute_group *cciss_dev_attr_groups[] = { + &cciss_dev_attr_group, + NULL +}; + +static struct device_type cciss_dev_type = { + .name = "cciss_device", + .groups = cciss_dev_attr_groups, + .release = cciss_device_release, +}; + +static struct bus_type cciss_bus_type = { + .name = "cciss", +}; + +/* + * cciss_hba_release is called when the reference count + * of h->dev goes to zero. + */ +static void cciss_hba_release(struct device *dev) +{ + /* + * nothing to do, but need this to avoid a warning + * about not having a release handler from lib/kref.c. + */ +} + +/* + * Initialize sysfs entry for each controller. This sets up and registers + * the 'cciss#' directory for each individual controller under + * /sys/bus/pci/devices//. + */ +static int cciss_create_hba_sysfs_entry(struct ctlr_info *h) +{ + device_initialize(&h->dev); + h->dev.type = &cciss_host_type; + h->dev.bus = &cciss_bus_type; + dev_set_name(&h->dev, "%s", h->devname); + h->dev.parent = &h->pdev->dev; + + return device_add(&h->dev); +} + +/* + * Remove sysfs entries for an hba. + */ +static void cciss_destroy_hba_sysfs_entry(struct ctlr_info *h) +{ + device_del(&h->dev); + put_device(&h->dev); /* final put. */ +} + +/* cciss_device_release is called when the reference count + * of h->drv[x]dev goes to zero. + */ +static void cciss_device_release(struct device *dev) +{ + drive_info_struct *drv = to_drv(dev); + kfree(drv); +} + +/* + * Initialize sysfs for each logical drive. This sets up and registers + * the 'c#d#' directory for each individual logical drive under + * /sys/bus/pci/devices/drv[drv_index]->device_initialized) + return 0; + + dev = &h->drv[drv_index]->dev; + device_initialize(dev); + dev->type = &cciss_dev_type; + dev->bus = &cciss_bus_type; + dev_set_name(dev, "c%dd%d", h->ctlr, drv_index); + dev->parent = &h->dev; + h->drv[drv_index]->device_initialized = 1; + return device_add(dev); +} + +/* + * Remove sysfs entries for a logical drive. + */ +static void cciss_destroy_ld_sysfs_entry(struct ctlr_info *h, int drv_index, + int ctlr_exiting) +{ + struct device *dev = &h->drv[drv_index]->dev; + + /* special case for c*d0, we only destroy it on controller exit */ + if (drv_index == 0 && !ctlr_exiting) + return; + + device_del(dev); + put_device(dev); /* the "final" put. */ + h->drv[drv_index] = NULL; +} + +/* + * For operations that cannot sleep, a command block is allocated at init, + * and managed by cmd_alloc() and cmd_free() using a simple bitmap to track + * which ones are free or in use. + */ +static CommandList_struct *cmd_alloc(ctlr_info_t *h) +{ + CommandList_struct *c; + int i; + u64bit temp64; + dma_addr_t cmd_dma_handle, err_dma_handle; + + do { + i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds); + if (i == h->nr_cmds) + return NULL; + } while (test_and_set_bit(i, h->cmd_pool_bits) != 0); + c = h->cmd_pool + i; + memset(c, 0, sizeof(CommandList_struct)); + cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct); + c->err_info = h->errinfo_pool + i; + memset(c->err_info, 0, sizeof(ErrorInfo_struct)); + err_dma_handle = h->errinfo_pool_dhandle + + i * sizeof(ErrorInfo_struct); + h->nr_allocs++; + + c->cmdindex = i; + + INIT_LIST_HEAD(&c->list); + c->busaddr = (__u32) cmd_dma_handle; + temp64.val = (__u64) err_dma_handle; + c->ErrDesc.Addr.lower = temp64.val32.lower; + c->ErrDesc.Addr.upper = temp64.val32.upper; + c->ErrDesc.Len = sizeof(ErrorInfo_struct); + + c->ctlr = h->ctlr; + return c; +} + +/* allocate a command using pci_alloc_consistent, used for ioctls, + * etc., not for the main i/o path. + */ +static CommandList_struct *cmd_special_alloc(ctlr_info_t *h) +{ + CommandList_struct *c; + u64bit temp64; + dma_addr_t cmd_dma_handle, err_dma_handle; + + c = pci_zalloc_consistent(h->pdev, sizeof(CommandList_struct), + &cmd_dma_handle); + if (c == NULL) + return NULL; + + c->cmdindex = -1; + + c->err_info = pci_zalloc_consistent(h->pdev, sizeof(ErrorInfo_struct), + &err_dma_handle); + + if (c->err_info == NULL) { + pci_free_consistent(h->pdev, + sizeof(CommandList_struct), c, cmd_dma_handle); + return NULL; + } + + INIT_LIST_HEAD(&c->list); + c->busaddr = (__u32) cmd_dma_handle; + temp64.val = (__u64) err_dma_handle; + c->ErrDesc.Addr.lower = temp64.val32.lower; + c->ErrDesc.Addr.upper = temp64.val32.upper; + c->ErrDesc.Len = sizeof(ErrorInfo_struct); + + c->ctlr = h->ctlr; + return c; +} + +static void cmd_free(ctlr_info_t *h, CommandList_struct *c) +{ + int i; + + i = c - h->cmd_pool; + clear_bit(i, h->cmd_pool_bits); + h->nr_frees++; +} + +static void cmd_special_free(ctlr_info_t *h, CommandList_struct *c) +{ + u64bit temp64; + + temp64.val32.lower = c->ErrDesc.Addr.lower; + temp64.val32.upper = c->ErrDesc.Addr.upper; + pci_free_consistent(h->pdev, sizeof(ErrorInfo_struct), + c->err_info, (dma_addr_t) temp64.val); + pci_free_consistent(h->pdev, sizeof(CommandList_struct), c, + (dma_addr_t) cciss_tag_discard_error_bits(h, (u32) c->busaddr)); +} + +static inline ctlr_info_t *get_host(struct gendisk *disk) +{ + return disk->queue->queuedata; +} + +static inline drive_info_struct *get_drv(struct gendisk *disk) +{ + return disk->private_data; +} + +/* + * Open. Make sure the device is really there. + */ +static int cciss_open(struct block_device *bdev, fmode_t mode) +{ + ctlr_info_t *h = get_host(bdev->bd_disk); + drive_info_struct *drv = get_drv(bdev->bd_disk); + + dev_dbg(&h->pdev->dev, "cciss_open %s\n", bdev->bd_disk->disk_name); + if (drv->busy_configuring) + return -EBUSY; + /* + * Root is allowed to open raw volume zero even if it's not configured + * so array config can still work. Root is also allowed to open any + * volume that has a LUN ID, so it can issue IOCTL to reread the + * disk information. I don't think I really like this + * but I'm already using way to many device nodes to claim another one + * for "raw controller". + */ + if (drv->heads == 0) { + if (MINOR(bdev->bd_dev) != 0) { /* not node 0? */ + /* if not node 0 make sure it is a partition = 0 */ + if (MINOR(bdev->bd_dev) & 0x0f) { + return -ENXIO; + /* if it is, make sure we have a LUN ID */ + } else if (memcmp(drv->LunID, CTLR_LUNID, + sizeof(drv->LunID))) { + return -ENXIO; + } + } + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + drv->usage_count++; + h->usage_count++; + return 0; +} + +static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + mutex_lock(&cciss_mutex); + ret = cciss_open(bdev, mode); + mutex_unlock(&cciss_mutex); + + return ret; +} + +/* + * Close. Sync first. + */ +static void cciss_release(struct gendisk *disk, fmode_t mode) +{ + ctlr_info_t *h; + drive_info_struct *drv; + + mutex_lock(&cciss_mutex); + h = get_host(disk); + drv = get_drv(disk); + dev_dbg(&h->pdev->dev, "cciss_release %s\n", disk->disk_name); + drv->usage_count--; + h->usage_count--; + mutex_unlock(&cciss_mutex); +} + +#ifdef CONFIG_COMPAT + +static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg); +static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg); + +static int cciss_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + switch (cmd) { + case CCISS_GETPCIINFO: + case CCISS_GETINTINFO: + case CCISS_SETINTINFO: + case CCISS_GETNODENAME: + case CCISS_SETNODENAME: + case CCISS_GETHEARTBEAT: + case CCISS_GETBUSTYPES: + case CCISS_GETFIRMVER: + case CCISS_GETDRIVVER: + case CCISS_REVALIDVOLS: + case CCISS_DEREGDISK: + case CCISS_REGNEWDISK: + case CCISS_REGNEWD: + case CCISS_RESCANDISK: + case CCISS_GETLUNINFO: + return cciss_ioctl(bdev, mode, cmd, arg); + + case CCISS_PASSTHRU32: + return cciss_ioctl32_passthru(bdev, mode, cmd, arg); + case CCISS_BIG_PASSTHRU32: + return cciss_ioctl32_big_passthru(bdev, mode, cmd, arg); + + default: + return -ENOIOCTLCMD; + } +} + +static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + IOCTL32_Command_struct __user *arg32 = + (IOCTL32_Command_struct __user *) arg; + IOCTL_Command_struct arg64; + IOCTL_Command_struct __user *p = compat_alloc_user_space(sizeof(arg64)); + int err; + u32 cp; + + memset(&arg64, 0, sizeof(arg64)); + err = 0; + err |= + copy_from_user(&arg64.LUN_info, &arg32->LUN_info, + sizeof(arg64.LUN_info)); + err |= + copy_from_user(&arg64.Request, &arg32->Request, + sizeof(arg64.Request)); + err |= + copy_from_user(&arg64.error_info, &arg32->error_info, + sizeof(arg64.error_info)); + err |= get_user(arg64.buf_size, &arg32->buf_size); + err |= get_user(cp, &arg32->buf); + arg64.buf = compat_ptr(cp); + err |= copy_to_user(p, &arg64, sizeof(arg64)); + + if (err) + return -EFAULT; + + err = cciss_ioctl(bdev, mode, CCISS_PASSTHRU, (unsigned long)p); + if (err) + return err; + err |= + copy_in_user(&arg32->error_info, &p->error_info, + sizeof(arg32->error_info)); + if (err) + return -EFAULT; + return err; +} + +static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + BIG_IOCTL32_Command_struct __user *arg32 = + (BIG_IOCTL32_Command_struct __user *) arg; + BIG_IOCTL_Command_struct arg64; + BIG_IOCTL_Command_struct __user *p = + compat_alloc_user_space(sizeof(arg64)); + int err; + u32 cp; + + memset(&arg64, 0, sizeof(arg64)); + err = 0; + err |= + copy_from_user(&arg64.LUN_info, &arg32->LUN_info, + sizeof(arg64.LUN_info)); + err |= + copy_from_user(&arg64.Request, &arg32->Request, + sizeof(arg64.Request)); + err |= + copy_from_user(&arg64.error_info, &arg32->error_info, + sizeof(arg64.error_info)); + err |= get_user(arg64.buf_size, &arg32->buf_size); + err |= get_user(arg64.malloc_size, &arg32->malloc_size); + err |= get_user(cp, &arg32->buf); + arg64.buf = compat_ptr(cp); + err |= copy_to_user(p, &arg64, sizeof(arg64)); + + if (err) + return -EFAULT; + + err = cciss_ioctl(bdev, mode, CCISS_BIG_PASSTHRU, (unsigned long)p); + if (err) + return err; + err |= + copy_in_user(&arg32->error_info, &p->error_info, + sizeof(arg32->error_info)); + if (err) + return -EFAULT; + return err; +} +#endif + +static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + drive_info_struct *drv = get_drv(bdev->bd_disk); + + if (!drv->cylinders) + return -ENXIO; + + geo->heads = drv->heads; + geo->sectors = drv->sectors; + geo->cylinders = drv->cylinders; + return 0; +} + +static void check_ioctl_unit_attention(ctlr_info_t *h, CommandList_struct *c) +{ + if (c->err_info->CommandStatus == CMD_TARGET_STATUS && + c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION) + (void)check_for_unit_attention(h, c); +} + +static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp) +{ + cciss_pci_info_struct pciinfo; + + if (!argp) + return -EINVAL; + pciinfo.domain = pci_domain_nr(h->pdev->bus); + pciinfo.bus = h->pdev->bus->number; + pciinfo.dev_fn = h->pdev->devfn; + pciinfo.board_id = h->board_id; + if (copy_to_user(argp, &pciinfo, sizeof(cciss_pci_info_struct))) + return -EFAULT; + return 0; +} + +static int cciss_getintinfo(ctlr_info_t *h, void __user *argp) +{ + cciss_coalint_struct intinfo; + unsigned long flags; + + if (!argp) + return -EINVAL; + spin_lock_irqsave(&h->lock, flags); + intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay); + intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount); + spin_unlock_irqrestore(&h->lock, flags); + if (copy_to_user + (argp, &intinfo, sizeof(cciss_coalint_struct))) + return -EFAULT; + return 0; +} + +static int cciss_setintinfo(ctlr_info_t *h, void __user *argp) +{ + cciss_coalint_struct intinfo; + unsigned long flags; + int i; + + if (!argp) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&intinfo, argp, sizeof(intinfo))) + return -EFAULT; + if ((intinfo.delay == 0) && (intinfo.count == 0)) + return -EINVAL; + spin_lock_irqsave(&h->lock, flags); + /* Update the field, and then ring the doorbell */ + writel(intinfo.delay, &(h->cfgtable->HostWrite.CoalIntDelay)); + writel(intinfo.count, &(h->cfgtable->HostWrite.CoalIntCount)); + writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); + + for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { + if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) + break; + udelay(1000); /* delay and try again */ + } + spin_unlock_irqrestore(&h->lock, flags); + if (i >= MAX_IOCTL_CONFIG_WAIT) + return -EAGAIN; + return 0; +} + +static int cciss_getnodename(ctlr_info_t *h, void __user *argp) +{ + NodeName_type NodeName; + unsigned long flags; + int i; + + if (!argp) + return -EINVAL; + spin_lock_irqsave(&h->lock, flags); + for (i = 0; i < 16; i++) + NodeName[i] = readb(&h->cfgtable->ServerName[i]); + spin_unlock_irqrestore(&h->lock, flags); + if (copy_to_user(argp, NodeName, sizeof(NodeName_type))) + return -EFAULT; + return 0; +} + +static int cciss_setnodename(ctlr_info_t *h, void __user *argp) +{ + NodeName_type NodeName; + unsigned long flags; + int i; + + if (!argp) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(NodeName, argp, sizeof(NodeName_type))) + return -EFAULT; + spin_lock_irqsave(&h->lock, flags); + /* Update the field, and then ring the doorbell */ + for (i = 0; i < 16; i++) + writeb(NodeName[i], &h->cfgtable->ServerName[i]); + writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); + for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) { + if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) + break; + udelay(1000); /* delay and try again */ + } + spin_unlock_irqrestore(&h->lock, flags); + if (i >= MAX_IOCTL_CONFIG_WAIT) + return -EAGAIN; + return 0; +} + +static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp) +{ + Heartbeat_type heartbeat; + unsigned long flags; + + if (!argp) + return -EINVAL; + spin_lock_irqsave(&h->lock, flags); + heartbeat = readl(&h->cfgtable->HeartBeat); + spin_unlock_irqrestore(&h->lock, flags); + if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type))) + return -EFAULT; + return 0; +} + +static int cciss_getbustypes(ctlr_info_t *h, void __user *argp) +{ + BusTypes_type BusTypes; + unsigned long flags; + + if (!argp) + return -EINVAL; + spin_lock_irqsave(&h->lock, flags); + BusTypes = readl(&h->cfgtable->BusTypes); + spin_unlock_irqrestore(&h->lock, flags); + if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type))) + return -EFAULT; + return 0; +} + +static int cciss_getfirmver(ctlr_info_t *h, void __user *argp) +{ + FirmwareVer_type firmware; + + if (!argp) + return -EINVAL; + memcpy(firmware, h->firm_ver, 4); + + if (copy_to_user + (argp, firmware, sizeof(FirmwareVer_type))) + return -EFAULT; + return 0; +} + +static int cciss_getdrivver(ctlr_info_t *h, void __user *argp) +{ + DriverVer_type DriverVer = DRIVER_VERSION; + + if (!argp) + return -EINVAL; + if (copy_to_user(argp, &DriverVer, sizeof(DriverVer_type))) + return -EFAULT; + return 0; +} + +static int cciss_getluninfo(ctlr_info_t *h, + struct gendisk *disk, void __user *argp) +{ + LogvolInfo_struct luninfo; + drive_info_struct *drv = get_drv(disk); + + if (!argp) + return -EINVAL; + memcpy(&luninfo.LunID, drv->LunID, sizeof(luninfo.LunID)); + luninfo.num_opens = drv->usage_count; + luninfo.num_parts = 0; + if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct))) + return -EFAULT; + return 0; +} + +static int cciss_passthru(ctlr_info_t *h, void __user *argp) +{ + IOCTL_Command_struct iocommand; + CommandList_struct *c; + char *buff = NULL; + u64bit temp64; + DECLARE_COMPLETION_ONSTACK(wait); + + if (!argp) + return -EINVAL; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + if (copy_from_user + (&iocommand, argp, sizeof(IOCTL_Command_struct))) + return -EFAULT; + if ((iocommand.buf_size < 1) && + (iocommand.Request.Type.Direction != XFER_NONE)) { + return -EINVAL; + } + if (iocommand.buf_size > 0) { + buff = kmalloc(iocommand.buf_size, GFP_KERNEL); + if (buff == NULL) + return -EFAULT; + } + if (iocommand.Request.Type.Direction == XFER_WRITE) { + /* Copy the data into the buffer we created */ + if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) { + kfree(buff); + return -EFAULT; + } + } else { + memset(buff, 0, iocommand.buf_size); + } + c = cmd_special_alloc(h); + if (!c) { + kfree(buff); + return -ENOMEM; + } + /* Fill in the command type */ + c->cmd_type = CMD_IOCTL_PEND; + /* Fill in Command Header */ + c->Header.ReplyQueue = 0; /* unused in simple mode */ + if (iocommand.buf_size > 0) { /* buffer to fill */ + c->Header.SGList = 1; + c->Header.SGTotal = 1; + } else { /* no buffers to fill */ + c->Header.SGList = 0; + c->Header.SGTotal = 0; + } + c->Header.LUN = iocommand.LUN_info; + /* use the kernel address the cmd block for tag */ + c->Header.Tag.lower = c->busaddr; + + /* Fill in Request block */ + c->Request = iocommand.Request; + + /* Fill in the scatter gather information */ + if (iocommand.buf_size > 0) { + temp64.val = pci_map_single(h->pdev, buff, + iocommand.buf_size, PCI_DMA_BIDIRECTIONAL); + c->SG[0].Addr.lower = temp64.val32.lower; + c->SG[0].Addr.upper = temp64.val32.upper; + c->SG[0].Len = iocommand.buf_size; + c->SG[0].Ext = 0; /* we are not chaining */ + } + c->waiting = &wait; + + enqueue_cmd_and_start_io(h, c); + wait_for_completion(&wait); + + /* unlock the buffers from DMA */ + temp64.val32.lower = c->SG[0].Addr.lower; + temp64.val32.upper = c->SG[0].Addr.upper; + pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, iocommand.buf_size, + PCI_DMA_BIDIRECTIONAL); + check_ioctl_unit_attention(h, c); + + /* Copy the error information out */ + iocommand.error_info = *(c->err_info); + if (copy_to_user(argp, &iocommand, sizeof(IOCTL_Command_struct))) { + kfree(buff); + cmd_special_free(h, c); + return -EFAULT; + } + + if (iocommand.Request.Type.Direction == XFER_READ) { + /* Copy the data out of the buffer we created */ + if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) { + kfree(buff); + cmd_special_free(h, c); + return -EFAULT; + } + } + kfree(buff); + cmd_special_free(h, c); + return 0; +} + +static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp) +{ + BIG_IOCTL_Command_struct *ioc; + CommandList_struct *c; + unsigned char **buff = NULL; + int *buff_size = NULL; + u64bit temp64; + BYTE sg_used = 0; + int status = 0; + int i; + DECLARE_COMPLETION_ONSTACK(wait); + __u32 left; + __u32 sz; + BYTE __user *data_ptr; + + if (!argp) + return -EINVAL; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + ioc = kmalloc(sizeof(*ioc), GFP_KERNEL); + if (!ioc) { + status = -ENOMEM; + goto cleanup1; + } + if (copy_from_user(ioc, argp, sizeof(*ioc))) { + status = -EFAULT; + goto cleanup1; + } + if ((ioc->buf_size < 1) && + (ioc->Request.Type.Direction != XFER_NONE)) { + status = -EINVAL; + goto cleanup1; + } + /* Check kmalloc limits using all SGs */ + if (ioc->malloc_size > MAX_KMALLOC_SIZE) { + status = -EINVAL; + goto cleanup1; + } + if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) { + status = -EINVAL; + goto cleanup1; + } + buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL); + if (!buff) { + status = -ENOMEM; + goto cleanup1; + } + buff_size = kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL); + if (!buff_size) { + status = -ENOMEM; + goto cleanup1; + } + left = ioc->buf_size; + data_ptr = ioc->buf; + while (left) { + sz = (left > ioc->malloc_size) ? ioc->malloc_size : left; + buff_size[sg_used] = sz; + buff[sg_used] = kmalloc(sz, GFP_KERNEL); + if (buff[sg_used] == NULL) { + status = -ENOMEM; + goto cleanup1; + } + if (ioc->Request.Type.Direction == XFER_WRITE) { + if (copy_from_user(buff[sg_used], data_ptr, sz)) { + status = -EFAULT; + goto cleanup1; + } + } else { + memset(buff[sg_used], 0, sz); + } + left -= sz; + data_ptr += sz; + sg_used++; + } + c = cmd_special_alloc(h); + if (!c) { + status = -ENOMEM; + goto cleanup1; + } + c->cmd_type = CMD_IOCTL_PEND; + c->Header.ReplyQueue = 0; + c->Header.SGList = sg_used; + c->Header.SGTotal = sg_used; + c->Header.LUN = ioc->LUN_info; + c->Header.Tag.lower = c->busaddr; + + c->Request = ioc->Request; + for (i = 0; i < sg_used; i++) { + temp64.val = pci_map_single(h->pdev, buff[i], buff_size[i], + PCI_DMA_BIDIRECTIONAL); + c->SG[i].Addr.lower = temp64.val32.lower; + c->SG[i].Addr.upper = temp64.val32.upper; + c->SG[i].Len = buff_size[i]; + c->SG[i].Ext = 0; /* we are not chaining */ + } + c->waiting = &wait; + enqueue_cmd_and_start_io(h, c); + wait_for_completion(&wait); + /* unlock the buffers from DMA */ + for (i = 0; i < sg_used; i++) { + temp64.val32.lower = c->SG[i].Addr.lower; + temp64.val32.upper = c->SG[i].Addr.upper; + pci_unmap_single(h->pdev, + (dma_addr_t) temp64.val, buff_size[i], + PCI_DMA_BIDIRECTIONAL); + } + check_ioctl_unit_attention(h, c); + /* Copy the error information out */ + ioc->error_info = *(c->err_info); + if (copy_to_user(argp, ioc, sizeof(*ioc))) { + cmd_special_free(h, c); + status = -EFAULT; + goto cleanup1; + } + if (ioc->Request.Type.Direction == XFER_READ) { + /* Copy the data out of the buffer we created */ + BYTE __user *ptr = ioc->buf; + for (i = 0; i < sg_used; i++) { + if (copy_to_user(ptr, buff[i], buff_size[i])) { + cmd_special_free(h, c); + status = -EFAULT; + goto cleanup1; + } + ptr += buff_size[i]; + } + } + cmd_special_free(h, c); + status = 0; +cleanup1: + if (buff) { + for (i = 0; i < sg_used; i++) + kfree(buff[i]); + kfree(buff); + } + kfree(buff_size); + kfree(ioc); + return status; +} + +static int cciss_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct gendisk *disk = bdev->bd_disk; + ctlr_info_t *h = get_host(disk); + void __user *argp = (void __user *)arg; + + dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n", + cmd, arg); + switch (cmd) { + case CCISS_GETPCIINFO: + return cciss_getpciinfo(h, argp); + case CCISS_GETINTINFO: + return cciss_getintinfo(h, argp); + case CCISS_SETINTINFO: + return cciss_setintinfo(h, argp); + case CCISS_GETNODENAME: + return cciss_getnodename(h, argp); + case CCISS_SETNODENAME: + return cciss_setnodename(h, argp); + case CCISS_GETHEARTBEAT: + return cciss_getheartbeat(h, argp); + case CCISS_GETBUSTYPES: + return cciss_getbustypes(h, argp); + case CCISS_GETFIRMVER: + return cciss_getfirmver(h, argp); + case CCISS_GETDRIVVER: + return cciss_getdrivver(h, argp); + case CCISS_DEREGDISK: + case CCISS_REGNEWD: + case CCISS_REVALIDVOLS: + return rebuild_lun_table(h, 0, 1); + case CCISS_GETLUNINFO: + return cciss_getluninfo(h, disk, argp); + case CCISS_PASSTHRU: + return cciss_passthru(h, argp); + case CCISS_BIG_PASSTHRU: + return cciss_bigpassthru(h, argp); + + /* scsi_cmd_blk_ioctl handles these, below, though some are not */ + /* very meaningful for cciss. SG_IO is the main one people want. */ + + case SG_GET_VERSION_NUM: + case SG_SET_TIMEOUT: + case SG_GET_TIMEOUT: + case SG_GET_RESERVED_SIZE: + case SG_SET_RESERVED_SIZE: + case SG_EMULATED_HOST: + case SG_IO: + case SCSI_IOCTL_SEND_COMMAND: + return scsi_cmd_blk_ioctl(bdev, mode, cmd, argp); + + /* scsi_cmd_blk_ioctl would normally handle these, below, but */ + /* they aren't a good fit for cciss, as CD-ROMs are */ + /* not supported, and we don't have any bus/target/lun */ + /* which we present to the kernel. */ + + case CDROM_SEND_PACKET: + case CDROMCLOSETRAY: + case CDROMEJECT: + case SCSI_IOCTL_GET_IDLUN: + case SCSI_IOCTL_GET_BUS_NUMBER: + default: + return -ENOTTY; + } +} + +static void cciss_check_queues(ctlr_info_t *h) +{ + int start_queue = h->next_to_run; + int i; + + /* check to see if we have maxed out the number of commands that can + * be placed on the queue. If so then exit. We do this check here + * in case the interrupt we serviced was from an ioctl and did not + * free any new commands. + */ + if ((find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds)) == h->nr_cmds) + return; + + /* We have room on the queue for more commands. Now we need to queue + * them up. We will also keep track of the next queue to run so + * that every queue gets a chance to be started first. + */ + for (i = 0; i < h->highest_lun + 1; i++) { + int curr_queue = (start_queue + i) % (h->highest_lun + 1); + /* make sure the disk has been added and the drive is real + * because this can be called from the middle of init_one. + */ + if (!h->drv[curr_queue]) + continue; + if (!(h->drv[curr_queue]->queue) || + !(h->drv[curr_queue]->heads)) + continue; + blk_start_queue(h->gendisk[curr_queue]->queue); + + /* check to see if we have maxed out the number of commands + * that can be placed on the queue. + */ + if ((find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds)) == h->nr_cmds) { + if (curr_queue == start_queue) { + h->next_to_run = + (start_queue + 1) % (h->highest_lun + 1); + break; + } else { + h->next_to_run = curr_queue; + break; + } + } + } +} + +static void cciss_softirq_done(struct request *rq) +{ + CommandList_struct *c = rq->completion_data; + ctlr_info_t *h = hba[c->ctlr]; + SGDescriptor_struct *curr_sg = c->SG; + u64bit temp64; + unsigned long flags; + int i, ddir; + int sg_index = 0; + + if (c->Request.Type.Direction == XFER_READ) + ddir = PCI_DMA_FROMDEVICE; + else + ddir = PCI_DMA_TODEVICE; + + /* command did not need to be retried */ + /* unmap the DMA mapping for all the scatter gather elements */ + for (i = 0; i < c->Header.SGList; i++) { + if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) { + cciss_unmap_sg_chain_block(h, c); + /* Point to the next block */ + curr_sg = h->cmd_sg_list[c->cmdindex]; + sg_index = 0; + } + temp64.val32.lower = curr_sg[sg_index].Addr.lower; + temp64.val32.upper = curr_sg[sg_index].Addr.upper; + pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len, + ddir); + ++sg_index; + } + + dev_dbg(&h->pdev->dev, "Done with %p\n", rq); + + /* set the residual count for pc requests */ + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) + rq->resid_len = c->err_info->ResidualCnt; + + blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO); + + spin_lock_irqsave(&h->lock, flags); + cmd_free(h, c); + cciss_check_queues(h); + spin_unlock_irqrestore(&h->lock, flags); +} + +static inline void log_unit_to_scsi3addr(ctlr_info_t *h, + unsigned char scsi3addr[], uint32_t log_unit) +{ + memcpy(scsi3addr, h->drv[log_unit]->LunID, + sizeof(h->drv[log_unit]->LunID)); +} + +/* This function gets the SCSI vendor, model, and revision of a logical drive + * via the inquiry page 0. Model, vendor, and rev are set to empty strings if + * they cannot be read. + */ +static void cciss_get_device_descr(ctlr_info_t *h, int logvol, + char *vendor, char *model, char *rev) +{ + int rc; + InquiryData_struct *inq_buf; + unsigned char scsi3addr[8]; + + *vendor = '\0'; + *model = '\0'; + *rev = '\0'; + + inq_buf = kzalloc(sizeof(InquiryData_struct), GFP_KERNEL); + if (!inq_buf) + return; + + log_unit_to_scsi3addr(h, scsi3addr, logvol); + rc = sendcmd_withirq(h, CISS_INQUIRY, inq_buf, sizeof(*inq_buf), 0, + scsi3addr, TYPE_CMD); + if (rc == IO_OK) { + memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN); + vendor[VENDOR_LEN] = '\0'; + memcpy(model, &inq_buf->data_byte[16], MODEL_LEN); + model[MODEL_LEN] = '\0'; + memcpy(rev, &inq_buf->data_byte[32], REV_LEN); + rev[REV_LEN] = '\0'; + } + + kfree(inq_buf); + return; +} + +/* This function gets the serial number of a logical drive via + * inquiry page 0x83. Serial no. is 16 bytes. If the serial + * number cannot be had, for whatever reason, 16 bytes of 0xff + * are returned instead. + */ +static void cciss_get_serial_no(ctlr_info_t *h, int logvol, + unsigned char *serial_no, int buflen) +{ +#define PAGE_83_INQ_BYTES 64 + int rc; + unsigned char *buf; + unsigned char scsi3addr[8]; + + if (buflen > 16) + buflen = 16; + memset(serial_no, 0xff, buflen); + buf = kzalloc(PAGE_83_INQ_BYTES, GFP_KERNEL); + if (!buf) + return; + memset(serial_no, 0, buflen); + log_unit_to_scsi3addr(h, scsi3addr, logvol); + rc = sendcmd_withirq(h, CISS_INQUIRY, buf, + PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); + if (rc == IO_OK) + memcpy(serial_no, &buf[8], buflen); + kfree(buf); + return; +} + +/* + * cciss_add_disk sets up the block device queue for a logical drive + */ +static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, + int drv_index) +{ + disk->queue = blk_init_queue(do_cciss_request, &h->lock); + if (!disk->queue) + goto init_queue_failure; + sprintf(disk->disk_name, "cciss/c%dd%d", h->ctlr, drv_index); + disk->major = h->major; + disk->first_minor = drv_index << NWD_SHIFT; + disk->fops = &cciss_fops; + if (cciss_create_ld_sysfs_entry(h, drv_index)) + goto cleanup_queue; + disk->private_data = h->drv[drv_index]; + disk->driverfs_dev = &h->drv[drv_index]->dev; + + /* Set up queue information */ + blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); + + /* This is a hardware imposed limit. */ + blk_queue_max_segments(disk->queue, h->maxsgentries); + + blk_queue_max_hw_sectors(disk->queue, h->cciss_max_sectors); + + blk_queue_softirq_done(disk->queue, cciss_softirq_done); + + disk->queue->queuedata = h; + + blk_queue_logical_block_size(disk->queue, + h->drv[drv_index]->block_size); + + /* Make sure all queue data is written out before */ + /* setting h->drv[drv_index]->queue, as setting this */ + /* allows the interrupt handler to start the queue */ + wmb(); + h->drv[drv_index]->queue = disk->queue; + add_disk(disk); + return 0; + +cleanup_queue: + blk_cleanup_queue(disk->queue); + disk->queue = NULL; +init_queue_failure: + return -1; +} + +/* This function will check the usage_count of the drive to be updated/added. + * If the usage_count is zero and it is a heretofore unknown drive, or, + * the drive's capacity, geometry, or serial number has changed, + * then the drive information will be updated and the disk will be + * re-registered with the kernel. If these conditions don't hold, + * then it will be left alone for the next reboot. The exception to this + * is disk 0 which will always be left registered with the kernel since it + * is also the controller node. Any changes to disk 0 will show up on + * the next reboot. + */ +static void cciss_update_drive_info(ctlr_info_t *h, int drv_index, + int first_time, int via_ioctl) +{ + struct gendisk *disk; + InquiryData_struct *inq_buff = NULL; + unsigned int block_size; + sector_t total_size; + unsigned long flags = 0; + int ret = 0; + drive_info_struct *drvinfo; + + /* Get information about the disk and modify the driver structure */ + inq_buff = kmalloc(sizeof(InquiryData_struct), GFP_KERNEL); + drvinfo = kzalloc(sizeof(*drvinfo), GFP_KERNEL); + if (inq_buff == NULL || drvinfo == NULL) + goto mem_msg; + + /* testing to see if 16-byte CDBs are already being used */ + if (h->cciss_read == CCISS_READ_16) { + cciss_read_capacity_16(h, drv_index, + &total_size, &block_size); + + } else { + cciss_read_capacity(h, drv_index, &total_size, &block_size); + /* if read_capacity returns all F's this volume is >2TB */ + /* in size so we switch to 16-byte CDB's for all */ + /* read/write ops */ + if (total_size == 0xFFFFFFFFULL) { + cciss_read_capacity_16(h, drv_index, + &total_size, &block_size); + h->cciss_read = CCISS_READ_16; + h->cciss_write = CCISS_WRITE_16; + } else { + h->cciss_read = CCISS_READ_10; + h->cciss_write = CCISS_WRITE_10; + } + } + + cciss_geometry_inquiry(h, drv_index, total_size, block_size, + inq_buff, drvinfo); + drvinfo->block_size = block_size; + drvinfo->nr_blocks = total_size + 1; + + cciss_get_device_descr(h, drv_index, drvinfo->vendor, + drvinfo->model, drvinfo->rev); + cciss_get_serial_no(h, drv_index, drvinfo->serial_no, + sizeof(drvinfo->serial_no)); + /* Save the lunid in case we deregister the disk, below. */ + memcpy(drvinfo->LunID, h->drv[drv_index]->LunID, + sizeof(drvinfo->LunID)); + + /* Is it the same disk we already know, and nothing's changed? */ + if (h->drv[drv_index]->raid_level != -1 && + ((memcmp(drvinfo->serial_no, + h->drv[drv_index]->serial_no, 16) == 0) && + drvinfo->block_size == h->drv[drv_index]->block_size && + drvinfo->nr_blocks == h->drv[drv_index]->nr_blocks && + drvinfo->heads == h->drv[drv_index]->heads && + drvinfo->sectors == h->drv[drv_index]->sectors && + drvinfo->cylinders == h->drv[drv_index]->cylinders)) + /* The disk is unchanged, nothing to update */ + goto freeret; + + /* If we get here it's not the same disk, or something's changed, + * so we need to * deregister it, and re-register it, if it's not + * in use. + * If the disk already exists then deregister it before proceeding + * (unless it's the first disk (for the controller node). + */ + if (h->drv[drv_index]->raid_level != -1 && drv_index != 0) { + dev_warn(&h->pdev->dev, "disk %d has changed.\n", drv_index); + spin_lock_irqsave(&h->lock, flags); + h->drv[drv_index]->busy_configuring = 1; + spin_unlock_irqrestore(&h->lock, flags); + + /* deregister_disk sets h->drv[drv_index]->queue = NULL + * which keeps the interrupt handler from starting + * the queue. + */ + ret = deregister_disk(h, drv_index, 0, via_ioctl); + } + + /* If the disk is in use return */ + if (ret) + goto freeret; + + /* Save the new information from cciss_geometry_inquiry + * and serial number inquiry. If the disk was deregistered + * above, then h->drv[drv_index] will be NULL. + */ + if (h->drv[drv_index] == NULL) { + drvinfo->device_initialized = 0; + h->drv[drv_index] = drvinfo; + drvinfo = NULL; /* so it won't be freed below. */ + } else { + /* special case for cxd0 */ + h->drv[drv_index]->block_size = drvinfo->block_size; + h->drv[drv_index]->nr_blocks = drvinfo->nr_blocks; + h->drv[drv_index]->heads = drvinfo->heads; + h->drv[drv_index]->sectors = drvinfo->sectors; + h->drv[drv_index]->cylinders = drvinfo->cylinders; + h->drv[drv_index]->raid_level = drvinfo->raid_level; + memcpy(h->drv[drv_index]->serial_no, drvinfo->serial_no, 16); + memcpy(h->drv[drv_index]->vendor, drvinfo->vendor, + VENDOR_LEN + 1); + memcpy(h->drv[drv_index]->model, drvinfo->model, MODEL_LEN + 1); + memcpy(h->drv[drv_index]->rev, drvinfo->rev, REV_LEN + 1); + } + + ++h->num_luns; + disk = h->gendisk[drv_index]; + set_capacity(disk, h->drv[drv_index]->nr_blocks); + + /* If it's not disk 0 (drv_index != 0) + * or if it was disk 0, but there was previously + * no actual corresponding configured logical drive + * (raid_leve == -1) then we want to update the + * logical drive's information. + */ + if (drv_index || first_time) { + if (cciss_add_disk(h, disk, drv_index) != 0) { + cciss_free_gendisk(h, drv_index); + cciss_free_drive_info(h, drv_index); + dev_warn(&h->pdev->dev, "could not update disk %d\n", + drv_index); + --h->num_luns; + } + } + +freeret: + kfree(inq_buff); + kfree(drvinfo); + return; +mem_msg: + dev_err(&h->pdev->dev, "out of memory\n"); + goto freeret; +} + +/* This function will find the first index of the controllers drive array + * that has a null drv pointer and allocate the drive info struct and + * will return that index This is where new drives will be added. + * If the index to be returned is greater than the highest_lun index for + * the controller then highest_lun is set * to this new index. + * If there are no available indexes or if tha allocation fails, then -1 + * is returned. * "controller_node" is used to know if this is a real + * logical drive, or just the controller node, which determines if this + * counts towards highest_lun. + */ +static int cciss_alloc_drive_info(ctlr_info_t *h, int controller_node) +{ + int i; + drive_info_struct *drv; + + /* Search for an empty slot for our drive info */ + for (i = 0; i < CISS_MAX_LUN; i++) { + + /* if not cxd0 case, and it's occupied, skip it. */ + if (h->drv[i] && i != 0) + continue; + /* + * If it's cxd0 case, and drv is alloc'ed already, and a + * disk is configured there, skip it. + */ + if (i == 0 && h->drv[i] && h->drv[i]->raid_level != -1) + continue; + + /* + * We've found an empty slot. Update highest_lun + * provided this isn't just the fake cxd0 controller node. + */ + if (i > h->highest_lun && !controller_node) + h->highest_lun = i; + + /* If adding a real disk at cxd0, and it's already alloc'ed */ + if (i == 0 && h->drv[i] != NULL) + return i; + + /* + * Found an empty slot, not already alloc'ed. Allocate it. + * Mark it with raid_level == -1, so we know it's new later on. + */ + drv = kzalloc(sizeof(*drv), GFP_KERNEL); + if (!drv) + return -1; + drv->raid_level = -1; /* so we know it's new */ + h->drv[i] = drv; + return i; + } + return -1; +} + +static void cciss_free_drive_info(ctlr_info_t *h, int drv_index) +{ + kfree(h->drv[drv_index]); + h->drv[drv_index] = NULL; +} + +static void cciss_free_gendisk(ctlr_info_t *h, int drv_index) +{ + put_disk(h->gendisk[drv_index]); + h->gendisk[drv_index] = NULL; +} + +/* cciss_add_gendisk finds a free hba[]->drv structure + * and allocates a gendisk if needed, and sets the lunid + * in the drvinfo structure. It returns the index into + * the ->drv[] array, or -1 if none are free. + * is_controller_node indicates whether highest_lun should + * count this disk, or if it's only being added to provide + * a means to talk to the controller in case no logical + * drives have yet been configured. + */ +static int cciss_add_gendisk(ctlr_info_t *h, unsigned char lunid[], + int controller_node) +{ + int drv_index; + + drv_index = cciss_alloc_drive_info(h, controller_node); + if (drv_index == -1) + return -1; + + /*Check if the gendisk needs to be allocated */ + if (!h->gendisk[drv_index]) { + h->gendisk[drv_index] = + alloc_disk(1 << NWD_SHIFT); + if (!h->gendisk[drv_index]) { + dev_err(&h->pdev->dev, + "could not allocate a new disk %d\n", + drv_index); + goto err_free_drive_info; + } + } + memcpy(h->drv[drv_index]->LunID, lunid, + sizeof(h->drv[drv_index]->LunID)); + if (cciss_create_ld_sysfs_entry(h, drv_index)) + goto err_free_disk; + /* Don't need to mark this busy because nobody */ + /* else knows about this disk yet to contend */ + /* for access to it. */ + h->drv[drv_index]->busy_configuring = 0; + wmb(); + return drv_index; + +err_free_disk: + cciss_free_gendisk(h, drv_index); +err_free_drive_info: + cciss_free_drive_info(h, drv_index); + return -1; +} + +/* This is for the special case of a controller which + * has no logical drives. In this case, we still need + * to register a disk so the controller can be accessed + * by the Array Config Utility. + */ +static void cciss_add_controller_node(ctlr_info_t *h) +{ + struct gendisk *disk; + int drv_index; + + if (h->gendisk[0] != NULL) /* already did this? Then bail. */ + return; + + drv_index = cciss_add_gendisk(h, CTLR_LUNID, 1); + if (drv_index == -1) + goto error; + h->drv[drv_index]->block_size = 512; + h->drv[drv_index]->nr_blocks = 0; + h->drv[drv_index]->heads = 0; + h->drv[drv_index]->sectors = 0; + h->drv[drv_index]->cylinders = 0; + h->drv[drv_index]->raid_level = -1; + memset(h->drv[drv_index]->serial_no, 0, 16); + disk = h->gendisk[drv_index]; + if (cciss_add_disk(h, disk, drv_index) == 0) + return; + cciss_free_gendisk(h, drv_index); + cciss_free_drive_info(h, drv_index); +error: + dev_warn(&h->pdev->dev, "could not add disk 0.\n"); + return; +} + +/* This function will add and remove logical drives from the Logical + * drive array of the controller and maintain persistency of ordering + * so that mount points are preserved until the next reboot. This allows + * for the removal of logical drives in the middle of the drive array + * without a re-ordering of those drives. + * INPUT + * h = The controller to perform the operations on + */ +static int rebuild_lun_table(ctlr_info_t *h, int first_time, + int via_ioctl) +{ + int num_luns; + ReportLunData_struct *ld_buff = NULL; + int return_code; + int listlength = 0; + int i; + int drv_found; + int drv_index = 0; + unsigned char lunid[8] = CTLR_LUNID; + unsigned long flags; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* Set busy_configuring flag for this operation */ + spin_lock_irqsave(&h->lock, flags); + if (h->busy_configuring) { + spin_unlock_irqrestore(&h->lock, flags); + return -EBUSY; + } + h->busy_configuring = 1; + spin_unlock_irqrestore(&h->lock, flags); + + ld_buff = kzalloc(sizeof(ReportLunData_struct), GFP_KERNEL); + if (ld_buff == NULL) + goto mem_msg; + + return_code = sendcmd_withirq(h, CISS_REPORT_LOG, ld_buff, + sizeof(ReportLunData_struct), + 0, CTLR_LUNID, TYPE_CMD); + + if (return_code == IO_OK) + listlength = be32_to_cpu(*(__be32 *) ld_buff->LUNListLength); + else { /* reading number of logical volumes failed */ + dev_warn(&h->pdev->dev, + "report logical volume command failed\n"); + listlength = 0; + goto freeret; + } + + num_luns = listlength / 8; /* 8 bytes per entry */ + if (num_luns > CISS_MAX_LUN) { + num_luns = CISS_MAX_LUN; + dev_warn(&h->pdev->dev, "more luns configured" + " on controller than can be handled by" + " this driver.\n"); + } + + if (num_luns == 0) + cciss_add_controller_node(h); + + /* Compare controller drive array to driver's drive array + * to see if any drives are missing on the controller due + * to action of Array Config Utility (user deletes drive) + * and deregister logical drives which have disappeared. + */ + for (i = 0; i <= h->highest_lun; i++) { + int j; + drv_found = 0; + + /* skip holes in the array from already deleted drives */ + if (h->drv[i] == NULL) + continue; + + for (j = 0; j < num_luns; j++) { + memcpy(lunid, &ld_buff->LUN[j][0], sizeof(lunid)); + if (memcmp(h->drv[i]->LunID, lunid, + sizeof(lunid)) == 0) { + drv_found = 1; + break; + } + } + if (!drv_found) { + /* Deregister it from the OS, it's gone. */ + spin_lock_irqsave(&h->lock, flags); + h->drv[i]->busy_configuring = 1; + spin_unlock_irqrestore(&h->lock, flags); + return_code = deregister_disk(h, i, 1, via_ioctl); + if (h->drv[i] != NULL) + h->drv[i]->busy_configuring = 0; + } + } + + /* Compare controller drive array to driver's drive array. + * Check for updates in the drive information and any new drives + * on the controller due to ACU adding logical drives, or changing + * a logical drive's size, etc. Reregister any new/changed drives + */ + for (i = 0; i < num_luns; i++) { + int j; + + drv_found = 0; + + memcpy(lunid, &ld_buff->LUN[i][0], sizeof(lunid)); + /* Find if the LUN is already in the drive array + * of the driver. If so then update its info + * if not in use. If it does not exist then find + * the first free index and add it. + */ + for (j = 0; j <= h->highest_lun; j++) { + if (h->drv[j] != NULL && + memcmp(h->drv[j]->LunID, lunid, + sizeof(h->drv[j]->LunID)) == 0) { + drv_index = j; + drv_found = 1; + break; + } + } + + /* check if the drive was found already in the array */ + if (!drv_found) { + drv_index = cciss_add_gendisk(h, lunid, 0); + if (drv_index == -1) + goto freeret; + } + cciss_update_drive_info(h, drv_index, first_time, via_ioctl); + } /* end for */ + +freeret: + kfree(ld_buff); + h->busy_configuring = 0; + /* We return -1 here to tell the ACU that we have registered/updated + * all of the drives that we can and to keep it from calling us + * additional times. + */ + return -1; +mem_msg: + dev_err(&h->pdev->dev, "out of memory\n"); + h->busy_configuring = 0; + goto freeret; +} + +static void cciss_clear_drive_info(drive_info_struct *drive_info) +{ + /* zero out the disk size info */ + drive_info->nr_blocks = 0; + drive_info->block_size = 0; + drive_info->heads = 0; + drive_info->sectors = 0; + drive_info->cylinders = 0; + drive_info->raid_level = -1; + memset(drive_info->serial_no, 0, sizeof(drive_info->serial_no)); + memset(drive_info->model, 0, sizeof(drive_info->model)); + memset(drive_info->rev, 0, sizeof(drive_info->rev)); + memset(drive_info->vendor, 0, sizeof(drive_info->vendor)); + /* + * don't clear the LUNID though, we need to remember which + * one this one is. + */ +} + +/* This function will deregister the disk and it's queue from the + * kernel. It must be called with the controller lock held and the + * drv structures busy_configuring flag set. It's parameters are: + * + * disk = This is the disk to be deregistered + * drv = This is the drive_info_struct associated with the disk to be + * deregistered. It contains information about the disk used + * by the driver. + * clear_all = This flag determines whether or not the disk information + * is going to be completely cleared out and the highest_lun + * reset. Sometimes we want to clear out information about + * the disk in preparation for re-adding it. In this case + * the highest_lun should be left unchanged and the LunID + * should not be cleared. + * via_ioctl + * This indicates whether we've reached this path via ioctl. + * This affects the maximum usage count allowed for c0d0 to be messed with. + * If this path is reached via ioctl(), then the max_usage_count will + * be 1, as the process calling ioctl() has got to have the device open. + * If we get here via sysfs, then the max usage count will be zero. +*/ +static int deregister_disk(ctlr_info_t *h, int drv_index, + int clear_all, int via_ioctl) +{ + int i; + struct gendisk *disk; + drive_info_struct *drv; + int recalculate_highest_lun; + + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + + drv = h->drv[drv_index]; + disk = h->gendisk[drv_index]; + + /* make sure logical volume is NOT is use */ + if (clear_all || (h->gendisk[0] == disk)) { + if (drv->usage_count > via_ioctl) + return -EBUSY; + } else if (drv->usage_count > 0) + return -EBUSY; + + recalculate_highest_lun = (drv == h->drv[h->highest_lun]); + + /* invalidate the devices and deregister the disk. If it is disk + * zero do not deregister it but just zero out it's values. This + * allows us to delete disk zero but keep the controller registered. + */ + if (h->gendisk[0] != disk) { + struct request_queue *q = disk->queue; + if (disk->flags & GENHD_FL_UP) { + cciss_destroy_ld_sysfs_entry(h, drv_index, 0); + del_gendisk(disk); + } + if (q) + blk_cleanup_queue(q); + /* If clear_all is set then we are deleting the logical + * drive, not just refreshing its info. For drives + * other than disk 0 we will call put_disk. We do not + * do this for disk 0 as we need it to be able to + * configure the controller. + */ + if (clear_all){ + /* This isn't pretty, but we need to find the + * disk in our array and NULL our the pointer. + * This is so that we will call alloc_disk if + * this index is used again later. + */ + for (i=0; i < CISS_MAX_LUN; i++){ + if (h->gendisk[i] == disk) { + h->gendisk[i] = NULL; + break; + } + } + put_disk(disk); + } + } else { + set_capacity(disk, 0); + cciss_clear_drive_info(drv); + } + + --h->num_luns; + + /* if it was the last disk, find the new hightest lun */ + if (clear_all && recalculate_highest_lun) { + int newhighest = -1; + for (i = 0; i <= h->highest_lun; i++) { + /* if the disk has size > 0, it is available */ + if (h->drv[i] && h->drv[i]->heads) + newhighest = i; + } + h->highest_lun = newhighest; + } + return 0; +} + +static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff, + size_t size, __u8 page_code, unsigned char *scsi3addr, + int cmd_type) +{ + u64bit buff_dma_handle; + int status = IO_OK; + + c->cmd_type = CMD_IOCTL_PEND; + c->Header.ReplyQueue = 0; + if (buff != NULL) { + c->Header.SGList = 1; + c->Header.SGTotal = 1; + } else { + c->Header.SGList = 0; + c->Header.SGTotal = 0; + } + c->Header.Tag.lower = c->busaddr; + memcpy(c->Header.LUN.LunAddrBytes, scsi3addr, 8); + + c->Request.Type.Type = cmd_type; + if (cmd_type == TYPE_CMD) { + switch (cmd) { + case CISS_INQUIRY: + /* are we trying to read a vital product page */ + if (page_code != 0) { + c->Request.CDB[1] = 0x01; + c->Request.CDB[2] = page_code; + } + c->Request.CDBLen = 6; + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_READ; + c->Request.Timeout = 0; + c->Request.CDB[0] = CISS_INQUIRY; + c->Request.CDB[4] = size & 0xFF; + break; + case CISS_REPORT_LOG: + case CISS_REPORT_PHYS: + /* Talking to controller so It's a physical command + mode = 00 target = 0. Nothing to write. + */ + c->Request.CDBLen = 12; + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_READ; + c->Request.Timeout = 0; + c->Request.CDB[0] = cmd; + c->Request.CDB[6] = (size >> 24) & 0xFF; /* MSB */ + c->Request.CDB[7] = (size >> 16) & 0xFF; + c->Request.CDB[8] = (size >> 8) & 0xFF; + c->Request.CDB[9] = size & 0xFF; + break; + + case CCISS_READ_CAPACITY: + c->Request.CDBLen = 10; + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_READ; + c->Request.Timeout = 0; + c->Request.CDB[0] = cmd; + break; + case CCISS_READ_CAPACITY_16: + c->Request.CDBLen = 16; + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_READ; + c->Request.Timeout = 0; + c->Request.CDB[0] = cmd; + c->Request.CDB[1] = 0x10; + c->Request.CDB[10] = (size >> 24) & 0xFF; + c->Request.CDB[11] = (size >> 16) & 0xFF; + c->Request.CDB[12] = (size >> 8) & 0xFF; + c->Request.CDB[13] = size & 0xFF; + c->Request.Timeout = 0; + c->Request.CDB[0] = cmd; + break; + case CCISS_CACHE_FLUSH: + c->Request.CDBLen = 12; + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_WRITE; + c->Request.Timeout = 0; + c->Request.CDB[0] = BMIC_WRITE; + c->Request.CDB[6] = BMIC_CACHE_FLUSH; + c->Request.CDB[7] = (size >> 8) & 0xFF; + c->Request.CDB[8] = size & 0xFF; + break; + case TEST_UNIT_READY: + c->Request.CDBLen = 6; + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_NONE; + c->Request.Timeout = 0; + break; + default: + dev_warn(&h->pdev->dev, "Unknown Command 0x%c\n", cmd); + return IO_ERROR; + } + } else if (cmd_type == TYPE_MSG) { + switch (cmd) { + case CCISS_ABORT_MSG: + c->Request.CDBLen = 12; + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_WRITE; + c->Request.Timeout = 0; + c->Request.CDB[0] = cmd; /* abort */ + c->Request.CDB[1] = 0; /* abort a command */ + /* buff contains the tag of the command to abort */ + memcpy(&c->Request.CDB[4], buff, 8); + break; + case CCISS_RESET_MSG: + c->Request.CDBLen = 16; + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_NONE; + c->Request.Timeout = 0; + memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB)); + c->Request.CDB[0] = cmd; /* reset */ + c->Request.CDB[1] = CCISS_RESET_TYPE_TARGET; + break; + case CCISS_NOOP_MSG: + c->Request.CDBLen = 1; + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = XFER_WRITE; + c->Request.Timeout = 0; + c->Request.CDB[0] = cmd; + break; + default: + dev_warn(&h->pdev->dev, + "unknown message type %d\n", cmd); + return IO_ERROR; + } + } else { + dev_warn(&h->pdev->dev, "unknown command type %d\n", cmd_type); + return IO_ERROR; + } + /* Fill in the scatter gather information */ + if (size > 0) { + buff_dma_handle.val = (__u64) pci_map_single(h->pdev, + buff, size, + PCI_DMA_BIDIRECTIONAL); + c->SG[0].Addr.lower = buff_dma_handle.val32.lower; + c->SG[0].Addr.upper = buff_dma_handle.val32.upper; + c->SG[0].Len = size; + c->SG[0].Ext = 0; /* we are not chaining */ + } + return status; +} + +static int cciss_send_reset(ctlr_info_t *h, unsigned char *scsi3addr, + u8 reset_type) +{ + CommandList_struct *c; + int return_status; + + c = cmd_alloc(h); + if (!c) + return -ENOMEM; + return_status = fill_cmd(h, c, CCISS_RESET_MSG, NULL, 0, 0, + CTLR_LUNID, TYPE_MSG); + c->Request.CDB[1] = reset_type; /* fill_cmd defaults to target reset */ + if (return_status != IO_OK) { + cmd_special_free(h, c); + return return_status; + } + c->waiting = NULL; + enqueue_cmd_and_start_io(h, c); + /* Don't wait for completion, the reset won't complete. Don't free + * the command either. This is the last command we will send before + * re-initializing everything, so it doesn't matter and won't leak. + */ + return 0; +} + +static int check_target_status(ctlr_info_t *h, CommandList_struct *c) +{ + switch (c->err_info->ScsiStatus) { + case SAM_STAT_GOOD: + return IO_OK; + case SAM_STAT_CHECK_CONDITION: + switch (0xf & c->err_info->SenseInfo[2]) { + case 0: return IO_OK; /* no sense */ + case 1: return IO_OK; /* recovered error */ + default: + if (check_for_unit_attention(h, c)) + return IO_NEEDS_RETRY; + dev_warn(&h->pdev->dev, "cmd 0x%02x " + "check condition, sense key = 0x%02x\n", + c->Request.CDB[0], c->err_info->SenseInfo[2]); + } + break; + default: + dev_warn(&h->pdev->dev, "cmd 0x%02x" + "scsi status = 0x%02x\n", + c->Request.CDB[0], c->err_info->ScsiStatus); + break; + } + return IO_ERROR; +} + +static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c) +{ + int return_status = IO_OK; + + if (c->err_info->CommandStatus == CMD_SUCCESS) + return IO_OK; + + switch (c->err_info->CommandStatus) { + case CMD_TARGET_STATUS: + return_status = check_target_status(h, c); + break; + case CMD_DATA_UNDERRUN: + case CMD_DATA_OVERRUN: + /* expected for inquiry and report lun commands */ + break; + case CMD_INVALID: + dev_warn(&h->pdev->dev, "cmd 0x%02x is " + "reported invalid\n", c->Request.CDB[0]); + return_status = IO_ERROR; + break; + case CMD_PROTOCOL_ERR: + dev_warn(&h->pdev->dev, "cmd 0x%02x has " + "protocol error\n", c->Request.CDB[0]); + return_status = IO_ERROR; + break; + case CMD_HARDWARE_ERR: + dev_warn(&h->pdev->dev, "cmd 0x%02x had " + " hardware error\n", c->Request.CDB[0]); + return_status = IO_ERROR; + break; + case CMD_CONNECTION_LOST: + dev_warn(&h->pdev->dev, "cmd 0x%02x had " + "connection lost\n", c->Request.CDB[0]); + return_status = IO_ERROR; + break; + case CMD_ABORTED: + dev_warn(&h->pdev->dev, "cmd 0x%02x was " + "aborted\n", c->Request.CDB[0]); + return_status = IO_ERROR; + break; + case CMD_ABORT_FAILED: + dev_warn(&h->pdev->dev, "cmd 0x%02x reports " + "abort failed\n", c->Request.CDB[0]); + return_status = IO_ERROR; + break; + case CMD_UNSOLICITED_ABORT: + dev_warn(&h->pdev->dev, "unsolicited abort 0x%02x\n", + c->Request.CDB[0]); + return_status = IO_NEEDS_RETRY; + break; + case CMD_UNABORTABLE: + dev_warn(&h->pdev->dev, "cmd unabortable\n"); + return_status = IO_ERROR; + break; + default: + dev_warn(&h->pdev->dev, "cmd 0x%02x returned " + "unknown status %x\n", c->Request.CDB[0], + c->err_info->CommandStatus); + return_status = IO_ERROR; + } + return return_status; +} + +static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c, + int attempt_retry) +{ + DECLARE_COMPLETION_ONSTACK(wait); + u64bit buff_dma_handle; + int return_status = IO_OK; + +resend_cmd2: + c->waiting = &wait; + enqueue_cmd_and_start_io(h, c); + + wait_for_completion(&wait); + + if (c->err_info->CommandStatus == 0 || !attempt_retry) + goto command_done; + + return_status = process_sendcmd_error(h, c); + + if (return_status == IO_NEEDS_RETRY && + c->retry_count < MAX_CMD_RETRIES) { + dev_warn(&h->pdev->dev, "retrying 0x%02x\n", + c->Request.CDB[0]); + c->retry_count++; + /* erase the old error information */ + memset(c->err_info, 0, sizeof(ErrorInfo_struct)); + return_status = IO_OK; + reinit_completion(&wait); + goto resend_cmd2; + } + +command_done: + /* unlock the buffers from DMA */ + buff_dma_handle.val32.lower = c->SG[0].Addr.lower; + buff_dma_handle.val32.upper = c->SG[0].Addr.upper; + pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val, + c->SG[0].Len, PCI_DMA_BIDIRECTIONAL); + return return_status; +} + +static int sendcmd_withirq(ctlr_info_t *h, __u8 cmd, void *buff, size_t size, + __u8 page_code, unsigned char scsi3addr[], + int cmd_type) +{ + CommandList_struct *c; + int return_status; + + c = cmd_special_alloc(h); + if (!c) + return -ENOMEM; + return_status = fill_cmd(h, c, cmd, buff, size, page_code, + scsi3addr, cmd_type); + if (return_status == IO_OK) + return_status = sendcmd_withirq_core(h, c, 1); + + cmd_special_free(h, c); + return return_status; +} + +static void cciss_geometry_inquiry(ctlr_info_t *h, int logvol, + sector_t total_size, + unsigned int block_size, + InquiryData_struct *inq_buff, + drive_info_struct *drv) +{ + int return_code; + unsigned long t; + unsigned char scsi3addr[8]; + + memset(inq_buff, 0, sizeof(InquiryData_struct)); + log_unit_to_scsi3addr(h, scsi3addr, logvol); + return_code = sendcmd_withirq(h, CISS_INQUIRY, inq_buff, + sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD); + if (return_code == IO_OK) { + if (inq_buff->data_byte[8] == 0xFF) { + dev_warn(&h->pdev->dev, + "reading geometry failed, volume " + "does not support reading geometry\n"); + drv->heads = 255; + drv->sectors = 32; /* Sectors per track */ + drv->cylinders = total_size + 1; + drv->raid_level = RAID_UNKNOWN; + } else { + drv->heads = inq_buff->data_byte[6]; + drv->sectors = inq_buff->data_byte[7]; + drv->cylinders = (inq_buff->data_byte[4] & 0xff) << 8; + drv->cylinders += inq_buff->data_byte[5]; + drv->raid_level = inq_buff->data_byte[8]; + } + drv->block_size = block_size; + drv->nr_blocks = total_size + 1; + t = drv->heads * drv->sectors; + if (t > 1) { + sector_t real_size = total_size + 1; + unsigned long rem = sector_div(real_size, t); + if (rem) + real_size++; + drv->cylinders = real_size; + } + } else { /* Get geometry failed */ + dev_warn(&h->pdev->dev, "reading geometry failed\n"); + } +} + +static void +cciss_read_capacity(ctlr_info_t *h, int logvol, sector_t *total_size, + unsigned int *block_size) +{ + ReadCapdata_struct *buf; + int return_code; + unsigned char scsi3addr[8]; + + buf = kzalloc(sizeof(ReadCapdata_struct), GFP_KERNEL); + if (!buf) { + dev_warn(&h->pdev->dev, "out of memory\n"); + return; + } + + log_unit_to_scsi3addr(h, scsi3addr, logvol); + return_code = sendcmd_withirq(h, CCISS_READ_CAPACITY, buf, + sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD); + if (return_code == IO_OK) { + *total_size = be32_to_cpu(*(__be32 *) buf->total_size); + *block_size = be32_to_cpu(*(__be32 *) buf->block_size); + } else { /* read capacity command failed */ + dev_warn(&h->pdev->dev, "read capacity failed\n"); + *total_size = 0; + *block_size = BLOCK_SIZE; + } + kfree(buf); +} + +static void cciss_read_capacity_16(ctlr_info_t *h, int logvol, + sector_t *total_size, unsigned int *block_size) +{ + ReadCapdata_struct_16 *buf; + int return_code; + unsigned char scsi3addr[8]; + + buf = kzalloc(sizeof(ReadCapdata_struct_16), GFP_KERNEL); + if (!buf) { + dev_warn(&h->pdev->dev, "out of memory\n"); + return; + } + + log_unit_to_scsi3addr(h, scsi3addr, logvol); + return_code = sendcmd_withirq(h, CCISS_READ_CAPACITY_16, + buf, sizeof(ReadCapdata_struct_16), + 0, scsi3addr, TYPE_CMD); + if (return_code == IO_OK) { + *total_size = be64_to_cpu(*(__be64 *) buf->total_size); + *block_size = be32_to_cpu(*(__be32 *) buf->block_size); + } else { /* read capacity command failed */ + dev_warn(&h->pdev->dev, "read capacity failed\n"); + *total_size = 0; + *block_size = BLOCK_SIZE; + } + dev_info(&h->pdev->dev, " blocks= %llu block_size= %d\n", + (unsigned long long)*total_size+1, *block_size); + kfree(buf); +} + +static int cciss_revalidate(struct gendisk *disk) +{ + ctlr_info_t *h = get_host(disk); + drive_info_struct *drv = get_drv(disk); + int logvol; + int FOUND = 0; + unsigned int block_size; + sector_t total_size; + InquiryData_struct *inq_buff = NULL; + + for (logvol = 0; logvol <= h->highest_lun; logvol++) { + if (!h->drv[logvol]) + continue; + if (memcmp(h->drv[logvol]->LunID, drv->LunID, + sizeof(drv->LunID)) == 0) { + FOUND = 1; + break; + } + } + + if (!FOUND) + return 1; + + inq_buff = kmalloc(sizeof(InquiryData_struct), GFP_KERNEL); + if (inq_buff == NULL) { + dev_warn(&h->pdev->dev, "out of memory\n"); + return 1; + } + if (h->cciss_read == CCISS_READ_10) { + cciss_read_capacity(h, logvol, + &total_size, &block_size); + } else { + cciss_read_capacity_16(h, logvol, + &total_size, &block_size); + } + cciss_geometry_inquiry(h, logvol, total_size, block_size, + inq_buff, drv); + + blk_queue_logical_block_size(drv->queue, drv->block_size); + set_capacity(disk, drv->nr_blocks); + + kfree(inq_buff); + return 0; +} + +/* + * Map (physical) PCI mem into (virtual) kernel space + */ +static void __iomem *remap_pci_mem(ulong base, ulong size) +{ + ulong page_base = ((ulong) base) & PAGE_MASK; + ulong page_offs = ((ulong) base) - page_base; + void __iomem *page_remapped = ioremap(page_base, page_offs + size); + + return page_remapped ? (page_remapped + page_offs) : NULL; +} + +/* + * Takes jobs of the Q and sends them to the hardware, then puts it on + * the Q to wait for completion. + */ +static void start_io(ctlr_info_t *h) +{ + CommandList_struct *c; + + while (!list_empty(&h->reqQ)) { + c = list_entry(h->reqQ.next, CommandList_struct, list); + /* can't do anything if fifo is full */ + if ((h->access.fifo_full(h))) { + dev_warn(&h->pdev->dev, "fifo full\n"); + break; + } + + /* Get the first entry from the Request Q */ + removeQ(c); + h->Qdepth--; + + /* Tell the controller execute command */ + h->access.submit_command(h, c); + + /* Put job onto the completed Q */ + addQ(&h->cmpQ, c); + } +} + +/* Assumes that h->lock is held. */ +/* Zeros out the error record and then resends the command back */ +/* to the controller */ +static inline void resend_cciss_cmd(ctlr_info_t *h, CommandList_struct *c) +{ + /* erase the old error information */ + memset(c->err_info, 0, sizeof(ErrorInfo_struct)); + + /* add it to software queue and then send it to the controller */ + addQ(&h->reqQ, c); + h->Qdepth++; + if (h->Qdepth > h->maxQsinceinit) + h->maxQsinceinit = h->Qdepth; + + start_io(h); +} + +static inline unsigned int make_status_bytes(unsigned int scsi_status_byte, + unsigned int msg_byte, unsigned int host_byte, + unsigned int driver_byte) +{ + /* inverse of macros in scsi.h */ + return (scsi_status_byte & 0xff) | + ((msg_byte & 0xff) << 8) | + ((host_byte & 0xff) << 16) | + ((driver_byte & 0xff) << 24); +} + +static inline int evaluate_target_status(ctlr_info_t *h, + CommandList_struct *cmd, int *retry_cmd) +{ + unsigned char sense_key; + unsigned char status_byte, msg_byte, host_byte, driver_byte; + int error_value; + + *retry_cmd = 0; + /* If we get in here, it means we got "target status", that is, scsi status */ + status_byte = cmd->err_info->ScsiStatus; + driver_byte = DRIVER_OK; + msg_byte = cmd->err_info->CommandStatus; /* correct? seems too device specific */ + + if (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) + host_byte = DID_PASSTHROUGH; + else + host_byte = DID_OK; + + error_value = make_status_bytes(status_byte, msg_byte, + host_byte, driver_byte); + + if (cmd->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION) { + if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC) + dev_warn(&h->pdev->dev, "cmd %p " + "has SCSI Status 0x%x\n", + cmd, cmd->err_info->ScsiStatus); + return error_value; + } + + /* check the sense key */ + sense_key = 0xf & cmd->err_info->SenseInfo[2]; + /* no status or recovered error */ + if (((sense_key == 0x0) || (sense_key == 0x1)) && + (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC)) + error_value = 0; + + if (check_for_unit_attention(h, cmd)) { + *retry_cmd = !(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC); + return 0; + } + + /* Not SG_IO or similar? */ + if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC) { + if (error_value != 0) + dev_warn(&h->pdev->dev, "cmd %p has CHECK CONDITION" + " sense key = 0x%x\n", cmd, sense_key); + return error_value; + } + + /* SG_IO or similar, copy sense data back */ + if (cmd->rq->sense) { + if (cmd->rq->sense_len > cmd->err_info->SenseLen) + cmd->rq->sense_len = cmd->err_info->SenseLen; + memcpy(cmd->rq->sense, cmd->err_info->SenseInfo, + cmd->rq->sense_len); + } else + cmd->rq->sense_len = 0; + + return error_value; +} + +/* checks the status of the job and calls complete buffers to mark all + * buffers for the completed job. Note that this function does not need + * to hold the hba/queue lock. + */ +static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd, + int timeout) +{ + int retry_cmd = 0; + struct request *rq = cmd->rq; + + rq->errors = 0; + + if (timeout) + rq->errors = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT); + + if (cmd->err_info->CommandStatus == 0) /* no error has occurred */ + goto after_error_processing; + + switch (cmd->err_info->CommandStatus) { + case CMD_TARGET_STATUS: + rq->errors = evaluate_target_status(h, cmd, &retry_cmd); + break; + case CMD_DATA_UNDERRUN: + if (cmd->rq->cmd_type == REQ_TYPE_FS) { + dev_warn(&h->pdev->dev, "cmd %p has" + " completed with data underrun " + "reported\n", cmd); + cmd->rq->resid_len = cmd->err_info->ResidualCnt; + } + break; + case CMD_DATA_OVERRUN: + if (cmd->rq->cmd_type == REQ_TYPE_FS) + dev_warn(&h->pdev->dev, "cciss: cmd %p has" + " completed with data overrun " + "reported\n", cmd); + break; + case CMD_INVALID: + dev_warn(&h->pdev->dev, "cciss: cmd %p is " + "reported invalid\n", cmd); + rq->errors = make_status_bytes(SAM_STAT_GOOD, + cmd->err_info->CommandStatus, DRIVER_OK, + (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + DID_PASSTHROUGH : DID_ERROR); + break; + case CMD_PROTOCOL_ERR: + dev_warn(&h->pdev->dev, "cciss: cmd %p has " + "protocol error\n", cmd); + rq->errors = make_status_bytes(SAM_STAT_GOOD, + cmd->err_info->CommandStatus, DRIVER_OK, + (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + DID_PASSTHROUGH : DID_ERROR); + break; + case CMD_HARDWARE_ERR: + dev_warn(&h->pdev->dev, "cciss: cmd %p had " + " hardware error\n", cmd); + rq->errors = make_status_bytes(SAM_STAT_GOOD, + cmd->err_info->CommandStatus, DRIVER_OK, + (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + DID_PASSTHROUGH : DID_ERROR); + break; + case CMD_CONNECTION_LOST: + dev_warn(&h->pdev->dev, "cciss: cmd %p had " + "connection lost\n", cmd); + rq->errors = make_status_bytes(SAM_STAT_GOOD, + cmd->err_info->CommandStatus, DRIVER_OK, + (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + DID_PASSTHROUGH : DID_ERROR); + break; + case CMD_ABORTED: + dev_warn(&h->pdev->dev, "cciss: cmd %p was " + "aborted\n", cmd); + rq->errors = make_status_bytes(SAM_STAT_GOOD, + cmd->err_info->CommandStatus, DRIVER_OK, + (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + DID_PASSTHROUGH : DID_ABORT); + break; + case CMD_ABORT_FAILED: + dev_warn(&h->pdev->dev, "cciss: cmd %p reports " + "abort failed\n", cmd); + rq->errors = make_status_bytes(SAM_STAT_GOOD, + cmd->err_info->CommandStatus, DRIVER_OK, + (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + DID_PASSTHROUGH : DID_ERROR); + break; + case CMD_UNSOLICITED_ABORT: + dev_warn(&h->pdev->dev, "cciss%d: unsolicited " + "abort %p\n", h->ctlr, cmd); + if (cmd->retry_count < MAX_CMD_RETRIES) { + retry_cmd = 1; + dev_warn(&h->pdev->dev, "retrying %p\n", cmd); + cmd->retry_count++; + } else + dev_warn(&h->pdev->dev, + "%p retried too many times\n", cmd); + rq->errors = make_status_bytes(SAM_STAT_GOOD, + cmd->err_info->CommandStatus, DRIVER_OK, + (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + DID_PASSTHROUGH : DID_ABORT); + break; + case CMD_TIMEOUT: + dev_warn(&h->pdev->dev, "cmd %p timedout\n", cmd); + rq->errors = make_status_bytes(SAM_STAT_GOOD, + cmd->err_info->CommandStatus, DRIVER_OK, + (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + DID_PASSTHROUGH : DID_ERROR); + break; + case CMD_UNABORTABLE: + dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd); + rq->errors = make_status_bytes(SAM_STAT_GOOD, + cmd->err_info->CommandStatus, DRIVER_OK, + cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC ? + DID_PASSTHROUGH : DID_ERROR); + break; + default: + dev_warn(&h->pdev->dev, "cmd %p returned " + "unknown status %x\n", cmd, + cmd->err_info->CommandStatus); + rq->errors = make_status_bytes(SAM_STAT_GOOD, + cmd->err_info->CommandStatus, DRIVER_OK, + (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + DID_PASSTHROUGH : DID_ERROR); + } + +after_error_processing: + + /* We need to return this command */ + if (retry_cmd) { + resend_cciss_cmd(h, cmd); + return; + } + cmd->rq->completion_data = cmd; + blk_complete_request(cmd->rq); +} + +static inline u32 cciss_tag_contains_index(u32 tag) +{ +#define DIRECT_LOOKUP_BIT 0x10 + return tag & DIRECT_LOOKUP_BIT; +} + +static inline u32 cciss_tag_to_index(u32 tag) +{ +#define DIRECT_LOOKUP_SHIFT 5 + return tag >> DIRECT_LOOKUP_SHIFT; +} + +static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag) +{ +#define CCISS_PERF_ERROR_BITS ((1 << DIRECT_LOOKUP_SHIFT) - 1) +#define CCISS_SIMPLE_ERROR_BITS 0x03 + if (likely(h->transMethod & CFGTBL_Trans_Performant)) + return tag & ~CCISS_PERF_ERROR_BITS; + return tag & ~CCISS_SIMPLE_ERROR_BITS; +} + +static inline void cciss_mark_tag_indexed(u32 *tag) +{ + *tag |= DIRECT_LOOKUP_BIT; +} + +static inline void cciss_set_tag_index(u32 *tag, u32 index) +{ + *tag |= (index << DIRECT_LOOKUP_SHIFT); +} + +/* + * Get a request and submit it to the controller. + */ +static void do_cciss_request(struct request_queue *q) +{ + ctlr_info_t *h = q->queuedata; + CommandList_struct *c; + sector_t start_blk; + int seg; + struct request *creq; + u64bit temp64; + struct scatterlist *tmp_sg; + SGDescriptor_struct *curr_sg; + drive_info_struct *drv; + int i, dir; + int sg_index = 0; + int chained = 0; + + queue: + creq = blk_peek_request(q); + if (!creq) + goto startio; + + BUG_ON(creq->nr_phys_segments > h->maxsgentries); + + c = cmd_alloc(h); + if (!c) + goto full; + + blk_start_request(creq); + + tmp_sg = h->scatter_list[c->cmdindex]; + spin_unlock_irq(q->queue_lock); + + c->cmd_type = CMD_RWREQ; + c->rq = creq; + + /* fill in the request */ + drv = creq->rq_disk->private_data; + c->Header.ReplyQueue = 0; /* unused in simple mode */ + /* got command from pool, so use the command block index instead */ + /* for direct lookups. */ + /* The first 2 bits are reserved for controller error reporting. */ + cciss_set_tag_index(&c->Header.Tag.lower, c->cmdindex); + cciss_mark_tag_indexed(&c->Header.Tag.lower); + memcpy(&c->Header.LUN, drv->LunID, sizeof(drv->LunID)); + c->Request.CDBLen = 10; /* 12 byte commands not in FW yet; */ + c->Request.Type.Type = TYPE_CMD; /* It is a command. */ + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = + (rq_data_dir(creq) == READ) ? XFER_READ : XFER_WRITE; + c->Request.Timeout = 0; /* Don't time out */ + c->Request.CDB[0] = + (rq_data_dir(creq) == READ) ? h->cciss_read : h->cciss_write; + start_blk = blk_rq_pos(creq); + dev_dbg(&h->pdev->dev, "sector =%d nr_sectors=%d\n", + (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq)); + sg_init_table(tmp_sg, h->maxsgentries); + seg = blk_rq_map_sg(q, creq, tmp_sg); + + /* get the DMA records for the setup */ + if (c->Request.Type.Direction == XFER_READ) + dir = PCI_DMA_FROMDEVICE; + else + dir = PCI_DMA_TODEVICE; + + curr_sg = c->SG; + sg_index = 0; + chained = 0; + + for (i = 0; i < seg; i++) { + if (((sg_index+1) == (h->max_cmd_sgentries)) && + !chained && ((seg - i) > 1)) { + /* Point to next chain block. */ + curr_sg = h->cmd_sg_list[c->cmdindex]; + sg_index = 0; + chained = 1; + } + curr_sg[sg_index].Len = tmp_sg[i].length; + temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]), + tmp_sg[i].offset, + tmp_sg[i].length, dir); + curr_sg[sg_index].Addr.lower = temp64.val32.lower; + curr_sg[sg_index].Addr.upper = temp64.val32.upper; + curr_sg[sg_index].Ext = 0; /* we are not chaining */ + ++sg_index; + } + if (chained) + cciss_map_sg_chain_block(h, c, h->cmd_sg_list[c->cmdindex], + (seg - (h->max_cmd_sgentries - 1)) * + sizeof(SGDescriptor_struct)); + + /* track how many SG entries we are using */ + if (seg > h->maxSG) + h->maxSG = seg; + + dev_dbg(&h->pdev->dev, "Submitting %u sectors in %d segments " + "chained[%d]\n", + blk_rq_sectors(creq), seg, chained); + + c->Header.SGTotal = seg + chained; + if (seg <= h->max_cmd_sgentries) + c->Header.SGList = c->Header.SGTotal; + else + c->Header.SGList = h->max_cmd_sgentries; + set_performant_mode(h, c); + + if (likely(creq->cmd_type == REQ_TYPE_FS)) { + if(h->cciss_read == CCISS_READ_10) { + c->Request.CDB[1] = 0; + c->Request.CDB[2] = (start_blk >> 24) & 0xff; /* MSB */ + c->Request.CDB[3] = (start_blk >> 16) & 0xff; + c->Request.CDB[4] = (start_blk >> 8) & 0xff; + c->Request.CDB[5] = start_blk & 0xff; + c->Request.CDB[6] = 0; /* (sect >> 24) & 0xff; MSB */ + c->Request.CDB[7] = (blk_rq_sectors(creq) >> 8) & 0xff; + c->Request.CDB[8] = blk_rq_sectors(creq) & 0xff; + c->Request.CDB[9] = c->Request.CDB[11] = c->Request.CDB[12] = 0; + } else { + u32 upper32 = upper_32_bits(start_blk); + + c->Request.CDBLen = 16; + c->Request.CDB[1]= 0; + c->Request.CDB[2]= (upper32 >> 24) & 0xff; /* MSB */ + c->Request.CDB[3]= (upper32 >> 16) & 0xff; + c->Request.CDB[4]= (upper32 >> 8) & 0xff; + c->Request.CDB[5]= upper32 & 0xff; + c->Request.CDB[6]= (start_blk >> 24) & 0xff; + c->Request.CDB[7]= (start_blk >> 16) & 0xff; + c->Request.CDB[8]= (start_blk >> 8) & 0xff; + c->Request.CDB[9]= start_blk & 0xff; + c->Request.CDB[10]= (blk_rq_sectors(creq) >> 24) & 0xff; + c->Request.CDB[11]= (blk_rq_sectors(creq) >> 16) & 0xff; + c->Request.CDB[12]= (blk_rq_sectors(creq) >> 8) & 0xff; + c->Request.CDB[13]= blk_rq_sectors(creq) & 0xff; + c->Request.CDB[14] = c->Request.CDB[15] = 0; + } + } else if (creq->cmd_type == REQ_TYPE_BLOCK_PC) { + c->Request.CDBLen = creq->cmd_len; + memcpy(c->Request.CDB, creq->cmd, BLK_MAX_CDB); + } else { + dev_warn(&h->pdev->dev, "bad request type %d\n", + creq->cmd_type); + BUG(); + } + + spin_lock_irq(q->queue_lock); + + addQ(&h->reqQ, c); + h->Qdepth++; + if (h->Qdepth > h->maxQsinceinit) + h->maxQsinceinit = h->Qdepth; + + goto queue; +full: + blk_stop_queue(q); +startio: + /* We will already have the driver lock here so not need + * to lock it. + */ + start_io(h); +} + +static inline unsigned long get_next_completion(ctlr_info_t *h) +{ + return h->access.command_completed(h); +} + +static inline int interrupt_pending(ctlr_info_t *h) +{ + return h->access.intr_pending(h); +} + +static inline long interrupt_not_for_us(ctlr_info_t *h) +{ + return ((h->access.intr_pending(h) == 0) || + (h->interrupts_enabled == 0)); +} + +static inline int bad_tag(ctlr_info_t *h, u32 tag_index, + u32 raw_tag) +{ + if (unlikely(tag_index >= h->nr_cmds)) { + dev_warn(&h->pdev->dev, "bad tag 0x%08x ignored.\n", raw_tag); + return 1; + } + return 0; +} + +static inline void finish_cmd(ctlr_info_t *h, CommandList_struct *c, + u32 raw_tag) +{ + removeQ(c); + if (likely(c->cmd_type == CMD_RWREQ)) + complete_command(h, c, 0); + else if (c->cmd_type == CMD_IOCTL_PEND) + complete(c->waiting); +#ifdef CONFIG_CISS_SCSI_TAPE + else if (c->cmd_type == CMD_SCSI) + complete_scsi_command(c, 0, raw_tag); +#endif +} + +static inline u32 next_command(ctlr_info_t *h) +{ + u32 a; + + if (unlikely(!(h->transMethod & CFGTBL_Trans_Performant))) + return h->access.command_completed(h); + + if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) { + a = *(h->reply_pool_head); /* Next cmd in ring buffer */ + (h->reply_pool_head)++; + h->commands_outstanding--; + } else { + a = FIFO_EMPTY; + } + /* Check for wraparound */ + if (h->reply_pool_head == (h->reply_pool + h->max_commands)) { + h->reply_pool_head = h->reply_pool; + h->reply_pool_wraparound ^= 1; + } + return a; +} + +/* process completion of an indexed ("direct lookup") command */ +static inline u32 process_indexed_cmd(ctlr_info_t *h, u32 raw_tag) +{ + u32 tag_index; + CommandList_struct *c; + + tag_index = cciss_tag_to_index(raw_tag); + if (bad_tag(h, tag_index, raw_tag)) + return next_command(h); + c = h->cmd_pool + tag_index; + finish_cmd(h, c, raw_tag); + return next_command(h); +} + +/* process completion of a non-indexed command */ +static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag) +{ + CommandList_struct *c = NULL; + __u32 busaddr_masked, tag_masked; + + tag_masked = cciss_tag_discard_error_bits(h, raw_tag); + list_for_each_entry(c, &h->cmpQ, list) { + busaddr_masked = cciss_tag_discard_error_bits(h, c->busaddr); + if (busaddr_masked == tag_masked) { + finish_cmd(h, c, raw_tag); + return next_command(h); + } + } + bad_tag(h, h->nr_cmds + 1, raw_tag); + return next_command(h); +} + +/* Some controllers, like p400, will give us one interrupt + * after a soft reset, even if we turned interrupts off. + * Only need to check for this in the cciss_xxx_discard_completions + * functions. + */ +static int ignore_bogus_interrupt(ctlr_info_t *h) +{ + if (likely(!reset_devices)) + return 0; + + if (likely(h->interrupts_enabled)) + return 0; + + dev_info(&h->pdev->dev, "Received interrupt while interrupts disabled " + "(known firmware bug.) Ignoring.\n"); + + return 1; +} + +static irqreturn_t cciss_intx_discard_completions(int irq, void *dev_id) +{ + ctlr_info_t *h = dev_id; + unsigned long flags; + u32 raw_tag; + + if (ignore_bogus_interrupt(h)) + return IRQ_NONE; + + if (interrupt_not_for_us(h)) + return IRQ_NONE; + spin_lock_irqsave(&h->lock, flags); + while (interrupt_pending(h)) { + raw_tag = get_next_completion(h); + while (raw_tag != FIFO_EMPTY) + raw_tag = next_command(h); + } + spin_unlock_irqrestore(&h->lock, flags); + return IRQ_HANDLED; +} + +static irqreturn_t cciss_msix_discard_completions(int irq, void *dev_id) +{ + ctlr_info_t *h = dev_id; + unsigned long flags; + u32 raw_tag; + + if (ignore_bogus_interrupt(h)) + return IRQ_NONE; + + spin_lock_irqsave(&h->lock, flags); + raw_tag = get_next_completion(h); + while (raw_tag != FIFO_EMPTY) + raw_tag = next_command(h); + spin_unlock_irqrestore(&h->lock, flags); + return IRQ_HANDLED; +} + +static irqreturn_t do_cciss_intx(int irq, void *dev_id) +{ + ctlr_info_t *h = dev_id; + unsigned long flags; + u32 raw_tag; + + if (interrupt_not_for_us(h)) + return IRQ_NONE; + spin_lock_irqsave(&h->lock, flags); + while (interrupt_pending(h)) { + raw_tag = get_next_completion(h); + while (raw_tag != FIFO_EMPTY) { + if (cciss_tag_contains_index(raw_tag)) + raw_tag = process_indexed_cmd(h, raw_tag); + else + raw_tag = process_nonindexed_cmd(h, raw_tag); + } + } + spin_unlock_irqrestore(&h->lock, flags); + return IRQ_HANDLED; +} + +/* Add a second interrupt handler for MSI/MSI-X mode. In this mode we never + * check the interrupt pending register because it is not set. + */ +static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id) +{ + ctlr_info_t *h = dev_id; + unsigned long flags; + u32 raw_tag; + + spin_lock_irqsave(&h->lock, flags); + raw_tag = get_next_completion(h); + while (raw_tag != FIFO_EMPTY) { + if (cciss_tag_contains_index(raw_tag)) + raw_tag = process_indexed_cmd(h, raw_tag); + else + raw_tag = process_nonindexed_cmd(h, raw_tag); + } + spin_unlock_irqrestore(&h->lock, flags); + return IRQ_HANDLED; +} + +/** + * add_to_scan_list() - add controller to rescan queue + * @h: Pointer to the controller. + * + * Adds the controller to the rescan queue if not already on the queue. + * + * returns 1 if added to the queue, 0 if skipped (could be on the + * queue already, or the controller could be initializing or shutting + * down). + **/ +static int add_to_scan_list(struct ctlr_info *h) +{ + struct ctlr_info *test_h; + int found = 0; + int ret = 0; + + if (h->busy_initializing) + return 0; + + if (!mutex_trylock(&h->busy_shutting_down)) + return 0; + + mutex_lock(&scan_mutex); + list_for_each_entry(test_h, &scan_q, scan_list) { + if (test_h == h) { + found = 1; + break; + } + } + if (!found && !h->busy_scanning) { + reinit_completion(&h->scan_wait); + list_add_tail(&h->scan_list, &scan_q); + ret = 1; + } + mutex_unlock(&scan_mutex); + mutex_unlock(&h->busy_shutting_down); + + return ret; +} + +/** + * remove_from_scan_list() - remove controller from rescan queue + * @h: Pointer to the controller. + * + * Removes the controller from the rescan queue if present. Blocks if + * the controller is currently conducting a rescan. The controller + * can be in one of three states: + * 1. Doesn't need a scan + * 2. On the scan list, but not scanning yet (we remove it) + * 3. Busy scanning (and not on the list). In this case we want to wait for + * the scan to complete to make sure the scanning thread for this + * controller is completely idle. + **/ +static void remove_from_scan_list(struct ctlr_info *h) +{ + struct ctlr_info *test_h, *tmp_h; + + mutex_lock(&scan_mutex); + list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) { + if (test_h == h) { /* state 2. */ + list_del(&h->scan_list); + complete_all(&h->scan_wait); + mutex_unlock(&scan_mutex); + return; + } + } + if (h->busy_scanning) { /* state 3. */ + mutex_unlock(&scan_mutex); + wait_for_completion(&h->scan_wait); + } else { /* state 1, nothing to do. */ + mutex_unlock(&scan_mutex); + } +} + +/** + * scan_thread() - kernel thread used to rescan controllers + * @data: Ignored. + * + * A kernel thread used scan for drive topology changes on + * controllers. The thread processes only one controller at a time + * using a queue. Controllers are added to the queue using + * add_to_scan_list() and removed from the queue either after done + * processing or using remove_from_scan_list(). + * + * returns 0. + **/ +static int scan_thread(void *data) +{ + struct ctlr_info *h; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + if (kthread_should_stop()) + break; + + while (1) { + mutex_lock(&scan_mutex); + if (list_empty(&scan_q)) { + mutex_unlock(&scan_mutex); + break; + } + + h = list_entry(scan_q.next, + struct ctlr_info, + scan_list); + list_del(&h->scan_list); + h->busy_scanning = 1; + mutex_unlock(&scan_mutex); + + rebuild_lun_table(h, 0, 0); + complete_all(&h->scan_wait); + mutex_lock(&scan_mutex); + h->busy_scanning = 0; + mutex_unlock(&scan_mutex); + } + } + + return 0; +} + +static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c) +{ + if (c->err_info->SenseInfo[2] != UNIT_ATTENTION) + return 0; + + switch (c->err_info->SenseInfo[12]) { + case STATE_CHANGED: + dev_warn(&h->pdev->dev, "a state change " + "detected, command retried\n"); + return 1; + break; + case LUN_FAILED: + dev_warn(&h->pdev->dev, "LUN failure " + "detected, action required\n"); + return 1; + break; + case REPORT_LUNS_CHANGED: + dev_warn(&h->pdev->dev, "report LUN data changed\n"); + /* + * Here, we could call add_to_scan_list and wake up the scan thread, + * except that it's quite likely that we will get more than one + * REPORT_LUNS_CHANGED condition in quick succession, which means + * that those which occur after the first one will likely happen + * *during* the scan_thread's rescan. And the rescan code is not + * robust enough to restart in the middle, undoing what it has already + * done, and it's not clear that it's even possible to do this, since + * part of what it does is notify the block layer, which starts + * doing it's own i/o to read partition tables and so on, and the + * driver doesn't have visibility to know what might need undoing. + * In any event, if possible, it is horribly complicated to get right + * so we just don't do it for now. + * + * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012. + */ + return 1; + break; + case POWER_OR_RESET: + dev_warn(&h->pdev->dev, + "a power on or device reset detected\n"); + return 1; + break; + case UNIT_ATTENTION_CLEARED: + dev_warn(&h->pdev->dev, + "unit attention cleared by another initiator\n"); + return 1; + break; + default: + dev_warn(&h->pdev->dev, "unknown unit attention detected\n"); + return 1; + } +} + +/* + * We cannot read the structure directly, for portability we must use + * the io functions. + * This is for debug only. + */ +static void print_cfg_table(ctlr_info_t *h) +{ + int i; + char temp_name[17]; + CfgTable_struct *tb = h->cfgtable; + + dev_dbg(&h->pdev->dev, "Controller Configuration information\n"); + dev_dbg(&h->pdev->dev, "------------------------------------\n"); + for (i = 0; i < 4; i++) + temp_name[i] = readb(&(tb->Signature[i])); + temp_name[4] = '\0'; + dev_dbg(&h->pdev->dev, " Signature = %s\n", temp_name); + dev_dbg(&h->pdev->dev, " Spec Number = %d\n", + readl(&(tb->SpecValence))); + dev_dbg(&h->pdev->dev, " Transport methods supported = 0x%x\n", + readl(&(tb->TransportSupport))); + dev_dbg(&h->pdev->dev, " Transport methods active = 0x%x\n", + readl(&(tb->TransportActive))); + dev_dbg(&h->pdev->dev, " Requested transport Method = 0x%x\n", + readl(&(tb->HostWrite.TransportRequest))); + dev_dbg(&h->pdev->dev, " Coalesce Interrupt Delay = 0x%x\n", + readl(&(tb->HostWrite.CoalIntDelay))); + dev_dbg(&h->pdev->dev, " Coalesce Interrupt Count = 0x%x\n", + readl(&(tb->HostWrite.CoalIntCount))); + dev_dbg(&h->pdev->dev, " Max outstanding commands = 0x%d\n", + readl(&(tb->CmdsOutMax))); + dev_dbg(&h->pdev->dev, " Bus Types = 0x%x\n", + readl(&(tb->BusTypes))); + for (i = 0; i < 16; i++) + temp_name[i] = readb(&(tb->ServerName[i])); + temp_name[16] = '\0'; + dev_dbg(&h->pdev->dev, " Server Name = %s\n", temp_name); + dev_dbg(&h->pdev->dev, " Heartbeat Counter = 0x%x\n\n\n", + readl(&(tb->HeartBeat))); +} + +static int find_PCI_BAR_index(struct pci_dev *pdev, unsigned long pci_bar_addr) +{ + int i, offset, mem_type, bar_type; + if (pci_bar_addr == PCI_BASE_ADDRESS_0) /* looking for BAR zero? */ + return 0; + offset = 0; + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + bar_type = pci_resource_flags(pdev, i) & PCI_BASE_ADDRESS_SPACE; + if (bar_type == PCI_BASE_ADDRESS_SPACE_IO) + offset += 4; + else { + mem_type = pci_resource_flags(pdev, i) & + PCI_BASE_ADDRESS_MEM_TYPE_MASK; + switch (mem_type) { + case PCI_BASE_ADDRESS_MEM_TYPE_32: + case PCI_BASE_ADDRESS_MEM_TYPE_1M: + offset += 4; /* 32 bit */ + break; + case PCI_BASE_ADDRESS_MEM_TYPE_64: + offset += 8; + break; + default: /* reserved in PCI 2.2 */ + dev_warn(&pdev->dev, + "Base address is invalid\n"); + return -1; + break; + } + } + if (offset == pci_bar_addr - PCI_BASE_ADDRESS_0) + return i + 1; + } + return -1; +} + +/* Fill in bucket_map[], given nsgs (the max number of + * scatter gather elements supported) and bucket[], + * which is an array of 8 integers. The bucket[] array + * contains 8 different DMA transfer sizes (in 16 + * byte increments) which the controller uses to fetch + * commands. This function fills in bucket_map[], which + * maps a given number of scatter gather elements to one of + * the 8 DMA transfer sizes. The point of it is to allow the + * controller to only do as much DMA as needed to fetch the + * command, with the DMA transfer size encoded in the lower + * bits of the command address. + */ +static void calc_bucket_map(int bucket[], int num_buckets, + int nsgs, int *bucket_map) +{ + int i, j, b, size; + + /* even a command with 0 SGs requires 4 blocks */ +#define MINIMUM_TRANSFER_BLOCKS 4 +#define NUM_BUCKETS 8 + /* Note, bucket_map must have nsgs+1 entries. */ + for (i = 0; i <= nsgs; i++) { + /* Compute size of a command with i SG entries */ + size = i + MINIMUM_TRANSFER_BLOCKS; + b = num_buckets; /* Assume the biggest bucket */ + /* Find the bucket that is just big enough */ + for (j = 0; j < 8; j++) { + if (bucket[j] >= size) { + b = j; + break; + } + } + /* for a command with i SG entries, use bucket b. */ + bucket_map[i] = b; + } +} + +static void cciss_wait_for_mode_change_ack(ctlr_info_t *h) +{ + int i; + + /* under certain very rare conditions, this can take awhile. + * (e.g.: hot replace a failed 144GB drive in a RAID 5 set right + * as we enter this code.) */ + for (i = 0; i < MAX_CONFIG_WAIT; i++) { + if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq)) + break; + usleep_range(10000, 20000); + } +} + +static void cciss_enter_performant_mode(ctlr_info_t *h, u32 use_short_tags) +{ + /* This is a bit complicated. There are 8 registers on + * the controller which we write to to tell it 8 different + * sizes of commands which there may be. It's a way of + * reducing the DMA done to fetch each command. Encoded into + * each command's tag are 3 bits which communicate to the controller + * which of the eight sizes that command fits within. The size of + * each command depends on how many scatter gather entries there are. + * Each SG entry requires 16 bytes. The eight registers are programmed + * with the number of 16-byte blocks a command of that size requires. + * The smallest command possible requires 5 such 16 byte blocks. + * the largest command possible requires MAXSGENTRIES + 4 16-byte + * blocks. Note, this only extends to the SG entries contained + * within the command block, and does not extend to chained blocks + * of SG elements. bft[] contains the eight values we write to + * the registers. They are not evenly distributed, but have more + * sizes for small commands, and fewer sizes for larger commands. + */ + __u32 trans_offset; + int bft[8] = { 5, 6, 8, 10, 12, 20, 28, MAXSGENTRIES + 4}; + /* + * 5 = 1 s/g entry or 4k + * 6 = 2 s/g entry or 8k + * 8 = 4 s/g entry or 16k + * 10 = 6 s/g entry or 24k + */ + unsigned long register_value; + BUILD_BUG_ON(28 > MAXSGENTRIES + 4); + + h->reply_pool_wraparound = 1; /* spec: init to 1 */ + + /* Controller spec: zero out this buffer. */ + memset(h->reply_pool, 0, h->max_commands * sizeof(__u64)); + h->reply_pool_head = h->reply_pool; + + trans_offset = readl(&(h->cfgtable->TransMethodOffset)); + calc_bucket_map(bft, ARRAY_SIZE(bft), h->maxsgentries, + h->blockFetchTable); + writel(bft[0], &h->transtable->BlockFetch0); + writel(bft[1], &h->transtable->BlockFetch1); + writel(bft[2], &h->transtable->BlockFetch2); + writel(bft[3], &h->transtable->BlockFetch3); + writel(bft[4], &h->transtable->BlockFetch4); + writel(bft[5], &h->transtable->BlockFetch5); + writel(bft[6], &h->transtable->BlockFetch6); + writel(bft[7], &h->transtable->BlockFetch7); + + /* size of controller ring buffer */ + writel(h->max_commands, &h->transtable->RepQSize); + writel(1, &h->transtable->RepQCount); + writel(0, &h->transtable->RepQCtrAddrLow32); + writel(0, &h->transtable->RepQCtrAddrHigh32); + writel(h->reply_pool_dhandle, &h->transtable->RepQAddr0Low32); + writel(0, &h->transtable->RepQAddr0High32); + writel(CFGTBL_Trans_Performant | use_short_tags, + &(h->cfgtable->HostWrite.TransportRequest)); + + writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); + cciss_wait_for_mode_change_ack(h); + register_value = readl(&(h->cfgtable->TransportActive)); + if (!(register_value & CFGTBL_Trans_Performant)) + dev_warn(&h->pdev->dev, "cciss: unable to get board into" + " performant mode\n"); +} + +static void cciss_put_controller_into_performant_mode(ctlr_info_t *h) +{ + __u32 trans_support; + + if (cciss_simple_mode) + return; + + dev_dbg(&h->pdev->dev, "Trying to put board into Performant mode\n"); + /* Attempt to put controller into performant mode if supported */ + /* Does board support performant mode? */ + trans_support = readl(&(h->cfgtable->TransportSupport)); + if (!(trans_support & PERFORMANT_MODE)) + return; + + dev_dbg(&h->pdev->dev, "Placing controller into performant mode\n"); + /* Performant mode demands commands on a 32 byte boundary + * pci_alloc_consistent aligns on page boundarys already. + * Just need to check if divisible by 32 + */ + if ((sizeof(CommandList_struct) % 32) != 0) { + dev_warn(&h->pdev->dev, "%s %d %s\n", + "cciss info: command size[", + (int)sizeof(CommandList_struct), + "] not divisible by 32, no performant mode..\n"); + return; + } + + /* Performant mode ring buffer and supporting data structures */ + h->reply_pool = (__u64 *)pci_alloc_consistent( + h->pdev, h->max_commands * sizeof(__u64), + &(h->reply_pool_dhandle)); + + /* Need a block fetch table for performant mode */ + h->blockFetchTable = kmalloc(((h->maxsgentries+1) * + sizeof(__u32)), GFP_KERNEL); + + if ((h->reply_pool == NULL) || (h->blockFetchTable == NULL)) + goto clean_up; + + cciss_enter_performant_mode(h, + trans_support & CFGTBL_Trans_use_short_tags); + + /* Change the access methods to the performant access methods */ + h->access = SA5_performant_access; + h->transMethod = CFGTBL_Trans_Performant; + + return; +clean_up: + kfree(h->blockFetchTable); + if (h->reply_pool) + pci_free_consistent(h->pdev, + h->max_commands * sizeof(__u64), + h->reply_pool, + h->reply_pool_dhandle); + return; + +} /* cciss_put_controller_into_performant_mode */ + +/* If MSI/MSI-X is supported by the kernel we will try to enable it on + * controllers that are capable. If not, we use IO-APIC mode. + */ + +static void cciss_interrupt_mode(ctlr_info_t *h) +{ +#ifdef CONFIG_PCI_MSI + int err; + struct msix_entry cciss_msix_entries[4] = { {0, 0}, {0, 1}, + {0, 2}, {0, 3} + }; + + /* Some boards advertise MSI but don't really support it */ + if ((h->board_id == 0x40700E11) || (h->board_id == 0x40800E11) || + (h->board_id == 0x40820E11) || (h->board_id == 0x40830E11)) + goto default_int_mode; + + if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) { + err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4); + if (!err) { + h->intr[0] = cciss_msix_entries[0].vector; + h->intr[1] = cciss_msix_entries[1].vector; + h->intr[2] = cciss_msix_entries[2].vector; + h->intr[3] = cciss_msix_entries[3].vector; + h->msix_vector = 1; + return; + } else { + dev_warn(&h->pdev->dev, + "MSI-X init failed %d\n", err); + } + } + if (pci_find_capability(h->pdev, PCI_CAP_ID_MSI)) { + if (!pci_enable_msi(h->pdev)) + h->msi_vector = 1; + else + dev_warn(&h->pdev->dev, "MSI init failed\n"); + } +default_int_mode: +#endif /* CONFIG_PCI_MSI */ + /* if we get here we're going to use the default interrupt mode */ + h->intr[h->intr_mode] = h->pdev->irq; + return; +} + +static int cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id) +{ + int i; + u32 subsystem_vendor_id, subsystem_device_id; + + subsystem_vendor_id = pdev->subsystem_vendor; + subsystem_device_id = pdev->subsystem_device; + *board_id = ((subsystem_device_id << 16) & 0xffff0000) | + subsystem_vendor_id; + + for (i = 0; i < ARRAY_SIZE(products); i++) { + /* Stand aside for hpsa driver on request */ + if (cciss_allow_hpsa) + return -ENODEV; + if (*board_id == products[i].board_id) + return i; + } + dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n", + *board_id); + return -ENODEV; +} + +static inline bool cciss_board_disabled(ctlr_info_t *h) +{ + u16 command; + + (void) pci_read_config_word(h->pdev, PCI_COMMAND, &command); + return ((command & PCI_COMMAND_MEMORY) == 0); +} + +static int cciss_pci_find_memory_BAR(struct pci_dev *pdev, + unsigned long *memory_bar) +{ + int i; + + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) + if (pci_resource_flags(pdev, i) & IORESOURCE_MEM) { + /* addressing mode bits already removed */ + *memory_bar = pci_resource_start(pdev, i); + dev_dbg(&pdev->dev, "memory BAR = %lx\n", + *memory_bar); + return 0; + } + dev_warn(&pdev->dev, "no memory BAR found\n"); + return -ENODEV; +} + +static int cciss_wait_for_board_state(struct pci_dev *pdev, + void __iomem *vaddr, int wait_for_ready) +#define BOARD_READY 1 +#define BOARD_NOT_READY 0 +{ + int i, iterations; + u32 scratchpad; + + if (wait_for_ready) + iterations = CCISS_BOARD_READY_ITERATIONS; + else + iterations = CCISS_BOARD_NOT_READY_ITERATIONS; + + for (i = 0; i < iterations; i++) { + scratchpad = readl(vaddr + SA5_SCRATCHPAD_OFFSET); + if (wait_for_ready) { + if (scratchpad == CCISS_FIRMWARE_READY) + return 0; + } else { + if (scratchpad != CCISS_FIRMWARE_READY) + return 0; + } + msleep(CCISS_BOARD_READY_POLL_INTERVAL_MSECS); + } + dev_warn(&pdev->dev, "board not ready, timed out.\n"); + return -ENODEV; +} + +static int cciss_find_cfg_addrs(struct pci_dev *pdev, void __iomem *vaddr, + u32 *cfg_base_addr, u64 *cfg_base_addr_index, + u64 *cfg_offset) +{ + *cfg_base_addr = readl(vaddr + SA5_CTCFG_OFFSET); + *cfg_offset = readl(vaddr + SA5_CTMEM_OFFSET); + *cfg_base_addr &= (u32) 0x0000ffff; + *cfg_base_addr_index = find_PCI_BAR_index(pdev, *cfg_base_addr); + if (*cfg_base_addr_index == -1) { + dev_warn(&pdev->dev, "cannot find cfg_base_addr_index, " + "*cfg_base_addr = 0x%08x\n", *cfg_base_addr); + return -ENODEV; + } + return 0; +} + +static int cciss_find_cfgtables(ctlr_info_t *h) +{ + u64 cfg_offset; + u32 cfg_base_addr; + u64 cfg_base_addr_index; + u32 trans_offset; + int rc; + + rc = cciss_find_cfg_addrs(h->pdev, h->vaddr, &cfg_base_addr, + &cfg_base_addr_index, &cfg_offset); + if (rc) + return rc; + h->cfgtable = remap_pci_mem(pci_resource_start(h->pdev, + cfg_base_addr_index) + cfg_offset, sizeof(*h->cfgtable)); + if (!h->cfgtable) + return -ENOMEM; + rc = write_driver_ver_to_cfgtable(h->cfgtable); + if (rc) + return rc; + /* Find performant mode table. */ + trans_offset = readl(&h->cfgtable->TransMethodOffset); + h->transtable = remap_pci_mem(pci_resource_start(h->pdev, + cfg_base_addr_index)+cfg_offset+trans_offset, + sizeof(*h->transtable)); + if (!h->transtable) + return -ENOMEM; + return 0; +} + +static void cciss_get_max_perf_mode_cmds(struct ctlr_info *h) +{ + h->max_commands = readl(&(h->cfgtable->MaxPerformantModeCommands)); + + /* Limit commands in memory limited kdump scenario. */ + if (reset_devices && h->max_commands > 32) + h->max_commands = 32; + + if (h->max_commands < 16) { + dev_warn(&h->pdev->dev, "Controller reports " + "max supported commands of %d, an obvious lie. " + "Using 16. Ensure that firmware is up to date.\n", + h->max_commands); + h->max_commands = 16; + } +} + +/* Interrogate the hardware for some limits: + * max commands, max SG elements without chaining, and with chaining, + * SG chain block size, etc. + */ +static void cciss_find_board_params(ctlr_info_t *h) +{ + cciss_get_max_perf_mode_cmds(h); + h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds; + h->maxsgentries = readl(&(h->cfgtable->MaxSGElements)); + /* + * The P600 may exhibit poor performnace under some workloads + * if we use the value in the configuration table. Limit this + * controller to MAXSGENTRIES (32) instead. + */ + if (h->board_id == 0x3225103C) + h->maxsgentries = MAXSGENTRIES; + /* + * Limit in-command s/g elements to 32 save dma'able memory. + * Howvever spec says if 0, use 31 + */ + h->max_cmd_sgentries = 31; + if (h->maxsgentries > 512) { + h->max_cmd_sgentries = 32; + h->chainsize = h->maxsgentries - h->max_cmd_sgentries + 1; + h->maxsgentries--; /* save one for chain pointer */ + } else { + h->maxsgentries = 31; /* default to traditional values */ + h->chainsize = 0; + } +} + +static inline bool CISS_signature_present(ctlr_info_t *h) +{ + if (!check_signature(h->cfgtable->Signature, "CISS", 4)) { + dev_warn(&h->pdev->dev, "not a valid CISS config table\n"); + return false; + } + return true; +} + +/* Need to enable prefetch in the SCSI core for 6400 in x86 */ +static inline void cciss_enable_scsi_prefetch(ctlr_info_t *h) +{ +#ifdef CONFIG_X86 + u32 prefetch; + + prefetch = readl(&(h->cfgtable->SCSI_Prefetch)); + prefetch |= 0x100; + writel(prefetch, &(h->cfgtable->SCSI_Prefetch)); +#endif +} + +/* Disable DMA prefetch for the P600. Otherwise an ASIC bug may result + * in a prefetch beyond physical memory. + */ +static inline void cciss_p600_dma_prefetch_quirk(ctlr_info_t *h) +{ + u32 dma_prefetch; + __u32 dma_refetch; + + if (h->board_id != 0x3225103C) + return; + dma_prefetch = readl(h->vaddr + I2O_DMA1_CFG); + dma_prefetch |= 0x8000; + writel(dma_prefetch, h->vaddr + I2O_DMA1_CFG); + pci_read_config_dword(h->pdev, PCI_COMMAND_PARITY, &dma_refetch); + dma_refetch |= 0x1; + pci_write_config_dword(h->pdev, PCI_COMMAND_PARITY, dma_refetch); +} + +static int cciss_pci_init(ctlr_info_t *h) +{ + int prod_index, err; + + prod_index = cciss_lookup_board_id(h->pdev, &h->board_id); + if (prod_index < 0) + return -ENODEV; + h->product_name = products[prod_index].product_name; + h->access = *(products[prod_index].access); + + if (cciss_board_disabled(h)) { + dev_warn(&h->pdev->dev, "controller appears to be disabled\n"); + return -ENODEV; + } + + pci_disable_link_state(h->pdev, PCIE_LINK_STATE_L0S | + PCIE_LINK_STATE_L1 | PCIE_LINK_STATE_CLKPM); + + err = pci_enable_device(h->pdev); + if (err) { + dev_warn(&h->pdev->dev, "Unable to Enable PCI device\n"); + return err; + } + + err = pci_request_regions(h->pdev, "cciss"); + if (err) { + dev_warn(&h->pdev->dev, + "Cannot obtain PCI resources, aborting\n"); + return err; + } + + dev_dbg(&h->pdev->dev, "irq = %x\n", h->pdev->irq); + dev_dbg(&h->pdev->dev, "board_id = %x\n", h->board_id); + +/* If the kernel supports MSI/MSI-X we will try to enable that functionality, + * else we use the IO-APIC interrupt assigned to us by system ROM. + */ + cciss_interrupt_mode(h); + err = cciss_pci_find_memory_BAR(h->pdev, &h->paddr); + if (err) + goto err_out_free_res; + h->vaddr = remap_pci_mem(h->paddr, 0x250); + if (!h->vaddr) { + err = -ENOMEM; + goto err_out_free_res; + } + err = cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY); + if (err) + goto err_out_free_res; + err = cciss_find_cfgtables(h); + if (err) + goto err_out_free_res; + print_cfg_table(h); + cciss_find_board_params(h); + + if (!CISS_signature_present(h)) { + err = -ENODEV; + goto err_out_free_res; + } + cciss_enable_scsi_prefetch(h); + cciss_p600_dma_prefetch_quirk(h); + err = cciss_enter_simple_mode(h); + if (err) + goto err_out_free_res; + cciss_put_controller_into_performant_mode(h); + return 0; + +err_out_free_res: + /* + * Deliberately omit pci_disable_device(): it does something nasty to + * Smart Array controllers that pci_enable_device does not undo + */ + if (h->transtable) + iounmap(h->transtable); + if (h->cfgtable) + iounmap(h->cfgtable); + if (h->vaddr) + iounmap(h->vaddr); + pci_release_regions(h->pdev); + return err; +} + +/* Function to find the first free pointer into our hba[] array + * Returns -1 if no free entries are left. + */ +static int alloc_cciss_hba(struct pci_dev *pdev) +{ + int i; + + for (i = 0; i < MAX_CTLR; i++) { + if (!hba[i]) { + ctlr_info_t *h; + + h = kzalloc(sizeof(ctlr_info_t), GFP_KERNEL); + if (!h) + goto Enomem; + hba[i] = h; + return i; + } + } + dev_warn(&pdev->dev, "This driver supports a maximum" + " of %d controllers.\n", MAX_CTLR); + return -1; +Enomem: + dev_warn(&pdev->dev, "out of memory.\n"); + return -1; +} + +static void free_hba(ctlr_info_t *h) +{ + int i; + + hba[h->ctlr] = NULL; + for (i = 0; i < h->highest_lun + 1; i++) + if (h->gendisk[i] != NULL) + put_disk(h->gendisk[i]); + kfree(h); +} + +/* Send a message CDB to the firmware. */ +static int cciss_message(struct pci_dev *pdev, unsigned char opcode, + unsigned char type) +{ + typedef struct { + CommandListHeader_struct CommandHeader; + RequestBlock_struct Request; + ErrDescriptor_struct ErrorDescriptor; + } Command; + static const size_t cmd_sz = sizeof(Command) + sizeof(ErrorInfo_struct); + Command *cmd; + dma_addr_t paddr64; + uint32_t paddr32, tag; + void __iomem *vaddr; + int i, err; + + vaddr = ioremap_nocache(pci_resource_start(pdev, 0), pci_resource_len(pdev, 0)); + if (vaddr == NULL) + return -ENOMEM; + + /* The Inbound Post Queue only accepts 32-bit physical addresses for the + CCISS commands, so they must be allocated from the lower 4GiB of + memory. */ + err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (err) { + iounmap(vaddr); + return -ENOMEM; + } + + cmd = pci_alloc_consistent(pdev, cmd_sz, &paddr64); + if (cmd == NULL) { + iounmap(vaddr); + return -ENOMEM; + } + + /* This must fit, because of the 32-bit consistent DMA mask. Also, + although there's no guarantee, we assume that the address is at + least 4-byte aligned (most likely, it's page-aligned). */ + paddr32 = paddr64; + + cmd->CommandHeader.ReplyQueue = 0; + cmd->CommandHeader.SGList = 0; + cmd->CommandHeader.SGTotal = 0; + cmd->CommandHeader.Tag.lower = paddr32; + cmd->CommandHeader.Tag.upper = 0; + memset(&cmd->CommandHeader.LUN.LunAddrBytes, 0, 8); + + cmd->Request.CDBLen = 16; + cmd->Request.Type.Type = TYPE_MSG; + cmd->Request.Type.Attribute = ATTR_HEADOFQUEUE; + cmd->Request.Type.Direction = XFER_NONE; + cmd->Request.Timeout = 0; /* Don't time out */ + cmd->Request.CDB[0] = opcode; + cmd->Request.CDB[1] = type; + memset(&cmd->Request.CDB[2], 0, 14); /* the rest of the CDB is reserved */ + + cmd->ErrorDescriptor.Addr.lower = paddr32 + sizeof(Command); + cmd->ErrorDescriptor.Addr.upper = 0; + cmd->ErrorDescriptor.Len = sizeof(ErrorInfo_struct); + + writel(paddr32, vaddr + SA5_REQUEST_PORT_OFFSET); + + for (i = 0; i < 10; i++) { + tag = readl(vaddr + SA5_REPLY_PORT_OFFSET); + if ((tag & ~3) == paddr32) + break; + msleep(CCISS_POST_RESET_NOOP_TIMEOUT_MSECS); + } + + iounmap(vaddr); + + /* we leak the DMA buffer here ... no choice since the controller could + still complete the command. */ + if (i == 10) { + dev_err(&pdev->dev, + "controller message %02x:%02x timed out\n", + opcode, type); + return -ETIMEDOUT; + } + + pci_free_consistent(pdev, cmd_sz, cmd, paddr64); + + if (tag & 2) { + dev_err(&pdev->dev, "controller message %02x:%02x failed\n", + opcode, type); + return -EIO; + } + + dev_info(&pdev->dev, "controller message %02x:%02x succeeded\n", + opcode, type); + return 0; +} + +#define cciss_noop(p) cciss_message(p, 3, 0) + +static int cciss_controller_hard_reset(struct pci_dev *pdev, + void * __iomem vaddr, u32 use_doorbell) +{ + u16 pmcsr; + int pos; + + if (use_doorbell) { + /* For everything after the P600, the PCI power state method + * of resetting the controller doesn't work, so we have this + * other way using the doorbell register. + */ + dev_info(&pdev->dev, "using doorbell to reset controller\n"); + writel(use_doorbell, vaddr + SA5_DOORBELL); + } else { /* Try to do it the PCI power state way */ + + /* Quoting from the Open CISS Specification: "The Power + * Management Control/Status Register (CSR) controls the power + * state of the device. The normal operating state is D0, + * CSR=00h. The software off state is D3, CSR=03h. To reset + * the controller, place the interface device in D3 then to D0, + * this causes a secondary PCI reset which will reset the + * controller." */ + + pos = pci_find_capability(pdev, PCI_CAP_ID_PM); + if (pos == 0) { + dev_err(&pdev->dev, + "cciss_controller_hard_reset: " + "PCI PM not supported\n"); + return -ENODEV; + } + dev_info(&pdev->dev, "using PCI PM to reset controller\n"); + /* enter the D3hot power management state */ + pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr); + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + pmcsr |= PCI_D3hot; + pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); + + msleep(500); + + /* enter the D0 power management state */ + pmcsr &= ~PCI_PM_CTRL_STATE_MASK; + pmcsr |= PCI_D0; + pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); + + /* + * The P600 requires a small delay when changing states. + * Otherwise we may think the board did not reset and we bail. + * This for kdump only and is particular to the P600. + */ + msleep(500); + } + return 0; +} + +static void init_driver_version(char *driver_version, int len) +{ + memset(driver_version, 0, len); + strncpy(driver_version, "cciss " DRIVER_NAME, len - 1); +} + +static int write_driver_ver_to_cfgtable(CfgTable_struct __iomem *cfgtable) +{ + char *driver_version; + int i, size = sizeof(cfgtable->driver_version); + + driver_version = kmalloc(size, GFP_KERNEL); + if (!driver_version) + return -ENOMEM; + + init_driver_version(driver_version, size); + for (i = 0; i < size; i++) + writeb(driver_version[i], &cfgtable->driver_version[i]); + kfree(driver_version); + return 0; +} + +static void read_driver_ver_from_cfgtable(CfgTable_struct __iomem *cfgtable, + unsigned char *driver_ver) +{ + int i; + + for (i = 0; i < sizeof(cfgtable->driver_version); i++) + driver_ver[i] = readb(&cfgtable->driver_version[i]); +} + +static int controller_reset_failed(CfgTable_struct __iomem *cfgtable) +{ + + char *driver_ver, *old_driver_ver; + int rc, size = sizeof(cfgtable->driver_version); + + old_driver_ver = kmalloc(2 * size, GFP_KERNEL); + if (!old_driver_ver) + return -ENOMEM; + driver_ver = old_driver_ver + size; + + /* After a reset, the 32 bytes of "driver version" in the cfgtable + * should have been changed, otherwise we know the reset failed. + */ + init_driver_version(old_driver_ver, size); + read_driver_ver_from_cfgtable(cfgtable, driver_ver); + rc = !memcmp(driver_ver, old_driver_ver, size); + kfree(old_driver_ver); + return rc; +} + +/* This does a hard reset of the controller using PCI power management + * states or using the doorbell register. */ +static int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) +{ + u64 cfg_offset; + u32 cfg_base_addr; + u64 cfg_base_addr_index; + void __iomem *vaddr; + unsigned long paddr; + u32 misc_fw_support; + int rc; + CfgTable_struct __iomem *cfgtable; + u32 use_doorbell; + u32 board_id; + u16 command_register; + + /* For controllers as old a the p600, this is very nearly + * the same thing as + * + * pci_save_state(pci_dev); + * pci_set_power_state(pci_dev, PCI_D3hot); + * pci_set_power_state(pci_dev, PCI_D0); + * pci_restore_state(pci_dev); + * + * For controllers newer than the P600, the pci power state + * method of resetting doesn't work so we have another way + * using the doorbell register. + */ + + /* Exclude 640x boards. These are two pci devices in one slot + * which share a battery backed cache module. One controls the + * cache, the other accesses the cache through the one that controls + * it. If we reset the one controlling the cache, the other will + * likely not be happy. Just forbid resetting this conjoined mess. + */ + cciss_lookup_board_id(pdev, &board_id); + if (!ctlr_is_resettable(board_id)) { + dev_warn(&pdev->dev, "Cannot reset Smart Array 640x " + "due to shared cache module."); + return -ENODEV; + } + + /* if controller is soft- but not hard resettable... */ + if (!ctlr_is_hard_resettable(board_id)) + return -ENOTSUPP; /* try soft reset later. */ + + /* Save the PCI command register */ + pci_read_config_word(pdev, 4, &command_register); + /* Turn the board off. This is so that later pci_restore_state() + * won't turn the board on before the rest of config space is ready. + */ + pci_disable_device(pdev); + pci_save_state(pdev); + + /* find the first memory BAR, so we can find the cfg table */ + rc = cciss_pci_find_memory_BAR(pdev, &paddr); + if (rc) + return rc; + vaddr = remap_pci_mem(paddr, 0x250); + if (!vaddr) + return -ENOMEM; + + /* find cfgtable in order to check if reset via doorbell is supported */ + rc = cciss_find_cfg_addrs(pdev, vaddr, &cfg_base_addr, + &cfg_base_addr_index, &cfg_offset); + if (rc) + goto unmap_vaddr; + cfgtable = remap_pci_mem(pci_resource_start(pdev, + cfg_base_addr_index) + cfg_offset, sizeof(*cfgtable)); + if (!cfgtable) { + rc = -ENOMEM; + goto unmap_vaddr; + } + rc = write_driver_ver_to_cfgtable(cfgtable); + if (rc) + goto unmap_vaddr; + + /* If reset via doorbell register is supported, use that. + * There are two such methods. Favor the newest method. + */ + misc_fw_support = readl(&cfgtable->misc_fw_support); + use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET2; + if (use_doorbell) { + use_doorbell = DOORBELL_CTLR_RESET2; + } else { + use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET; + if (use_doorbell) { + dev_warn(&pdev->dev, "Controller claims that " + "'Bit 2 doorbell reset' is " + "supported, but not 'bit 5 doorbell reset'. " + "Firmware update is recommended.\n"); + rc = -ENOTSUPP; /* use the soft reset */ + goto unmap_cfgtable; + } + } + + rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell); + if (rc) + goto unmap_cfgtable; + pci_restore_state(pdev); + rc = pci_enable_device(pdev); + if (rc) { + dev_warn(&pdev->dev, "failed to enable device.\n"); + goto unmap_cfgtable; + } + pci_write_config_word(pdev, 4, command_register); + + /* Some devices (notably the HP Smart Array 5i Controller) + need a little pause here */ + msleep(CCISS_POST_RESET_PAUSE_MSECS); + + /* Wait for board to become not ready, then ready. */ + dev_info(&pdev->dev, "Waiting for board to reset.\n"); + rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY); + if (rc) { + dev_warn(&pdev->dev, "Failed waiting for board to hard reset." + " Will try soft reset.\n"); + rc = -ENOTSUPP; /* Not expected, but try soft reset later */ + goto unmap_cfgtable; + } + rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY); + if (rc) { + dev_warn(&pdev->dev, + "failed waiting for board to become ready " + "after hard reset\n"); + goto unmap_cfgtable; + } + + rc = controller_reset_failed(vaddr); + if (rc < 0) + goto unmap_cfgtable; + if (rc) { + dev_warn(&pdev->dev, "Unable to successfully hard reset " + "controller. Will try soft reset.\n"); + rc = -ENOTSUPP; /* Not expected, but try soft reset later */ + } else { + dev_info(&pdev->dev, "Board ready after hard reset.\n"); + } + +unmap_cfgtable: + iounmap(cfgtable); + +unmap_vaddr: + iounmap(vaddr); + return rc; +} + +static int cciss_init_reset_devices(struct pci_dev *pdev) +{ + int rc, i; + + if (!reset_devices) + return 0; + + /* Reset the controller with a PCI power-cycle or via doorbell */ + rc = cciss_kdump_hard_reset_controller(pdev); + + /* -ENOTSUPP here means we cannot reset the controller + * but it's already (and still) up and running in + * "performant mode". Or, it might be 640x, which can't reset + * due to concerns about shared bbwc between 6402/6404 pair. + */ + if (rc == -ENOTSUPP) + return rc; /* just try to do the kdump anyhow. */ + if (rc) + return -ENODEV; + + /* Now try to get the controller to respond to a no-op */ + dev_warn(&pdev->dev, "Waiting for controller to respond to no-op\n"); + for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) { + if (cciss_noop(pdev) == 0) + break; + else + dev_warn(&pdev->dev, "no-op failed%s\n", + (i < CCISS_POST_RESET_NOOP_RETRIES - 1 ? + "; re-trying" : "")); + msleep(CCISS_POST_RESET_NOOP_INTERVAL_MSECS); + } + return 0; +} + +static int cciss_allocate_cmd_pool(ctlr_info_t *h) +{ + h->cmd_pool_bits = kmalloc(BITS_TO_LONGS(h->nr_cmds) * + sizeof(unsigned long), GFP_KERNEL); + h->cmd_pool = pci_alloc_consistent(h->pdev, + h->nr_cmds * sizeof(CommandList_struct), + &(h->cmd_pool_dhandle)); + h->errinfo_pool = pci_alloc_consistent(h->pdev, + h->nr_cmds * sizeof(ErrorInfo_struct), + &(h->errinfo_pool_dhandle)); + if ((h->cmd_pool_bits == NULL) + || (h->cmd_pool == NULL) + || (h->errinfo_pool == NULL)) { + dev_err(&h->pdev->dev, "out of memory"); + return -ENOMEM; + } + return 0; +} + +static int cciss_allocate_scatterlists(ctlr_info_t *h) +{ + int i; + + /* zero it, so that on free we need not know how many were alloc'ed */ + h->scatter_list = kzalloc(h->max_commands * + sizeof(struct scatterlist *), GFP_KERNEL); + if (!h->scatter_list) + return -ENOMEM; + + for (i = 0; i < h->nr_cmds; i++) { + h->scatter_list[i] = kmalloc(sizeof(struct scatterlist) * + h->maxsgentries, GFP_KERNEL); + if (h->scatter_list[i] == NULL) { + dev_err(&h->pdev->dev, "could not allocate " + "s/g lists\n"); + return -ENOMEM; + } + } + return 0; +} + +static void cciss_free_scatterlists(ctlr_info_t *h) +{ + int i; + + if (h->scatter_list) { + for (i = 0; i < h->nr_cmds; i++) + kfree(h->scatter_list[i]); + kfree(h->scatter_list); + } +} + +static void cciss_free_cmd_pool(ctlr_info_t *h) +{ + kfree(h->cmd_pool_bits); + if (h->cmd_pool) + pci_free_consistent(h->pdev, + h->nr_cmds * sizeof(CommandList_struct), + h->cmd_pool, h->cmd_pool_dhandle); + if (h->errinfo_pool) + pci_free_consistent(h->pdev, + h->nr_cmds * sizeof(ErrorInfo_struct), + h->errinfo_pool, h->errinfo_pool_dhandle); +} + +static int cciss_request_irq(ctlr_info_t *h, + irqreturn_t (*msixhandler)(int, void *), + irqreturn_t (*intxhandler)(int, void *)) +{ + if (h->msix_vector || h->msi_vector) { + if (!request_irq(h->intr[h->intr_mode], msixhandler, + 0, h->devname, h)) + return 0; + dev_err(&h->pdev->dev, "Unable to get msi irq %d" + " for %s\n", h->intr[h->intr_mode], + h->devname); + return -1; + } + + if (!request_irq(h->intr[h->intr_mode], intxhandler, + IRQF_SHARED, h->devname, h)) + return 0; + dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n", + h->intr[h->intr_mode], h->devname); + return -1; +} + +static int cciss_kdump_soft_reset(ctlr_info_t *h) +{ + if (cciss_send_reset(h, CTLR_LUNID, CCISS_RESET_TYPE_CONTROLLER)) { + dev_warn(&h->pdev->dev, "Resetting array controller failed.\n"); + return -EIO; + } + + dev_info(&h->pdev->dev, "Waiting for board to soft reset.\n"); + if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_NOT_READY)) { + dev_warn(&h->pdev->dev, "Soft reset had no effect.\n"); + return -1; + } + + dev_info(&h->pdev->dev, "Board reset, awaiting READY status.\n"); + if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY)) { + dev_warn(&h->pdev->dev, "Board failed to become ready " + "after soft reset.\n"); + return -1; + } + + return 0; +} + +static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h) +{ + int ctlr = h->ctlr; + + free_irq(h->intr[h->intr_mode], h); +#ifdef CONFIG_PCI_MSI + if (h->msix_vector) + pci_disable_msix(h->pdev); + else if (h->msi_vector) + pci_disable_msi(h->pdev); +#endif /* CONFIG_PCI_MSI */ + cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); + cciss_free_scatterlists(h); + cciss_free_cmd_pool(h); + kfree(h->blockFetchTable); + if (h->reply_pool) + pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64), + h->reply_pool, h->reply_pool_dhandle); + if (h->transtable) + iounmap(h->transtable); + if (h->cfgtable) + iounmap(h->cfgtable); + if (h->vaddr) + iounmap(h->vaddr); + unregister_blkdev(h->major, h->devname); + cciss_destroy_hba_sysfs_entry(h); + pci_release_regions(h->pdev); + kfree(h); + hba[ctlr] = NULL; +} + +/* + * This is it. Find all the controllers and register them. I really hate + * stealing all these major device numbers. + * returns the number of block devices registered. + */ +static int cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int i; + int j = 0; + int rc; + int try_soft_reset = 0; + int dac, return_code; + InquiryData_struct *inq_buff; + ctlr_info_t *h; + unsigned long flags; + + /* + * By default the cciss driver is used for all older HP Smart Array + * controllers. There are module paramaters that allow a user to + * override this behavior and instead use the hpsa SCSI driver. If + * this is the case cciss may be loaded first from the kdump initrd + * image and cause a kernel panic. So if reset_devices is true and + * cciss_allow_hpsa is set just bail. + */ + if ((reset_devices) && (cciss_allow_hpsa == 1)) + return -ENODEV; + rc = cciss_init_reset_devices(pdev); + if (rc) { + if (rc != -ENOTSUPP) + return rc; + /* If the reset fails in a particular way (it has no way to do + * a proper hard reset, so returns -ENOTSUPP) we can try to do + * a soft reset once we get the controller configured up to the + * point that it can accept a command. + */ + try_soft_reset = 1; + rc = 0; + } + +reinit_after_soft_reset: + + i = alloc_cciss_hba(pdev); + if (i < 0) + return -ENOMEM; + + h = hba[i]; + h->pdev = pdev; + h->busy_initializing = 1; + h->intr_mode = cciss_simple_mode ? SIMPLE_MODE_INT : PERF_MODE_INT; + INIT_LIST_HEAD(&h->cmpQ); + INIT_LIST_HEAD(&h->reqQ); + mutex_init(&h->busy_shutting_down); + + if (cciss_pci_init(h) != 0) + goto clean_no_release_regions; + + sprintf(h->devname, "cciss%d", i); + h->ctlr = i; + + if (cciss_tape_cmds < 2) + cciss_tape_cmds = 2; + if (cciss_tape_cmds > 16) + cciss_tape_cmds = 16; + + init_completion(&h->scan_wait); + + if (cciss_create_hba_sysfs_entry(h)) + goto clean0; + + /* configure PCI DMA stuff */ + if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) + dac = 1; + else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32))) + dac = 0; + else { + dev_err(&h->pdev->dev, "no suitable DMA available\n"); + goto clean1; + } + + /* + * register with the major number, or get a dynamic major number + * by passing 0 as argument. This is done for greater than + * 8 controller support. + */ + if (i < MAX_CTLR_ORIG) + h->major = COMPAQ_CISS_MAJOR + i; + rc = register_blkdev(h->major, h->devname); + if (rc == -EBUSY || rc == -EINVAL) { + dev_err(&h->pdev->dev, + "Unable to get major number %d for %s " + "on hba %d\n", h->major, h->devname, i); + goto clean1; + } else { + if (i >= MAX_CTLR_ORIG) + h->major = rc; + } + + /* make sure the board interrupts are off */ + h->access.set_intr_mask(h, CCISS_INTR_OFF); + rc = cciss_request_irq(h, do_cciss_msix_intr, do_cciss_intx); + if (rc) + goto clean2; + + dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n", + h->devname, pdev->device, pci_name(pdev), + h->intr[h->intr_mode], dac ? "" : " not"); + + if (cciss_allocate_cmd_pool(h)) + goto clean4; + + if (cciss_allocate_scatterlists(h)) + goto clean4; + + h->cmd_sg_list = cciss_allocate_sg_chain_blocks(h, + h->chainsize, h->nr_cmds); + if (!h->cmd_sg_list && h->chainsize > 0) + goto clean4; + + spin_lock_init(&h->lock); + + /* Initialize the pdev driver private data. + have it point to h. */ + pci_set_drvdata(pdev, h); + /* command and error info recs zeroed out before + they are used */ + bitmap_zero(h->cmd_pool_bits, h->nr_cmds); + + h->num_luns = 0; + h->highest_lun = -1; + for (j = 0; j < CISS_MAX_LUN; j++) { + h->drv[j] = NULL; + h->gendisk[j] = NULL; + } + + /* At this point, the controller is ready to take commands. + * Now, if reset_devices and the hard reset didn't work, try + * the soft reset and see if that works. + */ + if (try_soft_reset) { + + /* This is kind of gross. We may or may not get a completion + * from the soft reset command, and if we do, then the value + * from the fifo may or may not be valid. So, we wait 10 secs + * after the reset throwing away any completions we get during + * that time. Unregister the interrupt handler and register + * fake ones to scoop up any residual completions. + */ + spin_lock_irqsave(&h->lock, flags); + h->access.set_intr_mask(h, CCISS_INTR_OFF); + spin_unlock_irqrestore(&h->lock, flags); + free_irq(h->intr[h->intr_mode], h); + rc = cciss_request_irq(h, cciss_msix_discard_completions, + cciss_intx_discard_completions); + if (rc) { + dev_warn(&h->pdev->dev, "Failed to request_irq after " + "soft reset.\n"); + goto clean4; + } + + rc = cciss_kdump_soft_reset(h); + if (rc) { + dev_warn(&h->pdev->dev, "Soft reset failed.\n"); + goto clean4; + } + + dev_info(&h->pdev->dev, "Board READY.\n"); + dev_info(&h->pdev->dev, + "Waiting for stale completions to drain.\n"); + h->access.set_intr_mask(h, CCISS_INTR_ON); + msleep(10000); + h->access.set_intr_mask(h, CCISS_INTR_OFF); + + rc = controller_reset_failed(h->cfgtable); + if (rc) + dev_info(&h->pdev->dev, + "Soft reset appears to have failed.\n"); + + /* since the controller's reset, we have to go back and re-init + * everything. Easiest to just forget what we've done and do it + * all over again. + */ + cciss_undo_allocations_after_kdump_soft_reset(h); + try_soft_reset = 0; + if (rc) + /* don't go to clean4, we already unallocated */ + return -ENODEV; + + goto reinit_after_soft_reset; + } + + cciss_scsi_setup(h); + + /* Turn the interrupts on so we can service requests */ + h->access.set_intr_mask(h, CCISS_INTR_ON); + + /* Get the firmware version */ + inq_buff = kzalloc(sizeof(InquiryData_struct), GFP_KERNEL); + if (inq_buff == NULL) { + dev_err(&h->pdev->dev, "out of memory\n"); + goto clean4; + } + + return_code = sendcmd_withirq(h, CISS_INQUIRY, inq_buff, + sizeof(InquiryData_struct), 0, CTLR_LUNID, TYPE_CMD); + if (return_code == IO_OK) { + h->firm_ver[0] = inq_buff->data_byte[32]; + h->firm_ver[1] = inq_buff->data_byte[33]; + h->firm_ver[2] = inq_buff->data_byte[34]; + h->firm_ver[3] = inq_buff->data_byte[35]; + } else { /* send command failed */ + dev_warn(&h->pdev->dev, "unable to determine firmware" + " version of controller\n"); + } + kfree(inq_buff); + + cciss_procinit(h); + + h->cciss_max_sectors = 8192; + + rebuild_lun_table(h, 1, 0); + cciss_engage_scsi(h); + h->busy_initializing = 0; + return 0; + +clean4: + cciss_free_cmd_pool(h); + cciss_free_scatterlists(h); + cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); + free_irq(h->intr[h->intr_mode], h); +clean2: + unregister_blkdev(h->major, h->devname); +clean1: + cciss_destroy_hba_sysfs_entry(h); +clean0: + pci_release_regions(pdev); +clean_no_release_regions: + h->busy_initializing = 0; + + /* + * Deliberately omit pci_disable_device(): it does something nasty to + * Smart Array controllers that pci_enable_device does not undo + */ + pci_set_drvdata(pdev, NULL); + free_hba(h); + return -ENODEV; +} + +static void cciss_shutdown(struct pci_dev *pdev) +{ + ctlr_info_t *h; + char *flush_buf; + int return_code; + + h = pci_get_drvdata(pdev); + flush_buf = kzalloc(4, GFP_KERNEL); + if (!flush_buf) { + dev_warn(&h->pdev->dev, "cache not flushed, out of memory.\n"); + return; + } + /* write all data in the battery backed cache to disk */ + return_code = sendcmd_withirq(h, CCISS_CACHE_FLUSH, flush_buf, + 4, 0, CTLR_LUNID, TYPE_CMD); + kfree(flush_buf); + if (return_code != IO_OK) + dev_warn(&h->pdev->dev, "Error flushing cache\n"); + h->access.set_intr_mask(h, CCISS_INTR_OFF); + free_irq(h->intr[h->intr_mode], h); +} + +static int cciss_enter_simple_mode(struct ctlr_info *h) +{ + u32 trans_support; + + trans_support = readl(&(h->cfgtable->TransportSupport)); + if (!(trans_support & SIMPLE_MODE)) + return -ENOTSUPP; + + h->max_commands = readl(&(h->cfgtable->CmdsOutMax)); + writel(CFGTBL_Trans_Simple, &(h->cfgtable->HostWrite.TransportRequest)); + writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); + cciss_wait_for_mode_change_ack(h); + print_cfg_table(h); + if (!(readl(&(h->cfgtable->TransportActive)) & CFGTBL_Trans_Simple)) { + dev_warn(&h->pdev->dev, "unable to get board into simple mode\n"); + return -ENODEV; + } + h->transMethod = CFGTBL_Trans_Simple; + return 0; +} + + +static void cciss_remove_one(struct pci_dev *pdev) +{ + ctlr_info_t *h; + int i, j; + + if (pci_get_drvdata(pdev) == NULL) { + dev_err(&pdev->dev, "Unable to remove device\n"); + return; + } + + h = pci_get_drvdata(pdev); + i = h->ctlr; + if (hba[i] == NULL) { + dev_err(&pdev->dev, "device appears to already be removed\n"); + return; + } + + mutex_lock(&h->busy_shutting_down); + + remove_from_scan_list(h); + remove_proc_entry(h->devname, proc_cciss); + unregister_blkdev(h->major, h->devname); + + /* remove it from the disk list */ + for (j = 0; j < CISS_MAX_LUN; j++) { + struct gendisk *disk = h->gendisk[j]; + if (disk) { + struct request_queue *q = disk->queue; + + if (disk->flags & GENHD_FL_UP) { + cciss_destroy_ld_sysfs_entry(h, j, 1); + del_gendisk(disk); + } + if (q) + blk_cleanup_queue(q); + } + } + +#ifdef CONFIG_CISS_SCSI_TAPE + cciss_unregister_scsi(h); /* unhook from SCSI subsystem */ +#endif + + cciss_shutdown(pdev); + +#ifdef CONFIG_PCI_MSI + if (h->msix_vector) + pci_disable_msix(h->pdev); + else if (h->msi_vector) + pci_disable_msi(h->pdev); +#endif /* CONFIG_PCI_MSI */ + + iounmap(h->transtable); + iounmap(h->cfgtable); + iounmap(h->vaddr); + + cciss_free_cmd_pool(h); + /* Free up sg elements */ + for (j = 0; j < h->nr_cmds; j++) + kfree(h->scatter_list[j]); + kfree(h->scatter_list); + cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); + kfree(h->blockFetchTable); + if (h->reply_pool) + pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64), + h->reply_pool, h->reply_pool_dhandle); + /* + * Deliberately omit pci_disable_device(): it does something nasty to + * Smart Array controllers that pci_enable_device does not undo + */ + pci_release_regions(pdev); + pci_set_drvdata(pdev, NULL); + cciss_destroy_hba_sysfs_entry(h); + mutex_unlock(&h->busy_shutting_down); + free_hba(h); +} + +static struct pci_driver cciss_pci_driver = { + .name = "cciss", + .probe = cciss_init_one, + .remove = cciss_remove_one, + .id_table = cciss_pci_device_id, /* id_table */ + .shutdown = cciss_shutdown, +}; + +/* + * This is it. Register the PCI driver information for the cards we control + * the OS will call our registered routines when it finds one of our cards. + */ +static int __init cciss_init(void) +{ + int err; + + /* + * The hardware requires that commands are aligned on a 64-bit + * boundary. Given that we use pci_alloc_consistent() to allocate an + * array of them, the size must be a multiple of 8 bytes. + */ + BUILD_BUG_ON(sizeof(CommandList_struct) % COMMANDLIST_ALIGNMENT); + printk(KERN_INFO DRIVER_NAME "\n"); + + err = bus_register(&cciss_bus_type); + if (err) + return err; + + /* Start the scan thread */ + cciss_scan_thread = kthread_run(scan_thread, NULL, "cciss_scan"); + if (IS_ERR(cciss_scan_thread)) { + err = PTR_ERR(cciss_scan_thread); + goto err_bus_unregister; + } + + /* Register for our PCI devices */ + err = pci_register_driver(&cciss_pci_driver); + if (err) + goto err_thread_stop; + + return err; + +err_thread_stop: + kthread_stop(cciss_scan_thread); +err_bus_unregister: + bus_unregister(&cciss_bus_type); + + return err; +} + +static void __exit cciss_cleanup(void) +{ + int i; + + pci_unregister_driver(&cciss_pci_driver); + /* double check that all controller entrys have been removed */ + for (i = 0; i < MAX_CTLR; i++) { + if (hba[i] != NULL) { + dev_warn(&hba[i]->pdev->dev, + "had to remove controller\n"); + cciss_remove_one(hba[i]->pdev); + } + } + kthread_stop(cciss_scan_thread); + if (proc_cciss) + remove_proc_entry("driver/cciss", NULL); + bus_unregister(&cciss_bus_type); +} + +module_init(cciss_init); +module_exit(cciss_cleanup); diff --git a/kernel/drivers/block/cciss.h b/kernel/drivers/block/cciss.h new file mode 100644 index 000000000..7fda30e4a --- /dev/null +++ b/kernel/drivers/block/cciss.h @@ -0,0 +1,435 @@ +#ifndef CCISS_H +#define CCISS_H + +#include +#include + +#include "cciss_cmd.h" + + +#define NWD_SHIFT 4 +#define MAX_PART (1 << NWD_SHIFT) + +#define IO_OK 0 +#define IO_ERROR 1 +#define IO_NEEDS_RETRY 3 + +#define VENDOR_LEN 8 +#define MODEL_LEN 16 +#define REV_LEN 4 + +struct ctlr_info; +typedef struct ctlr_info ctlr_info_t; + +struct access_method { + void (*submit_command)(ctlr_info_t *h, CommandList_struct *c); + void (*set_intr_mask)(ctlr_info_t *h, unsigned long val); + unsigned long (*fifo_full)(ctlr_info_t *h); + bool (*intr_pending)(ctlr_info_t *h); + unsigned long (*command_completed)(ctlr_info_t *h); +}; +typedef struct _drive_info_struct +{ + unsigned char LunID[8]; + int usage_count; + struct request_queue *queue; + sector_t nr_blocks; + int block_size; + int heads; + int sectors; + int cylinders; + int raid_level; /* set to -1 to indicate that + * the drive is not in use/configured + */ + int busy_configuring; /* This is set when a drive is being removed + * to prevent it from being opened or it's + * queue from being started. + */ + struct device dev; + __u8 serial_no[16]; /* from inquiry page 0x83, + * not necc. null terminated. + */ + char vendor[VENDOR_LEN + 1]; /* SCSI vendor string */ + char model[MODEL_LEN + 1]; /* SCSI model string */ + char rev[REV_LEN + 1]; /* SCSI revision string */ + char device_initialized; /* indicates whether dev is initialized */ +} drive_info_struct; + +struct ctlr_info +{ + int ctlr; + char devname[8]; + char *product_name; + char firm_ver[4]; /* Firmware version */ + struct pci_dev *pdev; + __u32 board_id; + void __iomem *vaddr; + unsigned long paddr; + int nr_cmds; /* Number of commands allowed on this controller */ + CfgTable_struct __iomem *cfgtable; + int interrupts_enabled; + int major; + int max_commands; + int commands_outstanding; + int max_outstanding; /* Debug */ + int num_luns; + int highest_lun; + int usage_count; /* number of opens all all minor devices */ + /* Need space for temp sg list + * number of scatter/gathers supported + * number of scatter/gathers in chained block + */ + struct scatterlist **scatter_list; + int maxsgentries; + int chainsize; + int max_cmd_sgentries; + SGDescriptor_struct **cmd_sg_list; + +# define PERF_MODE_INT 0 +# define DOORBELL_INT 1 +# define SIMPLE_MODE_INT 2 +# define MEMQ_MODE_INT 3 + unsigned int intr[4]; + unsigned int msix_vector; + unsigned int msi_vector; + int intr_mode; + int cciss_max_sectors; + BYTE cciss_read; + BYTE cciss_write; + BYTE cciss_read_capacity; + + /* information about each logical volume */ + drive_info_struct *drv[CISS_MAX_LUN]; + + struct access_method access; + + /* queue and queue Info */ + struct list_head reqQ; + struct list_head cmpQ; + unsigned int Qdepth; + unsigned int maxQsinceinit; + unsigned int maxSG; + spinlock_t lock; + + /* pointers to command and error info pool */ + CommandList_struct *cmd_pool; + dma_addr_t cmd_pool_dhandle; + ErrorInfo_struct *errinfo_pool; + dma_addr_t errinfo_pool_dhandle; + unsigned long *cmd_pool_bits; + int nr_allocs; + int nr_frees; + int busy_configuring; + int busy_initializing; + int busy_scanning; + struct mutex busy_shutting_down; + + /* This element holds the zero based queue number of the last + * queue to be started. It is used for fairness. + */ + int next_to_run; + + /* Disk structures we need to pass back */ + struct gendisk *gendisk[CISS_MAX_LUN]; +#ifdef CONFIG_CISS_SCSI_TAPE + struct cciss_scsi_adapter_data_t *scsi_ctlr; +#endif + unsigned char alive; + struct list_head scan_list; + struct completion scan_wait; + struct device dev; + /* + * Performant mode tables. + */ + u32 trans_support; + u32 trans_offset; + struct TransTable_struct *transtable; + unsigned long transMethod; + + /* + * Performant mode completion buffer + */ + u64 *reply_pool; + dma_addr_t reply_pool_dhandle; + u64 *reply_pool_head; + size_t reply_pool_size; + unsigned char reply_pool_wraparound; + u32 *blockFetchTable; +}; + +/* Defining the diffent access_methods + * + * Memory mapped FIFO interface (SMART 53xx cards) + */ +#define SA5_DOORBELL 0x20 +#define SA5_REQUEST_PORT_OFFSET 0x40 +#define SA5_REPLY_INTR_MASK_OFFSET 0x34 +#define SA5_REPLY_PORT_OFFSET 0x44 +#define SA5_INTR_STATUS 0x30 +#define SA5_SCRATCHPAD_OFFSET 0xB0 + +#define SA5_CTCFG_OFFSET 0xB4 +#define SA5_CTMEM_OFFSET 0xB8 + +#define SA5_INTR_OFF 0x08 +#define SA5B_INTR_OFF 0x04 +#define SA5_INTR_PENDING 0x08 +#define SA5B_INTR_PENDING 0x04 +#define FIFO_EMPTY 0xffffffff +#define CCISS_FIRMWARE_READY 0xffff0000 /* value in scratchpad register */ +/* Perf. mode flags */ +#define SA5_PERF_INTR_PENDING 0x04 +#define SA5_PERF_INTR_OFF 0x05 +#define SA5_OUTDB_STATUS_PERF_BIT 0x01 +#define SA5_OUTDB_CLEAR_PERF_BIT 0x01 +#define SA5_OUTDB_CLEAR 0xA0 +#define SA5_OUTDB_CLEAR_PERF_BIT 0x01 +#define SA5_OUTDB_STATUS 0x9C + + +#define CISS_ERROR_BIT 0x02 + +#define CCISS_INTR_ON 1 +#define CCISS_INTR_OFF 0 + + +/* CCISS_BOARD_READY_WAIT_SECS is how long to wait for a board + * to become ready, in seconds, before giving up on it. + * CCISS_BOARD_READY_POLL_INTERVAL_MSECS * is how long to wait + * between polling the board to see if it is ready, in + * milliseconds. CCISS_BOARD_READY_ITERATIONS is derived + * the above. + */ +#define CCISS_BOARD_READY_WAIT_SECS (120) +#define CCISS_BOARD_NOT_READY_WAIT_SECS (100) +#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100) +#define CCISS_BOARD_READY_ITERATIONS \ + ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \ + CCISS_BOARD_READY_POLL_INTERVAL_MSECS) +#define CCISS_BOARD_NOT_READY_ITERATIONS \ + ((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \ + CCISS_BOARD_READY_POLL_INTERVAL_MSECS) +#define CCISS_POST_RESET_PAUSE_MSECS (3000) +#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (4000) +#define CCISS_POST_RESET_NOOP_RETRIES (12) +#define CCISS_POST_RESET_NOOP_TIMEOUT_MSECS (10000) + +/* + Send the command to the hardware +*/ +static void SA5_submit_command( ctlr_info_t *h, CommandList_struct *c) +{ +#ifdef CCISS_DEBUG + printk(KERN_WARNING "cciss%d: Sending %08x - down to controller\n", + h->ctlr, c->busaddr); +#endif /* CCISS_DEBUG */ + writel(c->busaddr, h->vaddr + SA5_REQUEST_PORT_OFFSET); + readl(h->vaddr + SA5_SCRATCHPAD_OFFSET); + h->commands_outstanding++; + if ( h->commands_outstanding > h->max_outstanding) + h->max_outstanding = h->commands_outstanding; +} + +/* + * This card is the opposite of the other cards. + * 0 turns interrupts on... + * 0x08 turns them off... + */ +static void SA5_intr_mask(ctlr_info_t *h, unsigned long val) +{ + if (val) + { /* Turn interrupts on */ + h->interrupts_enabled = 1; + writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + } else /* Turn them off */ + { + h->interrupts_enabled = 0; + writel( SA5_INTR_OFF, + h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + } +} +/* + * This card is the opposite of the other cards. + * 0 turns interrupts on... + * 0x04 turns them off... + */ +static void SA5B_intr_mask(ctlr_info_t *h, unsigned long val) +{ + if (val) + { /* Turn interrupts on */ + h->interrupts_enabled = 1; + writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + } else /* Turn them off */ + { + h->interrupts_enabled = 0; + writel( SA5B_INTR_OFF, + h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + } +} + +/* Performant mode intr_mask */ +static void SA5_performant_intr_mask(ctlr_info_t *h, unsigned long val) +{ + if (val) { /* turn on interrupts */ + h->interrupts_enabled = 1; + writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + } else { + h->interrupts_enabled = 0; + writel(SA5_PERF_INTR_OFF, + h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + } +} + +/* + * Returns true if fifo is full. + * + */ +static unsigned long SA5_fifo_full(ctlr_info_t *h) +{ + if( h->commands_outstanding >= h->max_commands) + return(1); + else + return(0); + +} +/* + * returns value read from hardware. + * returns FIFO_EMPTY if there is nothing to read + */ +static unsigned long SA5_completed(ctlr_info_t *h) +{ + unsigned long register_value + = readl(h->vaddr + SA5_REPLY_PORT_OFFSET); + if(register_value != FIFO_EMPTY) + { + h->commands_outstanding--; +#ifdef CCISS_DEBUG + printk("cciss: Read %lx back from board\n", register_value); +#endif /* CCISS_DEBUG */ + } +#ifdef CCISS_DEBUG + else + { + printk("cciss: FIFO Empty read\n"); + } +#endif + return ( register_value); + +} + +/* Performant mode command completed */ +static unsigned long SA5_performant_completed(ctlr_info_t *h) +{ + unsigned long register_value = FIFO_EMPTY; + + /* flush the controller write of the reply queue by reading + * outbound doorbell status register. + */ + register_value = readl(h->vaddr + SA5_OUTDB_STATUS); + /* msi auto clears the interrupt pending bit. */ + if (!(h->msi_vector || h->msix_vector)) { + writel(SA5_OUTDB_CLEAR_PERF_BIT, h->vaddr + SA5_OUTDB_CLEAR); + /* Do a read in order to flush the write to the controller + * (as per spec.) + */ + register_value = readl(h->vaddr + SA5_OUTDB_STATUS); + } + + if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) { + register_value = *(h->reply_pool_head); + (h->reply_pool_head)++; + h->commands_outstanding--; + } else { + register_value = FIFO_EMPTY; + } + /* Check for wraparound */ + if (h->reply_pool_head == (h->reply_pool + h->max_commands)) { + h->reply_pool_head = h->reply_pool; + h->reply_pool_wraparound ^= 1; + } + + return register_value; +} +/* + * Returns true if an interrupt is pending.. + */ +static bool SA5_intr_pending(ctlr_info_t *h) +{ + unsigned long register_value = + readl(h->vaddr + SA5_INTR_STATUS); +#ifdef CCISS_DEBUG + printk("cciss: intr_pending %lx\n", register_value); +#endif /* CCISS_DEBUG */ + if( register_value & SA5_INTR_PENDING) + return 1; + return 0 ; +} + +/* + * Returns true if an interrupt is pending.. + */ +static bool SA5B_intr_pending(ctlr_info_t *h) +{ + unsigned long register_value = + readl(h->vaddr + SA5_INTR_STATUS); +#ifdef CCISS_DEBUG + printk("cciss: intr_pending %lx\n", register_value); +#endif /* CCISS_DEBUG */ + if( register_value & SA5B_INTR_PENDING) + return 1; + return 0 ; +} + +static bool SA5_performant_intr_pending(ctlr_info_t *h) +{ + unsigned long register_value = readl(h->vaddr + SA5_INTR_STATUS); + + if (!register_value) + return false; + + if (h->msi_vector || h->msix_vector) + return true; + + /* Read outbound doorbell to flush */ + register_value = readl(h->vaddr + SA5_OUTDB_STATUS); + return register_value & SA5_OUTDB_STATUS_PERF_BIT; +} + +static struct access_method SA5_access = { + SA5_submit_command, + SA5_intr_mask, + SA5_fifo_full, + SA5_intr_pending, + SA5_completed, +}; + +static struct access_method SA5B_access = { + SA5_submit_command, + SA5B_intr_mask, + SA5_fifo_full, + SA5B_intr_pending, + SA5_completed, +}; + +static struct access_method SA5_performant_access = { + SA5_submit_command, + SA5_performant_intr_mask, + SA5_fifo_full, + SA5_performant_intr_pending, + SA5_performant_completed, +}; + +struct board_type { + __u32 board_id; + char *product_name; + struct access_method *access; + int nr_cmds; /* Max cmds this kind of ctlr can handle. */ +}; + +#endif /* CCISS_H */ diff --git a/kernel/drivers/block/cciss_cmd.h b/kernel/drivers/block/cciss_cmd.h new file mode 100644 index 000000000..d9be6b4d4 --- /dev/null +++ b/kernel/drivers/block/cciss_cmd.h @@ -0,0 +1,269 @@ +#ifndef CCISS_CMD_H +#define CCISS_CMD_H + +#include + +/* DEFINES */ +#define CISS_VERSION "1.00" + +/* general boundary definitions */ +#define MAXSGENTRIES 32 +#define CCISS_SG_CHAIN 0x80000000 +#define MAXREPLYQS 256 + +/* Unit Attentions ASC's as defined for the MSA2012sa */ +#define POWER_OR_RESET 0x29 +#define STATE_CHANGED 0x2a +#define UNIT_ATTENTION_CLEARED 0x2f +#define LUN_FAILED 0x3e +#define REPORT_LUNS_CHANGED 0x3f + +/* Unit Attentions ASCQ's as defined for the MSA2012sa */ + + /* These ASCQ's defined for ASC = POWER_OR_RESET */ +#define POWER_ON_RESET 0x00 +#define POWER_ON_REBOOT 0x01 +#define SCSI_BUS_RESET 0x02 +#define MSA_TARGET_RESET 0x03 +#define CONTROLLER_FAILOVER 0x04 +#define TRANSCEIVER_SE 0x05 +#define TRANSCEIVER_LVD 0x06 + + /* These ASCQ's defined for ASC = STATE_CHANGED */ +#define RESERVATION_PREEMPTED 0x03 +#define ASYM_ACCESS_CHANGED 0x06 +#define LUN_CAPACITY_CHANGED 0x09 + +/* config space register offsets */ +#define CFG_VENDORID 0x00 +#define CFG_DEVICEID 0x02 +#define CFG_I2OBAR 0x10 +#define CFG_MEM1BAR 0x14 + +/* i2o space register offsets */ +#define I2O_IBDB_SET 0x20 +#define I2O_IBDB_CLEAR 0x70 +#define I2O_INT_STATUS 0x30 +#define I2O_INT_MASK 0x34 +#define I2O_IBPOST_Q 0x40 +#define I2O_OBPOST_Q 0x44 +#define I2O_DMA1_CFG 0x214 + +/* Configuration Table */ +#define CFGTBL_ChangeReq 0x00000001l +#define CFGTBL_AccCmds 0x00000001l +#define DOORBELL_CTLR_RESET 0x00000004l +#define DOORBELL_CTLR_RESET2 0x00000020l + +#define CFGTBL_Trans_Simple 0x00000002l +#define CFGTBL_Trans_Performant 0x00000004l +#define CFGTBL_Trans_use_short_tags 0x20000000l + +#define CFGTBL_BusType_Ultra2 0x00000001l +#define CFGTBL_BusType_Ultra3 0x00000002l +#define CFGTBL_BusType_Fibre1G 0x00000100l +#define CFGTBL_BusType_Fibre2G 0x00000200l +typedef struct _vals32 +{ + __u32 lower; + __u32 upper; +} vals32; + +typedef union _u64bit +{ + vals32 val32; + __u64 val; +} u64bit; + +/* Type defs used in the following structs */ +#define QWORD vals32 + +/* STRUCTURES */ +#define CISS_MAX_PHYS_LUN 1024 +/* SCSI-3 Cmmands */ + +#pragma pack(1) + +#define CISS_INQUIRY 0x12 +/* Date returned */ +typedef struct _InquiryData_struct +{ + BYTE data_byte[36]; +} InquiryData_struct; + +#define CISS_REPORT_LOG 0xc2 /* Report Logical LUNs */ +#define CISS_REPORT_PHYS 0xc3 /* Report Physical LUNs */ +/* Data returned */ +typedef struct _ReportLUNdata_struct +{ + BYTE LUNListLength[4]; + DWORD reserved; + BYTE LUN[CISS_MAX_LUN][8]; +} ReportLunData_struct; + +#define CCISS_READ_CAPACITY 0x25 /* Read Capacity */ +typedef struct _ReadCapdata_struct +{ + BYTE total_size[4]; /* Total size in blocks */ + BYTE block_size[4]; /* Size of blocks in bytes */ +} ReadCapdata_struct; + +#define CCISS_READ_CAPACITY_16 0x9e /* Read Capacity 16 */ + +/* service action to differentiate a 16 byte read capacity from + other commands that use the 0x9e SCSI op code */ + +#define CCISS_READ_CAPACITY_16_SERVICE_ACT 0x10 + +typedef struct _ReadCapdata_struct_16 +{ + BYTE total_size[8]; /* Total size in blocks */ + BYTE block_size[4]; /* Size of blocks in bytes */ + BYTE prot_en:1; /* protection enable bit */ + BYTE rto_en:1; /* reference tag own enable bit */ + BYTE reserved:6; /* reserved bits */ + BYTE reserved2[18]; /* reserved bytes per spec */ +} ReadCapdata_struct_16; + +/* Define the supported read/write commands for cciss based controllers */ + +#define CCISS_READ_10 0x28 /* Read(10) */ +#define CCISS_WRITE_10 0x2a /* Write(10) */ +#define CCISS_READ_16 0x88 /* Read(16) */ +#define CCISS_WRITE_16 0x8a /* Write(16) */ + +/* Define the CDB lengths supported by cciss based controllers */ + +#define CDB_LEN10 10 +#define CDB_LEN16 16 + +/* BMIC commands */ +#define BMIC_READ 0x26 +#define BMIC_WRITE 0x27 +#define BMIC_CACHE_FLUSH 0xc2 +#define CCISS_CACHE_FLUSH 0x01 /* C2 was already being used by CCISS */ + +#define CCISS_ABORT_MSG 0x00 +#define CCISS_RESET_MSG 0x01 +#define CCISS_RESET_TYPE_CONTROLLER 0x00 +#define CCISS_RESET_TYPE_BUS 0x01 +#define CCISS_RESET_TYPE_TARGET 0x03 +#define CCISS_RESET_TYPE_LUN 0x04 +#define CCISS_NOOP_MSG 0x03 + +/* Command List Structure */ +#define CTLR_LUNID "\0\0\0\0\0\0\0\0" + +typedef struct _CommandListHeader_struct { + BYTE ReplyQueue; + BYTE SGList; + HWORD SGTotal; + QWORD Tag; + LUNAddr_struct LUN; +} CommandListHeader_struct; +typedef struct _ErrDescriptor_struct { + QWORD Addr; + DWORD Len; +} ErrDescriptor_struct; +typedef struct _SGDescriptor_struct { + QWORD Addr; + DWORD Len; + DWORD Ext; +} SGDescriptor_struct; + +/* Command types */ +#define CMD_RWREQ 0x00 +#define CMD_IOCTL_PEND 0x01 +#define CMD_SCSI 0x03 +#define CMD_MSG_DONE 0x04 +#define CMD_MSG_TIMEOUT 0x05 +#define CMD_MSG_STALE 0xff + +/* This structure needs to be divisible by COMMANDLIST_ALIGNMENT + * because low bits of the address are used to to indicate that + * whether the tag contains an index or an address. PAD_32 and + * PAD_64 can be adjusted independently as needed for 32-bit + * and 64-bits systems. + */ +#define COMMANDLIST_ALIGNMENT (32) +#define IS_64_BIT ((sizeof(long) - 4)/4) +#define IS_32_BIT (!IS_64_BIT) +#define PAD_32 (0) +#define PAD_64 (4) +#define PADSIZE (IS_32_BIT * PAD_32 + IS_64_BIT * PAD_64) +#define DIRECT_LOOKUP_BIT 0x10 +#define DIRECT_LOOKUP_SHIFT 5 + +typedef struct _CommandList_struct { + CommandListHeader_struct Header; + RequestBlock_struct Request; + ErrDescriptor_struct ErrDesc; + SGDescriptor_struct SG[MAXSGENTRIES]; + /* information associated with the command */ + __u32 busaddr; /* physical address of this record */ + ErrorInfo_struct * err_info; /* pointer to the allocated mem */ + int ctlr; + int cmd_type; + long cmdindex; + struct list_head list; + struct request * rq; + struct completion *waiting; + int retry_count; + void * scsi_cmd; + char pad[PADSIZE]; +} CommandList_struct; + +/* Configuration Table Structure */ +typedef struct _HostWrite_struct { + DWORD TransportRequest; + DWORD Reserved; + DWORD CoalIntDelay; + DWORD CoalIntCount; +} HostWrite_struct; + +typedef struct _CfgTable_struct { + BYTE Signature[4]; + DWORD SpecValence; +#define SIMPLE_MODE 0x02 +#define PERFORMANT_MODE 0x04 +#define MEMQ_MODE 0x08 + DWORD TransportSupport; + DWORD TransportActive; + HostWrite_struct HostWrite; + DWORD CmdsOutMax; + DWORD BusTypes; + DWORD TransMethodOffset; + BYTE ServerName[16]; + DWORD HeartBeat; + DWORD SCSI_Prefetch; + DWORD MaxSGElements; + DWORD MaxLogicalUnits; + DWORD MaxPhysicalDrives; + DWORD MaxPhysicalDrivesPerLogicalUnit; + DWORD MaxPerformantModeCommands; + u8 reserved[0x78 - 0x58]; + u32 misc_fw_support; /* offset 0x78 */ +#define MISC_FW_DOORBELL_RESET (0x02) +#define MISC_FW_DOORBELL_RESET2 (0x10) + u8 driver_version[32]; +} CfgTable_struct; + +struct TransTable_struct { + u32 BlockFetch0; + u32 BlockFetch1; + u32 BlockFetch2; + u32 BlockFetch3; + u32 BlockFetch4; + u32 BlockFetch5; + u32 BlockFetch6; + u32 BlockFetch7; + u32 RepQSize; + u32 RepQCount; + u32 RepQCtrAddrLow32; + u32 RepQCtrAddrHigh32; + u32 RepQAddr0Low32; + u32 RepQAddr0High32; +}; + +#pragma pack() +#endif /* CCISS_CMD_H */ diff --git a/kernel/drivers/block/cciss_scsi.c b/kernel/drivers/block/cciss_scsi.c new file mode 100644 index 000000000..ecd845cd2 --- /dev/null +++ b/kernel/drivers/block/cciss_scsi.c @@ -0,0 +1,1712 @@ +/* + * Disk Array driver for HP Smart Array controllers, SCSI Tape module. + * (C) Copyright 2001, 2007 Hewlett-Packard Development Company, L.P. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 300, Boston, MA + * 02111-1307, USA. + * + * Questions/Comments/Bugfixes to iss_storagedev@hp.com + * + * Author: Stephen M. Cameron + */ +#ifdef CONFIG_CISS_SCSI_TAPE + +/* Here we have code to present the driver as a scsi driver + as it is simultaneously presented as a block driver. The + reason for doing this is to allow access to SCSI tape drives + through the array controller. Note in particular, neither + physical nor logical disks are presented through the scsi layer. */ + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "cciss_scsi.h" + +#define CCISS_ABORT_MSG 0x00 +#define CCISS_RESET_MSG 0x01 + +static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff, + size_t size, + __u8 page_code, unsigned char *scsi3addr, + int cmd_type); + +static CommandList_struct *cmd_alloc(ctlr_info_t *h); +static CommandList_struct *cmd_special_alloc(ctlr_info_t *h); +static void cmd_free(ctlr_info_t *h, CommandList_struct *c); +static void cmd_special_free(ctlr_info_t *h, CommandList_struct *c); + +static int cciss_scsi_write_info(struct Scsi_Host *sh, + char *buffer, /* data buffer */ + int length); /* length of data in buffer */ +static int cciss_scsi_show_info(struct seq_file *m, + struct Scsi_Host *sh); + +static int cciss_scsi_queue_command (struct Scsi_Host *h, + struct scsi_cmnd *cmd); +static int cciss_eh_device_reset_handler(struct scsi_cmnd *); +static int cciss_eh_abort_handler(struct scsi_cmnd *); + +static struct cciss_scsi_hba_t ccissscsi[MAX_CTLR] = { + { .name = "cciss0", .ndevices = 0 }, + { .name = "cciss1", .ndevices = 0 }, + { .name = "cciss2", .ndevices = 0 }, + { .name = "cciss3", .ndevices = 0 }, + { .name = "cciss4", .ndevices = 0 }, + { .name = "cciss5", .ndevices = 0 }, + { .name = "cciss6", .ndevices = 0 }, + { .name = "cciss7", .ndevices = 0 }, +}; + +static struct scsi_host_template cciss_driver_template = { + .module = THIS_MODULE, + .name = "cciss", + .proc_name = "cciss", + .write_info = cciss_scsi_write_info, + .show_info = cciss_scsi_show_info, + .queuecommand = cciss_scsi_queue_command, + .this_id = 7, + .cmd_per_lun = 1, + .use_clustering = DISABLE_CLUSTERING, + /* Can't have eh_bus_reset_handler or eh_host_reset_handler for cciss */ + .eh_device_reset_handler= cciss_eh_device_reset_handler, + .eh_abort_handler = cciss_eh_abort_handler, +}; + +#pragma pack(1) + +#define SCSI_PAD_32 8 +#define SCSI_PAD_64 8 + +struct cciss_scsi_cmd_stack_elem_t { + CommandList_struct cmd; + ErrorInfo_struct Err; + __u32 busaddr; + int cmdindex; + u8 pad[IS_32_BIT * SCSI_PAD_32 + IS_64_BIT * SCSI_PAD_64]; +}; + +#pragma pack() + +#pragma pack(1) +struct cciss_scsi_cmd_stack_t { + struct cciss_scsi_cmd_stack_elem_t *pool; + struct cciss_scsi_cmd_stack_elem_t **elem; + dma_addr_t cmd_pool_handle; + int top; + int nelems; +}; +#pragma pack() + +struct cciss_scsi_adapter_data_t { + struct Scsi_Host *scsi_host; + struct cciss_scsi_cmd_stack_t cmd_stack; + SGDescriptor_struct **cmd_sg_list; + int registered; + spinlock_t lock; // to protect ccissscsi[ctlr]; +}; + +#define CPQ_TAPE_LOCK(h, flags) spin_lock_irqsave( \ + &h->scsi_ctlr->lock, flags); +#define CPQ_TAPE_UNLOCK(h, flags) spin_unlock_irqrestore( \ + &h->scsi_ctlr->lock, flags); + +static CommandList_struct * +scsi_cmd_alloc(ctlr_info_t *h) +{ + /* assume only one process in here at a time, locking done by caller. */ + /* use h->lock */ + /* might be better to rewrite how we allocate scsi commands in a way that */ + /* needs no locking at all. */ + + /* take the top memory chunk off the stack and return it, if any. */ + struct cciss_scsi_cmd_stack_elem_t *c; + struct cciss_scsi_adapter_data_t *sa; + struct cciss_scsi_cmd_stack_t *stk; + u64bit temp64; + + sa = h->scsi_ctlr; + stk = &sa->cmd_stack; + + if (stk->top < 0) + return NULL; + c = stk->elem[stk->top]; + /* memset(c, 0, sizeof(*c)); */ + memset(&c->cmd, 0, sizeof(c->cmd)); + memset(&c->Err, 0, sizeof(c->Err)); + /* set physical addr of cmd and addr of scsi parameters */ + c->cmd.busaddr = c->busaddr; + c->cmd.cmdindex = c->cmdindex; + /* (__u32) (stk->cmd_pool_handle + + (sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top)); */ + + temp64.val = (__u64) (c->busaddr + sizeof(CommandList_struct)); + /* (__u64) (stk->cmd_pool_handle + + (sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top) + + sizeof(CommandList_struct)); */ + stk->top--; + c->cmd.ErrDesc.Addr.lower = temp64.val32.lower; + c->cmd.ErrDesc.Addr.upper = temp64.val32.upper; + c->cmd.ErrDesc.Len = sizeof(ErrorInfo_struct); + + c->cmd.ctlr = h->ctlr; + c->cmd.err_info = &c->Err; + + return (CommandList_struct *) c; +} + +static void +scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c) +{ + /* assume only one process in here at a time, locking done by caller. */ + /* use h->lock */ + /* drop the free memory chunk on top of the stack. */ + + struct cciss_scsi_adapter_data_t *sa; + struct cciss_scsi_cmd_stack_t *stk; + + sa = h->scsi_ctlr; + stk = &sa->cmd_stack; + stk->top++; + if (stk->top >= stk->nelems) { + dev_err(&h->pdev->dev, + "scsi_cmd_free called too many times.\n"); + BUG(); + } + stk->elem[stk->top] = (struct cciss_scsi_cmd_stack_elem_t *) c; +} + +static int +scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa) +{ + int i; + struct cciss_scsi_cmd_stack_t *stk; + size_t size; + + stk = &sa->cmd_stack; + stk->nelems = cciss_tape_cmds + 2; + sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(h, + h->chainsize, stk->nelems); + if (!sa->cmd_sg_list && h->chainsize > 0) + return -ENOMEM; + + size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems; + + /* Check alignment, see cciss_cmd.h near CommandList_struct def. */ + BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0); + /* pci_alloc_consistent guarantees 32-bit DMA address will be used */ + stk->pool = (struct cciss_scsi_cmd_stack_elem_t *) + pci_alloc_consistent(h->pdev, size, &stk->cmd_pool_handle); + + if (stk->pool == NULL) { + cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems); + sa->cmd_sg_list = NULL; + return -ENOMEM; + } + stk->elem = kmalloc(sizeof(stk->elem[0]) * stk->nelems, GFP_KERNEL); + if (!stk->elem) { + pci_free_consistent(h->pdev, size, stk->pool, + stk->cmd_pool_handle); + return -1; + } + for (i = 0; i < stk->nelems; i++) { + stk->elem[i] = &stk->pool[i]; + stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + + (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i)); + stk->elem[i]->cmdindex = i; + } + stk->top = stk->nelems-1; + return 0; +} + +static void +scsi_cmd_stack_free(ctlr_info_t *h) +{ + struct cciss_scsi_adapter_data_t *sa; + struct cciss_scsi_cmd_stack_t *stk; + size_t size; + + sa = h->scsi_ctlr; + stk = &sa->cmd_stack; + if (stk->top != stk->nelems-1) { + dev_warn(&h->pdev->dev, + "bug: %d scsi commands are still outstanding.\n", + stk->nelems - stk->top); + } + size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems; + + pci_free_consistent(h->pdev, size, stk->pool, stk->cmd_pool_handle); + stk->pool = NULL; + cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems); + kfree(stk->elem); + stk->elem = NULL; +} + +#if 0 +static int xmargin=8; +static int amargin=60; + +static void +print_bytes (unsigned char *c, int len, int hex, int ascii) +{ + + int i; + unsigned char *x; + + if (hex) + { + x = c; + for (i=0;i0) printk("\n"); + if ((i % xmargin) == 0) printk("0x%04x:", i); + printk(" %02x", *x); + x++; + } + printk("\n"); + } + if (ascii) + { + x = c; + for (i=0;i0) printk("\n"); + if ((i % amargin) == 0) printk("0x%04x:", i); + if (*x > 26 && *x < 128) printk("%c", *x); + else printk("."); + x++; + } + printk("\n"); + } +} + +static void +print_cmd(CommandList_struct *cp) +{ + printk("queue:%d\n", cp->Header.ReplyQueue); + printk("sglist:%d\n", cp->Header.SGList); + printk("sgtot:%d\n", cp->Header.SGTotal); + printk("Tag:0x%08x/0x%08x\n", cp->Header.Tag.upper, + cp->Header.Tag.lower); + printk("LUN:0x%02x%02x%02x%02x%02x%02x%02x%02x\n", + cp->Header.LUN.LunAddrBytes[0], + cp->Header.LUN.LunAddrBytes[1], + cp->Header.LUN.LunAddrBytes[2], + cp->Header.LUN.LunAddrBytes[3], + cp->Header.LUN.LunAddrBytes[4], + cp->Header.LUN.LunAddrBytes[5], + cp->Header.LUN.LunAddrBytes[6], + cp->Header.LUN.LunAddrBytes[7]); + printk("CDBLen:%d\n", cp->Request.CDBLen); + printk("Type:%d\n",cp->Request.Type.Type); + printk("Attr:%d\n",cp->Request.Type.Attribute); + printk(" Dir:%d\n",cp->Request.Type.Direction); + printk("Timeout:%d\n",cp->Request.Timeout); + printk( "CDB: %02x %02x %02x %02x %02x %02x %02x %02x" + " %02x %02x %02x %02x %02x %02x %02x %02x\n", + cp->Request.CDB[0], cp->Request.CDB[1], + cp->Request.CDB[2], cp->Request.CDB[3], + cp->Request.CDB[4], cp->Request.CDB[5], + cp->Request.CDB[6], cp->Request.CDB[7], + cp->Request.CDB[8], cp->Request.CDB[9], + cp->Request.CDB[10], cp->Request.CDB[11], + cp->Request.CDB[12], cp->Request.CDB[13], + cp->Request.CDB[14], cp->Request.CDB[15]), + printk("edesc.Addr: 0x%08x/0%08x, Len = %d\n", + cp->ErrDesc.Addr.upper, cp->ErrDesc.Addr.lower, + cp->ErrDesc.Len); + printk("sgs..........Errorinfo:\n"); + printk("scsistatus:%d\n", cp->err_info->ScsiStatus); + printk("senselen:%d\n", cp->err_info->SenseLen); + printk("cmd status:%d\n", cp->err_info->CommandStatus); + printk("resid cnt:%d\n", cp->err_info->ResidualCnt); + printk("offense size:%d\n", cp->err_info->MoreErrInfo.Invalid_Cmd.offense_size); + printk("offense byte:%d\n", cp->err_info->MoreErrInfo.Invalid_Cmd.offense_num); + printk("offense value:%d\n", cp->err_info->MoreErrInfo.Invalid_Cmd.offense_value); + +} + +#endif + +static int +find_bus_target_lun(ctlr_info_t *h, int *bus, int *target, int *lun) +{ + /* finds an unused bus, target, lun for a new device */ + /* assumes h->scsi_ctlr->lock is held */ + int i, found=0; + unsigned char target_taken[CCISS_MAX_SCSI_DEVS_PER_HBA]; + + memset(&target_taken[0], 0, CCISS_MAX_SCSI_DEVS_PER_HBA); + + target_taken[SELF_SCSI_ID] = 1; + for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) + target_taken[ccissscsi[h->ctlr].dev[i].target] = 1; + + for (i = 0; i < CCISS_MAX_SCSI_DEVS_PER_HBA; i++) { + if (!target_taken[i]) { + *bus = 0; *target=i; *lun = 0; found=1; + break; + } + } + return (!found); +} +struct scsi2map { + char scsi3addr[8]; + int bus, target, lun; +}; + +static int +cciss_scsi_add_entry(ctlr_info_t *h, int hostno, + struct cciss_scsi_dev_t *device, + struct scsi2map *added, int *nadded) +{ + /* assumes h->scsi_ctlr->lock is held */ + int n = ccissscsi[h->ctlr].ndevices; + struct cciss_scsi_dev_t *sd; + int i, bus, target, lun; + unsigned char addr1[8], addr2[8]; + + if (n >= CCISS_MAX_SCSI_DEVS_PER_HBA) { + dev_warn(&h->pdev->dev, "Too many devices, " + "some will be inaccessible.\n"); + return -1; + } + + bus = target = -1; + lun = 0; + /* Is this device a non-zero lun of a multi-lun device */ + /* byte 4 of the 8-byte LUN addr will contain the logical unit no. */ + if (device->scsi3addr[4] != 0) { + /* Search through our list and find the device which */ + /* has the same 8 byte LUN address, excepting byte 4. */ + /* Assign the same bus and target for this new LUN. */ + /* Use the logical unit number from the firmware. */ + memcpy(addr1, device->scsi3addr, 8); + addr1[4] = 0; + for (i = 0; i < n; i++) { + sd = &ccissscsi[h->ctlr].dev[i]; + memcpy(addr2, sd->scsi3addr, 8); + addr2[4] = 0; + /* differ only in byte 4? */ + if (memcmp(addr1, addr2, 8) == 0) { + bus = sd->bus; + target = sd->target; + lun = device->scsi3addr[4]; + break; + } + } + } + + sd = &ccissscsi[h->ctlr].dev[n]; + if (lun == 0) { + if (find_bus_target_lun(h, + &sd->bus, &sd->target, &sd->lun) != 0) + return -1; + } else { + sd->bus = bus; + sd->target = target; + sd->lun = lun; + } + added[*nadded].bus = sd->bus; + added[*nadded].target = sd->target; + added[*nadded].lun = sd->lun; + (*nadded)++; + + memcpy(sd->scsi3addr, device->scsi3addr, 8); + memcpy(sd->vendor, device->vendor, sizeof(sd->vendor)); + memcpy(sd->revision, device->revision, sizeof(sd->revision)); + memcpy(sd->device_id, device->device_id, sizeof(sd->device_id)); + sd->devtype = device->devtype; + + ccissscsi[h->ctlr].ndevices++; + + /* initially, (before registering with scsi layer) we don't + know our hostno and we don't want to print anything first + time anyway (the scsi layer's inquiries will show that info) */ + if (hostno != -1) + dev_info(&h->pdev->dev, "%s device c%db%dt%dl%d added.\n", + scsi_device_type(sd->devtype), hostno, + sd->bus, sd->target, sd->lun); + return 0; +} + +static void +cciss_scsi_remove_entry(ctlr_info_t *h, int hostno, int entry, + struct scsi2map *removed, int *nremoved) +{ + /* assumes h->ctlr]->scsi_ctlr->lock is held */ + int i; + struct cciss_scsi_dev_t sd; + + if (entry < 0 || entry >= CCISS_MAX_SCSI_DEVS_PER_HBA) return; + sd = ccissscsi[h->ctlr].dev[entry]; + removed[*nremoved].bus = sd.bus; + removed[*nremoved].target = sd.target; + removed[*nremoved].lun = sd.lun; + (*nremoved)++; + for (i = entry; i < ccissscsi[h->ctlr].ndevices-1; i++) + ccissscsi[h->ctlr].dev[i] = ccissscsi[h->ctlr].dev[i+1]; + ccissscsi[h->ctlr].ndevices--; + dev_info(&h->pdev->dev, "%s device c%db%dt%dl%d removed.\n", + scsi_device_type(sd.devtype), hostno, + sd.bus, sd.target, sd.lun); +} + + +#define SCSI3ADDR_EQ(a,b) ( \ + (a)[7] == (b)[7] && \ + (a)[6] == (b)[6] && \ + (a)[5] == (b)[5] && \ + (a)[4] == (b)[4] && \ + (a)[3] == (b)[3] && \ + (a)[2] == (b)[2] && \ + (a)[1] == (b)[1] && \ + (a)[0] == (b)[0]) + +static void fixup_botched_add(ctlr_info_t *h, char *scsi3addr) +{ + /* called when scsi_add_device fails in order to re-adjust */ + /* ccissscsi[] to match the mid layer's view. */ + unsigned long flags; + int i, j; + CPQ_TAPE_LOCK(h, flags); + for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) { + if (memcmp(scsi3addr, + ccissscsi[h->ctlr].dev[i].scsi3addr, 8) == 0) { + for (j = i; j < ccissscsi[h->ctlr].ndevices-1; j++) + ccissscsi[h->ctlr].dev[j] = + ccissscsi[h->ctlr].dev[j+1]; + ccissscsi[h->ctlr].ndevices--; + break; + } + } + CPQ_TAPE_UNLOCK(h, flags); +} + +static int device_is_the_same(struct cciss_scsi_dev_t *dev1, + struct cciss_scsi_dev_t *dev2) +{ + return dev1->devtype == dev2->devtype && + memcmp(dev1->scsi3addr, dev2->scsi3addr, + sizeof(dev1->scsi3addr)) == 0 && + memcmp(dev1->device_id, dev2->device_id, + sizeof(dev1->device_id)) == 0 && + memcmp(dev1->vendor, dev2->vendor, + sizeof(dev1->vendor)) == 0 && + memcmp(dev1->model, dev2->model, + sizeof(dev1->model)) == 0 && + memcmp(dev1->revision, dev2->revision, + sizeof(dev1->revision)) == 0; +} + +static int +adjust_cciss_scsi_table(ctlr_info_t *h, int hostno, + struct cciss_scsi_dev_t sd[], int nsds) +{ + /* sd contains scsi3 addresses and devtypes, but + bus target and lun are not filled in. This funciton + takes what's in sd to be the current and adjusts + ccissscsi[] to be in line with what's in sd. */ + + int i,j, found, changes=0; + struct cciss_scsi_dev_t *csd; + unsigned long flags; + struct scsi2map *added, *removed; + int nadded, nremoved; + struct Scsi_Host *sh = NULL; + + added = kzalloc(sizeof(*added) * CCISS_MAX_SCSI_DEVS_PER_HBA, + GFP_KERNEL); + removed = kzalloc(sizeof(*removed) * CCISS_MAX_SCSI_DEVS_PER_HBA, + GFP_KERNEL); + + if (!added || !removed) { + dev_warn(&h->pdev->dev, + "Out of memory in adjust_cciss_scsi_table\n"); + goto free_and_out; + } + + CPQ_TAPE_LOCK(h, flags); + + if (hostno != -1) /* if it's not the first time... */ + sh = h->scsi_ctlr->scsi_host; + + /* find any devices in ccissscsi[] that are not in + sd[] and remove them from ccissscsi[] */ + + i = 0; + nremoved = 0; + nadded = 0; + while (i < ccissscsi[h->ctlr].ndevices) { + csd = &ccissscsi[h->ctlr].dev[i]; + found=0; + for (j=0;jscsi3addr)) { + if (device_is_the_same(&sd[j], csd)) + found=2; + else + found=1; + break; + } + } + + if (found == 0) { /* device no longer present. */ + changes++; + cciss_scsi_remove_entry(h, hostno, i, + removed, &nremoved); + /* remove ^^^, hence i not incremented */ + } else if (found == 1) { /* device is different in some way */ + changes++; + dev_info(&h->pdev->dev, + "device c%db%dt%dl%d has changed.\n", + hostno, csd->bus, csd->target, csd->lun); + cciss_scsi_remove_entry(h, hostno, i, + removed, &nremoved); + /* remove ^^^, hence i not incremented */ + if (cciss_scsi_add_entry(h, hostno, &sd[j], + added, &nadded) != 0) + /* we just removed one, so add can't fail. */ + BUG(); + csd->devtype = sd[j].devtype; + memcpy(csd->device_id, sd[j].device_id, + sizeof(csd->device_id)); + memcpy(csd->vendor, sd[j].vendor, + sizeof(csd->vendor)); + memcpy(csd->model, sd[j].model, + sizeof(csd->model)); + memcpy(csd->revision, sd[j].revision, + sizeof(csd->revision)); + } else /* device is same as it ever was, */ + i++; /* so just move along. */ + } + + /* Now, make sure every device listed in sd[] is also + listed in ccissscsi[], adding them if they aren't found */ + + for (i=0;ictlr].ndevices; j++) { + csd = &ccissscsi[h->ctlr].dev[j]; + if (SCSI3ADDR_EQ(sd[i].scsi3addr, + csd->scsi3addr)) { + if (device_is_the_same(&sd[i], csd)) + found=2; /* found device */ + else + found=1; /* found a bug. */ + break; + } + } + if (!found) { + changes++; + if (cciss_scsi_add_entry(h, hostno, &sd[i], + added, &nadded) != 0) + break; + } else if (found == 1) { + /* should never happen... */ + changes++; + dev_warn(&h->pdev->dev, + "device unexpectedly changed\n"); + /* but if it does happen, we just ignore that device */ + } + } + CPQ_TAPE_UNLOCK(h, flags); + + /* Don't notify scsi mid layer of any changes the first time through */ + /* (or if there are no changes) scsi_scan_host will do it later the */ + /* first time through. */ + if (hostno == -1 || !changes) + goto free_and_out; + + /* Notify scsi mid layer of any removed devices */ + for (i = 0; i < nremoved; i++) { + struct scsi_device *sdev = + scsi_device_lookup(sh, removed[i].bus, + removed[i].target, removed[i].lun); + if (sdev != NULL) { + scsi_remove_device(sdev); + scsi_device_put(sdev); + } else { + /* We don't expect to get here. */ + /* future cmds to this device will get selection */ + /* timeout as if the device was gone. */ + dev_warn(&h->pdev->dev, "didn't find " + "c%db%dt%dl%d\n for removal.", + hostno, removed[i].bus, + removed[i].target, removed[i].lun); + } + } + + /* Notify scsi mid layer of any added devices */ + for (i = 0; i < nadded; i++) { + int rc; + rc = scsi_add_device(sh, added[i].bus, + added[i].target, added[i].lun); + if (rc == 0) + continue; + dev_warn(&h->pdev->dev, "scsi_add_device " + "c%db%dt%dl%d failed, device not added.\n", + hostno, added[i].bus, added[i].target, added[i].lun); + /* now we have to remove it from ccissscsi, */ + /* since it didn't get added to scsi mid layer */ + fixup_botched_add(h, added[i].scsi3addr); + } + +free_and_out: + kfree(added); + kfree(removed); + return 0; +} + +static int +lookup_scsi3addr(ctlr_info_t *h, int bus, int target, int lun, char *scsi3addr) +{ + int i; + struct cciss_scsi_dev_t *sd; + unsigned long flags; + + CPQ_TAPE_LOCK(h, flags); + for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) { + sd = &ccissscsi[h->ctlr].dev[i]; + if (sd->bus == bus && + sd->target == target && + sd->lun == lun) { + memcpy(scsi3addr, &sd->scsi3addr[0], 8); + CPQ_TAPE_UNLOCK(h, flags); + return 0; + } + } + CPQ_TAPE_UNLOCK(h, flags); + return -1; +} + +static void +cciss_scsi_setup(ctlr_info_t *h) +{ + struct cciss_scsi_adapter_data_t * shba; + + ccissscsi[h->ctlr].ndevices = 0; + shba = (struct cciss_scsi_adapter_data_t *) + kmalloc(sizeof(*shba), GFP_KERNEL); + if (shba == NULL) + return; + shba->scsi_host = NULL; + spin_lock_init(&shba->lock); + shba->registered = 0; + if (scsi_cmd_stack_setup(h, shba) != 0) { + kfree(shba); + shba = NULL; + } + h->scsi_ctlr = shba; + return; +} + +static void complete_scsi_command(CommandList_struct *c, int timeout, + __u32 tag) +{ + struct scsi_cmnd *cmd; + ctlr_info_t *h; + ErrorInfo_struct *ei; + + ei = c->err_info; + + /* First, see if it was a message rather than a command */ + if (c->Request.Type.Type == TYPE_MSG) { + c->cmd_type = CMD_MSG_DONE; + return; + } + + cmd = (struct scsi_cmnd *) c->scsi_cmd; + h = hba[c->ctlr]; + + scsi_dma_unmap(cmd); + if (c->Header.SGTotal > h->max_cmd_sgentries) + cciss_unmap_sg_chain_block(h, c); + + cmd->result = (DID_OK << 16); /* host byte */ + cmd->result |= (COMMAND_COMPLETE << 8); /* msg byte */ + /* cmd->result |= (GOOD < 1); */ /* status byte */ + + cmd->result |= (ei->ScsiStatus); + /* printk("Scsistatus is 0x%02x\n", ei->ScsiStatus); */ + + /* copy the sense data whether we need to or not. */ + + memcpy(cmd->sense_buffer, ei->SenseInfo, + ei->SenseLen > SCSI_SENSE_BUFFERSIZE ? + SCSI_SENSE_BUFFERSIZE : + ei->SenseLen); + scsi_set_resid(cmd, ei->ResidualCnt); + + if(ei->CommandStatus != 0) + { /* an error has occurred */ + switch(ei->CommandStatus) + { + case CMD_TARGET_STATUS: + /* Pass it up to the upper layers... */ + if (!ei->ScsiStatus) { + + /* Ordinarily, this case should never happen, but there is a bug + in some released firmware revisions that allows it to happen + if, for example, a 4100 backplane loses power and the tape + drive is in it. We assume that it's a fatal error of some + kind because we can't show that it wasn't. We will make it + look like selection timeout since that is the most common + reason for this to occur, and it's severe enough. */ + + cmd->result = DID_NO_CONNECT << 16; + } + break; + case CMD_DATA_UNDERRUN: /* let mid layer handle it. */ + break; + case CMD_DATA_OVERRUN: + dev_warn(&h->pdev->dev, "%p has" + " completed with data overrun " + "reported\n", c); + break; + case CMD_INVALID: { + /* print_bytes(c, sizeof(*c), 1, 0); + print_cmd(c); */ + /* We get CMD_INVALID if you address a non-existent tape drive instead + of a selection timeout (no response). You will see this if you yank + out a tape drive, then try to access it. This is kind of a shame + because it means that any other CMD_INVALID (e.g. driver bug) will + get interpreted as a missing target. */ + cmd->result = DID_NO_CONNECT << 16; + } + break; + case CMD_PROTOCOL_ERR: + cmd->result = DID_ERROR << 16; + dev_warn(&h->pdev->dev, + "%p has protocol error\n", c); + break; + case CMD_HARDWARE_ERR: + cmd->result = DID_ERROR << 16; + dev_warn(&h->pdev->dev, + "%p had hardware error\n", c); + break; + case CMD_CONNECTION_LOST: + cmd->result = DID_ERROR << 16; + dev_warn(&h->pdev->dev, + "%p had connection lost\n", c); + break; + case CMD_ABORTED: + cmd->result = DID_ABORT << 16; + dev_warn(&h->pdev->dev, "%p was aborted\n", c); + break; + case CMD_ABORT_FAILED: + cmd->result = DID_ERROR << 16; + dev_warn(&h->pdev->dev, + "%p reports abort failed\n", c); + break; + case CMD_UNSOLICITED_ABORT: + cmd->result = DID_ABORT << 16; + dev_warn(&h->pdev->dev, "%p aborted due to an " + "unsolicited abort\n", c); + break; + case CMD_TIMEOUT: + cmd->result = DID_TIME_OUT << 16; + dev_warn(&h->pdev->dev, "%p timedout\n", c); + break; + case CMD_UNABORTABLE: + cmd->result = DID_ERROR << 16; + dev_warn(&h->pdev->dev, "c %p command " + "unabortable\n", c); + break; + default: + cmd->result = DID_ERROR << 16; + dev_warn(&h->pdev->dev, + "%p returned unknown status %x\n", c, + ei->CommandStatus); + } + } + cmd->scsi_done(cmd); + scsi_cmd_free(h, c); +} + +static int +cciss_scsi_detect(ctlr_info_t *h) +{ + struct Scsi_Host *sh; + int error; + + sh = scsi_host_alloc(&cciss_driver_template, sizeof(struct ctlr_info *)); + if (sh == NULL) + goto fail; + sh->io_port = 0; // good enough? FIXME, + sh->n_io_port = 0; // I don't think we use these two... + sh->this_id = SELF_SCSI_ID; + sh->can_queue = cciss_tape_cmds; + sh->sg_tablesize = h->maxsgentries; + sh->max_cmd_len = MAX_COMMAND_SIZE; + sh->max_sectors = h->cciss_max_sectors; + + ((struct cciss_scsi_adapter_data_t *) + h->scsi_ctlr)->scsi_host = sh; + sh->hostdata[0] = (unsigned long) h; + sh->irq = h->intr[SIMPLE_MODE_INT]; + sh->unique_id = sh->irq; + error = scsi_add_host(sh, &h->pdev->dev); + if (error) + goto fail_host_put; + scsi_scan_host(sh); + return 1; + + fail_host_put: + scsi_host_put(sh); + fail: + return 0; +} + +static void +cciss_unmap_one(struct pci_dev *pdev, + CommandList_struct *c, + size_t buflen, + int data_direction) +{ + u64bit addr64; + + addr64.val32.lower = c->SG[0].Addr.lower; + addr64.val32.upper = c->SG[0].Addr.upper; + pci_unmap_single(pdev, (dma_addr_t) addr64.val, buflen, data_direction); +} + +static void +cciss_map_one(struct pci_dev *pdev, + CommandList_struct *c, + unsigned char *buf, + size_t buflen, + int data_direction) +{ + __u64 addr64; + + addr64 = (__u64) pci_map_single(pdev, buf, buflen, data_direction); + c->SG[0].Addr.lower = + (__u32) (addr64 & (__u64) 0x00000000FFFFFFFF); + c->SG[0].Addr.upper = + (__u32) ((addr64 >> 32) & (__u64) 0x00000000FFFFFFFF); + c->SG[0].Len = buflen; + c->Header.SGList = (__u8) 1; /* no. SGs contig in this cmd */ + c->Header.SGTotal = (__u16) 1; /* total sgs in this cmd list */ +} + +static int +cciss_scsi_do_simple_cmd(ctlr_info_t *h, + CommandList_struct *c, + unsigned char *scsi3addr, + unsigned char *cdb, + unsigned char cdblen, + unsigned char *buf, int bufsize, + int direction) +{ + DECLARE_COMPLETION_ONSTACK(wait); + + c->cmd_type = CMD_IOCTL_PEND; /* treat this like an ioctl */ + c->scsi_cmd = NULL; + c->Header.ReplyQueue = 0; /* unused in simple mode */ + memcpy(&c->Header.LUN, scsi3addr, sizeof(c->Header.LUN)); + c->Header.Tag.lower = c->busaddr; /* Use k. address of cmd as tag */ + // Fill in the request block... + + /* printk("Using scsi3addr 0x%02x%0x2%0x2%0x2%0x2%0x2%0x2%0x2\n", + scsi3addr[0], scsi3addr[1], scsi3addr[2], scsi3addr[3], + scsi3addr[4], scsi3addr[5], scsi3addr[6], scsi3addr[7]); */ + + memset(c->Request.CDB, 0, sizeof(c->Request.CDB)); + memcpy(c->Request.CDB, cdb, cdblen); + c->Request.Timeout = 0; + c->Request.CDBLen = cdblen; + c->Request.Type.Type = TYPE_CMD; + c->Request.Type.Attribute = ATTR_SIMPLE; + c->Request.Type.Direction = direction; + + /* Fill in the SG list and do dma mapping */ + cciss_map_one(h->pdev, c, (unsigned char *) buf, + bufsize, DMA_FROM_DEVICE); + + c->waiting = &wait; + enqueue_cmd_and_start_io(h, c); + wait_for_completion(&wait); + + /* undo the dma mapping */ + cciss_unmap_one(h->pdev, c, bufsize, DMA_FROM_DEVICE); + return(0); +} + +static void +cciss_scsi_interpret_error(ctlr_info_t *h, CommandList_struct *c) +{ + ErrorInfo_struct *ei; + + ei = c->err_info; + switch(ei->CommandStatus) + { + case CMD_TARGET_STATUS: + dev_warn(&h->pdev->dev, + "cmd %p has completed with errors\n", c); + dev_warn(&h->pdev->dev, + "cmd %p has SCSI Status = %x\n", + c, ei->ScsiStatus); + if (ei->ScsiStatus == 0) + dev_warn(&h->pdev->dev, + "SCSI status is abnormally zero. " + "(probably indicates selection timeout " + "reported incorrectly due to a known " + "firmware bug, circa July, 2001.)\n"); + break; + case CMD_DATA_UNDERRUN: /* let mid layer handle it. */ + dev_info(&h->pdev->dev, "UNDERRUN\n"); + break; + case CMD_DATA_OVERRUN: + dev_warn(&h->pdev->dev, "%p has" + " completed with data overrun " + "reported\n", c); + break; + case CMD_INVALID: { + /* controller unfortunately reports SCSI passthru's */ + /* to non-existent targets as invalid commands. */ + dev_warn(&h->pdev->dev, + "%p is reported invalid (probably means " + "target device no longer present)\n", c); + /* print_bytes((unsigned char *) c, sizeof(*c), 1, 0); + print_cmd(c); */ + } + break; + case CMD_PROTOCOL_ERR: + dev_warn(&h->pdev->dev, "%p has protocol error\n", c); + break; + case CMD_HARDWARE_ERR: + /* cmd->result = DID_ERROR << 16; */ + dev_warn(&h->pdev->dev, "%p had hardware error\n", c); + break; + case CMD_CONNECTION_LOST: + dev_warn(&h->pdev->dev, "%p had connection lost\n", c); + break; + case CMD_ABORTED: + dev_warn(&h->pdev->dev, "%p was aborted\n", c); + break; + case CMD_ABORT_FAILED: + dev_warn(&h->pdev->dev, + "%p reports abort failed\n", c); + break; + case CMD_UNSOLICITED_ABORT: + dev_warn(&h->pdev->dev, + "%p aborted due to an unsolicited abort\n", c); + break; + case CMD_TIMEOUT: + dev_warn(&h->pdev->dev, "%p timedout\n", c); + break; + case CMD_UNABORTABLE: + dev_warn(&h->pdev->dev, + "%p unabortable\n", c); + break; + default: + dev_warn(&h->pdev->dev, + "%p returned unknown status %x\n", + c, ei->CommandStatus); + } +} + +static int +cciss_scsi_do_inquiry(ctlr_info_t *h, unsigned char *scsi3addr, + unsigned char page, unsigned char *buf, + unsigned char bufsize) +{ + int rc; + CommandList_struct *c; + char cdb[6]; + ErrorInfo_struct *ei; + unsigned long flags; + + spin_lock_irqsave(&h->lock, flags); + c = scsi_cmd_alloc(h); + spin_unlock_irqrestore(&h->lock, flags); + + if (c == NULL) { /* trouble... */ + printk("cmd_alloc returned NULL!\n"); + return -1; + } + + ei = c->err_info; + + cdb[0] = CISS_INQUIRY; + cdb[1] = (page != 0); + cdb[2] = page; + cdb[3] = 0; + cdb[4] = bufsize; + cdb[5] = 0; + rc = cciss_scsi_do_simple_cmd(h, c, scsi3addr, cdb, + 6, buf, bufsize, XFER_READ); + + if (rc != 0) return rc; /* something went wrong */ + + if (ei->CommandStatus != 0 && + ei->CommandStatus != CMD_DATA_UNDERRUN) { + cciss_scsi_interpret_error(h, c); + rc = -1; + } + spin_lock_irqsave(&h->lock, flags); + scsi_cmd_free(h, c); + spin_unlock_irqrestore(&h->lock, flags); + return rc; +} + +/* Get the device id from inquiry page 0x83 */ +static int cciss_scsi_get_device_id(ctlr_info_t *h, unsigned char *scsi3addr, + unsigned char *device_id, int buflen) +{ + int rc; + unsigned char *buf; + + if (buflen > 16) + buflen = 16; + buf = kzalloc(64, GFP_KERNEL); + if (!buf) + return -1; + rc = cciss_scsi_do_inquiry(h, scsi3addr, 0x83, buf, 64); + if (rc == 0) + memcpy(device_id, &buf[8], buflen); + kfree(buf); + return rc != 0; +} + +static int +cciss_scsi_do_report_phys_luns(ctlr_info_t *h, + ReportLunData_struct *buf, int bufsize) +{ + int rc; + CommandList_struct *c; + unsigned char cdb[12]; + unsigned char scsi3addr[8]; + ErrorInfo_struct *ei; + unsigned long flags; + + spin_lock_irqsave(&h->lock, flags); + c = scsi_cmd_alloc(h); + spin_unlock_irqrestore(&h->lock, flags); + if (c == NULL) { /* trouble... */ + printk("cmd_alloc returned NULL!\n"); + return -1; + } + + memset(&scsi3addr[0], 0, 8); /* address the controller */ + cdb[0] = CISS_REPORT_PHYS; + cdb[1] = 0; + cdb[2] = 0; + cdb[3] = 0; + cdb[4] = 0; + cdb[5] = 0; + cdb[6] = (bufsize >> 24) & 0xFF; //MSB + cdb[7] = (bufsize >> 16) & 0xFF; + cdb[8] = (bufsize >> 8) & 0xFF; + cdb[9] = bufsize & 0xFF; + cdb[10] = 0; + cdb[11] = 0; + + rc = cciss_scsi_do_simple_cmd(h, c, scsi3addr, + cdb, 12, + (unsigned char *) buf, + bufsize, XFER_READ); + + if (rc != 0) return rc; /* something went wrong */ + + ei = c->err_info; + if (ei->CommandStatus != 0 && + ei->CommandStatus != CMD_DATA_UNDERRUN) { + cciss_scsi_interpret_error(h, c); + rc = -1; + } + spin_lock_irqsave(&h->lock, flags); + scsi_cmd_free(h, c); + spin_unlock_irqrestore(&h->lock, flags); + return rc; +} + +static void +cciss_update_non_disk_devices(ctlr_info_t *h, int hostno) +{ + /* the idea here is we could get notified from /proc + that some devices have changed, so we do a report + physical luns cmd, and adjust our list of devices + accordingly. (We can't rely on the scsi-mid layer just + doing inquiries, because the "busses" that the scsi + mid-layer probes are totally fabricated by this driver, + so new devices wouldn't show up. + + the scsi3addr's of devices won't change so long as the + adapter is not reset. That means we can rescan and + tell which devices we already know about, vs. new + devices, vs. disappearing devices. + + Also, if you yank out a tape drive, then put in a disk + in it's place, (say, a configured volume from another + array controller for instance) _don't_ poke this driver + (so it thinks it's still a tape, but _do_ poke the scsi + mid layer, so it does an inquiry... the scsi mid layer + will see the physical disk. This would be bad. Need to + think about how to prevent that. One idea would be to + snoop all scsi responses and if an inquiry repsonse comes + back that reports a disk, chuck it an return selection + timeout instead and adjust our table... Not sure i like + that though. + + */ +#define OBDR_TAPE_INQ_SIZE 49 +#define OBDR_TAPE_SIG "$DR-10" + ReportLunData_struct *ld_buff; + unsigned char *inq_buff; + unsigned char scsi3addr[8]; + __u32 num_luns=0; + unsigned char *ch; + struct cciss_scsi_dev_t *currentsd, *this_device; + int ncurrent=0; + int reportlunsize = sizeof(*ld_buff) + CISS_MAX_PHYS_LUN * 8; + int i; + + ld_buff = kzalloc(reportlunsize, GFP_KERNEL); + inq_buff = kmalloc(OBDR_TAPE_INQ_SIZE, GFP_KERNEL); + currentsd = kzalloc(sizeof(*currentsd) * + (CCISS_MAX_SCSI_DEVS_PER_HBA+1), GFP_KERNEL); + if (ld_buff == NULL || inq_buff == NULL || currentsd == NULL) { + printk(KERN_ERR "cciss: out of memory\n"); + goto out; + } + this_device = ¤tsd[CCISS_MAX_SCSI_DEVS_PER_HBA]; + if (cciss_scsi_do_report_phys_luns(h, ld_buff, reportlunsize) == 0) { + ch = &ld_buff->LUNListLength[0]; + num_luns = ((ch[0]<<24) | (ch[1]<<16) | (ch[2]<<8) | ch[3]) / 8; + if (num_luns > CISS_MAX_PHYS_LUN) { + printk(KERN_WARNING + "cciss: Maximum physical LUNs (%d) exceeded. " + "%d LUNs ignored.\n", CISS_MAX_PHYS_LUN, + num_luns - CISS_MAX_PHYS_LUN); + num_luns = CISS_MAX_PHYS_LUN; + } + } + else { + printk(KERN_ERR "cciss: Report physical LUNs failed.\n"); + goto out; + } + + + /* adjust our table of devices */ + for (i = 0; i < num_luns; i++) { + /* for each physical lun, do an inquiry */ + if (ld_buff->LUN[i][3] & 0xC0) continue; + memset(inq_buff, 0, OBDR_TAPE_INQ_SIZE); + memcpy(&scsi3addr[0], &ld_buff->LUN[i][0], 8); + + if (cciss_scsi_do_inquiry(h, scsi3addr, 0, inq_buff, + (unsigned char) OBDR_TAPE_INQ_SIZE) != 0) + /* Inquiry failed (msg printed already) */ + continue; /* so we will skip this device. */ + + this_device->devtype = (inq_buff[0] & 0x1f); + this_device->bus = -1; + this_device->target = -1; + this_device->lun = -1; + memcpy(this_device->scsi3addr, scsi3addr, 8); + memcpy(this_device->vendor, &inq_buff[8], + sizeof(this_device->vendor)); + memcpy(this_device->model, &inq_buff[16], + sizeof(this_device->model)); + memcpy(this_device->revision, &inq_buff[32], + sizeof(this_device->revision)); + memset(this_device->device_id, 0, + sizeof(this_device->device_id)); + cciss_scsi_get_device_id(h, scsi3addr, + this_device->device_id, sizeof(this_device->device_id)); + + switch (this_device->devtype) + { + case 0x05: /* CD-ROM */ { + + /* We don't *really* support actual CD-ROM devices, + * just this "One Button Disaster Recovery" tape drive + * which temporarily pretends to be a CD-ROM drive. + * So we check that the device is really an OBDR tape + * device by checking for "$DR-10" in bytes 43-48 of + * the inquiry data. + */ + char obdr_sig[7]; + + strncpy(obdr_sig, &inq_buff[43], 6); + obdr_sig[6] = '\0'; + if (strncmp(obdr_sig, OBDR_TAPE_SIG, 6) != 0) + /* Not OBDR device, ignore it. */ + break; + } + /* fall through . . . */ + case 0x01: /* sequential access, (tape) */ + case 0x08: /* medium changer */ + if (ncurrent >= CCISS_MAX_SCSI_DEVS_PER_HBA) { + printk(KERN_INFO "cciss%d: %s ignored, " + "too many devices.\n", h->ctlr, + scsi_device_type(this_device->devtype)); + break; + } + currentsd[ncurrent] = *this_device; + ncurrent++; + break; + default: + break; + } + } + + adjust_cciss_scsi_table(h, hostno, currentsd, ncurrent); +out: + kfree(inq_buff); + kfree(ld_buff); + kfree(currentsd); + return; +} + +static int +is_keyword(char *ptr, int len, char *verb) // Thanks to ncr53c8xx.c +{ + int verb_len = strlen(verb); + if (len >= verb_len && !memcmp(verb,ptr,verb_len)) + return verb_len; + else + return 0; +} + +static int +cciss_scsi_user_command(ctlr_info_t *h, int hostno, char *buffer, int length) +{ + int arg_len; + + if ((arg_len = is_keyword(buffer, length, "rescan")) != 0) + cciss_update_non_disk_devices(h, hostno); + else + return -EINVAL; + return length; +} + +static int +cciss_scsi_write_info(struct Scsi_Host *sh, + char *buffer, /* data buffer */ + int length) /* length of data in buffer */ +{ + ctlr_info_t *h = (ctlr_info_t *) sh->hostdata[0]; + if (h == NULL) /* This really shouldn't ever happen. */ + return -EINVAL; + + return cciss_scsi_user_command(h, sh->host_no, + buffer, length); +} + +static int +cciss_scsi_show_info(struct seq_file *m, struct Scsi_Host *sh) +{ + + ctlr_info_t *h = (ctlr_info_t *) sh->hostdata[0]; + int i; + + if (h == NULL) /* This really shouldn't ever happen. */ + return -EINVAL; + + seq_printf(m, "cciss%d: SCSI host: %d\n", + h->ctlr, sh->host_no); + + /* this information is needed by apps to know which cciss + device corresponds to which scsi host number without + having to open a scsi target device node. The device + information is not a duplicate of /proc/scsi/scsi because + the two may be out of sync due to scsi hotplug, rather + this info is for an app to be able to use to know how to + get them back in sync. */ + + for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) { + struct cciss_scsi_dev_t *sd = + &ccissscsi[h->ctlr].dev[i]; + seq_printf(m, "c%db%dt%dl%d %02d " + "0x%02x%02x%02x%02x%02x%02x%02x%02x\n", + sh->host_no, sd->bus, sd->target, sd->lun, + sd->devtype, + sd->scsi3addr[0], sd->scsi3addr[1], + sd->scsi3addr[2], sd->scsi3addr[3], + sd->scsi3addr[4], sd->scsi3addr[5], + sd->scsi3addr[6], sd->scsi3addr[7]); + } + return 0; +} + +/* cciss_scatter_gather takes a struct scsi_cmnd, (cmd), and does the pci + dma mapping and fills in the scatter gather entries of the + cciss command, c. */ + +static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c, + struct scsi_cmnd *cmd) +{ + unsigned int len; + struct scatterlist *sg; + __u64 addr64; + int request_nsgs, i, chained, sg_index; + struct cciss_scsi_adapter_data_t *sa = h->scsi_ctlr; + SGDescriptor_struct *curr_sg; + + BUG_ON(scsi_sg_count(cmd) > h->maxsgentries); + + chained = 0; + sg_index = 0; + curr_sg = c->SG; + request_nsgs = scsi_dma_map(cmd); + if (request_nsgs) { + scsi_for_each_sg(cmd, sg, request_nsgs, i) { + if (sg_index + 1 == h->max_cmd_sgentries && + !chained && request_nsgs - i > 1) { + chained = 1; + sg_index = 0; + curr_sg = sa->cmd_sg_list[c->cmdindex]; + } + addr64 = (__u64) sg_dma_address(sg); + len = sg_dma_len(sg); + curr_sg[sg_index].Addr.lower = + (__u32) (addr64 & 0x0FFFFFFFFULL); + curr_sg[sg_index].Addr.upper = + (__u32) ((addr64 >> 32) & 0x0FFFFFFFFULL); + curr_sg[sg_index].Len = len; + curr_sg[sg_index].Ext = 0; + ++sg_index; + } + if (chained) + cciss_map_sg_chain_block(h, c, + sa->cmd_sg_list[c->cmdindex], + (request_nsgs - (h->max_cmd_sgentries - 1)) * + sizeof(SGDescriptor_struct)); + } + /* track how many SG entries we are using */ + if (request_nsgs > h->maxSG) + h->maxSG = request_nsgs; + c->Header.SGTotal = (u16) request_nsgs + chained; + if (request_nsgs > h->max_cmd_sgentries) + c->Header.SGList = h->max_cmd_sgentries; + else + c->Header.SGList = c->Header.SGTotal; + return; +} + + +static int +cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)) +{ + ctlr_info_t *h; + int rc; + unsigned char scsi3addr[8]; + CommandList_struct *c; + unsigned long flags; + + // Get the ptr to our adapter structure (hba[i]) out of cmd->host. + // We violate cmd->host privacy here. (Is there another way?) + h = (ctlr_info_t *) cmd->device->host->hostdata[0]; + + rc = lookup_scsi3addr(h, cmd->device->channel, cmd->device->id, + cmd->device->lun, scsi3addr); + if (rc != 0) { + /* the scsi nexus does not match any that we presented... */ + /* pretend to mid layer that we got selection timeout */ + cmd->result = DID_NO_CONNECT << 16; + done(cmd); + /* we might want to think about registering controller itself + as a processor device on the bus so sg binds to it. */ + return 0; + } + + /* Ok, we have a reasonable scsi nexus, so send the cmd down, and + see what the device thinks of it. */ + + spin_lock_irqsave(&h->lock, flags); + c = scsi_cmd_alloc(h); + spin_unlock_irqrestore(&h->lock, flags); + if (c == NULL) { /* trouble... */ + dev_warn(&h->pdev->dev, "scsi_cmd_alloc returned NULL!\n"); + /* FIXME: next 3 lines are -> BAD! <- */ + cmd->result = DID_NO_CONNECT << 16; + done(cmd); + return 0; + } + + // Fill in the command list header + + cmd->scsi_done = done; // save this for use by completion code + + /* save c in case we have to abort it */ + cmd->host_scribble = (unsigned char *) c; + + c->cmd_type = CMD_SCSI; + c->scsi_cmd = cmd; + c->Header.ReplyQueue = 0; /* unused in simple mode */ + memcpy(&c->Header.LUN.LunAddrBytes[0], &scsi3addr[0], 8); + c->Header.Tag.lower = c->busaddr; /* Use k. address of cmd as tag */ + + // Fill in the request block... + + c->Request.Timeout = 0; + memset(c->Request.CDB, 0, sizeof(c->Request.CDB)); + BUG_ON(cmd->cmd_len > sizeof(c->Request.CDB)); + c->Request.CDBLen = cmd->cmd_len; + memcpy(c->Request.CDB, cmd->cmnd, cmd->cmd_len); + c->Request.Type.Type = TYPE_CMD; + c->Request.Type.Attribute = ATTR_SIMPLE; + switch(cmd->sc_data_direction) + { + case DMA_TO_DEVICE: + c->Request.Type.Direction = XFER_WRITE; + break; + case DMA_FROM_DEVICE: + c->Request.Type.Direction = XFER_READ; + break; + case DMA_NONE: + c->Request.Type.Direction = XFER_NONE; + break; + case DMA_BIDIRECTIONAL: + // This can happen if a buggy application does a scsi passthru + // and sets both inlen and outlen to non-zero. ( see + // ../scsi/scsi_ioctl.c:scsi_ioctl_send_command() ) + + c->Request.Type.Direction = XFER_RSVD; + // This is technically wrong, and cciss controllers should + // reject it with CMD_INVALID, which is the most correct + // response, but non-fibre backends appear to let it + // slide by, and give the same results as if this field + // were set correctly. Either way is acceptable for + // our purposes here. + + break; + + default: + dev_warn(&h->pdev->dev, "unknown data direction: %d\n", + cmd->sc_data_direction); + BUG(); + break; + } + cciss_scatter_gather(h, c, cmd); + enqueue_cmd_and_start_io(h, c); + /* the cmd'll come back via intr handler in complete_scsi_command() */ + return 0; +} + +static DEF_SCSI_QCMD(cciss_scsi_queue_command) + +static void cciss_unregister_scsi(ctlr_info_t *h) +{ + struct cciss_scsi_adapter_data_t *sa; + struct cciss_scsi_cmd_stack_t *stk; + unsigned long flags; + + /* we are being forcibly unloaded, and may not refuse. */ + + spin_lock_irqsave(&h->lock, flags); + sa = h->scsi_ctlr; + stk = &sa->cmd_stack; + + /* if we weren't ever actually registered, don't unregister */ + if (sa->registered) { + spin_unlock_irqrestore(&h->lock, flags); + scsi_remove_host(sa->scsi_host); + scsi_host_put(sa->scsi_host); + spin_lock_irqsave(&h->lock, flags); + } + + /* set scsi_host to NULL so our detect routine will + find us on register */ + sa->scsi_host = NULL; + spin_unlock_irqrestore(&h->lock, flags); + scsi_cmd_stack_free(h); + kfree(sa); +} + +static int cciss_engage_scsi(ctlr_info_t *h) +{ + struct cciss_scsi_adapter_data_t *sa; + struct cciss_scsi_cmd_stack_t *stk; + unsigned long flags; + + spin_lock_irqsave(&h->lock, flags); + sa = h->scsi_ctlr; + stk = &sa->cmd_stack; + + if (sa->registered) { + dev_info(&h->pdev->dev, "SCSI subsystem already engaged.\n"); + spin_unlock_irqrestore(&h->lock, flags); + return -ENXIO; + } + sa->registered = 1; + spin_unlock_irqrestore(&h->lock, flags); + cciss_update_non_disk_devices(h, -1); + cciss_scsi_detect(h); + return 0; +} + +static void +cciss_seq_tape_report(struct seq_file *seq, ctlr_info_t *h) +{ + unsigned long flags; + + CPQ_TAPE_LOCK(h, flags); + seq_printf(seq, + "Sequential access devices: %d\n\n", + ccissscsi[h->ctlr].ndevices); + CPQ_TAPE_UNLOCK(h, flags); +} + +static int wait_for_device_to_become_ready(ctlr_info_t *h, + unsigned char lunaddr[]) +{ + int rc; + int count = 0; + int waittime = HZ; + CommandList_struct *c; + + c = cmd_alloc(h); + if (!c) { + dev_warn(&h->pdev->dev, "out of memory in " + "wait_for_device_to_become_ready.\n"); + return IO_ERROR; + } + + /* Send test unit ready until device ready, or give up. */ + while (count < 20) { + + /* Wait for a bit. do this first, because if we send + * the TUR right away, the reset will just abort it. + */ + schedule_timeout_uninterruptible(waittime); + count++; + + /* Increase wait time with each try, up to a point. */ + if (waittime < (HZ * 30)) + waittime = waittime * 2; + + /* Send the Test Unit Ready */ + rc = fill_cmd(h, c, TEST_UNIT_READY, NULL, 0, 0, + lunaddr, TYPE_CMD); + if (rc == 0) + rc = sendcmd_withirq_core(h, c, 0); + + (void) process_sendcmd_error(h, c); + + if (rc != 0) + goto retry_tur; + + if (c->err_info->CommandStatus == CMD_SUCCESS) + break; + + if (c->err_info->CommandStatus == CMD_TARGET_STATUS && + c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION) { + if (c->err_info->SenseInfo[2] == NO_SENSE) + break; + if (c->err_info->SenseInfo[2] == UNIT_ATTENTION) { + unsigned char asc; + asc = c->err_info->SenseInfo[12]; + check_for_unit_attention(h, c); + if (asc == POWER_OR_RESET) + break; + } + } +retry_tur: + dev_warn(&h->pdev->dev, "Waiting %d secs " + "for device to become ready.\n", + waittime / HZ); + rc = 1; /* device not ready. */ + } + + if (rc) + dev_warn(&h->pdev->dev, "giving up on device.\n"); + else + dev_warn(&h->pdev->dev, "device is ready.\n"); + + cmd_free(h, c); + return rc; +} + +/* Need at least one of these error handlers to keep ../scsi/hosts.c from + * complaining. Doing a host- or bus-reset can't do anything good here. + * Despite what it might say in scsi_error.c, there may well be commands + * on the controller, as the cciss driver registers twice, once as a block + * device for the logical drives, and once as a scsi device, for any tape + * drives. So we know there are no commands out on the tape drives, but we + * don't know there are no commands on the controller, and it is likely + * that there probably are, as the cciss block device is most commonly used + * as a boot device (embedded controller on HP/Compaq systems.) +*/ + +static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd) +{ + int rc; + CommandList_struct *cmd_in_trouble; + unsigned char lunaddr[8]; + ctlr_info_t *h; + + /* find the controller to which the command to be aborted was sent */ + h = (ctlr_info_t *) scsicmd->device->host->hostdata[0]; + if (h == NULL) /* paranoia */ + return FAILED; + dev_warn(&h->pdev->dev, "resetting tape drive or medium changer.\n"); + /* find the command that's giving us trouble */ + cmd_in_trouble = (CommandList_struct *) scsicmd->host_scribble; + if (cmd_in_trouble == NULL) /* paranoia */ + return FAILED; + memcpy(lunaddr, &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 8); + /* send a reset to the SCSI LUN which the command was sent to */ + rc = sendcmd_withirq(h, CCISS_RESET_MSG, NULL, 0, 0, lunaddr, + TYPE_MSG); + if (rc == 0 && wait_for_device_to_become_ready(h, lunaddr) == 0) + return SUCCESS; + dev_warn(&h->pdev->dev, "resetting device failed.\n"); + return FAILED; +} + +static int cciss_eh_abort_handler(struct scsi_cmnd *scsicmd) +{ + int rc; + CommandList_struct *cmd_to_abort; + unsigned char lunaddr[8]; + ctlr_info_t *h; + + /* find the controller to which the command to be aborted was sent */ + h = (ctlr_info_t *) scsicmd->device->host->hostdata[0]; + if (h == NULL) /* paranoia */ + return FAILED; + dev_warn(&h->pdev->dev, "aborting tardy SCSI cmd\n"); + + /* find the command to be aborted */ + cmd_to_abort = (CommandList_struct *) scsicmd->host_scribble; + if (cmd_to_abort == NULL) /* paranoia */ + return FAILED; + memcpy(lunaddr, &cmd_to_abort->Header.LUN.LunAddrBytes[0], 8); + rc = sendcmd_withirq(h, CCISS_ABORT_MSG, &cmd_to_abort->Header.Tag, + 0, 0, lunaddr, TYPE_MSG); + if (rc == 0) + return SUCCESS; + return FAILED; + +} + +#else /* no CONFIG_CISS_SCSI_TAPE */ + +/* If no tape support, then these become defined out of existence */ + +#define cciss_scsi_setup(cntl_num) +#define cciss_engage_scsi(h) + +#endif /* CONFIG_CISS_SCSI_TAPE */ diff --git a/kernel/drivers/block/cciss_scsi.h b/kernel/drivers/block/cciss_scsi.h new file mode 100644 index 000000000..e71d98672 --- /dev/null +++ b/kernel/drivers/block/cciss_scsi.h @@ -0,0 +1,79 @@ +/* + * Disk Array driver for HP Smart Array controllers, SCSI Tape module. + * (C) Copyright 2001, 2007 Hewlett-Packard Development Company, L.P. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 300, Boston, MA + * 02111-1307, USA. + * + * Questions/Comments/Bugfixes to iss_storagedev@hp.com + * + */ +#ifdef CONFIG_CISS_SCSI_TAPE +#ifndef _CCISS_SCSI_H_ +#define _CCISS_SCSI_H_ + +#include /* possibly irrelevant, since we don't show disks */ + + /* the scsi id of the adapter... */ +#define SELF_SCSI_ID 15 + /* 15 is somewhat arbitrary, since the scsi-2 bus + that's presented by the driver to the OS is + fabricated. The "real" scsi-3 bus the + hardware presents is fabricated too. + The actual, honest-to-goodness physical + bus that the devices are attached to is not + addressible natively, and may in fact turn + out to be not scsi at all. */ + + +/* + +If the upper scsi layer tries to track how many commands we have +outstanding, it will be operating under the misapprehension that it is +the only one sending us requests. We also have the block interface, +which is where most requests must surely come from, so the upper layer's +notion of how many requests we have outstanding will be wrong most or +all of the time. + +Note, the normal SCSI mid-layer error handling doesn't work well +for this driver because 1) it takes the io_request_lock before +calling error handlers and uses a local variable to store flags, +so the io_request_lock cannot be released and interrupts enabled +inside the error handlers, and, the error handlers cannot poll +for command completion because they might get commands from the +block half of the driver completing, and not know what to do +with them. That's what we get for making a hybrid scsi/block +driver, I suppose. + +*/ + +struct cciss_scsi_dev_t { + int devtype; + int bus, target, lun; /* as presented to the OS */ + unsigned char scsi3addr[8]; /* as presented to the HW */ + unsigned char device_id[16]; /* from inquiry pg. 0x83 */ + unsigned char vendor[8]; /* bytes 8-15 of inquiry data */ + unsigned char model[16]; /* bytes 16-31 of inquiry data */ + unsigned char revision[4]; /* bytes 32-35 of inquiry data */ +}; + +struct cciss_scsi_hba_t { + char *name; + int ndevices; +#define CCISS_MAX_SCSI_DEVS_PER_HBA 16 + struct cciss_scsi_dev_t dev[CCISS_MAX_SCSI_DEVS_PER_HBA]; +}; + +#endif /* _CCISS_SCSI_H_ */ +#endif /* CONFIG_CISS_SCSI_TAPE */ diff --git a/kernel/drivers/block/cpqarray.c b/kernel/drivers/block/cpqarray.c new file mode 100644 index 000000000..f749df9e1 --- /dev/null +++ b/kernel/drivers/block/cpqarray.c @@ -0,0 +1,1820 @@ +/* + * Disk Array driver for Compaq SMART2 Controllers + * Copyright 1998 Compaq Computer Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Questions/Comments/Bugfixes to iss_storagedev@hp.com + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define SMART2_DRIVER_VERSION(maj,min,submin) ((maj<<16)|(min<<8)|(submin)) + +#define DRIVER_NAME "Compaq SMART2 Driver (v 2.6.0)" +#define DRIVER_VERSION SMART2_DRIVER_VERSION(2,6,0) + +/* Embedded module documentation macros - see modules.h */ +/* Original author Chris Frantz - Compaq Computer Corporation */ +MODULE_AUTHOR("Compaq Computer Corporation"); +MODULE_DESCRIPTION("Driver for Compaq Smart2 Array Controllers version 2.6.0"); +MODULE_LICENSE("GPL"); + +#include "cpqarray.h" +#include "ida_cmd.h" +#include "smart1,2.h" +#include "ida_ioctl.h" + +#define READ_AHEAD 128 +#define NR_CMDS 128 /* This could probably go as high as ~400 */ + +#define MAX_CTLR 8 +#define CTLR_SHIFT 8 + +#define CPQARRAY_DMA_MASK 0xFFFFFFFF /* 32 bit DMA */ + +static DEFINE_MUTEX(cpqarray_mutex); +static int nr_ctlr; +static ctlr_info_t *hba[MAX_CTLR]; + +static int eisa[8]; + +#define NR_PRODUCTS ARRAY_SIZE(products) + +/* board_id = Subsystem Device ID & Vendor ID + * product = Marketing Name for the board + * access = Address of the struct of function pointers + */ +static struct board_type products[] = { + { 0x0040110E, "IDA", &smart1_access }, + { 0x0140110E, "IDA-2", &smart1_access }, + { 0x1040110E, "IAES", &smart1_access }, + { 0x2040110E, "SMART", &smart1_access }, + { 0x3040110E, "SMART-2/E", &smart2e_access }, + { 0x40300E11, "SMART-2/P", &smart2_access }, + { 0x40310E11, "SMART-2SL", &smart2_access }, + { 0x40320E11, "Smart Array 3200", &smart2_access }, + { 0x40330E11, "Smart Array 3100ES", &smart2_access }, + { 0x40340E11, "Smart Array 221", &smart2_access }, + { 0x40400E11, "Integrated Array", &smart4_access }, + { 0x40480E11, "Compaq Raid LC2", &smart4_access }, + { 0x40500E11, "Smart Array 4200", &smart4_access }, + { 0x40510E11, "Smart Array 4250ES", &smart4_access }, + { 0x40580E11, "Smart Array 431", &smart4_access }, +}; + +/* define the PCI info for the PCI cards this driver can control */ +static const struct pci_device_id cpqarray_pci_device_id[] = +{ + { PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_COMPAQ_42XX, + 0x0E11, 0x4058, 0, 0, 0}, /* SA431 */ + { PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_COMPAQ_42XX, + 0x0E11, 0x4051, 0, 0, 0}, /* SA4250ES */ + { PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_COMPAQ_42XX, + 0x0E11, 0x4050, 0, 0, 0}, /* SA4200 */ + { PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C1510, + 0x0E11, 0x4048, 0, 0, 0}, /* LC2 */ + { PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C1510, + 0x0E11, 0x4040, 0, 0, 0}, /* Integrated Array */ + { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P, + 0x0E11, 0x4034, 0, 0, 0}, /* SA 221 */ + { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P, + 0x0E11, 0x4033, 0, 0, 0}, /* SA 3100ES*/ + { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P, + 0x0E11, 0x4032, 0, 0, 0}, /* SA 3200*/ + { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P, + 0x0E11, 0x4031, 0, 0, 0}, /* SA 2SL*/ + { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P, + 0x0E11, 0x4030, 0, 0, 0}, /* SA 2P */ + { 0 } +}; + +MODULE_DEVICE_TABLE(pci, cpqarray_pci_device_id); + +static struct gendisk *ida_gendisk[MAX_CTLR][NWD]; + +/* Debug... */ +#define DBG(s) do { s } while(0) +/* Debug (general info)... */ +#define DBGINFO(s) do { } while(0) +/* Debug Paranoid... */ +#define DBGP(s) do { } while(0) +/* Debug Extra Paranoid... */ +#define DBGPX(s) do { } while(0) + +static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev); +static void __iomem *remap_pci_mem(ulong base, ulong size); +static int cpqarray_eisa_detect(void); +static int pollcomplete(int ctlr); +static void getgeometry(int ctlr); +static void start_fwbk(int ctlr); + +static cmdlist_t * cmd_alloc(ctlr_info_t *h, int get_from_pool); +static void cmd_free(ctlr_info_t *h, cmdlist_t *c, int got_from_pool); + +static void free_hba(int i); +static int alloc_cpqarray_hba(void); + +static int sendcmd( + __u8 cmd, + int ctlr, + void *buff, + size_t size, + unsigned int blk, + unsigned int blkcnt, + unsigned int log_unit ); + +static int ida_unlocked_open(struct block_device *bdev, fmode_t mode); +static void ida_release(struct gendisk *disk, fmode_t mode); +static int ida_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); +static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo); +static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io); + +static void do_ida_request(struct request_queue *q); +static void start_io(ctlr_info_t *h); + +static inline void addQ(cmdlist_t **Qptr, cmdlist_t *c); +static inline cmdlist_t *removeQ(cmdlist_t **Qptr, cmdlist_t *c); +static inline void complete_command(cmdlist_t *cmd, int timeout); + +static irqreturn_t do_ida_intr(int irq, void *dev_id); +static void ida_timer(unsigned long tdata); +static int ida_revalidate(struct gendisk *disk); +static int revalidate_allvol(ctlr_info_t *host); +static int cpqarray_register_ctlr(int ctlr, struct pci_dev *pdev); + +#ifdef CONFIG_PROC_FS +static void ida_procinit(int i); +#else +static void ida_procinit(int i) {} +#endif + +static inline drv_info_t *get_drv(struct gendisk *disk) +{ + return disk->private_data; +} + +static inline ctlr_info_t *get_host(struct gendisk *disk) +{ + return disk->queue->queuedata; +} + + +static const struct block_device_operations ida_fops = { + .owner = THIS_MODULE, + .open = ida_unlocked_open, + .release = ida_release, + .ioctl = ida_ioctl, + .getgeo = ida_getgeo, + .revalidate_disk= ida_revalidate, +}; + + +#ifdef CONFIG_PROC_FS + +static struct proc_dir_entry *proc_array; +static const struct file_operations ida_proc_fops; + +/* + * Get us a file in /proc/array that says something about each controller. + * Create /proc/array if it doesn't exist yet. + */ +static void __init ida_procinit(int i) +{ + if (proc_array == NULL) { + proc_array = proc_mkdir("driver/cpqarray", NULL); + if (!proc_array) return; + } + + proc_create_data(hba[i]->devname, 0, proc_array, &ida_proc_fops, hba[i]); +} + +/* + * Report information about this controller. + */ +static int ida_proc_show(struct seq_file *m, void *v) +{ + int i, ctlr; + ctlr_info_t *h = (ctlr_info_t*)m->private; + drv_info_t *drv; +#ifdef CPQ_PROC_PRINT_QUEUES + cmdlist_t *c; + unsigned long flags; +#endif + + ctlr = h->ctlr; + seq_printf(m, "%s: Compaq %s Controller\n" + " Board ID: 0x%08lx\n" + " Firmware Revision: %c%c%c%c\n" + " Controller Sig: 0x%08lx\n" + " Memory Address: 0x%08lx\n" + " I/O Port: 0x%04x\n" + " IRQ: %d\n" + " Logical drives: %d\n" + " Physical drives: %d\n\n" + " Current Q depth: %d\n" + " Max Q depth since init: %d\n\n", + h->devname, + h->product_name, + (unsigned long)h->board_id, + h->firm_rev[0], h->firm_rev[1], h->firm_rev[2], h->firm_rev[3], + (unsigned long)h->ctlr_sig, (unsigned long)h->vaddr, + (unsigned int) h->io_mem_addr, (unsigned int)h->intr, + h->log_drives, h->phys_drives, + h->Qdepth, h->maxQsinceinit); + + seq_puts(m, "Logical Drive Info:\n"); + + for(i=0; ilog_drives; i++) { + drv = &h->drv[i]; + seq_printf(m, "ida/c%dd%d: blksz=%d nr_blks=%d\n", + ctlr, i, drv->blk_size, drv->nr_blks); + } + +#ifdef CPQ_PROC_PRINT_QUEUES + spin_lock_irqsave(IDA_LOCK(h->ctlr), flags); + seq_puts(m, "\nCurrent Queues:\n"); + + c = h->reqQ; + seq_printf(m, "reqQ = %p", c); + if (c) c=c->next; + while(c && c != h->reqQ) { + seq_printf(m, "->%p", c); + c=c->next; + } + + c = h->cmpQ; + seq_printf(m, "\ncmpQ = %p", c); + if (c) c=c->next; + while(c && c != h->cmpQ) { + seq_printf(m, "->%p", c); + c=c->next; + } + + seq_putc(m, '\n'); + spin_unlock_irqrestore(IDA_LOCK(h->ctlr), flags); +#endif + seq_printf(m, "nr_allocs = %d\nnr_frees = %d\n", + h->nr_allocs, h->nr_frees); + return 0; +} + +static int ida_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ida_proc_show, PDE_DATA(inode)); +} + +static const struct file_operations ida_proc_fops = { + .owner = THIS_MODULE, + .open = ida_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_PROC_FS */ + +module_param_array(eisa, int, NULL, 0); + +static void release_io_mem(ctlr_info_t *c) +{ + /* if IO mem was not protected do nothing */ + if( c->io_mem_addr == 0) + return; + release_region(c->io_mem_addr, c->io_mem_length); + c->io_mem_addr = 0; + c->io_mem_length = 0; +} + +static void cpqarray_remove_one(int i) +{ + int j; + char buff[4]; + + /* sendcmd will turn off interrupt, and send the flush... + * To write all data in the battery backed cache to disks + * no data returned, but don't want to send NULL to sendcmd */ + if( sendcmd(FLUSH_CACHE, i, buff, 4, 0, 0, 0)) + { + printk(KERN_WARNING "Unable to flush cache on controller %d\n", + i); + } + free_irq(hba[i]->intr, hba[i]); + iounmap(hba[i]->vaddr); + unregister_blkdev(COMPAQ_SMART2_MAJOR+i, hba[i]->devname); + del_timer(&hba[i]->timer); + remove_proc_entry(hba[i]->devname, proc_array); + pci_free_consistent(hba[i]->pci_dev, + NR_CMDS * sizeof(cmdlist_t), (hba[i]->cmd_pool), + hba[i]->cmd_pool_dhandle); + kfree(hba[i]->cmd_pool_bits); + for(j = 0; j < NWD; j++) { + if (ida_gendisk[i][j]->flags & GENHD_FL_UP) + del_gendisk(ida_gendisk[i][j]); + put_disk(ida_gendisk[i][j]); + } + blk_cleanup_queue(hba[i]->queue); + release_io_mem(hba[i]); + free_hba(i); +} + +static void cpqarray_remove_one_pci(struct pci_dev *pdev) +{ + int i; + ctlr_info_t *tmp_ptr; + + if (pci_get_drvdata(pdev) == NULL) { + printk( KERN_ERR "cpqarray: Unable to remove device \n"); + return; + } + + tmp_ptr = pci_get_drvdata(pdev); + i = tmp_ptr->ctlr; + if (hba[i] == NULL) { + printk(KERN_ERR "cpqarray: controller %d appears to have" + "already been removed \n", i); + return; + } + pci_set_drvdata(pdev, NULL); + + cpqarray_remove_one(i); +} + +/* removing an instance that was not removed automatically.. + * must be an eisa card. + */ +static void cpqarray_remove_one_eisa(int i) +{ + if (hba[i] == NULL) { + printk(KERN_ERR "cpqarray: controller %d appears to have" + "already been removed \n", i); + return; + } + cpqarray_remove_one(i); +} + +/* pdev is NULL for eisa */ +static int cpqarray_register_ctlr(int i, struct pci_dev *pdev) +{ + struct request_queue *q; + int j; + + /* + * register block devices + * Find disks and fill in structs + * Get an interrupt, set the Q depth and get into /proc + */ + + /* If this successful it should insure that we are the only */ + /* instance of the driver */ + if (register_blkdev(COMPAQ_SMART2_MAJOR+i, hba[i]->devname)) { + goto Enomem4; + } + hba[i]->access.set_intr_mask(hba[i], 0); + if (request_irq(hba[i]->intr, do_ida_intr, IRQF_SHARED, + hba[i]->devname, hba[i])) + { + printk(KERN_ERR "cpqarray: Unable to get irq %d for %s\n", + hba[i]->intr, hba[i]->devname); + goto Enomem3; + } + + for (j=0; jcmd_pool = pci_alloc_consistent( + hba[i]->pci_dev, NR_CMDS * sizeof(cmdlist_t), + &(hba[i]->cmd_pool_dhandle)); + hba[i]->cmd_pool_bits = kcalloc( + DIV_ROUND_UP(NR_CMDS, BITS_PER_LONG), sizeof(unsigned long), + GFP_KERNEL); + + if (!hba[i]->cmd_pool_bits || !hba[i]->cmd_pool) + goto Enomem1; + + memset(hba[i]->cmd_pool, 0, NR_CMDS * sizeof(cmdlist_t)); + printk(KERN_INFO "cpqarray: Finding drives on %s", + hba[i]->devname); + + spin_lock_init(&hba[i]->lock); + q = blk_init_queue(do_ida_request, &hba[i]->lock); + if (!q) + goto Enomem1; + + hba[i]->queue = q; + q->queuedata = hba[i]; + + getgeometry(i); + start_fwbk(i); + + ida_procinit(i); + + if (pdev) + blk_queue_bounce_limit(q, hba[i]->pci_dev->dma_mask); + + /* This is a hardware imposed limit. */ + blk_queue_max_segments(q, SG_MAX); + + init_timer(&hba[i]->timer); + hba[i]->timer.expires = jiffies + IDA_TIMER; + hba[i]->timer.data = (unsigned long)hba[i]; + hba[i]->timer.function = ida_timer; + add_timer(&hba[i]->timer); + + /* Enable IRQ now that spinlock and rate limit timer are set up */ + hba[i]->access.set_intr_mask(hba[i], FIFO_NOT_EMPTY); + + for(j=0; jdrv[j]; + sprintf(disk->disk_name, "ida/c%dd%d", i, j); + disk->major = COMPAQ_SMART2_MAJOR + i; + disk->first_minor = j<fops = &ida_fops; + if (j && !drv->nr_blks) + continue; + blk_queue_logical_block_size(hba[i]->queue, drv->blk_size); + set_capacity(disk, drv->nr_blks); + disk->queue = hba[i]->queue; + disk->private_data = drv; + add_disk(disk); + } + + /* done ! */ + return(i); + +Enomem1: + nr_ctlr = i; + kfree(hba[i]->cmd_pool_bits); + if (hba[i]->cmd_pool) + pci_free_consistent(hba[i]->pci_dev, NR_CMDS*sizeof(cmdlist_t), + hba[i]->cmd_pool, hba[i]->cmd_pool_dhandle); +Enomem2: + while (j--) { + put_disk(ida_gendisk[i][j]); + ida_gendisk[i][j] = NULL; + } + free_irq(hba[i]->intr, hba[i]); +Enomem3: + unregister_blkdev(COMPAQ_SMART2_MAJOR+i, hba[i]->devname); +Enomem4: + if (pdev) + pci_set_drvdata(pdev, NULL); + release_io_mem(hba[i]); + free_hba(i); + + printk( KERN_ERR "cpqarray: out of memory"); + + return -1; +} + +static int cpqarray_init_one(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + int i; + + printk(KERN_DEBUG "cpqarray: Device 0x%x has been found at" + " bus %d dev %d func %d\n", + pdev->device, pdev->bus->number, PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn)); + i = alloc_cpqarray_hba(); + if( i < 0 ) + return (-1); + memset(hba[i], 0, sizeof(ctlr_info_t)); + sprintf(hba[i]->devname, "ida%d", i); + hba[i]->ctlr = i; + /* Initialize the pdev driver private data */ + pci_set_drvdata(pdev, hba[i]); + + if (cpqarray_pci_init(hba[i], pdev) != 0) { + pci_set_drvdata(pdev, NULL); + release_io_mem(hba[i]); + free_hba(i); + return -1; + } + + return (cpqarray_register_ctlr(i, pdev)); +} + +static struct pci_driver cpqarray_pci_driver = { + .name = "cpqarray", + .probe = cpqarray_init_one, + .remove = cpqarray_remove_one_pci, + .id_table = cpqarray_pci_device_id, +}; + +/* + * This is it. Find all the controllers and register them. + * returns the number of block devices registered. + */ +static int __init cpqarray_init(void) +{ + int num_cntlrs_reg = 0; + int i; + int rc = 0; + + /* detect controllers */ + printk(DRIVER_NAME "\n"); + + rc = pci_register_driver(&cpqarray_pci_driver); + if (rc) + return rc; + cpqarray_eisa_detect(); + + for (i=0; i < MAX_CTLR; i++) { + if (hba[i] != NULL) + num_cntlrs_reg++; + } + + if (num_cntlrs_reg) + return 0; + else { + pci_unregister_driver(&cpqarray_pci_driver); + return -ENODEV; + } +} + +/* Function to find the first free pointer into our hba[] array */ +/* Returns -1 if no free entries are left. */ +static int alloc_cpqarray_hba(void) +{ + int i; + + for(i=0; i< MAX_CTLR; i++) { + if (hba[i] == NULL) { + hba[i] = kmalloc(sizeof(ctlr_info_t), GFP_KERNEL); + if(hba[i]==NULL) { + printk(KERN_ERR "cpqarray: out of memory.\n"); + return (-1); + } + return (i); + } + } + printk(KERN_WARNING "cpqarray: This driver supports a maximum" + " of 8 controllers.\n"); + return(-1); +} + +static void free_hba(int i) +{ + kfree(hba[i]); + hba[i]=NULL; +} + +/* + * Find the IO address of the controller, its IRQ and so forth. Fill + * in some basic stuff into the ctlr_info_t structure. + */ +static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev) +{ + ushort vendor_id, device_id, command; + unchar cache_line_size, latency_timer; + unchar irq, revision; + unsigned long addr[6]; + __u32 board_id; + + int i; + + c->pci_dev = pdev; + pci_set_master(pdev); + if (pci_enable_device(pdev)) { + printk(KERN_ERR "cpqarray: Unable to Enable PCI device\n"); + return -1; + } + vendor_id = pdev->vendor; + device_id = pdev->device; + revision = pdev->revision; + irq = pdev->irq; + + for(i=0; i<6; i++) + addr[i] = pci_resource_start(pdev, i); + + if (pci_set_dma_mask(pdev, CPQARRAY_DMA_MASK) != 0) + { + printk(KERN_ERR "cpqarray: Unable to set DMA mask\n"); + return -1; + } + + pci_read_config_word(pdev, PCI_COMMAND, &command); + pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache_line_size); + pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &latency_timer); + + pci_read_config_dword(pdev, 0x2c, &board_id); + + /* check to see if controller has been disabled */ + if(!(command & 0x02)) { + printk(KERN_WARNING + "cpqarray: controller appears to be disabled\n"); + return(-1); + } + +DBGINFO( + printk("vendor_id = %x\n", vendor_id); + printk("device_id = %x\n", device_id); + printk("command = %x\n", command); + for(i=0; i<6; i++) + printk("addr[%d] = %lx\n", i, addr[i]); + printk("revision = %x\n", revision); + printk("irq = %x\n", irq); + printk("cache_line_size = %x\n", cache_line_size); + printk("latency_timer = %x\n", latency_timer); + printk("board_id = %x\n", board_id); +); + + c->intr = irq; + + for(i=0; i<6; i++) { + if (pci_resource_flags(pdev, i) & PCI_BASE_ADDRESS_SPACE_IO) + { /* IO space */ + c->io_mem_addr = addr[i]; + c->io_mem_length = pci_resource_end(pdev, i) + - pci_resource_start(pdev, i) + 1; + if(!request_region( c->io_mem_addr, c->io_mem_length, + "cpqarray")) + { + printk( KERN_WARNING "cpqarray I/O memory range already in use addr %lx length = %ld\n", c->io_mem_addr, c->io_mem_length); + c->io_mem_addr = 0; + c->io_mem_length = 0; + } + break; + } + } + + c->paddr = 0; + for(i=0; i<6; i++) + if (!(pci_resource_flags(pdev, i) & + PCI_BASE_ADDRESS_SPACE_IO)) { + c->paddr = pci_resource_start (pdev, i); + break; + } + if (!c->paddr) + return -1; + c->vaddr = remap_pci_mem(c->paddr, 128); + if (!c->vaddr) + return -1; + c->board_id = board_id; + + for(i=0; iproduct_name = products[i].product_name; + c->access = *(products[i].access); + break; + } + } + if (i == NR_PRODUCTS) { + printk(KERN_WARNING "cpqarray: Sorry, I don't know how" + " to access the SMART Array controller %08lx\n", + (unsigned long)board_id); + return -1; + } + + return 0; +} + +/* + * Map (physical) PCI mem into (virtual) kernel space + */ +static void __iomem *remap_pci_mem(ulong base, ulong size) +{ + ulong page_base = ((ulong) base) & PAGE_MASK; + ulong page_offs = ((ulong) base) - page_base; + void __iomem *page_remapped = ioremap(page_base, page_offs+size); + + return (page_remapped ? (page_remapped + page_offs) : NULL); +} + +#ifndef MODULE +/* + * Config string is a comma separated set of i/o addresses of EISA cards. + */ +static int cpqarray_setup(char *str) +{ + int i, ints[9]; + + (void)get_options(str, ARRAY_SIZE(ints), ints); + + for(i=0; iio_mem_addr = eisa[i]; + hba[ctlr]->io_mem_length = 0x7FF; + if(!request_region(hba[ctlr]->io_mem_addr, + hba[ctlr]->io_mem_length, + "cpqarray")) + { + printk(KERN_WARNING "cpqarray: I/O range already in " + "use addr = %lx length = %ld\n", + hba[ctlr]->io_mem_addr, + hba[ctlr]->io_mem_length); + free_hba(ctlr); + continue; + } + + /* + * Read the config register to find our interrupt + */ + intr = inb(eisa[i]+0xCC0) >> 4; + if (intr & 1) intr = 11; + else if (intr & 2) intr = 10; + else if (intr & 4) intr = 14; + else if (intr & 8) intr = 15; + + hba[ctlr]->intr = intr; + sprintf(hba[ctlr]->devname, "ida%d", nr_ctlr); + hba[ctlr]->product_name = products[j].product_name; + hba[ctlr]->access = *(products[j].access); + hba[ctlr]->ctlr = ctlr; + hba[ctlr]->board_id = board_id; + hba[ctlr]->pci_dev = NULL; /* not PCI */ + +DBGINFO( + printk("i = %d, j = %d\n", i, j); + printk("irq = %x\n", intr); + printk("product name = %s\n", products[j].product_name); + printk("board_id = %x\n", board_id); +); + + num_ctlr++; + i++; + + if (cpqarray_register_ctlr(ctlr, NULL) == -1) + printk(KERN_WARNING + "cpqarray: Can't register EISA controller %d\n", + ctlr); + + } + + return num_ctlr; +} + +/* + * Open. Make sure the device is really there. + */ +static int ida_open(struct block_device *bdev, fmode_t mode) +{ + drv_info_t *drv = get_drv(bdev->bd_disk); + ctlr_info_t *host = get_host(bdev->bd_disk); + + DBGINFO(printk("ida_open %s\n", bdev->bd_disk->disk_name)); + /* + * Root is allowed to open raw volume zero even if it's not configured + * so array config can still work. I don't think I really like this, + * but I'm already using way to many device nodes to claim another one + * for "raw controller". + */ + if (!drv->nr_blks) { + if (!capable(CAP_SYS_RAWIO)) + return -ENXIO; + if (!capable(CAP_SYS_ADMIN) && drv != host->drv) + return -ENXIO; + } + host->usage_count++; + return 0; +} + +static int ida_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + mutex_lock(&cpqarray_mutex); + ret = ida_open(bdev, mode); + mutex_unlock(&cpqarray_mutex); + + return ret; +} + +/* + * Close. Sync first. + */ +static void ida_release(struct gendisk *disk, fmode_t mode) +{ + ctlr_info_t *host; + + mutex_lock(&cpqarray_mutex); + host = get_host(disk); + host->usage_count--; + mutex_unlock(&cpqarray_mutex); +} + +/* + * Enqueuing and dequeuing functions for cmdlists. + */ +static inline void addQ(cmdlist_t **Qptr, cmdlist_t *c) +{ + if (*Qptr == NULL) { + *Qptr = c; + c->next = c->prev = c; + } else { + c->prev = (*Qptr)->prev; + c->next = (*Qptr); + (*Qptr)->prev->next = c; + (*Qptr)->prev = c; + } +} + +static inline cmdlist_t *removeQ(cmdlist_t **Qptr, cmdlist_t *c) +{ + if (c && c->next != c) { + if (*Qptr == c) *Qptr = c->next; + c->prev->next = c->next; + c->next->prev = c->prev; + } else { + *Qptr = NULL; + } + return c; +} + +/* + * Get a request and submit it to the controller. + * This routine needs to grab all the requests it possibly can from the + * req Q and submit them. Interrupts are off (and need to be off) when you + * are in here (either via the dummy do_ida_request functions or by being + * called from the interrupt handler + */ +static void do_ida_request(struct request_queue *q) +{ + ctlr_info_t *h = q->queuedata; + cmdlist_t *c; + struct request *creq; + struct scatterlist tmp_sg[SG_MAX]; + int i, dir, seg; + +queue_next: + creq = blk_peek_request(q); + if (!creq) + goto startio; + + BUG_ON(creq->nr_phys_segments > SG_MAX); + + if ((c = cmd_alloc(h,1)) == NULL) + goto startio; + + blk_start_request(creq); + + c->ctlr = h->ctlr; + c->hdr.unit = (drv_info_t *)(creq->rq_disk->private_data) - h->drv; + c->hdr.size = sizeof(rblk_t) >> 2; + c->size += sizeof(rblk_t); + + c->req.hdr.blk = blk_rq_pos(creq); + c->rq = creq; +DBGPX( + printk("sector=%d, nr_sectors=%u\n", + blk_rq_pos(creq), blk_rq_sectors(creq)); +); + sg_init_table(tmp_sg, SG_MAX); + seg = blk_rq_map_sg(q, creq, tmp_sg); + + /* Now do all the DMA Mappings */ + if (rq_data_dir(creq) == READ) + dir = PCI_DMA_FROMDEVICE; + else + dir = PCI_DMA_TODEVICE; + for( i=0; i < seg; i++) + { + c->req.sg[i].size = tmp_sg[i].length; + c->req.sg[i].addr = (__u32) pci_map_page(h->pci_dev, + sg_page(&tmp_sg[i]), + tmp_sg[i].offset, + tmp_sg[i].length, dir); + } +DBGPX( printk("Submitting %u sectors in %d segments\n", blk_rq_sectors(creq), seg); ); + c->req.hdr.sg_cnt = seg; + c->req.hdr.blk_cnt = blk_rq_sectors(creq); + c->req.hdr.cmd = (rq_data_dir(creq) == READ) ? IDA_READ : IDA_WRITE; + c->type = CMD_RWREQ; + + /* Put the request on the tail of the request queue */ + addQ(&h->reqQ, c); + h->Qdepth++; + if (h->Qdepth > h->maxQsinceinit) + h->maxQsinceinit = h->Qdepth; + + goto queue_next; + +startio: + start_io(h); +} + +/* + * start_io submits everything on a controller's request queue + * and moves it to the completion queue. + * + * Interrupts had better be off if you're in here + */ +static void start_io(ctlr_info_t *h) +{ + cmdlist_t *c; + + while((c = h->reqQ) != NULL) { + /* Can't do anything if we're busy */ + if (h->access.fifo_full(h) == 0) + return; + + /* Get the first entry from the request Q */ + removeQ(&h->reqQ, c); + h->Qdepth--; + + /* Tell the controller to do our bidding */ + h->access.submit_command(h, c); + + /* Get onto the completion Q */ + addQ(&h->cmpQ, c); + } +} + +/* + * Mark all buffers that cmd was responsible for + */ +static inline void complete_command(cmdlist_t *cmd, int timeout) +{ + struct request *rq = cmd->rq; + int error = 0; + int i, ddir; + + if (cmd->req.hdr.rcode & RCODE_NONFATAL && + (hba[cmd->ctlr]->misc_tflags & MISC_NONFATAL_WARN) == 0) { + printk(KERN_NOTICE "Non Fatal error on ida/c%dd%d\n", + cmd->ctlr, cmd->hdr.unit); + hba[cmd->ctlr]->misc_tflags |= MISC_NONFATAL_WARN; + } + if (cmd->req.hdr.rcode & RCODE_FATAL) { + printk(KERN_WARNING "Fatal error on ida/c%dd%d\n", + cmd->ctlr, cmd->hdr.unit); + error = -EIO; + } + if (cmd->req.hdr.rcode & RCODE_INVREQ) { + printk(KERN_WARNING "Invalid request on ida/c%dd%d = (cmd=%x sect=%d cnt=%d sg=%d ret=%x)\n", + cmd->ctlr, cmd->hdr.unit, cmd->req.hdr.cmd, + cmd->req.hdr.blk, cmd->req.hdr.blk_cnt, + cmd->req.hdr.sg_cnt, cmd->req.hdr.rcode); + error = -EIO; + } + if (timeout) + error = -EIO; + /* unmap the DMA mapping for all the scatter gather elements */ + if (cmd->req.hdr.cmd == IDA_READ) + ddir = PCI_DMA_FROMDEVICE; + else + ddir = PCI_DMA_TODEVICE; + for(i=0; ireq.hdr.sg_cnt; i++) + pci_unmap_page(hba[cmd->ctlr]->pci_dev, cmd->req.sg[i].addr, + cmd->req.sg[i].size, ddir); + + DBGPX(printk("Done with %p\n", rq);); + __blk_end_request_all(rq, error); +} + +/* + * The controller will interrupt us upon completion of commands. + * Find the command on the completion queue, remove it, tell the OS and + * try to queue up more IO + */ +static irqreturn_t do_ida_intr(int irq, void *dev_id) +{ + ctlr_info_t *h = dev_id; + cmdlist_t *c; + unsigned long istat; + unsigned long flags; + __u32 a,a1; + + istat = h->access.intr_pending(h); + /* Is this interrupt for us? */ + if (istat == 0) + return IRQ_NONE; + + /* + * If there are completed commands in the completion queue, + * we had better do something about it. + */ + spin_lock_irqsave(IDA_LOCK(h->ctlr), flags); + if (istat & FIFO_NOT_EMPTY) { + while((a = h->access.command_completed(h))) { + a1 = a; a &= ~3; + if ((c = h->cmpQ) == NULL) + { + printk(KERN_WARNING "cpqarray: Completion of %08lx ignored\n", (unsigned long)a1); + continue; + } + while(c->busaddr != a) { + c = c->next; + if (c == h->cmpQ) + break; + } + /* + * If we've found the command, take it off the + * completion Q and free it + */ + if (c->busaddr == a) { + removeQ(&h->cmpQ, c); + /* Check for invalid command. + * Controller returns command error, + * But rcode = 0. + */ + + if((a1 & 0x03) && (c->req.hdr.rcode == 0)) + { + c->req.hdr.rcode = RCODE_INVREQ; + } + if (c->type == CMD_RWREQ) { + complete_command(c, 0); + cmd_free(h, c, 1); + } else if (c->type == CMD_IOCTL_PEND) { + c->type = CMD_IOCTL_DONE; + } + continue; + } + } + } + + /* + * See if we can queue up some more IO + */ + do_ida_request(h->queue); + spin_unlock_irqrestore(IDA_LOCK(h->ctlr), flags); + return IRQ_HANDLED; +} + +/* + * This timer was for timing out requests that haven't happened after + * IDA_TIMEOUT. That wasn't such a good idea. This timer is used to + * reset a flags structure so we don't flood the user with + * "Non-Fatal error" messages. + */ +static void ida_timer(unsigned long tdata) +{ + ctlr_info_t *h = (ctlr_info_t*)tdata; + + h->timer.expires = jiffies + IDA_TIMER; + add_timer(&h->timer); + h->misc_tflags = 0; +} + +static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + drv_info_t *drv = get_drv(bdev->bd_disk); + + if (drv->cylinders) { + geo->heads = drv->heads; + geo->sectors = drv->sectors; + geo->cylinders = drv->cylinders; + } else { + geo->heads = 0xff; + geo->sectors = 0x3f; + geo->cylinders = drv->nr_blks / (0xff*0x3f); + } + + return 0; +} + +/* + * ida_ioctl does some miscellaneous stuff like reporting drive geometry, + * setting readahead and submitting commands from userspace to the controller. + */ +static int ida_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) +{ + drv_info_t *drv = get_drv(bdev->bd_disk); + ctlr_info_t *host = get_host(bdev->bd_disk); + int error; + ida_ioctl_t __user *io = (ida_ioctl_t __user *)arg; + ida_ioctl_t *my_io; + + switch(cmd) { + case IDAGETDRVINFO: + if (copy_to_user(&io->c.drv, drv, sizeof(drv_info_t))) + return -EFAULT; + return 0; + case IDAPASSTHRU: + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + my_io = kmalloc(sizeof(ida_ioctl_t), GFP_KERNEL); + if (!my_io) + return -ENOMEM; + error = -EFAULT; + if (copy_from_user(my_io, io, sizeof(*my_io))) + goto out_passthru; + error = ida_ctlr_ioctl(host, drv - host->drv, my_io); + if (error) + goto out_passthru; + error = -EFAULT; + if (copy_to_user(io, my_io, sizeof(*my_io))) + goto out_passthru; + error = 0; +out_passthru: + kfree(my_io); + return error; + case IDAGETCTLRSIG: + if (!arg) return -EINVAL; + if (put_user(host->ctlr_sig, (int __user *)arg)) + return -EFAULT; + return 0; + case IDAREVALIDATEVOLS: + if (MINOR(bdev->bd_dev) != 0) + return -ENXIO; + return revalidate_allvol(host); + case IDADRIVERVERSION: + if (!arg) return -EINVAL; + if (put_user(DRIVER_VERSION, (unsigned long __user *)arg)) + return -EFAULT; + return 0; + case IDAGETPCIINFO: + { + + ida_pci_info_struct pciinfo; + + if (!arg) return -EINVAL; + memset(&pciinfo, 0, sizeof(pciinfo)); + pciinfo.bus = host->pci_dev->bus->number; + pciinfo.dev_fn = host->pci_dev->devfn; + pciinfo.board_id = host->board_id; + if(copy_to_user((void __user *) arg, &pciinfo, + sizeof( ida_pci_info_struct))) + return -EFAULT; + return(0); + } + + default: + return -EINVAL; + } + +} + +static int ida_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long param) +{ + int ret; + + mutex_lock(&cpqarray_mutex); + ret = ida_locked_ioctl(bdev, mode, cmd, param); + mutex_unlock(&cpqarray_mutex); + + return ret; +} + +/* + * ida_ctlr_ioctl is for passing commands to the controller from userspace. + * The command block (io) has already been copied to kernel space for us, + * however, any elements in the sglist need to be copied to kernel space + * or copied back to userspace. + * + * Only root may perform a controller passthru command, however I'm not doing + * any serious sanity checking on the arguments. Doing an IDA_WRITE_MEDIA and + * putting a 64M buffer in the sglist is probably a *bad* idea. + */ +static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io) +{ + int ctlr = h->ctlr; + cmdlist_t *c; + void *p = NULL; + unsigned long flags; + int error; + + if ((c = cmd_alloc(h, 0)) == NULL) + return -ENOMEM; + c->ctlr = ctlr; + c->hdr.unit = (io->unit & UNITVALID) ? (io->unit & ~UNITVALID) : dsk; + c->hdr.size = sizeof(rblk_t) >> 2; + c->size += sizeof(rblk_t); + + c->req.hdr.cmd = io->cmd; + c->req.hdr.blk = io->blk; + c->req.hdr.blk_cnt = io->blk_cnt; + c->type = CMD_IOCTL_PEND; + + /* Pre submit processing */ + switch(io->cmd) { + case PASSTHRU_A: + p = memdup_user(io->sg[0].addr, io->sg[0].size); + if (IS_ERR(p)) { + error = PTR_ERR(p); + cmd_free(h, c, 0); + return error; + } + c->req.hdr.blk = pci_map_single(h->pci_dev, &(io->c), + sizeof(ida_ioctl_t), + PCI_DMA_BIDIRECTIONAL); + c->req.sg[0].size = io->sg[0].size; + c->req.sg[0].addr = pci_map_single(h->pci_dev, p, + c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); + c->req.hdr.sg_cnt = 1; + break; + case IDA_READ: + case READ_FLASH_ROM: + case SENSE_CONTROLLER_PERFORMANCE: + p = kmalloc(io->sg[0].size, GFP_KERNEL); + if (!p) + { + error = -ENOMEM; + cmd_free(h, c, 0); + return(error); + } + + c->req.sg[0].size = io->sg[0].size; + c->req.sg[0].addr = pci_map_single(h->pci_dev, p, + c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); + c->req.hdr.sg_cnt = 1; + break; + case IDA_WRITE: + case IDA_WRITE_MEDIA: + case DIAG_PASS_THRU: + case COLLECT_BUFFER: + case WRITE_FLASH_ROM: + p = memdup_user(io->sg[0].addr, io->sg[0].size); + if (IS_ERR(p)) { + error = PTR_ERR(p); + cmd_free(h, c, 0); + return error; + } + c->req.sg[0].size = io->sg[0].size; + c->req.sg[0].addr = pci_map_single(h->pci_dev, p, + c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); + c->req.hdr.sg_cnt = 1; + break; + default: + c->req.sg[0].size = sizeof(io->c); + c->req.sg[0].addr = pci_map_single(h->pci_dev,&io->c, + c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); + c->req.hdr.sg_cnt = 1; + } + + /* Put the request on the tail of the request queue */ + spin_lock_irqsave(IDA_LOCK(ctlr), flags); + addQ(&h->reqQ, c); + h->Qdepth++; + start_io(h); + spin_unlock_irqrestore(IDA_LOCK(ctlr), flags); + + /* Wait for completion */ + while(c->type != CMD_IOCTL_DONE) + schedule(); + + /* Unmap the DMA */ + pci_unmap_single(h->pci_dev, c->req.sg[0].addr, c->req.sg[0].size, + PCI_DMA_BIDIRECTIONAL); + /* Post submit processing */ + switch(io->cmd) { + case PASSTHRU_A: + pci_unmap_single(h->pci_dev, c->req.hdr.blk, + sizeof(ida_ioctl_t), + PCI_DMA_BIDIRECTIONAL); + case IDA_READ: + case DIAG_PASS_THRU: + case SENSE_CONTROLLER_PERFORMANCE: + case READ_FLASH_ROM: + if (copy_to_user(io->sg[0].addr, p, io->sg[0].size)) { + kfree(p); + return -EFAULT; + } + /* fall through and free p */ + case IDA_WRITE: + case IDA_WRITE_MEDIA: + case COLLECT_BUFFER: + case WRITE_FLASH_ROM: + kfree(p); + break; + default:; + /* Nothing to do */ + } + + io->rcode = c->req.hdr.rcode; + cmd_free(h, c, 0); + return(0); +} + +/* + * Commands are pre-allocated in a large block. Here we use a simple bitmap + * scheme to suballocte them to the driver. Operations that are not time + * critical (and can wait for kmalloc and possibly sleep) can pass in NULL + * as the first argument to get a new command. + */ +static cmdlist_t * cmd_alloc(ctlr_info_t *h, int get_from_pool) +{ + cmdlist_t * c; + int i; + dma_addr_t cmd_dhandle; + + if (!get_from_pool) { + c = (cmdlist_t*)pci_alloc_consistent(h->pci_dev, + sizeof(cmdlist_t), &cmd_dhandle); + if(c==NULL) + return NULL; + } else { + do { + i = find_first_zero_bit(h->cmd_pool_bits, NR_CMDS); + if (i == NR_CMDS) + return NULL; + } while(test_and_set_bit(i&(BITS_PER_LONG-1), h->cmd_pool_bits+(i/BITS_PER_LONG)) != 0); + c = h->cmd_pool + i; + cmd_dhandle = h->cmd_pool_dhandle + i*sizeof(cmdlist_t); + h->nr_allocs++; + } + + memset(c, 0, sizeof(cmdlist_t)); + c->busaddr = cmd_dhandle; + return c; +} + +static void cmd_free(ctlr_info_t *h, cmdlist_t *c, int got_from_pool) +{ + int i; + + if (!got_from_pool) { + pci_free_consistent(h->pci_dev, sizeof(cmdlist_t), c, + c->busaddr); + } else { + i = c - h->cmd_pool; + clear_bit(i&(BITS_PER_LONG-1), h->cmd_pool_bits+(i/BITS_PER_LONG)); + h->nr_frees++; + } +} + +/*********************************************************************** + name: sendcmd + Send a command to an IDA using the memory mapped FIFO interface + and wait for it to complete. + This routine should only be called at init time. +***********************************************************************/ +static int sendcmd( + __u8 cmd, + int ctlr, + void *buff, + size_t size, + unsigned int blk, + unsigned int blkcnt, + unsigned int log_unit ) +{ + cmdlist_t *c; + int complete; + unsigned long temp; + unsigned long i; + ctlr_info_t *info_p = hba[ctlr]; + + c = cmd_alloc(info_p, 1); + if(!c) + return IO_ERROR; + c->ctlr = ctlr; + c->hdr.unit = log_unit; + c->hdr.prio = 0; + c->hdr.size = sizeof(rblk_t) >> 2; + c->size += sizeof(rblk_t); + + /* The request information. */ + c->req.hdr.next = 0; + c->req.hdr.rcode = 0; + c->req.bp = 0; + c->req.hdr.sg_cnt = 1; + c->req.hdr.reserved = 0; + + if (size == 0) + c->req.sg[0].size = 512; + else + c->req.sg[0].size = size; + + c->req.hdr.blk = blk; + c->req.hdr.blk_cnt = blkcnt; + c->req.hdr.cmd = (unsigned char) cmd; + c->req.sg[0].addr = (__u32) pci_map_single(info_p->pci_dev, + buff, c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); + /* + * Disable interrupt + */ + info_p->access.set_intr_mask(info_p, 0); + /* Make sure there is room in the command FIFO */ + /* Actually it should be completely empty at this time. */ + for (i = 200000; i > 0; i--) { + temp = info_p->access.fifo_full(info_p); + if (temp != 0) { + break; + } + udelay(10); +DBG( + printk(KERN_WARNING "cpqarray ida%d: idaSendPciCmd FIFO full," + " waiting!\n", ctlr); +); + } + /* + * Send the cmd + */ + info_p->access.submit_command(info_p, c); + complete = pollcomplete(ctlr); + + pci_unmap_single(info_p->pci_dev, (dma_addr_t) c->req.sg[0].addr, + c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL); + if (complete != 1) { + if (complete != c->busaddr) { + printk( KERN_WARNING + "cpqarray ida%d: idaSendPciCmd " + "Invalid command list address returned! (%08lx)\n", + ctlr, (unsigned long)complete); + cmd_free(info_p, c, 1); + return (IO_ERROR); + } + } else { + printk( KERN_WARNING + "cpqarray ida%d: idaSendPciCmd Timeout out, " + "No command list address returned!\n", + ctlr); + cmd_free(info_p, c, 1); + return (IO_ERROR); + } + + if (c->req.hdr.rcode & 0x00FE) { + if (!(c->req.hdr.rcode & BIG_PROBLEM)) { + printk( KERN_WARNING + "cpqarray ida%d: idaSendPciCmd, error: " + "Controller failed at init time " + "cmd: 0x%x, return code = 0x%x\n", + ctlr, c->req.hdr.cmd, c->req.hdr.rcode); + + cmd_free(info_p, c, 1); + return (IO_ERROR); + } + } + cmd_free(info_p, c, 1); + return (IO_OK); +} + +/* + * revalidate_allvol is for online array config utilities. After a + * utility reconfigures the drives in the array, it can use this function + * (through an ioctl) to make the driver zap any previous disk structs for + * that controller and get new ones. + * + * Right now I'm using the getgeometry() function to do this, but this + * function should probably be finer grained and allow you to revalidate one + * particualar logical volume (instead of all of them on a particular + * controller). + */ +static int revalidate_allvol(ctlr_info_t *host) +{ + int ctlr = host->ctlr; + int i; + unsigned long flags; + + spin_lock_irqsave(IDA_LOCK(ctlr), flags); + if (host->usage_count > 1) { + spin_unlock_irqrestore(IDA_LOCK(ctlr), flags); + printk(KERN_WARNING "cpqarray: Device busy for volume" + " revalidation (usage=%d)\n", host->usage_count); + return -EBUSY; + } + host->usage_count++; + spin_unlock_irqrestore(IDA_LOCK(ctlr), flags); + + /* + * Set the partition and block size structures for all volumes + * on this controller to zero. We will reread all of this data + */ + set_capacity(ida_gendisk[ctlr][0], 0); + for (i = 1; i < NWD; i++) { + struct gendisk *disk = ida_gendisk[ctlr][i]; + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + } + memset(host->drv, 0, sizeof(drv_info_t)*NWD); + + /* + * Tell the array controller not to give us any interrupts while + * we check the new geometry. Then turn interrupts back on when + * we're done. + */ + host->access.set_intr_mask(host, 0); + getgeometry(ctlr); + host->access.set_intr_mask(host, FIFO_NOT_EMPTY); + + for(i=0; idrv[i]; + if (i && !drv->nr_blks) + continue; + blk_queue_logical_block_size(host->queue, drv->blk_size); + set_capacity(disk, drv->nr_blks); + disk->queue = host->queue; + disk->private_data = drv; + if (i) + add_disk(disk); + } + + host->usage_count--; + return 0; +} + +static int ida_revalidate(struct gendisk *disk) +{ + drv_info_t *drv = disk->private_data; + set_capacity(disk, drv->nr_blks); + return 0; +} + +/******************************************************************** + name: pollcomplete + Wait polling for a command to complete. + The memory mapped FIFO is polled for the completion. + Used only at init time, interrupts disabled. + ********************************************************************/ +static int pollcomplete(int ctlr) +{ + int done; + int i; + + /* Wait (up to 2 seconds) for a command to complete */ + + for (i = 200000; i > 0; i--) { + done = hba[ctlr]->access.command_completed(hba[ctlr]); + if (done == 0) { + udelay(10); /* a short fixed delay */ + } else + return (done); + } + /* Invalid address to tell caller we ran out of time */ + return 1; +} +/***************************************************************** + start_fwbk + Starts controller firmwares background processing. + Currently only the Integrated Raid controller needs this done. + If the PCI mem address registers are written to after this, + data corruption may occur +*****************************************************************/ +static void start_fwbk(int ctlr) +{ + id_ctlr_t *id_ctlr_buf; + int ret_code; + + if( (hba[ctlr]->board_id != 0x40400E11) + && (hba[ctlr]->board_id != 0x40480E11) ) + + /* Not a Integrated Raid, so there is nothing for us to do */ + return; + printk(KERN_DEBUG "cpqarray: Starting firmware's background" + " processing\n"); + /* Command does not return anything, but idasend command needs a + buffer */ + id_ctlr_buf = kmalloc(sizeof(id_ctlr_t), GFP_KERNEL); + if(id_ctlr_buf==NULL) + { + printk(KERN_WARNING "cpqarray: Out of memory. " + "Unable to start background processing.\n"); + return; + } + ret_code = sendcmd(RESUME_BACKGROUND_ACTIVITY, ctlr, + id_ctlr_buf, 0, 0, 0, 0); + if(ret_code != IO_OK) + printk(KERN_WARNING "cpqarray: Unable to start" + " background processing\n"); + + kfree(id_ctlr_buf); +} +/***************************************************************** + getgeometry + Get ida logical volume geometry from the controller + This is a large bit of code which once existed in two flavors, + It is used only at init time. +*****************************************************************/ +static void getgeometry(int ctlr) +{ + id_log_drv_t *id_ldrive; + id_ctlr_t *id_ctlr_buf; + sense_log_drv_stat_t *id_lstatus_buf; + config_t *sense_config_buf; + unsigned int log_unit, log_index; + int ret_code, size; + drv_info_t *drv; + ctlr_info_t *info_p = hba[ctlr]; + int i; + + info_p->log_drv_map = 0; + + id_ldrive = kzalloc(sizeof(id_log_drv_t), GFP_KERNEL); + if (!id_ldrive) { + printk( KERN_ERR "cpqarray: out of memory.\n"); + goto err_0; + } + + id_ctlr_buf = kzalloc(sizeof(id_ctlr_t), GFP_KERNEL); + if (!id_ctlr_buf) { + printk( KERN_ERR "cpqarray: out of memory.\n"); + goto err_1; + } + + id_lstatus_buf = kzalloc(sizeof(sense_log_drv_stat_t), GFP_KERNEL); + if (!id_lstatus_buf) { + printk( KERN_ERR "cpqarray: out of memory.\n"); + goto err_2; + } + + sense_config_buf = kzalloc(sizeof(config_t), GFP_KERNEL); + if (!sense_config_buf) { + printk( KERN_ERR "cpqarray: out of memory.\n"); + goto err_3; + } + + info_p->phys_drives = 0; + info_p->log_drv_map = 0; + info_p->drv_assign_map = 0; + info_p->drv_spare_map = 0; + info_p->mp_failed_drv_map = 0; /* only initialized here */ + /* Get controllers info for this logical drive */ + ret_code = sendcmd(ID_CTLR, ctlr, id_ctlr_buf, 0, 0, 0, 0); + if (ret_code == IO_ERROR) { + /* + * If can't get controller info, set the logical drive map to 0, + * so the idastubopen will fail on all logical drives + * on the controller. + */ + printk(KERN_ERR "cpqarray: error sending ID controller\n"); + goto err_4; + } + + info_p->log_drives = id_ctlr_buf->nr_drvs; + for(i=0;i<4;i++) + info_p->firm_rev[i] = id_ctlr_buf->firm_rev[i]; + info_p->ctlr_sig = id_ctlr_buf->cfg_sig; + + printk(" (%s)\n", info_p->product_name); + /* + * Initialize logical drive map to zero + */ + log_index = 0; + /* + * Get drive geometry for all logical drives + */ + if (id_ctlr_buf->nr_drvs > 16) + printk(KERN_WARNING "cpqarray ida%d: This driver supports " + "16 logical drives per controller.\n. " + " Additional drives will not be " + "detected\n", ctlr); + + for (log_unit = 0; + (log_index < id_ctlr_buf->nr_drvs) + && (log_unit < NWD); + log_unit++) { + size = sizeof(sense_log_drv_stat_t); + + /* + Send "Identify logical drive status" cmd + */ + ret_code = sendcmd(SENSE_LOG_DRV_STAT, + ctlr, id_lstatus_buf, size, 0, 0, log_unit); + if (ret_code == IO_ERROR) { + /* + If can't get logical drive status, set + the logical drive map to 0, so the + idastubopen will fail for all logical drives + on the controller. + */ + info_p->log_drv_map = 0; + printk( KERN_WARNING + "cpqarray ida%d: idaGetGeometry - Controller" + " failed to report status of logical drive %d\n" + "Access to this controller has been disabled\n", + ctlr, log_unit); + goto err_4; + } + /* + Make sure the logical drive is configured + */ + if (id_lstatus_buf->status != LOG_NOT_CONF) { + ret_code = sendcmd(ID_LOG_DRV, ctlr, id_ldrive, + sizeof(id_log_drv_t), 0, 0, log_unit); + /* + If error, the bit for this + logical drive won't be set and + idastubopen will return error. + */ + if (ret_code != IO_ERROR) { + drv = &info_p->drv[log_unit]; + drv->blk_size = id_ldrive->blk_size; + drv->nr_blks = id_ldrive->nr_blks; + drv->cylinders = id_ldrive->drv.cyl; + drv->heads = id_ldrive->drv.heads; + drv->sectors = id_ldrive->drv.sect_per_track; + info_p->log_drv_map |= (1 << log_unit); + + printk(KERN_INFO "cpqarray ida/c%dd%d: blksz=%d nr_blks=%d\n", + ctlr, log_unit, drv->blk_size, drv->nr_blks); + ret_code = sendcmd(SENSE_CONFIG, + ctlr, sense_config_buf, + sizeof(config_t), 0, 0, log_unit); + if (ret_code == IO_ERROR) { + info_p->log_drv_map = 0; + printk(KERN_ERR "cpqarray: error sending sense config\n"); + goto err_4; + } + + info_p->phys_drives = + sense_config_buf->ctlr_phys_drv; + info_p->drv_assign_map + |= sense_config_buf->drv_asgn_map; + info_p->drv_assign_map + |= sense_config_buf->spare_asgn_map; + info_p->drv_spare_map + |= sense_config_buf->spare_asgn_map; + } /* end of if no error on id_ldrive */ + log_index = log_index + 1; + } /* end of if logical drive configured */ + } /* end of for log_unit */ + + /* Free all the buffers and return */ +err_4: + kfree(sense_config_buf); +err_3: + kfree(id_lstatus_buf); +err_2: + kfree(id_ctlr_buf); +err_1: + kfree(id_ldrive); +err_0: + return; +} + +static void __exit cpqarray_exit(void) +{ + int i; + + pci_unregister_driver(&cpqarray_pci_driver); + + /* Double check that all controller entries have been removed */ + for(i=0; i +#include +#include +#include +#endif + +#include "ida_cmd.h" + +#define IO_OK 0 +#define IO_ERROR 1 +#define NWD 16 +#define NWD_SHIFT 4 + +#define IDA_TIMER (5*HZ) +#define IDA_TIMEOUT (10*HZ) + +#define MISC_NONFATAL_WARN 0x01 + +typedef struct { + unsigned blk_size; + unsigned nr_blks; + unsigned cylinders; + unsigned heads; + unsigned sectors; + int usage_count; +} drv_info_t; + +#ifdef __KERNEL__ + +struct ctlr_info; +typedef struct ctlr_info ctlr_info_t; + +struct access_method { + void (*submit_command)(ctlr_info_t *h, cmdlist_t *c); + void (*set_intr_mask)(ctlr_info_t *h, unsigned long val); + unsigned long (*fifo_full)(ctlr_info_t *h); + unsigned long (*intr_pending)(ctlr_info_t *h); + unsigned long (*command_completed)(ctlr_info_t *h); +}; + +struct board_type { + __u32 board_id; + char *product_name; + struct access_method *access; +}; + +struct ctlr_info { + int ctlr; + char devname[8]; + __u32 log_drv_map; + __u32 drv_assign_map; + __u32 drv_spare_map; + __u32 mp_failed_drv_map; + + char firm_rev[4]; + int ctlr_sig; + + int log_drives; + int phys_drives; + + struct pci_dev *pci_dev; /* NULL if EISA */ + __u32 board_id; + char *product_name; + + void __iomem *vaddr; + unsigned long paddr; + unsigned long io_mem_addr; + unsigned long io_mem_length; + int intr; + int usage_count; + drv_info_t drv[NWD]; + struct proc_dir_entry *proc; + + struct access_method access; + + cmdlist_t *reqQ; + cmdlist_t *cmpQ; + cmdlist_t *cmd_pool; + dma_addr_t cmd_pool_dhandle; + unsigned long *cmd_pool_bits; + struct request_queue *queue; + spinlock_t lock; + + unsigned int Qdepth; + unsigned int maxQsinceinit; + + unsigned int nr_requests; + unsigned int nr_allocs; + unsigned int nr_frees; + struct timer_list timer; + unsigned int misc_tflags; +}; + +#define IDA_LOCK(i) (&hba[i]->lock) + +#endif + +#endif /* CPQARRAY_H */ diff --git a/kernel/drivers/block/cryptoloop.c b/kernel/drivers/block/cryptoloop.c new file mode 100644 index 000000000..99e773cb7 --- /dev/null +++ b/kernel/drivers/block/cryptoloop.c @@ -0,0 +1,216 @@ +/* + Linux loop encryption enabling module + + Copyright (C) 2002 Herbert Valerio Riedel + Copyright (C) 2003 Fruhwirth Clemens + + This module is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This module is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this module; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include + +#include +#include +#include +#include +#include +#include +#include "loop.h" + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("loop blockdevice transferfunction adaptor / CryptoAPI"); +MODULE_AUTHOR("Herbert Valerio Riedel "); + +#define LOOP_IV_SECTOR_BITS 9 +#define LOOP_IV_SECTOR_SIZE (1 << LOOP_IV_SECTOR_BITS) + +static int +cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info) +{ + int err = -EINVAL; + int cipher_len; + int mode_len; + char cms[LO_NAME_SIZE]; /* cipher-mode string */ + char *cipher; + char *mode; + char *cmsp = cms; /* c-m string pointer */ + struct crypto_blkcipher *tfm; + + /* encryption breaks for non sector aligned offsets */ + + if (info->lo_offset % LOOP_IV_SECTOR_SIZE) + goto out; + + strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE); + cms[LO_NAME_SIZE - 1] = 0; + + cipher = cmsp; + cipher_len = strcspn(cmsp, "-"); + + mode = cmsp + cipher_len; + mode_len = 0; + if (*mode) { + mode++; + mode_len = strcspn(mode, "-"); + } + + if (!mode_len) { + mode = "cbc"; + mode_len = 3; + } + + if (cipher_len + mode_len + 3 > LO_NAME_SIZE) + return -EINVAL; + + memmove(cms, mode, mode_len); + cmsp = cms + mode_len; + *cmsp++ = '('; + memcpy(cmsp, info->lo_crypt_name, cipher_len); + cmsp += cipher_len; + *cmsp++ = ')'; + *cmsp = 0; + + tfm = crypto_alloc_blkcipher(cms, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + err = crypto_blkcipher_setkey(tfm, info->lo_encrypt_key, + info->lo_encrypt_key_size); + + if (err != 0) + goto out_free_tfm; + + lo->key_data = tfm; + return 0; + + out_free_tfm: + crypto_free_blkcipher(tfm); + + out: + return err; +} + + +typedef int (*encdec_cbc_t)(struct blkcipher_desc *desc, + struct scatterlist *sg_out, + struct scatterlist *sg_in, + unsigned int nsg); + +static int +cryptoloop_transfer(struct loop_device *lo, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t IV) +{ + struct crypto_blkcipher *tfm = lo->key_data; + struct blkcipher_desc desc = { + .tfm = tfm, + .flags = CRYPTO_TFM_REQ_MAY_SLEEP, + }; + struct scatterlist sg_out; + struct scatterlist sg_in; + + encdec_cbc_t encdecfunc; + struct page *in_page, *out_page; + unsigned in_offs, out_offs; + int err; + + sg_init_table(&sg_out, 1); + sg_init_table(&sg_in, 1); + + if (cmd == READ) { + in_page = raw_page; + in_offs = raw_off; + out_page = loop_page; + out_offs = loop_off; + encdecfunc = crypto_blkcipher_crt(tfm)->decrypt; + } else { + in_page = loop_page; + in_offs = loop_off; + out_page = raw_page; + out_offs = raw_off; + encdecfunc = crypto_blkcipher_crt(tfm)->encrypt; + } + + while (size > 0) { + const int sz = min(size, LOOP_IV_SECTOR_SIZE); + u32 iv[4] = { 0, }; + iv[0] = cpu_to_le32(IV & 0xffffffff); + + sg_set_page(&sg_in, in_page, sz, in_offs); + sg_set_page(&sg_out, out_page, sz, out_offs); + + desc.info = iv; + err = encdecfunc(&desc, &sg_out, &sg_in, sz); + if (err) + return err; + + IV++; + size -= sz; + in_offs += sz; + out_offs += sz; + } + + return 0; +} + +static int +cryptoloop_ioctl(struct loop_device *lo, int cmd, unsigned long arg) +{ + return -EINVAL; +} + +static int +cryptoloop_release(struct loop_device *lo) +{ + struct crypto_blkcipher *tfm = lo->key_data; + if (tfm != NULL) { + crypto_free_blkcipher(tfm); + lo->key_data = NULL; + return 0; + } + printk(KERN_ERR "cryptoloop_release(): tfm == NULL?\n"); + return -EINVAL; +} + +static struct loop_func_table cryptoloop_funcs = { + .number = LO_CRYPT_CRYPTOAPI, + .init = cryptoloop_init, + .ioctl = cryptoloop_ioctl, + .transfer = cryptoloop_transfer, + .release = cryptoloop_release, + .owner = THIS_MODULE +}; + +static int __init +init_cryptoloop(void) +{ + int rc = loop_register_transfer(&cryptoloop_funcs); + + if (rc) + printk(KERN_ERR "cryptoloop: loop_register_transfer failed\n"); + return rc; +} + +static void __exit +cleanup_cryptoloop(void) +{ + if (loop_unregister_transfer(LO_CRYPT_CRYPTOAPI)) + printk(KERN_ERR + "cryptoloop: loop_unregister_transfer failed\n"); +} + +module_init(init_cryptoloop); +module_exit(cleanup_cryptoloop); diff --git a/kernel/drivers/block/drbd/Kconfig b/kernel/drivers/block/drbd/Kconfig new file mode 100644 index 000000000..7845bd6ee --- /dev/null +++ b/kernel/drivers/block/drbd/Kconfig @@ -0,0 +1,73 @@ +# +# DRBD device driver configuration +# + +comment "DRBD disabled because PROC_FS or INET not selected" + depends on PROC_FS='n' || INET='n' + +config BLK_DEV_DRBD + tristate "DRBD Distributed Replicated Block Device support" + depends on PROC_FS && INET + select LRU_CACHE + select LIBCRC32C + default n + help + + NOTE: In order to authenticate connections you have to select + CRYPTO_HMAC and a hash function as well. + + DRBD is a shared-nothing, synchronously replicated block device. It + is designed to serve as a building block for high availability + clusters and in this context, is a "drop-in" replacement for shared + storage. Simplistically, you could see it as a network RAID 1. + + Each minor device has a role, which can be 'primary' or 'secondary'. + On the node with the primary device the application is supposed to + run and to access the device (/dev/drbdX). Every write is sent to + the local 'lower level block device' and, across the network, to the + node with the device in 'secondary' state. The secondary device + simply writes the data to its lower level block device. + + DRBD can also be used in dual-Primary mode (device writable on both + nodes), which means it can exhibit shared disk semantics in a + shared-nothing cluster. Needless to say, on top of dual-Primary + DRBD utilizing a cluster file system is necessary to maintain for + cache coherency. + + For automatic failover you need a cluster manager (e.g. heartbeat). + See also: http://www.drbd.org/, http://www.linux-ha.org + + If unsure, say N. + +config DRBD_FAULT_INJECTION + bool "DRBD fault injection" + depends on BLK_DEV_DRBD + help + + Say Y here if you want to simulate IO errors, in order to test DRBD's + behavior. + + The actual simulation of IO errors is done by writing 3 values to + /sys/module/drbd/parameters/ + + enable_faults: bitmask of... + 1 meta data write + 2 read + 4 resync data write + 8 read + 16 data write + 32 data read + 64 read ahead + 128 kmalloc of bitmap + 256 allocation of peer_requests + 512 insert data corruption on receiving side + + fault_devs: bitmask of minor numbers + fault_rate: frequency in percent + + Example: Simulate data write errors on /dev/drbd0 with a probability of 5%. + echo 16 > /sys/module/drbd/parameters/enable_faults + echo 1 > /sys/module/drbd/parameters/fault_devs + echo 5 > /sys/module/drbd/parameters/fault_rate + + If unsure, say N. diff --git a/kernel/drivers/block/drbd/Makefile b/kernel/drivers/block/drbd/Makefile new file mode 100644 index 000000000..4464e353c --- /dev/null +++ b/kernel/drivers/block/drbd/Makefile @@ -0,0 +1,8 @@ +drbd-y := drbd_bitmap.o drbd_proc.o +drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o +drbd-y += drbd_main.o drbd_strings.o drbd_nl.o +drbd-y += drbd_interval.o drbd_state.o +drbd-y += drbd_nla.o +drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o + +obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o diff --git a/kernel/drivers/block/drbd/drbd_actlog.c b/kernel/drivers/block/drbd/drbd_actlog.c new file mode 100644 index 000000000..1318e3217 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_actlog.c @@ -0,0 +1,1219 @@ +/* + drbd_actlog.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner . + Copyright (C) 2003-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include +#include +#include +#include "drbd_int.h" + + +enum al_transaction_types { + AL_TR_UPDATE = 0, + AL_TR_INITIALIZED = 0xffff +}; +/* all fields on disc in big endian */ +struct __packed al_transaction_on_disk { + /* don't we all like magic */ + __be32 magic; + + /* to identify the most recent transaction block + * in the on disk ring buffer */ + __be32 tr_number; + + /* checksum on the full 4k block, with this field set to 0. */ + __be32 crc32c; + + /* type of transaction, special transaction types like: + * purge-all, set-all-idle, set-all-active, ... to-be-defined + * see also enum al_transaction_types */ + __be16 transaction_type; + + /* we currently allow only a few thousand extents, + * so 16bit will be enough for the slot number. */ + + /* how many updates in this transaction */ + __be16 n_updates; + + /* maximum slot number, "al-extents" in drbd.conf speak. + * Having this in each transaction should make reconfiguration + * of that parameter easier. */ + __be16 context_size; + + /* slot number the context starts with */ + __be16 context_start_slot_nr; + + /* Some reserved bytes. Expected usage is a 64bit counter of + * sectors-written since device creation, and other data generation tag + * supporting usage */ + __be32 __reserved[4]; + + /* --- 36 byte used --- */ + + /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes + * in one transaction, then use the remaining byte in the 4k block for + * context information. "Flexible" number of updates per transaction + * does not help, as we have to account for the case when all update + * slots are used anyways, so it would only complicate code without + * additional benefit. + */ + __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; + + /* but the extent number is 32bit, which at an extent size of 4 MiB + * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ + __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; + + /* --- 420 bytes used (36 + 64*6) --- */ + + /* 4096 - 420 = 3676 = 919 * 4 */ + __be32 context[AL_CONTEXT_PER_TRANSACTION]; +}; + +void *drbd_md_get_buffer(struct drbd_device *device, const char *intent) +{ + int r; + + wait_event(device->misc_wait, + (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 || + device->state.disk <= D_FAILED); + + if (r) + return NULL; + + device->md_io.current_use = intent; + device->md_io.start_jif = jiffies; + device->md_io.submit_jif = device->md_io.start_jif - 1; + return page_address(device->md_io.page); +} + +void drbd_md_put_buffer(struct drbd_device *device) +{ + if (atomic_dec_and_test(&device->md_io.in_use)) + wake_up(&device->misc_wait); +} + +void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, + unsigned int *done) +{ + long dt; + + rcu_read_lock(); + dt = rcu_dereference(bdev->disk_conf)->disk_timeout; + rcu_read_unlock(); + dt = dt * HZ / 10; + if (dt == 0) + dt = MAX_SCHEDULE_TIMEOUT; + + dt = wait_event_timeout(device->misc_wait, + *done || test_bit(FORCE_DETACH, &device->flags), dt); + if (dt == 0) { + drbd_err(device, "meta-data IO operation timed out\n"); + drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH); + } +} + +static int _drbd_md_sync_page_io(struct drbd_device *device, + struct drbd_backing_dev *bdev, + sector_t sector, int rw) +{ + struct bio *bio; + /* we do all our meta data IO in aligned 4k blocks. */ + const int size = 4096; + int err; + + device->md_io.done = 0; + device->md_io.error = -ENODEV; + + if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags)) + rw |= REQ_FUA | REQ_FLUSH; + rw |= REQ_SYNC | REQ_NOIDLE; + + bio = bio_alloc_drbd(GFP_NOIO); + bio->bi_bdev = bdev->md_bdev; + bio->bi_iter.bi_sector = sector; + err = -EIO; + if (bio_add_page(bio, device->md_io.page, size, 0) != size) + goto out; + bio->bi_private = device; + bio->bi_end_io = drbd_md_endio; + bio->bi_rw = rw; + + if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL) + /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ + ; + else if (!get_ldev_if_state(device, D_ATTACHING)) { + /* Corresponding put_ldev in drbd_md_endio() */ + drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); + err = -ENODEV; + goto out; + } + + bio_get(bio); /* one bio_put() is in the completion handler */ + atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */ + device->md_io.submit_jif = jiffies; + if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) + bio_endio(bio, -EIO); + else + submit_bio(rw, bio); + wait_until_done_or_force_detached(device, bdev, &device->md_io.done); + if (bio_flagged(bio, BIO_UPTODATE)) + err = device->md_io.error; + + out: + bio_put(bio); + return err; +} + +int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, + sector_t sector, int rw) +{ + int err; + D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1); + + BUG_ON(!bdev->md_bdev); + + dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", + (void*)_RET_IP_ ); + + if (sector < drbd_md_first_sector(bdev) || + sector + 7 > drbd_md_last_sector(bdev)) + drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + + err = _drbd_md_sync_page_io(device, bdev, sector, rw); + if (err) { + drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); + } + return err; +} + +static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr) +{ + struct lc_element *tmp; + tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); + if (unlikely(tmp != NULL)) { + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); + if (test_bit(BME_NO_WRITES, &bm_ext->flags)) + return bm_ext; + } + return NULL; +} + +static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock) +{ + struct lc_element *al_ext; + struct bm_extent *bm_ext; + int wake; + + spin_lock_irq(&device->al_lock); + bm_ext = find_active_resync_extent(device, enr); + if (bm_ext) { + wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); + spin_unlock_irq(&device->al_lock); + if (wake) + wake_up(&device->al_wait); + return NULL; + } + if (nonblock) + al_ext = lc_try_get(device->act_log, enr); + else + al_ext = lc_get(device->act_log, enr); + spin_unlock_irq(&device->al_lock); + return al_ext; +} + +bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i) +{ + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + + D_ASSERT(device, (unsigned)(last - first) <= 1); + D_ASSERT(device, atomic_read(&device->local_cnt) > 0); + + /* FIXME figure out a fast path for bios crossing AL extent boundaries */ + if (first != last) + return false; + + return _al_get(device, first, true); +} + +bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) +{ + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned enr; + bool need_transaction = false; + + D_ASSERT(device, first <= last); + D_ASSERT(device, atomic_read(&device->local_cnt) > 0); + + for (enr = first; enr <= last; enr++) { + struct lc_element *al_ext; + wait_event(device->al_wait, + (al_ext = _al_get(device, enr, false)) != NULL); + if (al_ext->lc_number != enr) + need_transaction = true; + } + return need_transaction; +} + +static int al_write_transaction(struct drbd_device *device); + +void drbd_al_begin_io_commit(struct drbd_device *device) +{ + bool locked = false; + + /* Serialize multiple transactions. + * This uses test_and_set_bit, memory barrier is implicit. + */ + wait_event(device->al_wait, + device->act_log->pending_changes == 0 || + (locked = lc_try_lock_for_transaction(device->act_log))); + + if (locked) { + /* Double check: it may have been committed by someone else, + * while we have been waiting for the lock. */ + if (device->act_log->pending_changes) { + bool write_al_updates; + + rcu_read_lock(); + write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; + rcu_read_unlock(); + + if (write_al_updates) + al_write_transaction(device); + spin_lock_irq(&device->al_lock); + /* FIXME + if (err) + we need an "lc_cancel" here; + */ + lc_committed(device->act_log); + spin_unlock_irq(&device->al_lock); + } + lc_unlock(device->act_log); + wake_up(&device->al_wait); + } +} + +/* + * @delegate: delegate activity log I/O to the worker thread + */ +void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i) +{ + if (drbd_al_begin_io_prepare(device, i)) + drbd_al_begin_io_commit(device); +} + +int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i) +{ + struct lru_cache *al = device->act_log; + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned nr_al_extents; + unsigned available_update_slots; + unsigned enr; + + D_ASSERT(device, first <= last); + + nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ + available_update_slots = min(al->nr_elements - al->used, + al->max_pending_changes - al->pending_changes); + + /* We want all necessary updates for a given request within the same transaction + * We could first check how many updates are *actually* needed, + * and use that instead of the worst-case nr_al_extents */ + if (available_update_slots < nr_al_extents) { + /* Too many activity log extents are currently "hot". + * + * If we have accumulated pending changes already, + * we made progress. + * + * If we cannot get even a single pending change through, + * stop the fast path until we made some progress, + * or requests to "cold" extents could be starved. */ + if (!al->pending_changes) + __set_bit(__LC_STARVING, &device->act_log->flags); + return -ENOBUFS; + } + + /* Is resync active in this area? */ + for (enr = first; enr <= last; enr++) { + struct lc_element *tmp; + tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); + if (unlikely(tmp != NULL)) { + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); + if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { + if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) + return -EBUSY; + return -EWOULDBLOCK; + } + } + } + + /* Checkout the refcounts. + * Given that we checked for available elements and update slots above, + * this has to be successful. */ + for (enr = first; enr <= last; enr++) { + struct lc_element *al_ext; + al_ext = lc_get_cumulative(device->act_log, enr); + if (!al_ext) + drbd_info(device, "LOGIC BUG for enr=%u\n", enr); + } + return 0; +} + +void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) +{ + /* for bios crossing activity log extent boundaries, + * we may need to activate two extents in one go */ + unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); + unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); + unsigned enr; + struct lc_element *extent; + unsigned long flags; + + D_ASSERT(device, first <= last); + spin_lock_irqsave(&device->al_lock, flags); + + for (enr = first; enr <= last; enr++) { + extent = lc_find(device->act_log, enr); + if (!extent) { + drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr); + continue; + } + lc_put(device->act_log, extent); + } + spin_unlock_irqrestore(&device->al_lock, flags); + wake_up(&device->al_wait); +} + +#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) +/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT + * are still coupled, or assume too much about their relation. + * Code below will not work if this is violated. + * Will be cleaned up with some followup patch. + */ +# error FIXME +#endif + +static unsigned int al_extent_to_bm_page(unsigned int al_enr) +{ + return al_enr >> + /* bit to page */ + ((PAGE_SHIFT + 3) - + /* al extent number to bit */ + (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); +} + +static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) +{ + const unsigned int stripes = device->ldev->md.al_stripes; + const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; + + /* transaction number, modulo on-disk ring buffer wrap around */ + unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); + + /* ... to aligned 4k on disk block */ + t = ((t % stripes) * stripe_size_4kB) + t/stripes; + + /* ... to 512 byte sector in activity log */ + t *= 8; + + /* ... plus offset to the on disk position */ + return device->ldev->md.md_offset + device->ldev->md.al_offset + t; +} + +int al_write_transaction(struct drbd_device *device) +{ + struct al_transaction_on_disk *buffer; + struct lc_element *e; + sector_t sector; + int i, mx; + unsigned extent_nr; + unsigned crc = 0; + int err = 0; + + if (!get_ldev(device)) { + drbd_err(device, "disk is %s, cannot start al transaction\n", + drbd_disk_str(device->state.disk)); + return -EIO; + } + + /* The bitmap write may have failed, causing a state change. */ + if (device->state.disk < D_INCONSISTENT) { + drbd_err(device, + "disk is %s, cannot write al transaction\n", + drbd_disk_str(device->state.disk)); + put_ldev(device); + return -EIO; + } + + /* protects md_io_buffer, al_tr_cycle, ... */ + buffer = drbd_md_get_buffer(device, __func__); + if (!buffer) { + drbd_err(device, "disk failed while waiting for md_io buffer\n"); + put_ldev(device); + return -ENODEV; + } + + memset(buffer, 0, sizeof(*buffer)); + buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); + buffer->tr_number = cpu_to_be32(device->al_tr_number); + + i = 0; + + /* Even though no one can start to change this list + * once we set the LC_LOCKED -- from drbd_al_begin_io(), + * lc_try_lock_for_transaction() --, someone may still + * be in the process of changing it. */ + spin_lock_irq(&device->al_lock); + list_for_each_entry(e, &device->act_log->to_be_changed, list) { + if (i == AL_UPDATES_PER_TRANSACTION) { + i++; + break; + } + buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); + buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); + if (e->lc_number != LC_FREE) + drbd_bm_mark_for_writeout(device, + al_extent_to_bm_page(e->lc_number)); + i++; + } + spin_unlock_irq(&device->al_lock); + BUG_ON(i > AL_UPDATES_PER_TRANSACTION); + + buffer->n_updates = cpu_to_be16(i); + for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { + buffer->update_slot_nr[i] = cpu_to_be16(-1); + buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); + } + + buffer->context_size = cpu_to_be16(device->act_log->nr_elements); + buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); + + mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, + device->act_log->nr_elements - device->al_tr_cycle); + for (i = 0; i < mx; i++) { + unsigned idx = device->al_tr_cycle + i; + extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; + buffer->context[i] = cpu_to_be32(extent_nr); + } + for (; i < AL_CONTEXT_PER_TRANSACTION; i++) + buffer->context[i] = cpu_to_be32(LC_FREE); + + device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; + if (device->al_tr_cycle >= device->act_log->nr_elements) + device->al_tr_cycle = 0; + + sector = al_tr_number_to_on_disk_sector(device); + + crc = crc32c(0, buffer, 4096); + buffer->crc32c = cpu_to_be32(crc); + + if (drbd_bm_write_hinted(device)) + err = -EIO; + else { + bool write_al_updates; + rcu_read_lock(); + write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; + rcu_read_unlock(); + if (write_al_updates) { + if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { + err = -EIO; + drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); + } else { + device->al_tr_number++; + device->al_writ_cnt++; + } + } + } + + drbd_md_put_buffer(device); + put_ldev(device); + + return err; +} + +static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) +{ + int rv; + + spin_lock_irq(&device->al_lock); + rv = (al_ext->refcnt == 0); + if (likely(rv)) + lc_del(device->act_log, al_ext); + spin_unlock_irq(&device->al_lock); + + return rv; +} + +/** + * drbd_al_shrink() - Removes all active extents form the activity log + * @device: DRBD device. + * + * Removes all active extents form the activity log, waiting until + * the reference count of each entry dropped to 0 first, of course. + * + * You need to lock device->act_log with lc_try_lock() / lc_unlock() + */ +void drbd_al_shrink(struct drbd_device *device) +{ + struct lc_element *al_ext; + int i; + + D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags)); + + for (i = 0; i < device->act_log->nr_elements; i++) { + al_ext = lc_element_by_index(device->act_log, i); + if (al_ext->lc_number == LC_FREE) + continue; + wait_event(device->al_wait, _try_lc_del(device, al_ext)); + } + + wake_up(&device->al_wait); +} + +int drbd_initialize_al(struct drbd_device *device, void *buffer) +{ + struct al_transaction_on_disk *al = buffer; + struct drbd_md *md = &device->ldev->md; + sector_t al_base = md->md_offset + md->al_offset; + int al_size_4k = md->al_stripes * md->al_stripe_size_4k; + int i; + + memset(al, 0, 4096); + al->magic = cpu_to_be32(DRBD_AL_MAGIC); + al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); + al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); + + for (i = 0; i < al_size_4k; i++) { + int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE); + if (err) + return err; + } + return 0; +} + +static const char *drbd_change_sync_fname[] = { + [RECORD_RS_FAILED] = "drbd_rs_failed_io", + [SET_IN_SYNC] = "drbd_set_in_sync", + [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync" +}; + +/* ATTENTION. The AL's extents are 4MB each, while the extents in the + * resync LRU-cache are 16MB each. + * The caller of this function has to hold an get_ldev() reference. + * + * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success), + * potentially pulling in (and recounting the corresponding bits) + * this resync extent into the resync extent lru cache. + * + * Returns whether all bits have been cleared for this resync extent, + * precisely: (rs_left <= rs_failed) + * + * TODO will be obsoleted once we have a caching lru of the on disk bitmap + */ +static bool update_rs_extent(struct drbd_device *device, + unsigned int enr, int count, + enum update_sync_bits_mode mode) +{ + struct lc_element *e; + + D_ASSERT(device, atomic_read(&device->local_cnt)); + + /* When setting out-of-sync bits, + * we don't need it cached (lc_find). + * But if it is present in the cache, + * we should update the cached bit count. + * Otherwise, that extent should be in the resync extent lru cache + * already -- or we want to pull it in if necessary -- (lc_get), + * then update and check rs_left and rs_failed. */ + if (mode == SET_OUT_OF_SYNC) + e = lc_find(device->resync, enr); + else + e = lc_get(device->resync, enr); + if (e) { + struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); + if (ext->lce.lc_number == enr) { + if (mode == SET_IN_SYNC) + ext->rs_left -= count; + else if (mode == SET_OUT_OF_SYNC) + ext->rs_left += count; + else + ext->rs_failed += count; + if (ext->rs_left < ext->rs_failed) { + drbd_warn(device, "BAD! enr=%u rs_left=%d " + "rs_failed=%d count=%d cstate=%s\n", + ext->lce.lc_number, ext->rs_left, + ext->rs_failed, count, + drbd_conn_str(device->state.conn)); + + /* We don't expect to be able to clear more bits + * than have been set when we originally counted + * the set bits to cache that value in ext->rs_left. + * Whatever the reason (disconnect during resync, + * delayed local completion of an application write), + * try to fix it up by recounting here. */ + ext->rs_left = drbd_bm_e_weight(device, enr); + } + } else { + /* Normally this element should be in the cache, + * since drbd_rs_begin_io() pulled it already in. + * + * But maybe an application write finished, and we set + * something outside the resync lru_cache in sync. + */ + int rs_left = drbd_bm_e_weight(device, enr); + if (ext->flags != 0) { + drbd_warn(device, "changing resync lce: %d[%u;%02lx]" + " -> %d[%u;00]\n", + ext->lce.lc_number, ext->rs_left, + ext->flags, enr, rs_left); + ext->flags = 0; + } + if (ext->rs_failed) { + drbd_warn(device, "Kicking resync_lru element enr=%u " + "out with rs_failed=%d\n", + ext->lce.lc_number, ext->rs_failed); + } + ext->rs_left = rs_left; + ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0; + /* we don't keep a persistent log of the resync lru, + * we can commit any change right away. */ + lc_committed(device->resync); + } + if (mode != SET_OUT_OF_SYNC) + lc_put(device->resync, &ext->lce); + /* no race, we are within the al_lock! */ + + if (ext->rs_left <= ext->rs_failed) { + ext->rs_failed = 0; + return true; + } + } else if (mode != SET_OUT_OF_SYNC) { + /* be quiet if lc_find() did not find it. */ + drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", + device->resync_locked, + device->resync->nr_elements, + device->resync->flags); + } + return false; +} + +void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) +{ + unsigned long now = jiffies; + unsigned long last = device->rs_mark_time[device->rs_last_mark]; + int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS; + if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { + if (device->rs_mark_left[device->rs_last_mark] != still_to_go && + device->state.conn != C_PAUSED_SYNC_T && + device->state.conn != C_PAUSED_SYNC_S) { + device->rs_mark_time[next] = now; + device->rs_mark_left[next] = still_to_go; + device->rs_last_mark = next; + } + } +} + +/* It is called lazy update, so don't do write-out too often. */ +static bool lazy_bitmap_update_due(struct drbd_device *device) +{ + return time_after(jiffies, device->rs_last_bcast + 2*HZ); +} + +static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) +{ + if (rs_done) + set_bit(RS_DONE, &device->flags); + /* and also set RS_PROGRESS below */ + else if (!lazy_bitmap_update_due(device)) + return; + + drbd_device_post_work(device, RS_PROGRESS); +} + +static int update_sync_bits(struct drbd_device *device, + unsigned long sbnr, unsigned long ebnr, + enum update_sync_bits_mode mode) +{ + /* + * We keep a count of set bits per resync-extent in the ->rs_left + * caching member, so we need to loop and work within the resync extent + * alignment. Typically this loop will execute exactly once. + */ + unsigned long flags; + unsigned long count = 0; + unsigned int cleared = 0; + while (sbnr <= ebnr) { + /* set temporary boundary bit number to last bit number within + * the resync extent of the current start bit number, + * but cap at provided end bit number */ + unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK); + unsigned long c; + + if (mode == RECORD_RS_FAILED) + /* Only called from drbd_rs_failed_io(), bits + * supposedly still set. Recount, maybe some + * of the bits have been successfully cleared + * by application IO meanwhile. + */ + c = drbd_bm_count_bits(device, sbnr, tbnr); + else if (mode == SET_IN_SYNC) + c = drbd_bm_clear_bits(device, sbnr, tbnr); + else /* if (mode == SET_OUT_OF_SYNC) */ + c = drbd_bm_set_bits(device, sbnr, tbnr); + + if (c) { + spin_lock_irqsave(&device->al_lock, flags); + cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode); + spin_unlock_irqrestore(&device->al_lock, flags); + count += c; + } + sbnr = tbnr + 1; + } + if (count) { + if (mode == SET_IN_SYNC) { + unsigned long still_to_go = drbd_bm_total_weight(device); + bool rs_is_done = (still_to_go <= device->rs_failed); + drbd_advance_rs_marks(device, still_to_go); + if (cleared || rs_is_done) + maybe_schedule_on_disk_bitmap_update(device, rs_is_done); + } else if (mode == RECORD_RS_FAILED) + device->rs_failed += count; + wake_up(&device->al_wait); + } + return count; +} + +/* clear the bit corresponding to the piece of storage in question: + * size byte of data starting from sector. Only clear a bits of the affected + * one ore more _aligned_ BM_BLOCK_SIZE blocks. + * + * called by worker on C_SYNC_TARGET and receiver on SyncSource. + * + */ +int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, + enum update_sync_bits_mode mode) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr, ebnr, lbnr; + unsigned long count = 0; + sector_t esector, nr_sectors; + + /* This would be an empty REQ_FLUSH, be silent. */ + if ((mode == SET_OUT_OF_SYNC) && size == 0) + return 0; + + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { + drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", + drbd_change_sync_fname[mode], + (unsigned long long)sector, size); + return 0; + } + + if (!get_ldev(device)) + return 0; /* no disk, no metadata, no bitmap to manipulate bits in */ + + nr_sectors = drbd_get_capacity(device->this_bdev); + esector = sector + (size >> 9) - 1; + + if (!expect(sector < nr_sectors)) + goto out; + if (!expect(esector < nr_sectors)) + esector = nr_sectors - 1; + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + if (mode == SET_IN_SYNC) { + /* Round up start sector, round down end sector. We make sure + * we only clear full, aligned, BM_BLOCK_SIZE blocks. */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) + goto out; + if (unlikely(esector == (nr_sectors-1))) + ebnr = lbnr; + else + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + } else { + /* We set it out of sync, or record resync failure. + * Should not round anything here. */ + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + } + + count = update_sync_bits(device, sbnr, ebnr, mode); +out: + put_ldev(device); + return count; +} + +static +struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr) +{ + struct lc_element *e; + struct bm_extent *bm_ext; + int wakeup = 0; + unsigned long rs_flags; + + spin_lock_irq(&device->al_lock); + if (device->resync_locked > device->resync->nr_elements/2) { + spin_unlock_irq(&device->al_lock); + return NULL; + } + e = lc_get(device->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(device, enr); + bm_ext->rs_failed = 0; + lc_committed(device->resync); + wakeup = 1; + } + if (bm_ext->lce.refcnt == 1) + device->resync_locked++; + set_bit(BME_NO_WRITES, &bm_ext->flags); + } + rs_flags = device->resync->flags; + spin_unlock_irq(&device->al_lock); + if (wakeup) + wake_up(&device->al_wait); + + if (!bm_ext) { + if (rs_flags & LC_STARVING) + drbd_warn(device, "Have to wait for element" + " (resync LRU too small?)\n"); + BUG_ON(rs_flags & LC_LOCKED); + } + + return bm_ext; +} + +static int _is_in_al(struct drbd_device *device, unsigned int enr) +{ + int rv; + + spin_lock_irq(&device->al_lock); + rv = lc_is_used(device->act_log, enr); + spin_unlock_irq(&device->al_lock); + + return rv; +} + +/** + * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED + * @device: DRBD device. + * @sector: The sector number. + * + * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. + */ +int drbd_rs_begin_io(struct drbd_device *device, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct bm_extent *bm_ext; + int i, sig; + bool sa; + +retry: + sig = wait_event_interruptible(device->al_wait, + (bm_ext = _bme_get(device, enr))); + if (sig) + return -EINTR; + + if (test_bit(BME_LOCKED, &bm_ext->flags)) + return 0; + + /* step aside only while we are above c-min-rate; unless disabled. */ + sa = drbd_rs_c_min_rate_throttle(device); + + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { + sig = wait_event_interruptible(device->al_wait, + !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || + (sa && test_bit(BME_PRIORITY, &bm_ext->flags))); + + if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) { + spin_lock_irq(&device->al_lock); + if (lc_put(device->resync, &bm_ext->lce) == 0) { + bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ + device->resync_locked--; + wake_up(&device->al_wait); + } + spin_unlock_irq(&device->al_lock); + if (sig) + return -EINTR; + if (schedule_timeout_interruptible(HZ/10)) + return -EINTR; + goto retry; + } + } + set_bit(BME_LOCKED, &bm_ext->flags); + return 0; +} + +/** + * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep + * @device: DRBD device. + * @sector: The sector number. + * + * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then + * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN + * if there is still application IO going on in this area. + */ +int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; + struct lc_element *e; + struct bm_extent *bm_ext; + int i; + bool throttle = drbd_rs_should_slow_down(device, sector, true); + + /* If we need to throttle, a half-locked (only marked BME_NO_WRITES, + * not yet BME_LOCKED) extent needs to be kicked out explicitly if we + * need to throttle. There is at most one such half-locked extent, + * which is remembered in resync_wenr. */ + + if (throttle && device->resync_wenr != enr) + return -EAGAIN; + + spin_lock_irq(&device->al_lock); + if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) { + /* in case you have very heavy scattered io, it may + * stall the syncer undefined if we give up the ref count + * when we try again and requeue. + * + * if we don't give up the refcount, but the next time + * we are scheduled this extent has been "synced" by new + * application writes, we'd miss the lc_put on the + * extent we keep the refcount on. + * so we remembered which extent we had to try again, and + * if the next requested one is something else, we do + * the lc_put here... + * we also have to wake_up + */ + e = lc_find(device->resync, device->resync_wenr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + device->resync_wenr = LC_FREE; + if (lc_put(device->resync, &bm_ext->lce) == 0) { + bm_ext->flags = 0; + device->resync_locked--; + } + wake_up(&device->al_wait); + } else { + drbd_alert(device, "LOGIC BUG\n"); + } + } + /* TRY. */ + e = lc_try_get(device->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + if (test_bit(BME_LOCKED, &bm_ext->flags)) + goto proceed; + if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { + device->resync_locked++; + } else { + /* we did set the BME_NO_WRITES, + * but then could not set BME_LOCKED, + * so we tried again. + * drop the extra reference. */ + bm_ext->lce.refcnt--; + D_ASSERT(device, bm_ext->lce.refcnt > 0); + } + goto check_al; + } else { + /* do we rather want to try later? */ + if (device->resync_locked > device->resync->nr_elements-3) + goto try_again; + /* Do or do not. There is no try. -- Yoda */ + e = lc_get(device->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (!bm_ext) { + const unsigned long rs_flags = device->resync->flags; + if (rs_flags & LC_STARVING) + drbd_warn(device, "Have to wait for element" + " (resync LRU too small?)\n"); + BUG_ON(rs_flags & LC_LOCKED); + goto try_again; + } + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(device, enr); + bm_ext->rs_failed = 0; + lc_committed(device->resync); + wake_up(&device->al_wait); + D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0); + } + set_bit(BME_NO_WRITES, &bm_ext->flags); + D_ASSERT(device, bm_ext->lce.refcnt == 1); + device->resync_locked++; + goto check_al; + } +check_al: + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { + if (lc_is_used(device->act_log, al_enr+i)) + goto try_again; + } + set_bit(BME_LOCKED, &bm_ext->flags); +proceed: + device->resync_wenr = LC_FREE; + spin_unlock_irq(&device->al_lock); + return 0; + +try_again: + if (bm_ext) { + if (throttle) { + D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + device->resync_wenr = LC_FREE; + if (lc_put(device->resync, &bm_ext->lce) == 0) { + bm_ext->flags = 0; + device->resync_locked--; + } + wake_up(&device->al_wait); + } else + device->resync_wenr = enr; + } + spin_unlock_irq(&device->al_lock); + return -EAGAIN; +} + +void drbd_rs_complete_io(struct drbd_device *device, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct lc_element *e; + struct bm_extent *bm_ext; + unsigned long flags; + + spin_lock_irqsave(&device->al_lock, flags); + e = lc_find(device->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (!bm_ext) { + spin_unlock_irqrestore(&device->al_lock, flags); + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n"); + return; + } + + if (bm_ext->lce.refcnt == 0) { + spin_unlock_irqrestore(&device->al_lock, flags); + drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, " + "but refcnt is 0!?\n", + (unsigned long long)sector, enr); + return; + } + + if (lc_put(device->resync, &bm_ext->lce) == 0) { + bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */ + device->resync_locked--; + wake_up(&device->al_wait); + } + + spin_unlock_irqrestore(&device->al_lock, flags); +} + +/** + * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) + * @device: DRBD device. + */ +void drbd_rs_cancel_all(struct drbd_device *device) +{ + spin_lock_irq(&device->al_lock); + + if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */ + lc_reset(device->resync); + put_ldev(device); + } + device->resync_locked = 0; + device->resync_wenr = LC_FREE; + spin_unlock_irq(&device->al_lock); + wake_up(&device->al_wait); +} + +/** + * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU + * @device: DRBD device. + * + * Returns 0 upon success, -EAGAIN if at least one reference count was + * not zero. + */ +int drbd_rs_del_all(struct drbd_device *device) +{ + struct lc_element *e; + struct bm_extent *bm_ext; + int i; + + spin_lock_irq(&device->al_lock); + + if (get_ldev_if_state(device, D_FAILED)) { + /* ok, ->resync is there. */ + for (i = 0; i < device->resync->nr_elements; i++) { + e = lc_element_by_index(device->resync, i); + bm_ext = lc_entry(e, struct bm_extent, lce); + if (bm_ext->lce.lc_number == LC_FREE) + continue; + if (bm_ext->lce.lc_number == device->resync_wenr) { + drbd_info(device, "dropping %u in drbd_rs_del_all, apparently" + " got 'synced' by application io\n", + device->resync_wenr); + D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + device->resync_wenr = LC_FREE; + lc_put(device->resync, &bm_ext->lce); + } + if (bm_ext->lce.refcnt != 0) { + drbd_info(device, "Retrying drbd_rs_del_all() later. " + "refcnt=%d\n", bm_ext->lce.refcnt); + put_ldev(device); + spin_unlock_irq(&device->al_lock); + return -EAGAIN; + } + D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags)); + lc_del(device->resync, &bm_ext->lce); + } + D_ASSERT(device, device->resync->used == 0); + put_ldev(device); + } + spin_unlock_irq(&device->al_lock); + wake_up(&device->al_wait); + + return 0; +} diff --git a/kernel/drivers/block/drbd/drbd_bitmap.c b/kernel/drivers/block/drbd/drbd_bitmap.c new file mode 100644 index 000000000..434c77dcc --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_bitmap.c @@ -0,0 +1,1648 @@ +/* + drbd_bitmap.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2004-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2004-2008, Philipp Reisner . + Copyright (C) 2004-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#include "drbd_int.h" + + +/* OPAQUE outside this file! + * interface defined in drbd_int.h + + * convention: + * function name drbd_bm_... => used elsewhere, "public". + * function name bm_... => internal to implementation, "private". + */ + + +/* + * LIMITATIONS: + * We want to support >= peta byte of backend storage, while for now still using + * a granularity of one bit per 4KiB of storage. + * 1 << 50 bytes backend storage (1 PiB) + * 1 << (50 - 12) bits needed + * 38 --> we need u64 to index and count bits + * 1 << (38 - 3) bitmap bytes needed + * 35 --> we still need u64 to index and count bytes + * (that's 32 GiB of bitmap for 1 PiB storage) + * 1 << (35 - 2) 32bit longs needed + * 33 --> we'd even need u64 to index and count 32bit long words. + * 1 << (35 - 3) 64bit longs needed + * 32 --> we could get away with a 32bit unsigned int to index and count + * 64bit long words, but I rather stay with unsigned long for now. + * We probably should neither count nor point to bytes or long words + * directly, but either by bitnumber, or by page index and offset. + * 1 << (35 - 12) + * 22 --> we need that much 4KiB pages of bitmap. + * 1 << (22 + 3) --> on a 64bit arch, + * we need 32 MiB to store the array of page pointers. + * + * Because I'm lazy, and because the resulting patch was too large, too ugly + * and still incomplete, on 32bit we still "only" support 16 TiB (minus some), + * (1 << 32) bits * 4k storage. + * + + * bitmap storage and IO: + * Bitmap is stored little endian on disk, and is kept little endian in + * core memory. Currently we still hold the full bitmap in core as long + * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage + * seems excessive. + * + * We plan to reduce the amount of in-core bitmap pages by paging them in + * and out against their on-disk location as necessary, but need to make + * sure we don't cause too much meta data IO, and must not deadlock in + * tight memory situations. This needs some more work. + */ + +/* + * NOTE + * Access to the *bm_pages is protected by bm_lock. + * It is safe to read the other members within the lock. + * + * drbd_bm_set_bits is called from bio_endio callbacks, + * We may be called with irq already disabled, + * so we need spin_lock_irqsave(). + * And we need the kmap_atomic. + */ +struct drbd_bitmap { + struct page **bm_pages; + spinlock_t bm_lock; + + /* see LIMITATIONS: above */ + + unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ + unsigned long bm_bits; + size_t bm_words; + size_t bm_number_of_pages; + sector_t bm_dev_capacity; + struct mutex bm_change; /* serializes resize operations */ + + wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */ + + enum bm_flag bm_flags; + + /* debugging aid, in case we are still racy somewhere */ + char *bm_why; + struct task_struct *bm_task; +}; + +#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) +static void __bm_print_lock_info(struct drbd_device *device, const char *func) +{ + struct drbd_bitmap *b = device->bitmap; + if (!__ratelimit(&drbd_ratelimit_state)) + return; + drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n", + current->comm, task_pid_nr(current), + func, b->bm_why ?: "?", + b->bm_task->comm, task_pid_nr(b->bm_task)); +} + +void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags) +{ + struct drbd_bitmap *b = device->bitmap; + int trylock_failed; + + if (!b) { + drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n"); + return; + } + + trylock_failed = !mutex_trylock(&b->bm_change); + + if (trylock_failed) { + drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n", + current->comm, task_pid_nr(current), + why, b->bm_why ?: "?", + b->bm_task->comm, task_pid_nr(b->bm_task)); + mutex_lock(&b->bm_change); + } + if (BM_LOCKED_MASK & b->bm_flags) + drbd_err(device, "FIXME bitmap already locked in bm_lock\n"); + b->bm_flags |= flags & BM_LOCKED_MASK; + + b->bm_why = why; + b->bm_task = current; +} + +void drbd_bm_unlock(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + if (!b) { + drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n"); + return; + } + + if (!(BM_LOCKED_MASK & device->bitmap->bm_flags)) + drbd_err(device, "FIXME bitmap not locked in bm_unlock\n"); + + b->bm_flags &= ~BM_LOCKED_MASK; + b->bm_why = NULL; + b->bm_task = NULL; + mutex_unlock(&b->bm_change); +} + +/* we store some "meta" info about our pages in page->private */ +/* at a granularity of 4k storage per bitmap bit: + * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks + * 1<<38 bits, + * 1<<23 4k bitmap pages. + * Use 24 bits as page index, covers 2 peta byte storage + * at a granularity of 4k per bit. + * Used to report the failed page idx on io error from the endio handlers. + */ +#define BM_PAGE_IDX_MASK ((1UL<<24)-1) +/* this page is currently read in, or written back */ +#define BM_PAGE_IO_LOCK 31 +/* if there has been an IO error for this page */ +#define BM_PAGE_IO_ERROR 30 +/* this is to be able to intelligently skip disk IO, + * set if bits have been set since last IO. */ +#define BM_PAGE_NEED_WRITEOUT 29 +/* to mark for lazy writeout once syncer cleared all clearable bits, + * we if bits have been cleared since last IO. */ +#define BM_PAGE_LAZY_WRITEOUT 28 +/* pages marked with this "HINT" will be considered for writeout + * on activity log transactions */ +#define BM_PAGE_HINT_WRITEOUT 27 + +/* store_page_idx uses non-atomic assignment. It is only used directly after + * allocating the page. All other bm_set_page_* and bm_clear_page_* need to + * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap + * changes) may happen from various contexts, and wait_on_bit/wake_up_bit + * requires it all to be atomic as well. */ +static void bm_store_page_idx(struct page *page, unsigned long idx) +{ + BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); + set_page_private(page, idx); +} + +static unsigned long bm_page_to_idx(struct page *page) +{ + return page_private(page) & BM_PAGE_IDX_MASK; +} + +/* As is very unlikely that the same page is under IO from more than one + * context, we can get away with a bit per page and one wait queue per bitmap. + */ +static void bm_page_lock_io(struct drbd_device *device, int page_nr) +{ + struct drbd_bitmap *b = device->bitmap; + void *addr = &page_private(b->bm_pages[page_nr]); + wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr)); +} + +static void bm_page_unlock_io(struct drbd_device *device, int page_nr) +{ + struct drbd_bitmap *b = device->bitmap; + void *addr = &page_private(b->bm_pages[page_nr]); + clear_bit_unlock(BM_PAGE_IO_LOCK, addr); + wake_up(&device->bitmap->bm_io_wait); +} + +/* set _before_ submit_io, so it may be reset due to being changed + * while this page is in flight... will get submitted later again */ +static void bm_set_page_unchanged(struct page *page) +{ + /* use cmpxchg? */ + clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); + clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page)); +} + +static void bm_set_page_need_writeout(struct page *page) +{ + set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); +} + +/** + * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout + * @device: DRBD device. + * @page_nr: the bitmap page to mark with the "hint" flag + * + * From within an activity log transaction, we mark a few pages with these + * hints, then call drbd_bm_write_hinted(), which will only write out changed + * pages which are flagged with this mark. + */ +void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr) +{ + struct page *page; + if (page_nr >= device->bitmap->bm_number_of_pages) { + drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n", + page_nr, (int)device->bitmap->bm_number_of_pages); + return; + } + page = device->bitmap->bm_pages[page_nr]; + set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); +} + +static int bm_test_page_unchanged(struct page *page) +{ + volatile const unsigned long *addr = &page_private(page); + return (*addr & ((1UL<> PAGE_SHIFT; */ + unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3); + BUG_ON(page_nr >= b->bm_number_of_pages); + return page_nr; +} + +static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr) +{ + /* page_nr = (bitnr/8) >> PAGE_SHIFT; */ + unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3); + BUG_ON(page_nr >= b->bm_number_of_pages); + return page_nr; +} + +static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) +{ + struct page *page = b->bm_pages[idx]; + return (unsigned long *) kmap_atomic(page); +} + +static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) +{ + return __bm_map_pidx(b, idx); +} + +static void __bm_unmap(unsigned long *p_addr) +{ + kunmap_atomic(p_addr); +}; + +static void bm_unmap(unsigned long *p_addr) +{ + return __bm_unmap(p_addr); +} + +/* long word offset of _bitmap_ sector */ +#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) +/* word offset from start of bitmap to word number _in_page_ + * modulo longs per page +#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) + hm, well, Philipp thinks gcc might not optimize the % into & (... - 1) + so do it explicitly: + */ +#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) + +/* Long words per page */ +#define LWPP (PAGE_SIZE/sizeof(long)) + +/* + * actually most functions herein should take a struct drbd_bitmap*, not a + * struct drbd_device*, but for the debug macros I like to have the device around + * to be able to report device specific. + */ + + +static void bm_free_pages(struct page **pages, unsigned long number) +{ + unsigned long i; + if (!pages) + return; + + for (i = 0; i < number; i++) { + if (!pages[i]) { + pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n", + i, number); + continue; + } + __free_page(pages[i]); + pages[i] = NULL; + } +} + +static void bm_vk_free(void *ptr, int v) +{ + if (v) + vfree(ptr); + else + kfree(ptr); +} + +/* + * "have" and "want" are NUMBER OF PAGES. + */ +static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) +{ + struct page **old_pages = b->bm_pages; + struct page **new_pages, *page; + unsigned int i, bytes, vmalloced = 0; + unsigned long have = b->bm_number_of_pages; + + BUG_ON(have == 0 && old_pages != NULL); + BUG_ON(have != 0 && old_pages == NULL); + + if (have == want) + return old_pages; + + /* Trying kmalloc first, falling back to vmalloc. + * GFP_NOIO, as this is called while drbd IO is "suspended", + * and during resize or attach on diskless Primary, + * we must not block on IO to ourselves. + * Context is receiver thread or dmsetup. */ + bytes = sizeof(struct page *)*want; + new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN); + if (!new_pages) { + new_pages = __vmalloc(bytes, + GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); + if (!new_pages) + return NULL; + vmalloced = 1; + } + + if (want >= have) { + for (i = 0; i < have; i++) + new_pages[i] = old_pages[i]; + for (; i < want; i++) { + page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); + if (!page) { + bm_free_pages(new_pages + have, i - have); + bm_vk_free(new_pages, vmalloced); + return NULL; + } + /* we want to know which page it is + * from the endio handlers */ + bm_store_page_idx(page, i); + new_pages[i] = page; + } + } else { + for (i = 0; i < want; i++) + new_pages[i] = old_pages[i]; + /* NOT HERE, we are outside the spinlock! + bm_free_pages(old_pages + want, have - want); + */ + } + + if (vmalloced) + b->bm_flags |= BM_P_VMALLOCED; + else + b->bm_flags &= ~BM_P_VMALLOCED; + + return new_pages; +} + +/* + * called on driver init only. TODO call when a device is created. + * allocates the drbd_bitmap, and stores it in device->bitmap. + */ +int drbd_bm_init(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + WARN_ON(b != NULL); + b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL); + if (!b) + return -ENOMEM; + spin_lock_init(&b->bm_lock); + mutex_init(&b->bm_change); + init_waitqueue_head(&b->bm_io_wait); + + device->bitmap = b; + + return 0; +} + +sector_t drbd_bm_capacity(struct drbd_device *device) +{ + if (!expect(device->bitmap)) + return 0; + return device->bitmap->bm_dev_capacity; +} + +/* called on driver unload. TODO: call when a device is destroyed. + */ +void drbd_bm_cleanup(struct drbd_device *device) +{ + if (!expect(device->bitmap)) + return; + bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages); + bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags)); + kfree(device->bitmap); + device->bitmap = NULL; +} + +/* + * since (b->bm_bits % BITS_PER_LONG) != 0, + * this masks out the remaining bits. + * Returns the number of bits cleared. + */ +#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3)) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1) +#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1) +static int bm_clear_surplus(struct drbd_bitmap *b) +{ + unsigned long mask; + unsigned long *p_addr, *bm; + int tmp; + int cleared = 0; + + /* number of bits modulo bits per page */ + tmp = (b->bm_bits & BITS_PER_PAGE_MASK); + /* mask the used bits of the word containing the last bit */ + mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1; + /* bitmap is always stored little endian, + * on disk and in core memory alike */ + mask = cpu_to_lel(mask); + + p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1); + bm = p_addr + (tmp/BITS_PER_LONG); + if (mask) { + /* If mask != 0, we are not exactly aligned, so bm now points + * to the long containing the last bit. + * If mask == 0, bm already points to the word immediately + * after the last (long word aligned) bit. */ + cleared = hweight_long(*bm & ~mask); + *bm &= mask; + bm++; + } + + if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) { + /* on a 32bit arch, we may need to zero out + * a padding long to align with a 64bit remote */ + cleared += hweight_long(*bm); + *bm = 0; + } + bm_unmap(p_addr); + return cleared; +} + +static void bm_set_surplus(struct drbd_bitmap *b) +{ + unsigned long mask; + unsigned long *p_addr, *bm; + int tmp; + + /* number of bits modulo bits per page */ + tmp = (b->bm_bits & BITS_PER_PAGE_MASK); + /* mask the used bits of the word containing the last bit */ + mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1; + /* bitmap is always stored little endian, + * on disk and in core memory alike */ + mask = cpu_to_lel(mask); + + p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1); + bm = p_addr + (tmp/BITS_PER_LONG); + if (mask) { + /* If mask != 0, we are not exactly aligned, so bm now points + * to the long containing the last bit. + * If mask == 0, bm already points to the word immediately + * after the last (long word aligned) bit. */ + *bm |= ~mask; + bm++; + } + + if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) { + /* on a 32bit arch, we may need to zero out + * a padding long to align with a 64bit remote */ + *bm = ~0UL; + } + bm_unmap(p_addr); +} + +/* you better not modify the bitmap while this is running, + * or its results will be stale */ +static unsigned long bm_count_bits(struct drbd_bitmap *b) +{ + unsigned long *p_addr; + unsigned long bits = 0; + unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1; + int idx, i, last_word; + + /* all but last page */ + for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) { + p_addr = __bm_map_pidx(b, idx); + for (i = 0; i < LWPP; i++) + bits += hweight_long(p_addr[i]); + __bm_unmap(p_addr); + cond_resched(); + } + /* last (or only) page */ + last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL; + p_addr = __bm_map_pidx(b, idx); + for (i = 0; i < last_word; i++) + bits += hweight_long(p_addr[i]); + p_addr[last_word] &= cpu_to_lel(mask); + bits += hweight_long(p_addr[last_word]); + /* 32bit arch, may have an unused padding long */ + if (BITS_PER_LONG == 32 && (last_word & 1) == 0) + p_addr[last_word+1] = 0; + __bm_unmap(p_addr); + return bits; +} + +/* offset and len in long words.*/ +static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) +{ + unsigned long *p_addr, *bm; + unsigned int idx; + size_t do_now, end; + + end = offset + len; + + if (end > b->bm_words) { + pr_alert("bm_memset end > bm_words\n"); + return; + } + + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; + idx = bm_word_to_page_idx(b, offset); + p_addr = bm_map_pidx(b, idx); + bm = p_addr + MLPP(offset); + if (bm+do_now > p_addr + LWPP) { + pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", + p_addr, bm, (int)do_now); + } else + memset(bm, c, do_now * sizeof(long)); + bm_unmap(p_addr); + bm_set_page_need_writeout(b->bm_pages[idx]); + offset += do_now; + } +} + +/* For the layout, see comment above drbd_md_set_sector_offsets(). */ +static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev) +{ + u64 bitmap_sectors; + if (ldev->md.al_offset == 8) + bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset; + else + bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset; + return bitmap_sectors << (9 + 3); +} + +/* + * make sure the bitmap has enough room for the attached storage, + * if necessary, resize. + * called whenever we may have changed the device size. + * returns -ENOMEM if we could not allocate enough memory, 0 on success. + * In case this is actually a resize, we copy the old bitmap into the new one. + * Otherwise, the bitmap is initialized to all bits set. + */ +int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long bits, words, owords, obits; + unsigned long want, have, onpages; /* number of pages */ + struct page **npages, **opages = NULL; + int err = 0, growing; + int opages_vmalloced; + + if (!expect(b)) + return -ENOMEM; + + drbd_bm_lock(device, "resize", BM_LOCKED_MASK); + + drbd_info(device, "drbd_bm_resize called with capacity == %llu\n", + (unsigned long long)capacity); + + if (capacity == b->bm_dev_capacity) + goto out; + + opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags); + + if (capacity == 0) { + spin_lock_irq(&b->bm_lock); + opages = b->bm_pages; + onpages = b->bm_number_of_pages; + owords = b->bm_words; + b->bm_pages = NULL; + b->bm_number_of_pages = + b->bm_set = + b->bm_bits = + b->bm_words = + b->bm_dev_capacity = 0; + spin_unlock_irq(&b->bm_lock); + bm_free_pages(opages, onpages); + bm_vk_free(opages, opages_vmalloced); + goto out; + } + bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); + + /* if we would use + words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL; + a 32bit host could present the wrong number of words + to a 64bit host. + */ + words = ALIGN(bits, 64) >> LN2_BPL; + + if (get_ldev(device)) { + u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev); + put_ldev(device); + if (bits > bits_on_disk) { + drbd_info(device, "bits = %lu\n", bits); + drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk); + err = -ENOSPC; + goto out; + } + } + + want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; + have = b->bm_number_of_pages; + if (want == have) { + D_ASSERT(device, b->bm_pages != NULL); + npages = b->bm_pages; + } else { + if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC)) + npages = NULL; + else + npages = bm_realloc_pages(b, want); + } + + if (!npages) { + err = -ENOMEM; + goto out; + } + + spin_lock_irq(&b->bm_lock); + opages = b->bm_pages; + owords = b->bm_words; + obits = b->bm_bits; + + growing = bits > obits; + if (opages && growing && set_new_bits) + bm_set_surplus(b); + + b->bm_pages = npages; + b->bm_number_of_pages = want; + b->bm_bits = bits; + b->bm_words = words; + b->bm_dev_capacity = capacity; + + if (growing) { + if (set_new_bits) { + bm_memset(b, owords, 0xff, words-owords); + b->bm_set += bits - obits; + } else + bm_memset(b, owords, 0x00, words-owords); + + } + + if (want < have) { + /* implicit: (opages != NULL) && (opages != npages) */ + bm_free_pages(opages + want, have - want); + } + + (void)bm_clear_surplus(b); + + spin_unlock_irq(&b->bm_lock); + if (opages != npages) + bm_vk_free(opages, opages_vmalloced); + if (!growing) + b->bm_set = bm_count_bits(b); + drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want); + + out: + drbd_bm_unlock(device); + return err; +} + +/* inherently racy: + * if not protected by other means, return value may be out of date when + * leaving this function... + * we still need to lock it, since it is important that this returns + * bm_set == 0 precisely. + * + * maybe bm_set should be atomic_t ? + */ +unsigned long _drbd_bm_total_weight(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long s; + unsigned long flags; + + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + s = b->bm_set; + spin_unlock_irqrestore(&b->bm_lock, flags); + + return s; +} + +unsigned long drbd_bm_total_weight(struct drbd_device *device) +{ + unsigned long s; + /* if I don't have a disk, I don't know about out-of-sync status */ + if (!get_ldev_if_state(device, D_NEGOTIATING)) + return 0; + s = _drbd_bm_total_weight(device); + put_ldev(device); + return s; +} + +size_t drbd_bm_words(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; + + return b->bm_words; +} + +unsigned long drbd_bm_bits(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + if (!expect(b)) + return 0; + + return b->bm_bits; +} + +/* merge number words from buffer into the bitmap starting at offset. + * buffer[i] is expected to be little endian unsigned long. + * bitmap must be locked by drbd_bm_lock. + * currently only used from receive_bitmap. + */ +void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number, + unsigned long *buffer) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr, *bm; + unsigned long word, bits; + unsigned int idx; + size_t end, do_now; + + end = offset + number; + + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; + if (number == 0) + return; + WARN_ON(offset >= b->bm_words); + WARN_ON(end > b->bm_words); + + spin_lock_irq(&b->bm_lock); + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; + idx = bm_word_to_page_idx(b, offset); + p_addr = bm_map_pidx(b, idx); + bm = p_addr + MLPP(offset); + offset += do_now; + while (do_now--) { + bits = hweight_long(*bm); + word = *bm | *buffer++; + *bm++ = word; + b->bm_set += hweight_long(word) - bits; + } + bm_unmap(p_addr); + bm_set_page_need_writeout(b->bm_pages[idx]); + } + /* with 32bit <-> 64bit cross-platform connect + * this is only correct for current usage, + * where we _know_ that we are 64 bit aligned, + * and know that this function is used in this way, too... + */ + if (end == b->bm_words) + b->bm_set -= bm_clear_surplus(b); + spin_unlock_irq(&b->bm_lock); +} + +/* copy number words from the bitmap starting at offset into the buffer. + * buffer[i] will be little endian unsigned long. + */ +void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number, + unsigned long *buffer) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr, *bm; + size_t end, do_now; + + end = offset + number; + + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; + + spin_lock_irq(&b->bm_lock); + if ((offset >= b->bm_words) || + (end > b->bm_words) || + (number <= 0)) + drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n", + (unsigned long) offset, + (unsigned long) number, + (unsigned long) b->bm_words); + else { + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; + p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset)); + bm = p_addr + MLPP(offset); + offset += do_now; + while (do_now--) + *buffer++ = *bm++; + bm_unmap(p_addr); + } + } + spin_unlock_irq(&b->bm_lock); +} + +/* set all bits in the bitmap */ +void drbd_bm_set_all(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; + + spin_lock_irq(&b->bm_lock); + bm_memset(b, 0, 0xff, b->bm_words); + (void)bm_clear_surplus(b); + b->bm_set = b->bm_bits; + spin_unlock_irq(&b->bm_lock); +} + +/* clear all bits in the bitmap */ +void drbd_bm_clear_all(struct drbd_device *device) +{ + struct drbd_bitmap *b = device->bitmap; + if (!expect(b)) + return; + if (!expect(b->bm_pages)) + return; + + spin_lock_irq(&b->bm_lock); + bm_memset(b, 0, 0, b->bm_words); + b->bm_set = 0; + spin_unlock_irq(&b->bm_lock); +} + +static void drbd_bm_aio_ctx_destroy(struct kref *kref) +{ + struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref); + unsigned long flags; + + spin_lock_irqsave(&ctx->device->resource->req_lock, flags); + list_del(&ctx->list); + spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags); + put_ldev(ctx->device); + kfree(ctx); +} + +/* bv_page may be a copy, or may be the original */ +static void drbd_bm_endio(struct bio *bio, int error) +{ + struct drbd_bm_aio_ctx *ctx = bio->bi_private; + struct drbd_device *device = ctx->device; + struct drbd_bitmap *b = device->bitmap; + unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! + * do we want to WARN() on this? */ + if (!error && !uptodate) + error = -EIO; + + if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 && + !bm_test_page_unchanged(b->bm_pages[idx])) + drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx); + + if (error) { + /* ctx error will hold the completed-last non-zero error code, + * in case error codes differ. */ + ctx->error = error; + bm_set_page_io_err(b->bm_pages[idx]); + /* Not identical to on disk version of it. + * Is BM_PAGE_IO_ERROR enough? */ + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "IO ERROR %d on bitmap page idx %u\n", + error, idx); + } else { + bm_clear_page_io_err(b->bm_pages[idx]); + dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx); + } + + bm_page_unlock_io(device, idx); + + if (ctx->flags & BM_AIO_COPY_PAGES) + mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool); + + bio_put(bio); + + if (atomic_dec_and_test(&ctx->in_flight)) { + ctx->done = 1; + wake_up(&device->misc_wait); + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); + } +} + +static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) +{ + struct bio *bio = bio_alloc_drbd(GFP_NOIO); + struct drbd_device *device = ctx->device; + struct drbd_bitmap *b = device->bitmap; + struct page *page; + unsigned int len; + unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE; + + sector_t on_disk_sector = + device->ldev->md.md_offset + device->ldev->md.bm_offset; + on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); + + /* this might happen with very small + * flexible external meta data device, + * or with PAGE_SIZE > 4k */ + len = min_t(unsigned int, PAGE_SIZE, + (drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9); + + /* serialize IO on this page */ + bm_page_lock_io(device, page_nr); + /* before memcpy and submit, + * so it can be redirtied any time */ + bm_set_page_unchanged(b->bm_pages[page_nr]); + + if (ctx->flags & BM_AIO_COPY_PAGES) { + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); + copy_highpage(page, b->bm_pages[page_nr]); + bm_store_page_idx(page, page_nr); + } else + page = b->bm_pages[page_nr]; + bio->bi_bdev = device->ldev->md_bdev; + bio->bi_iter.bi_sector = on_disk_sector; + /* bio_add_page of a single page to an empty bio will always succeed, + * according to api. Do we want to assert that? */ + bio_add_page(bio, page, len, 0); + bio->bi_private = ctx; + bio->bi_end_io = drbd_bm_endio; + + if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { + bio->bi_rw |= rw; + bio_endio(bio, -EIO); + } else { + submit_bio(rw, bio); + /* this should not count as user activity and cause the + * resync to throttle -- see drbd_rs_should_slow_down(). */ + atomic_add(len >> 9, &device->rs_sect_ev); + } +} + +/* + * bm_rw: read/write the whole bitmap from/to its on disk location. + */ +static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local) +{ + struct drbd_bm_aio_ctx *ctx; + struct drbd_bitmap *b = device->bitmap; + int num_pages, i, count = 0; + unsigned long now; + char ppb[10]; + int err = 0; + + /* + * We are protected against bitmap disappearing/resizing by holding an + * ldev reference (caller must have called get_ldev()). + * For read/write, we are protected against changes to the bitmap by + * the bitmap lock (see drbd_bitmap_io). + * For lazy writeout, we don't care for ongoing changes to the bitmap, + * as we submit copies of pages anyways. + */ + + ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; + + *ctx = (struct drbd_bm_aio_ctx) { + .device = device, + .start_jif = jiffies, + .in_flight = ATOMIC_INIT(1), + .done = 0, + .flags = flags, + .error = 0, + .kref = { ATOMIC_INIT(2) }, + }; + + if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */ + drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); + kfree(ctx); + return -ENODEV; + } + /* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from + drbd_adm_attach(), after device->ldev was assigned. */ + + if (0 == (ctx->flags & ~BM_AIO_READ)) + WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); + + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&ctx->list, &device->pending_bitmap_io); + spin_unlock_irq(&device->resource->req_lock); + + num_pages = b->bm_number_of_pages; + + now = jiffies; + + /* let the layers below us try to merge these bios... */ + for (i = 0; i < num_pages; i++) { + /* ignore completely unchanged pages */ + if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) + break; + if (!(flags & BM_AIO_READ)) { + if ((flags & BM_AIO_WRITE_HINTED) && + !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, + &page_private(b->bm_pages[i]))) + continue; + + if (!(flags & BM_AIO_WRITE_ALL_PAGES) && + bm_test_page_unchanged(b->bm_pages[i])) { + dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i); + continue; + } + /* during lazy writeout, + * ignore those pages not marked for lazy writeout. */ + if (lazy_writeout_upper_idx && + !bm_test_page_lazy_writeout(b->bm_pages[i])) { + dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i); + continue; + } + } + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i); + ++count; + cond_resched(); + } + + /* + * We initialize ctx->in_flight to one to make sure drbd_bm_endio + * will not set ctx->done early, and decrement / test it here. If there + * are still some bios in flight, we need to wait for them here. + * If all IO is done already (or nothing had been submitted), there is + * no need to wait. Still, we need to put the kref associated with the + * "in_flight reached zero, all done" event. + */ + if (!atomic_dec_and_test(&ctx->in_flight)) + wait_until_done_or_force_detached(device, device->ldev, &ctx->done); + else + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); + + /* summary for global bitmap IO */ + if (flags == 0) + drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n", + (flags & BM_AIO_READ) ? "READ" : "WRITE", + count, jiffies - now); + + if (ctx->error) { + drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n"); + drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); + err = -EIO; /* ctx->error ? */ + } + + if (atomic_read(&ctx->in_flight)) + err = -EIO; /* Disk timeout/force-detach during IO... */ + + now = jiffies; + if (flags & BM_AIO_READ) { + b->bm_set = bm_count_bits(b); + drbd_info(device, "recounting of set bits took additional %lu jiffies\n", + jiffies - now); + } + now = b->bm_set; + + if ((flags & ~BM_AIO_READ) == 0) + drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", + ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); + + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); + return err; +} + +/** + * drbd_bm_read() - Read the whole bitmap from its on disk location. + * @device: DRBD device. + */ +int drbd_bm_read(struct drbd_device *device) __must_hold(local) +{ + return bm_rw(device, BM_AIO_READ, 0); +} + +/** + * drbd_bm_write() - Write the whole bitmap to its on disk location. + * @device: DRBD device. + * + * Will only write pages that have changed since last IO. + */ +int drbd_bm_write(struct drbd_device *device) __must_hold(local) +{ + return bm_rw(device, 0, 0); +} + +/** + * drbd_bm_write_all() - Write the whole bitmap to its on disk location. + * @device: DRBD device. + * + * Will write all pages. + */ +int drbd_bm_write_all(struct drbd_device *device) __must_hold(local) +{ + return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0); +} + +/** + * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed. + * @device: DRBD device. + * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages + */ +int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local) +{ + return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx); +} + +/** + * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location. + * @device: DRBD device. + * + * Will only write pages that have changed since last IO. + * In contrast to drbd_bm_write(), this will copy the bitmap pages + * to temporary writeout pages. It is intended to trigger a full write-out + * while still allowing the bitmap to change, for example if a resync or online + * verify is aborted due to a failed peer disk, while local IO continues, or + * pending resync acks are still being processed. + */ +int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local) +{ + return bm_rw(device, BM_AIO_COPY_PAGES, 0); +} + +/** + * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed. + * @device: DRBD device. + */ +int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local) +{ + return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0); +} + +/* NOTE + * find_first_bit returns int, we return unsigned long. + * For this to work on 32bit arch with bitnumbers > (1<<32), + * we'd need to return u64, and get a whole lot of other places + * fixed where we still use unsigned long. + * + * this returns a bit number, NOT a sector! + */ +static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo, + const int find_zero_bit) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr; + unsigned long bit_offset; + unsigned i; + + + if (bm_fo > b->bm_bits) { + drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); + bm_fo = DRBD_END_OF_BITMAP; + } else { + while (bm_fo < b->bm_bits) { + /* bit offset of the first bit in the page */ + bit_offset = bm_fo & ~BITS_PER_PAGE_MASK; + p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo)); + + if (find_zero_bit) + i = find_next_zero_bit_le(p_addr, + PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK); + else + i = find_next_bit_le(p_addr, + PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK); + + __bm_unmap(p_addr); + if (i < PAGE_SIZE*8) { + bm_fo = bit_offset + i; + if (bm_fo >= b->bm_bits) + break; + goto found; + } + bm_fo = bit_offset + PAGE_SIZE*8; + } + bm_fo = DRBD_END_OF_BITMAP; + } + found: + return bm_fo; +} + +static unsigned long bm_find_next(struct drbd_device *device, + unsigned long bm_fo, const int find_zero_bit) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long i = DRBD_END_OF_BITMAP; + + if (!expect(b)) + return i; + if (!expect(b->bm_pages)) + return i; + + spin_lock_irq(&b->bm_lock); + if (BM_DONT_TEST & b->bm_flags) + bm_print_lock_info(device); + + i = __bm_find_next(device, bm_fo, find_zero_bit); + + spin_unlock_irq(&b->bm_lock); + return i; +} + +unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo) +{ + return bm_find_next(device, bm_fo, 0); +} + +#if 0 +/* not yet needed for anything. */ +unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo) +{ + return bm_find_next(device, bm_fo, 1); +} +#endif + +/* does not spin_lock_irqsave. + * you must take drbd_bm_lock() first */ +unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo) +{ + /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */ + return __bm_find_next(device, bm_fo, 0); +} + +unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo) +{ + /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */ + return __bm_find_next(device, bm_fo, 1); +} + +/* returns number of bits actually changed. + * for val != 0, we change 0 -> 1, return code positive + * for val == 0, we change 1 -> 0, return code negative + * wants bitnr, not sector. + * expected to be called for only a few bits (e - s about BITS_PER_LONG). + * Must hold bitmap lock already. */ +static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s, + unsigned long e, int val) +{ + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr = NULL; + unsigned long bitnr; + unsigned int last_page_nr = -1U; + int c = 0; + int changed_total = 0; + + if (e >= b->bm_bits) { + drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", + s, e, b->bm_bits); + e = b->bm_bits ? b->bm_bits -1 : 0; + } + for (bitnr = s; bitnr <= e; bitnr++) { + unsigned int page_nr = bm_bit_to_page_idx(b, bitnr); + if (page_nr != last_page_nr) { + if (p_addr) + __bm_unmap(p_addr); + if (c < 0) + bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); + else if (c > 0) + bm_set_page_need_writeout(b->bm_pages[last_page_nr]); + changed_total += c; + c = 0; + p_addr = __bm_map_pidx(b, page_nr); + last_page_nr = page_nr; + } + if (val) + c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr)); + else + c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr)); + } + if (p_addr) + __bm_unmap(p_addr); + if (c < 0) + bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); + else if (c > 0) + bm_set_page_need_writeout(b->bm_pages[last_page_nr]); + changed_total += c; + b->bm_set += changed_total; + return changed_total; +} + +/* returns number of bits actually changed. + * for val != 0, we change 0 -> 1, return code positive + * for val == 0, we change 1 -> 0, return code negative + * wants bitnr, not sector */ +static int bm_change_bits_to(struct drbd_device *device, const unsigned long s, + const unsigned long e, int val) +{ + unsigned long flags; + struct drbd_bitmap *b = device->bitmap; + int c = 0; + + if (!expect(b)) + return 1; + if (!expect(b->bm_pages)) + return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags) + bm_print_lock_info(device); + + c = __bm_change_bits_to(device, s, e, val); + + spin_unlock_irqrestore(&b->bm_lock, flags); + return c; +} + +/* returns number of bits changed 0 -> 1 */ +int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) +{ + return bm_change_bits_to(device, s, e, 1); +} + +/* returns number of bits changed 1 -> 0 */ +int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) +{ + return -bm_change_bits_to(device, s, e, 0); +} + +/* sets all bits in full words, + * from first_word up to, but not including, last_word */ +static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, + int page_nr, int first_word, int last_word) +{ + int i; + int bits; + int changed = 0; + unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); + for (i = first_word; i < last_word; i++) { + bits = hweight_long(paddr[i]); + paddr[i] = ~0UL; + changed += BITS_PER_LONG - bits; + } + kunmap_atomic(paddr); + if (changed) { + /* We only need lazy writeout, the information is still in the + * remote bitmap as well, and is reconstructed during the next + * bitmap exchange, if lost locally due to a crash. */ + bm_set_page_lazy_writeout(b->bm_pages[page_nr]); + b->bm_set += changed; + } +} + +/* Same thing as drbd_bm_set_bits, + * but more efficient for a large bit range. + * You must first drbd_bm_lock(). + * Can be called to set the whole bitmap in one go. + * Sets bits from s to e _inclusive_. */ +void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) +{ + /* First set_bit from the first bit (s) + * up to the next long boundary (sl), + * then assign full words up to the last long boundary (el), + * then set_bit up to and including the last bit (e). + * + * Do not use memset, because we must account for changes, + * so we need to loop over the words with hweight() anyways. + */ + struct drbd_bitmap *b = device->bitmap; + unsigned long sl = ALIGN(s,BITS_PER_LONG); + unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1); + int first_page; + int last_page; + int page_nr; + int first_word; + int last_word; + + if (e - s <= 3*BITS_PER_LONG) { + /* don't bother; el and sl may even be wrong. */ + spin_lock_irq(&b->bm_lock); + __bm_change_bits_to(device, s, e, 1); + spin_unlock_irq(&b->bm_lock); + return; + } + + /* difference is large enough that we can trust sl and el */ + + spin_lock_irq(&b->bm_lock); + + /* bits filling the current long */ + if (sl) + __bm_change_bits_to(device, s, sl-1, 1); + + first_page = sl >> (3 + PAGE_SHIFT); + last_page = el >> (3 + PAGE_SHIFT); + + /* MLPP: modulo longs per page */ + /* LWPP: long words per page */ + first_word = MLPP(sl >> LN2_BPL); + last_word = LWPP; + + /* first and full pages, unless first page == last page */ + for (page_nr = first_page; page_nr < last_page; page_nr++) { + bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word); + spin_unlock_irq(&b->bm_lock); + cond_resched(); + first_word = 0; + spin_lock_irq(&b->bm_lock); + } + /* last page (respectively only page, for first page == last page) */ + last_word = MLPP(el >> LN2_BPL); + + /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples). + * ==> e = 32767, el = 32768, last_page = 2, + * and now last_word = 0. + * We do not want to touch last_page in this case, + * as we did not allocate it, it is not present in bitmap->bm_pages. + */ + if (last_word) + bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word); + + /* possibly trailing bits. + * example: (e & 63) == 63, el will be e+1. + * if that even was the very last bit, + * it would trigger an assert in __bm_change_bits_to() + */ + if (el <= e) + __bm_change_bits_to(device, el, e, 1); + spin_unlock_irq(&b->bm_lock); +} + +/* returns bit state + * wants bitnr, NOT sector. + * inherently racy... area needs to be locked by means of {al,rs}_lru + * 1 ... bit set + * 0 ... bit not set + * -1 ... first out of bounds access, stop testing for bits! + */ +int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr) +{ + unsigned long flags; + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr; + int i; + + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if (BM_DONT_TEST & b->bm_flags) + bm_print_lock_info(device); + if (bitnr < b->bm_bits) { + p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr)); + i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0; + bm_unmap(p_addr); + } else if (bitnr == b->bm_bits) { + i = -1; + } else { /* (bitnr > b->bm_bits) */ + drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits); + i = 0; + } + + spin_unlock_irqrestore(&b->bm_lock, flags); + return i; +} + +/* returns number of bits set in the range [s, e] */ +int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e) +{ + unsigned long flags; + struct drbd_bitmap *b = device->bitmap; + unsigned long *p_addr = NULL; + unsigned long bitnr; + unsigned int page_nr = -1U; + int c = 0; + + /* If this is called without a bitmap, that is a bug. But just to be + * robust in case we screwed up elsewhere, in that case pretend there + * was one dirty bit in the requested area, so we won't try to do a + * local read there (no bitmap probably implies no disk) */ + if (!expect(b)) + return 1; + if (!expect(b->bm_pages)) + return 1; + + spin_lock_irqsave(&b->bm_lock, flags); + if (BM_DONT_TEST & b->bm_flags) + bm_print_lock_info(device); + for (bitnr = s; bitnr <= e; bitnr++) { + unsigned int idx = bm_bit_to_page_idx(b, bitnr); + if (page_nr != idx) { + page_nr = idx; + if (p_addr) + bm_unmap(p_addr); + p_addr = bm_map_pidx(b, idx); + } + if (expect(bitnr < b->bm_bits)) + c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); + else + drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); + } + if (p_addr) + bm_unmap(p_addr); + spin_unlock_irqrestore(&b->bm_lock, flags); + return c; +} + + +/* inherently racy... + * return value may be already out-of-date when this function returns. + * but the general usage is that this is only use during a cstate when bits are + * only cleared, not set, and typically only care for the case when the return + * value is zero, or we already "locked" this "bitmap extent" by other means. + * + * enr is bm-extent number, since we chose to name one sector (512 bytes) + * worth of the bitmap a "bitmap extent". + * + * TODO + * I think since we use it like a reference count, we should use the real + * reference count of some bitmap extent element from some lru instead... + * + */ +int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr) +{ + struct drbd_bitmap *b = device->bitmap; + int count, s, e; + unsigned long flags; + unsigned long *p_addr, *bm; + + if (!expect(b)) + return 0; + if (!expect(b->bm_pages)) + return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if (BM_DONT_TEST & b->bm_flags) + bm_print_lock_info(device); + + s = S2W(enr); + e = min((size_t)S2W(enr+1), b->bm_words); + count = 0; + if (s < b->bm_words) { + int n = e-s; + p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); + bm = p_addr + MLPP(s); + while (n--) + count += hweight_long(*bm++); + bm_unmap(p_addr); + } else { + drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s); + } + spin_unlock_irqrestore(&b->bm_lock, flags); + return count; +} diff --git a/kernel/drivers/block/drbd/drbd_debugfs.c b/kernel/drivers/block/drbd/drbd_debugfs.c new file mode 100644 index 000000000..a6ee3d750 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_debugfs.c @@ -0,0 +1,958 @@ +#define pr_fmt(fmt) "drbd debugfs: " fmt +#include +#include +#include +#include +#include +#include +#include + +#include "drbd_int.h" +#include "drbd_req.h" +#include "drbd_debugfs.h" + + +/********************************************************************** + * Whenever you change the file format, remember to bump the version. * + **********************************************************************/ + +static struct dentry *drbd_debugfs_root; +static struct dentry *drbd_debugfs_version; +static struct dentry *drbd_debugfs_resources; +static struct dentry *drbd_debugfs_minors; + +static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt) +{ + if (valid) + seq_printf(m, "\t%d", jiffies_to_msecs(dt)); + else + seq_printf(m, "\t-"); +} + +static void __seq_print_rq_state_bit(struct seq_file *m, + bool is_set, char *sep, const char *set_name, const char *unset_name) +{ + if (is_set && set_name) { + seq_putc(m, *sep); + seq_puts(m, set_name); + *sep = '|'; + } else if (!is_set && unset_name) { + seq_putc(m, *sep); + seq_puts(m, unset_name); + *sep = '|'; + } +} + +static void seq_print_rq_state_bit(struct seq_file *m, + bool is_set, char *sep, const char *set_name) +{ + __seq_print_rq_state_bit(m, is_set, sep, set_name, NULL); +} + +/* pretty print enum drbd_req_state_bits req->rq_state */ +static void seq_print_request_state(struct seq_file *m, struct drbd_request *req) +{ + unsigned int s = req->rq_state; + char sep = ' '; + seq_printf(m, "\t0x%08x", s); + seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed"); + + /* RQ_WRITE ignored, already reported */ + seq_puts(m, "\tlocal:"); + seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL"); + seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed"); + seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended"); + sep = ' '; + seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending"); + seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed"); + seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted"); + seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok"); + if (sep == ' ') + seq_puts(m, " -"); + + /* for_each_connection ... */ + seq_printf(m, "\tnet:"); + sep = ' '; + seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending"); + seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued"); + seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent"); + seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done"); + seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis"); + seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok"); + if (sep == ' ') + seq_puts(m, " -"); + + seq_printf(m, " :"); + sep = ' '; + seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B"); + seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C"); + seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr"); + if (sep == ' ') + seq_puts(m, " -"); + seq_printf(m, "\n"); +} + +static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now) +{ + /* change anything here, fixup header below! */ + unsigned int s = req->rq_state; + +#define RQ_HDR_1 "epoch\tsector\tsize\trw" + seq_printf(m, "0x%x\t%llu\t%u\t%s", + req->epoch, + (unsigned long long)req->i.sector, req->i.size >> 9, + (s & RQ_WRITE) ? "W" : "R"); + +#define RQ_HDR_2 "\tstart\tin AL\tsubmit" + seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif)); + seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif); + seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif); + +#define RQ_HDR_3 "\tsent\tacked\tdone" + seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif); + seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif); + seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif); + +#define RQ_HDR_4 "\tstate\n" + seq_print_request_state(m, req); +} +#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4 + +static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now) +{ + seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr); + seq_print_one_request(m, req, now); +} + +static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n"); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + struct drbd_md_io tmp; + /* In theory this is racy, + * in the sense that there could have been a + * drbd_md_put_buffer(); drbd_md_get_buffer(); + * between accessing these members here. */ + tmp = device->md_io; + if (atomic_read(&tmp.in_use)) { + seq_printf(m, "%u\t%u\t%d\t", + device->minor, device->vnr, + jiffies_to_msecs(now - tmp.start_jif)); + if (time_before(tmp.submit_jif, tmp.start_jif)) + seq_puts(m, "-\t"); + else + seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif)); + seq_printf(m, "%s\n", tmp.current_use); + } + } + rcu_read_unlock(); +} + +static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + seq_puts(m, "minor\tvnr\tage\t#waiting\n"); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + unsigned long jif; + struct drbd_request *req; + int n = atomic_read(&device->ap_actlog_cnt); + if (n) { + spin_lock_irq(&device->resource->req_lock); + req = list_first_entry_or_null(&device->pending_master_completion[1], + struct drbd_request, req_pending_master_completion); + /* if the oldest request does not wait for the activity log + * it is not interesting for us here */ + if (req && !(req->rq_state & RQ_IN_ACT_LOG)) + jif = req->start_jif; + else + req = NULL; + spin_unlock_irq(&device->resource->req_lock); + } + if (n) { + seq_printf(m, "%u\t%u\t", device->minor, device->vnr); + if (req) + seq_printf(m, "%u\t", jiffies_to_msecs(now - jif)); + else + seq_puts(m, "-\t"); + seq_printf(m, "%u\n", n); + } + } + rcu_read_unlock(); +} + +static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now) +{ + struct drbd_bm_aio_ctx *ctx; + unsigned long start_jif; + unsigned int in_flight; + unsigned int flags; + spin_lock_irq(&device->resource->req_lock); + ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list); + if (ctx && ctx->done) + ctx = NULL; + if (ctx) { + start_jif = ctx->start_jif; + in_flight = atomic_read(&ctx->in_flight); + flags = ctx->flags; + } + spin_unlock_irq(&device->resource->req_lock); + if (ctx) { + seq_printf(m, "%u\t%u\t%c\t%u\t%u\n", + device->minor, device->vnr, + (flags & BM_AIO_READ) ? 'R' : 'W', + jiffies_to_msecs(now - start_jif), + in_flight); + } +} + +static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n"); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + seq_print_device_bitmap_io(m, device, now); + } + rcu_read_unlock(); +} + +/* pretty print enum peer_req->flags */ +static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req) +{ + unsigned long f = peer_req->flags; + char sep = ' '; + + __seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing"); + __seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal"); + seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL"); + seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C"); + seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync"); + + if (f & EE_IS_TRIM) { + seq_putc(m, sep); + sep = '|'; + if (f & EE_IS_TRIM_USE_ZEROOUT) + seq_puts(m, "zero-out"); + else + seq_puts(m, "trim"); + } + seq_putc(m, '\n'); +} + +static void seq_print_peer_request(struct seq_file *m, + struct drbd_device *device, struct list_head *lh, + unsigned long now) +{ + bool reported_preparing = false; + struct drbd_peer_request *peer_req; + list_for_each_entry(peer_req, lh, w.list) { + if (reported_preparing && !(peer_req->flags & EE_SUBMITTED)) + continue; + + if (device) + seq_printf(m, "%u\t%u\t", device->minor, device->vnr); + + seq_printf(m, "%llu\t%u\t%c\t%u\t", + (unsigned long long)peer_req->i.sector, peer_req->i.size >> 9, + (peer_req->flags & EE_WRITE) ? 'W' : 'R', + jiffies_to_msecs(now - peer_req->submit_jif)); + seq_print_peer_request_flags(m, peer_req); + if (peer_req->flags & EE_SUBMITTED) + break; + else + reported_preparing = true; + } +} + +static void seq_print_device_peer_requests(struct seq_file *m, + struct drbd_device *device, unsigned long now) +{ + seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n"); + spin_lock_irq(&device->resource->req_lock); + seq_print_peer_request(m, device, &device->active_ee, now); + seq_print_peer_request(m, device, &device->read_ee, now); + seq_print_peer_request(m, device, &device->sync_ee, now); + spin_unlock_irq(&device->resource->req_lock); + if (test_bit(FLUSH_PENDING, &device->flags)) { + seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n", + device->minor, device->vnr, + jiffies_to_msecs(now - device->flush_jif)); + } +} + +static void seq_print_resource_pending_peer_requests(struct seq_file *m, + struct drbd_resource *resource, unsigned long now) +{ + struct drbd_device *device; + unsigned int i; + + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, i) { + seq_print_device_peer_requests(m, device, now); + } + rcu_read_unlock(); +} + +static void seq_print_resource_transfer_log_summary(struct seq_file *m, + struct drbd_resource *resource, + struct drbd_connection *connection, + unsigned long now) +{ + struct drbd_request *req; + unsigned int count = 0; + unsigned int show_state = 0; + + seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR); + spin_lock_irq(&resource->req_lock); + list_for_each_entry(req, &connection->transfer_log, tl_requests) { + unsigned int tmp = 0; + unsigned int s; + ++count; + + /* don't disable irq "forever" */ + if (!(count & 0x1ff)) { + struct drbd_request *req_next; + kref_get(&req->kref); + spin_unlock_irq(&resource->req_lock); + cond_resched(); + spin_lock_irq(&resource->req_lock); + req_next = list_next_entry(req, tl_requests); + if (kref_put(&req->kref, drbd_req_destroy)) + req = req_next; + if (&req->tl_requests == &connection->transfer_log) + break; + } + + s = req->rq_state; + + /* This is meant to summarize timing issues, to be able to tell + * local disk problems from network problems. + * Skip requests, if we have shown an even older request with + * similar aspects already. */ + if (req->master_bio == NULL) + tmp |= 1; + if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING)) + tmp |= 2; + if (s & RQ_NET_MASK) { + if (!(s & RQ_NET_SENT)) + tmp |= 4; + if (s & RQ_NET_PENDING) + tmp |= 8; + if (!(s & RQ_NET_DONE)) + tmp |= 16; + } + if ((tmp & show_state) == tmp) + continue; + show_state |= tmp; + seq_printf(m, "%u\t", count); + seq_print_minor_vnr_req(m, req, now); + if (show_state == 0x1f) + break; + } + spin_unlock_irq(&resource->req_lock); +} + +/* TODO: transfer_log and friends should be moved to resource */ +static int in_flight_summary_show(struct seq_file *m, void *pos) +{ + struct drbd_resource *resource = m->private; + struct drbd_connection *connection; + unsigned long jif = jiffies; + + connection = first_connection(resource); + /* This does not happen, actually. + * But be robust and prepare for future code changes. */ + if (!connection || !kref_get_unless_zero(&connection->kref)) + return -ESTALE; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + seq_puts(m, "oldest bitmap IO\n"); + seq_print_resource_pending_bitmap_io(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "meta data IO\n"); + seq_print_resource_pending_meta_io(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "socket buffer stats\n"); + /* for each connection ... once we have more than one */ + rcu_read_lock(); + if (connection->data.socket) { + /* open coded SIOCINQ, the "relevant" part */ + struct tcp_sock *tp = tcp_sk(connection->data.socket->sk); + int answ = tp->rcv_nxt - tp->copied_seq; + seq_printf(m, "unread receive buffer: %u Byte\n", answ); + /* open coded SIOCOUTQ, the "relevant" part */ + answ = tp->write_seq - tp->snd_una; + seq_printf(m, "unacked send buffer: %u Byte\n", answ); + } + rcu_read_unlock(); + seq_putc(m, '\n'); + + seq_puts(m, "oldest peer requests\n"); + seq_print_resource_pending_peer_requests(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "application requests waiting for activity log\n"); + seq_print_waiting_for_AL(m, resource, jif); + seq_putc(m, '\n'); + + seq_puts(m, "oldest application requests\n"); + seq_print_resource_transfer_log_summary(m, resource, connection, jif); + seq_putc(m, '\n'); + + jif = jiffies - jif; + if (jif) + seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif)); + kref_put(&connection->kref, drbd_destroy_connection); + return 0; +} + +/* simple_positive(file->f_path.dentry) respectively debugfs_positive(), + * but neither is "reachable" from here. + * So we have our own inline version of it above. :-( */ +static inline int debugfs_positive(struct dentry *dentry) +{ + return d_really_is_positive(dentry) && !d_unhashed(dentry); +} + +/* make sure at *open* time that the respective object won't go away. */ +static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *), + void *data, struct kref *kref, + void (*release)(struct kref *)) +{ + struct dentry *parent; + int ret = -ESTALE; + + /* Are we still linked, + * or has debugfs_remove() already been called? */ + parent = file->f_path.dentry->d_parent; + /* not sure if this can happen: */ + if (!parent || d_really_is_negative(parent)) + goto out; + /* serialize with d_delete() */ + mutex_lock(&d_inode(parent)->i_mutex); + /* Make sure the object is still alive */ + if (debugfs_positive(file->f_path.dentry) + && kref_get_unless_zero(kref)) + ret = 0; + mutex_unlock(&d_inode(parent)->i_mutex); + if (!ret) { + ret = single_open(file, show, data); + if (ret) + kref_put(kref, release); + } +out: + return ret; +} + +static int in_flight_summary_open(struct inode *inode, struct file *file) +{ + struct drbd_resource *resource = inode->i_private; + return drbd_single_open(file, in_flight_summary_show, resource, + &resource->kref, drbd_destroy_resource); +} + +static int in_flight_summary_release(struct inode *inode, struct file *file) +{ + struct drbd_resource *resource = inode->i_private; + kref_put(&resource->kref, drbd_destroy_resource); + return single_release(inode, file); +} + +static const struct file_operations in_flight_summary_fops = { + .owner = THIS_MODULE, + .open = in_flight_summary_open, + .read = seq_read, + .llseek = seq_lseek, + .release = in_flight_summary_release, +}; + +void drbd_debugfs_resource_add(struct drbd_resource *resource) +{ + struct dentry *dentry; + if (!drbd_debugfs_resources) + return; + + dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res = dentry; + + dentry = debugfs_create_dir("volumes", resource->debugfs_res); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res_volumes = dentry; + + dentry = debugfs_create_dir("connections", resource->debugfs_res); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res_connections = dentry; + + dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP, + resource->debugfs_res, resource, + &in_flight_summary_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + resource->debugfs_res_in_flight_summary = dentry; + return; + +fail: + drbd_debugfs_resource_cleanup(resource); + drbd_err(resource, "failed to create debugfs dentry\n"); +} + +static void drbd_debugfs_remove(struct dentry **dp) +{ + debugfs_remove(*dp); + *dp = NULL; +} + +void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) +{ + /* it is ok to call debugfs_remove(NULL) */ + drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary); + drbd_debugfs_remove(&resource->debugfs_res_connections); + drbd_debugfs_remove(&resource->debugfs_res_volumes); + drbd_debugfs_remove(&resource->debugfs_res); +} + +static void seq_print_one_timing_detail(struct seq_file *m, + const struct drbd_thread_timing_details *tdp, + unsigned long now) +{ + struct drbd_thread_timing_details td; + /* No locking... + * use temporary assignment to get at consistent data. */ + do { + td = *tdp; + } while (td.cb_nr != tdp->cb_nr); + if (!td.cb_addr) + return; + seq_printf(m, "%u\t%d\t%s:%u\t%ps\n", + td.cb_nr, + jiffies_to_msecs(now - td.start_jif), + td.caller_fn, td.line, + td.cb_addr); +} + +static void seq_print_timing_details(struct seq_file *m, + const char *title, + unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now) +{ + unsigned int start_idx; + unsigned int i; + + seq_printf(m, "%s\n", title); + /* If not much is going on, this will result in natural ordering. + * If it is very busy, we will possibly skip events, or even see wrap + * arounds, which could only be avoided with locking. + */ + start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST; + for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++) + seq_print_one_timing_detail(m, tdp+i, now); + for (i = 0; i < start_idx; i++) + seq_print_one_timing_detail(m, tdp+i, now); +} + +static int callback_history_show(struct seq_file *m, void *ignored) +{ + struct drbd_connection *connection = m->private; + unsigned long jif = jiffies; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + seq_puts(m, "n\tage\tcallsite\tfn\n"); + seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif); + seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif); + return 0; +} + +static int callback_history_open(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + return drbd_single_open(file, callback_history_show, connection, + &connection->kref, drbd_destroy_connection); +} + +static int callback_history_release(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + kref_put(&connection->kref, drbd_destroy_connection); + return single_release(inode, file); +} + +static const struct file_operations connection_callback_history_fops = { + .owner = THIS_MODULE, + .open = callback_history_open, + .read = seq_read, + .llseek = seq_lseek, + .release = callback_history_release, +}; + +static int connection_oldest_requests_show(struct seq_file *m, void *ignored) +{ + struct drbd_connection *connection = m->private; + unsigned long now = jiffies; + struct drbd_request *r1, *r2; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + spin_lock_irq(&connection->resource->req_lock); + r1 = connection->req_next; + if (r1) + seq_print_minor_vnr_req(m, r1, now); + r2 = connection->req_ack_pending; + if (r2 && r2 != r1) { + r1 = r2; + seq_print_minor_vnr_req(m, r1, now); + } + r2 = connection->req_not_net_done; + if (r2 && r2 != r1) + seq_print_minor_vnr_req(m, r2, now); + spin_unlock_irq(&connection->resource->req_lock); + return 0; +} + +static int connection_oldest_requests_open(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + return drbd_single_open(file, connection_oldest_requests_show, connection, + &connection->kref, drbd_destroy_connection); +} + +static int connection_oldest_requests_release(struct inode *inode, struct file *file) +{ + struct drbd_connection *connection = inode->i_private; + kref_put(&connection->kref, drbd_destroy_connection); + return single_release(inode, file); +} + +static const struct file_operations connection_oldest_requests_fops = { + .owner = THIS_MODULE, + .open = connection_oldest_requests_open, + .read = seq_read, + .llseek = seq_lseek, + .release = connection_oldest_requests_release, +}; + +void drbd_debugfs_connection_add(struct drbd_connection *connection) +{ + struct dentry *conns_dir = connection->resource->debugfs_res_connections; + struct dentry *dentry; + if (!conns_dir) + return; + + /* Once we enable mutliple peers, + * these connections will have descriptive names. + * For now, it is just the one connection to the (only) "peer". */ + dentry = debugfs_create_dir("peer", conns_dir); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + connection->debugfs_conn = dentry; + + dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP, + connection->debugfs_conn, connection, + &connection_callback_history_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + connection->debugfs_conn_callback_history = dentry; + + dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP, + connection->debugfs_conn, connection, + &connection_oldest_requests_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + connection->debugfs_conn_oldest_requests = dentry; + return; + +fail: + drbd_debugfs_connection_cleanup(connection); + drbd_err(connection, "failed to create debugfs dentry\n"); +} + +void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) +{ + drbd_debugfs_remove(&connection->debugfs_conn_callback_history); + drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests); + drbd_debugfs_remove(&connection->debugfs_conn); +} + +static void resync_dump_detail(struct seq_file *m, struct lc_element *e) +{ + struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); + + seq_printf(m, "%5d %s %s %s", bme->rs_left, + test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------", + test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------", + test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------" + ); +} + +static int device_resync_extents_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + if (get_ldev_if_state(device, D_FAILED)) { + lc_seq_printf_stats(m, device->resync); + lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail); + put_ldev(device); + } + return 0; +} + +static int device_act_log_extents_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + if (get_ldev_if_state(device, D_FAILED)) { + lc_seq_printf_stats(m, device->act_log); + lc_seq_dump_details(m, device->act_log, "", NULL); + put_ldev(device); + } + return 0; +} + +static int device_oldest_requests_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + struct drbd_resource *resource = device->resource; + unsigned long now = jiffies; + struct drbd_request *r1, *r2; + int i; + + /* BUMP me if you change the file format/content/presentation */ + seq_printf(m, "v: %u\n\n", 0); + + seq_puts(m, RQ_HDR); + spin_lock_irq(&resource->req_lock); + /* WRITE, then READ */ + for (i = 1; i >= 0; --i) { + r1 = list_first_entry_or_null(&device->pending_master_completion[i], + struct drbd_request, req_pending_master_completion); + r2 = list_first_entry_or_null(&device->pending_completion[i], + struct drbd_request, req_pending_local); + if (r1) + seq_print_one_request(m, r1, now); + if (r2 && r2 != r1) + seq_print_one_request(m, r2, now); + } + spin_unlock_irq(&resource->req_lock); + return 0; +} + +static int device_data_gen_id_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + struct drbd_md *md; + enum drbd_uuid_index idx; + + if (!get_ldev_if_state(device, D_FAILED)) + return -ENODEV; + + md = &device->ldev->md; + spin_lock_irq(&md->uuid_lock); + for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) { + seq_printf(m, "0x%016llX\n", md->uuid[idx]); + } + spin_unlock_irq(&md->uuid_lock); + put_ldev(device); + return 0; +} + +#define drbd_debugfs_device_attr(name) \ +static int device_ ## name ## _open(struct inode *inode, struct file *file) \ +{ \ + struct drbd_device *device = inode->i_private; \ + return drbd_single_open(file, device_ ## name ## _show, device, \ + &device->kref, drbd_destroy_device); \ +} \ +static int device_ ## name ## _release(struct inode *inode, struct file *file) \ +{ \ + struct drbd_device *device = inode->i_private; \ + kref_put(&device->kref, drbd_destroy_device); \ + return single_release(inode, file); \ +} \ +static const struct file_operations device_ ## name ## _fops = { \ + .owner = THIS_MODULE, \ + .open = device_ ## name ## _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = device_ ## name ## _release, \ +}; + +drbd_debugfs_device_attr(oldest_requests) +drbd_debugfs_device_attr(act_log_extents) +drbd_debugfs_device_attr(resync_extents) +drbd_debugfs_device_attr(data_gen_id) + +void drbd_debugfs_device_add(struct drbd_device *device) +{ + struct dentry *vols_dir = device->resource->debugfs_res_volumes; + char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */ + char vnr_buf[8]; /* volume number vnr is even 16 bit only; */ + char *slink_name = NULL; + + struct dentry *dentry; + if (!vols_dir || !drbd_debugfs_minors) + return; + + snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr); + dentry = debugfs_create_dir(vnr_buf, vols_dir); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + device->debugfs_vol = dentry; + + snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor); + slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u", + device->resource->name, device->vnr); + if (!slink_name) + goto fail; + dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name); + kfree(slink_name); + slink_name = NULL; + if (IS_ERR_OR_NULL(dentry)) + goto fail; + device->debugfs_minor = dentry; + +#define DCF(name) do { \ + dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \ + device->debugfs_vol, device, \ + &device_ ## name ## _fops); \ + if (IS_ERR_OR_NULL(dentry)) \ + goto fail; \ + device->debugfs_vol_ ## name = dentry; \ + } while (0) + + DCF(oldest_requests); + DCF(act_log_extents); + DCF(resync_extents); + DCF(data_gen_id); +#undef DCF + return; + +fail: + drbd_debugfs_device_cleanup(device); + drbd_err(device, "failed to create debugfs entries\n"); +} + +void drbd_debugfs_device_cleanup(struct drbd_device *device) +{ + drbd_debugfs_remove(&device->debugfs_minor); + drbd_debugfs_remove(&device->debugfs_vol_oldest_requests); + drbd_debugfs_remove(&device->debugfs_vol_act_log_extents); + drbd_debugfs_remove(&device->debugfs_vol_resync_extents); + drbd_debugfs_remove(&device->debugfs_vol_data_gen_id); + drbd_debugfs_remove(&device->debugfs_vol); +} + +void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) +{ + struct dentry *conn_dir = peer_device->connection->debugfs_conn; + struct dentry *dentry; + char vnr_buf[8]; + + if (!conn_dir) + return; + + snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr); + dentry = debugfs_create_dir(vnr_buf, conn_dir); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + peer_device->debugfs_peer_dev = dentry; + return; + +fail: + drbd_debugfs_peer_device_cleanup(peer_device); + drbd_err(peer_device, "failed to create debugfs entries\n"); +} + +void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) +{ + drbd_debugfs_remove(&peer_device->debugfs_peer_dev); +} + +static int drbd_version_show(struct seq_file *m, void *ignored) +{ + seq_printf(m, "# %s\n", drbd_buildtag()); + seq_printf(m, "VERSION=%s\n", REL_VERSION); + seq_printf(m, "API_VERSION=%u\n", API_VERSION); + seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN); + seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX); + return 0; +} + +static int drbd_version_open(struct inode *inode, struct file *file) +{ + return single_open(file, drbd_version_show, NULL); +} + +static struct file_operations drbd_version_fops = { + .owner = THIS_MODULE, + .open = drbd_version_open, + .llseek = seq_lseek, + .read = seq_read, + .release = single_release, +}; + +/* not __exit, may be indirectly called + * from the module-load-failure path as well. */ +void drbd_debugfs_cleanup(void) +{ + drbd_debugfs_remove(&drbd_debugfs_resources); + drbd_debugfs_remove(&drbd_debugfs_minors); + drbd_debugfs_remove(&drbd_debugfs_version); + drbd_debugfs_remove(&drbd_debugfs_root); +} + +int __init drbd_debugfs_init(void) +{ + struct dentry *dentry; + + dentry = debugfs_create_dir("drbd", NULL); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_root = dentry; + + dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_version = dentry; + + dentry = debugfs_create_dir("resources", drbd_debugfs_root); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_resources = dentry; + + dentry = debugfs_create_dir("minors", drbd_debugfs_root); + if (IS_ERR_OR_NULL(dentry)) + goto fail; + drbd_debugfs_minors = dentry; + return 0; + +fail: + drbd_debugfs_cleanup(); + if (dentry) + return PTR_ERR(dentry); + else + return -EINVAL; +} diff --git a/kernel/drivers/block/drbd/drbd_debugfs.h b/kernel/drivers/block/drbd/drbd_debugfs.h new file mode 100644 index 000000000..8bee21340 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_debugfs.h @@ -0,0 +1,39 @@ +#include +#include +#include + +#include "drbd_int.h" + +#ifdef CONFIG_DEBUG_FS +int __init drbd_debugfs_init(void); +void drbd_debugfs_cleanup(void); + +void drbd_debugfs_resource_add(struct drbd_resource *resource); +void drbd_debugfs_resource_cleanup(struct drbd_resource *resource); + +void drbd_debugfs_connection_add(struct drbd_connection *connection); +void drbd_debugfs_connection_cleanup(struct drbd_connection *connection); + +void drbd_debugfs_device_add(struct drbd_device *device); +void drbd_debugfs_device_cleanup(struct drbd_device *device); + +void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device); +void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device); +#else + +static inline int __init drbd_debugfs_init(void) { return -ENODEV; } +static inline void drbd_debugfs_cleanup(void) { } + +static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { } +static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { } + +static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { } +static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { } + +static inline void drbd_debugfs_device_add(struct drbd_device *device) { } +static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { } + +static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { } +static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { } + +#endif diff --git a/kernel/drivers/block/drbd/drbd_int.h b/kernel/drivers/block/drbd/drbd_int.h new file mode 100644 index 000000000..b905e9888 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_int.h @@ -0,0 +1,2328 @@ +/* + drbd_int.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#ifndef _DRBD_INT_H +#define _DRBD_INT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "drbd_strings.h" +#include "drbd_state.h" +#include "drbd_protocol.h" + +#ifdef __CHECKER__ +# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) +# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) +# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) +# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) +#else +# define __protected_by(x) +# define __protected_read_by(x) +# define __protected_write_by(x) +# define __must_hold(x) +#endif + +/* module parameter, defined in drbd_main.c */ +extern unsigned int minor_count; +extern bool disable_sendpage; +extern bool allow_oos; +void tl_abort_disk_io(struct drbd_device *device); + +#ifdef CONFIG_DRBD_FAULT_INJECTION +extern int enable_faults; +extern int fault_rate; +extern int fault_devs; +#endif + +extern char usermode_helper[]; + + +/* I don't remember why XCPU ... + * This is used to wake the asender, + * and to interrupt sending the sending task + * on disconnect. + */ +#define DRBD_SIG SIGXCPU + +/* This is used to stop/restart our threads. + * Cannot use SIGTERM nor SIGKILL, since these + * are sent out by init on runlevel changes + * I choose SIGHUP for now. + */ +#define DRBD_SIGKILL SIGHUP + +#define ID_IN_SYNC (4711ULL) +#define ID_OUT_OF_SYNC (4712ULL) +#define ID_SYNCER (-1ULL) + +#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL) + +struct drbd_device; +struct drbd_connection; + +#define __drbd_printk_device(level, device, fmt, args...) \ + dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args) +#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \ + dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args) +#define __drbd_printk_resource(level, resource, fmt, args...) \ + printk(level "drbd %s: " fmt, (resource)->name, ## args) +#define __drbd_printk_connection(level, connection, fmt, args...) \ + printk(level "drbd %s: " fmt, (connection)->resource->name, ## args) + +void drbd_printk_with_wrong_object_type(void); + +#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \ + (__builtin_types_compatible_p(typeof(obj), type) || \ + __builtin_types_compatible_p(typeof(obj), const type)), \ + func(level, (const type)(obj), fmt, ## args) + +#define drbd_printk(level, obj, fmt, args...) \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, struct drbd_device *, \ + __drbd_printk_device, level, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, struct drbd_resource *, \ + __drbd_printk_resource, level, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, struct drbd_connection *, \ + __drbd_printk_connection, level, fmt, ## args), \ + __builtin_choose_expr( \ + __drbd_printk_if_same_type(obj, struct drbd_peer_device *, \ + __drbd_printk_peer_device, level, fmt, ## args), \ + drbd_printk_with_wrong_object_type())))) + +#define drbd_dbg(obj, fmt, args...) \ + drbd_printk(KERN_DEBUG, obj, fmt, ## args) +#define drbd_alert(obj, fmt, args...) \ + drbd_printk(KERN_ALERT, obj, fmt, ## args) +#define drbd_err(obj, fmt, args...) \ + drbd_printk(KERN_ERR, obj, fmt, ## args) +#define drbd_warn(obj, fmt, args...) \ + drbd_printk(KERN_WARNING, obj, fmt, ## args) +#define drbd_info(obj, fmt, args...) \ + drbd_printk(KERN_INFO, obj, fmt, ## args) +#define drbd_emerg(obj, fmt, args...) \ + drbd_printk(KERN_EMERG, obj, fmt, ## args) + +#define dynamic_drbd_dbg(device, fmt, args...) \ + dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args) + +#define D_ASSERT(device, exp) do { \ + if (!(exp)) \ + drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \ + } while (0) + +/** + * expect - Make an assertion + * + * Unlike the assert macro, this macro returns a boolean result. + */ +#define expect(exp) ({ \ + bool _bool = (exp); \ + if (!_bool) \ + drbd_err(device, "ASSERTION %s FAILED in %s\n", \ + #exp, __func__); \ + _bool; \ + }) + +/* Defines to control fault insertion */ +enum { + DRBD_FAULT_MD_WR = 0, /* meta data write */ + DRBD_FAULT_MD_RD = 1, /* read */ + DRBD_FAULT_RS_WR = 2, /* resync */ + DRBD_FAULT_RS_RD = 3, + DRBD_FAULT_DT_WR = 4, /* data */ + DRBD_FAULT_DT_RD = 5, + DRBD_FAULT_DT_RA = 6, /* data read ahead */ + DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ + DRBD_FAULT_AL_EE = 8, /* alloc ee */ + DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */ + + DRBD_FAULT_MAX, +}; + +extern unsigned int +_drbd_insert_fault(struct drbd_device *device, unsigned int type); + +static inline int +drbd_insert_fault(struct drbd_device *device, unsigned int type) { +#ifdef CONFIG_DRBD_FAULT_INJECTION + return fault_rate && + (enable_faults & (1<command == P_BITMAP) */ + unsigned packets[2]; + unsigned bytes[2]; +}; + +extern void INFO_bm_xfer_stats(struct drbd_device *device, + const char *direction, struct bm_xfer_ctx *c); + +static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) +{ + /* word_offset counts "native long words" (32 or 64 bit), + * aligned at 64 bit. + * Encoded packet may end at an unaligned bit offset. + * In case a fallback clear text packet is transmitted in + * between, we adjust this offset back to the last 64bit + * aligned "native long word", which makes coding and decoding + * the plain text bitmap much more convenient. */ +#if BITS_PER_LONG == 64 + c->word_offset = c->bit_offset >> 6; +#elif BITS_PER_LONG == 32 + c->word_offset = c->bit_offset >> 5; + c->word_offset &= ~(1UL); +#else +# error "unsupported BITS_PER_LONG" +#endif +} + +extern unsigned int drbd_header_size(struct drbd_connection *connection); + +/**********************************************************************/ +enum drbd_thread_state { + NONE, + RUNNING, + EXITING, + RESTARTING +}; + +struct drbd_thread { + spinlock_t t_lock; + struct task_struct *task; + struct completion stop; + enum drbd_thread_state t_state; + int (*function) (struct drbd_thread *); + struct drbd_resource *resource; + struct drbd_connection *connection; + int reset_cpu_mask; + const char *name; +}; + +static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) +{ + /* THINK testing the t_state seems to be uncritical in all cases + * (but thread_{start,stop}), so we can read it *without* the lock. + * --lge */ + + smp_rmb(); + return thi->t_state; +} + +struct drbd_work { + struct list_head list; + int (*cb)(struct drbd_work *, int cancel); +}; + +struct drbd_device_work { + struct drbd_work w; + struct drbd_device *device; +}; + +#include "drbd_interval.h" + +extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *); + +struct drbd_request { + struct drbd_work w; + struct drbd_device *device; + + /* if local IO is not allowed, will be NULL. + * if local IO _is_ allowed, holds the locally submitted bio clone, + * or, after local IO completion, the ERR_PTR(error). + * see drbd_request_endio(). */ + struct bio *private_bio; + + struct drbd_interval i; + + /* epoch: used to check on "completion" whether this req was in + * the current epoch, and we therefore have to close it, + * causing a p_barrier packet to be send, starting a new epoch. + * + * This corresponds to "barrier" in struct p_barrier[_ack], + * and to "barrier_nr" in struct drbd_epoch (and various + * comments/function parameters/local variable names). + */ + unsigned int epoch; + + struct list_head tl_requests; /* ring list in the transfer log */ + struct bio *master_bio; /* master bio pointer */ + + /* see struct drbd_device */ + struct list_head req_pending_master_completion; + struct list_head req_pending_local; + + /* for generic IO accounting */ + unsigned long start_jif; + + /* for DRBD internal statistics */ + + /* Minimal set of time stamps to determine if we wait for activity log + * transactions, local disk or peer. 32 bit "jiffies" are good enough, + * we don't expect a DRBD request to be stalled for several month. + */ + + /* before actual request processing */ + unsigned long in_actlog_jif; + + /* local disk */ + unsigned long pre_submit_jif; + + /* per connection */ + unsigned long pre_send_jif; + unsigned long acked_jif; + unsigned long net_done_jif; + + /* Possibly even more detail to track each phase: + * master_completion_jif + * how long did it take to complete the master bio + * (application visible latency) + * allocated_jif + * how long the master bio was blocked until we finally allocated + * a tracking struct + * in_actlog_jif + * how long did we wait for activity log transactions + * + * net_queued_jif + * when did we finally queue it for sending + * pre_send_jif + * when did we start sending it + * post_send_jif + * how long did we block in the network stack trying to send it + * acked_jif + * when did we receive (or fake, in protocol A) a remote ACK + * net_done_jif + * when did we receive final acknowledgement (P_BARRIER_ACK), + * or decide, e.g. on connection loss, that we do no longer expect + * anything from this peer for this request. + * + * pre_submit_jif + * post_sub_jif + * when did we start submiting to the lower level device, + * and how long did we block in that submit function + * local_completion_jif + * how long did it take the lower level device to complete this request + */ + + + /* once it hits 0, we may complete the master_bio */ + atomic_t completion_ref; + /* once it hits 0, we may destroy this drbd_request object */ + struct kref kref; + + unsigned rq_state; /* see comments above _req_mod() */ +}; + +struct drbd_epoch { + struct drbd_connection *connection; + struct list_head list; + unsigned int barrier_nr; + atomic_t epoch_size; /* increased on every request added. */ + atomic_t active; /* increased on every req. added, and dec on every finished. */ + unsigned long flags; +}; + +/* Prototype declaration of function defined in drbd_receiver.c */ +int drbdd_init(struct drbd_thread *); +int drbd_asender(struct drbd_thread *); + +/* drbd_epoch flag bits */ +enum { + DE_HAVE_BARRIER_NUMBER, +}; + +enum epoch_event { + EV_PUT, + EV_GOT_BARRIER_NR, + EV_BECAME_LAST, + EV_CLEANUP = 32, /* used as flag */ +}; + +struct digest_info { + int digest_size; + void *digest; +}; + +struct drbd_peer_request { + struct drbd_work w; + struct drbd_peer_device *peer_device; + struct drbd_epoch *epoch; /* for writes */ + struct page *pages; + atomic_t pending_bios; + struct drbd_interval i; + /* see comments on ee flag bits below */ + unsigned long flags; + unsigned long submit_jif; + union { + u64 block_id; + struct digest_info *digest; + }; +}; + +/* ee flag bits. + * While corresponding bios are in flight, the only modification will be + * set_bit WAS_ERROR, which has to be atomic. + * If no bios are in flight yet, or all have been completed, + * non-atomic modification to ee->flags is ok. + */ +enum { + __EE_CALL_AL_COMPLETE_IO, + __EE_MAY_SET_IN_SYNC, + + /* is this a TRIM aka REQ_DISCARD? */ + __EE_IS_TRIM, + /* our lower level cannot handle trim, + * and we want to fall back to zeroout instead */ + __EE_IS_TRIM_USE_ZEROOUT, + + /* In case a barrier failed, + * we need to resubmit without the barrier flag. */ + __EE_RESUBMITTED, + + /* we may have several bios per peer request. + * if any of those fail, we set this flag atomically + * from the endio callback */ + __EE_WAS_ERROR, + + /* This ee has a pointer to a digest instead of a block id */ + __EE_HAS_DIGEST, + + /* Conflicting local requests need to be restarted after this request */ + __EE_RESTART_REQUESTS, + + /* The peer wants a write ACK for this (wire proto C) */ + __EE_SEND_WRITE_ACK, + + /* Is set when net_conf had two_primaries set while creating this peer_req */ + __EE_IN_INTERVAL_TREE, + + /* for debugfs: */ + /* has this been submitted, or does it still wait for something else? */ + __EE_SUBMITTED, + + /* this is/was a write request */ + __EE_WRITE, + + /* this originates from application on peer + * (not some resync or verify or other DRBD internal request) */ + __EE_APPLICATION, +}; +#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) +#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) +#define EE_IS_TRIM (1<<__EE_IS_TRIM) +#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT) +#define EE_RESUBMITTED (1<<__EE_RESUBMITTED) +#define EE_WAS_ERROR (1<<__EE_WAS_ERROR) +#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST) +#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS) +#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK) +#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) +#define EE_SUBMITTED (1<<__EE_SUBMITTED) +#define EE_WRITE (1<<__EE_WRITE) +#define EE_APPLICATION (1<<__EE_APPLICATION) + +/* flag bits per device */ +enum { + UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ + MD_DIRTY, /* current uuids and flags not yet on disk */ + USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ + CL_ST_CHG_SUCCESS, + CL_ST_CHG_FAIL, + CRASHED_PRIMARY, /* This node was a crashed primary. + * Gets cleared when the state.conn + * goes into C_CONNECTED state. */ + CONSIDER_RESYNC, + + MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ + + SUSPEND_IO, /* suspend application io */ + BITMAP_IO, /* suspend application io; + once no more io in flight, start bitmap io */ + BITMAP_IO_QUEUED, /* Started bitmap IO */ + WAS_IO_ERROR, /* Local disk failed, returned IO error */ + WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */ + FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ + RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ + RESIZE_PENDING, /* Size change detected locally, waiting for the response from + * the peer, if it changed there as well. */ + NEW_CUR_UUID, /* Create new current UUID when thawing IO */ + AL_SUSPENDED, /* Activity logging is currently suspended. */ + AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ + B_RS_H_DONE, /* Before resync handler done (already executed) */ + DISCARD_MY_DATA, /* discard_my_data flag per volume */ + READ_BALANCE_RR, + + FLUSH_PENDING, /* if set, device->flush_jif is when we submitted that flush + * from drbd_flush_after_epoch() */ + + /* cleared only after backing device related structures have been destroyed. */ + GOING_DISKLESS, /* Disk is being detached, because of io-error, or admin request. */ + + /* to be used in drbd_device_post_work() */ + GO_DISKLESS, /* tell worker to schedule cleanup before detach */ + DESTROY_DISK, /* tell worker to close backing devices and destroy related structures. */ + MD_SYNC, /* tell worker to call drbd_md_sync() */ + RS_START, /* tell worker to start resync/OV */ + RS_PROGRESS, /* tell worker that resync made significant progress */ + RS_DONE, /* tell worker that resync is done */ +}; + +struct drbd_bitmap; /* opaque for drbd_device */ + +/* definition of bits in bm_flags to be used in drbd_bm_lock + * and drbd_bitmap_io and friends. */ +enum bm_flag { + /* do we need to kfree, or vfree bm_pages? */ + BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ + + /* currently locked for bulk operation */ + BM_LOCKED_MASK = 0xf, + + /* in detail, that is: */ + BM_DONT_CLEAR = 0x1, + BM_DONT_SET = 0x2, + BM_DONT_TEST = 0x4, + + /* so we can mark it locked for bulk operation, + * and still allow all non-bulk operations */ + BM_IS_LOCKED = 0x8, + + /* (test bit, count bit) allowed (common case) */ + BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED, + + /* testing bits, as well as setting new bits allowed, but clearing bits + * would be unexpected. Used during bitmap receive. Setting new bits + * requires sending of "out-of-sync" information, though. */ + BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED, + + /* for drbd_bm_write_copy_pages, everything is allowed, + * only concurrent bulk operations are locked out. */ + BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED, +}; + +struct drbd_work_queue { + struct list_head q; + spinlock_t q_lock; /* to protect the list. */ + wait_queue_head_t q_wait; +}; + +struct drbd_socket { + struct mutex mutex; + struct socket *socket; + /* this way we get our + * send/receive buffers off the stack */ + void *sbuf; + void *rbuf; +}; + +struct drbd_md { + u64 md_offset; /* sector offset to 'super' block */ + + u64 la_size_sect; /* last agreed size, unit sectors */ + spinlock_t uuid_lock; + u64 uuid[UI_SIZE]; + u64 device_uuid; + u32 flags; + u32 md_size_sect; + + s32 al_offset; /* signed relative sector offset to activity log */ + s32 bm_offset; /* signed relative sector offset to bitmap */ + + /* cached value of bdev->disk_conf->meta_dev_idx (see below) */ + s32 meta_dev_idx; + + /* see al_tr_number_to_on_disk_sector() */ + u32 al_stripes; + u32 al_stripe_size_4k; + u32 al_size_4k; /* cached product of the above */ +}; + +struct drbd_backing_dev { + struct block_device *backing_bdev; + struct block_device *md_bdev; + struct drbd_md md; + struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */ + sector_t known_size; /* last known size of that backing device */ +}; + +struct drbd_md_io { + struct page *page; + unsigned long start_jif; /* last call to drbd_md_get_buffer */ + unsigned long submit_jif; /* last _drbd_md_sync_page_io() submit */ + const char *current_use; + atomic_t in_use; + unsigned int done; + int error; +}; + +struct bm_io_work { + struct drbd_work w; + char *why; + enum bm_flag flags; + int (*io_fn)(struct drbd_device *device); + void (*done)(struct drbd_device *device, int rv); +}; + +enum write_ordering_e { + WO_none, + WO_drain_io, + WO_bdev_flush, +}; + +struct fifo_buffer { + unsigned int head_index; + unsigned int size; + int total; /* sum of all values */ + int values[0]; +}; +extern struct fifo_buffer *fifo_alloc(int fifo_size); + +/* flag bits per connection */ +enum { + NET_CONGESTED, /* The data socket is congested */ + RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ + SEND_PING, /* whether asender should send a ping asap */ + SIGNAL_ASENDER, /* whether asender wants to be interrupted */ + GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ + CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ + CONN_WD_ST_CHG_OKAY, + CONN_WD_ST_CHG_FAIL, + CONN_DRY_RUN, /* Expect disconnect after resync handshake. */ + CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */ + STATE_SENT, /* Do not change state/UUIDs while this is set */ + CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) + * pending, from drbd worker context. + * If set, bdi_write_congested() returns true, + * so shrink_page_list() would not recurse into, + * and potentially deadlock on, this drbd worker. + */ + DISCONNECT_SENT, + + DEVICE_WORK_PENDING, /* tell worker that some device has pending work */ +}; + +struct drbd_resource { + char *name; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_res; + struct dentry *debugfs_res_volumes; + struct dentry *debugfs_res_connections; + struct dentry *debugfs_res_in_flight_summary; +#endif + struct kref kref; + struct idr devices; /* volume number to device mapping */ + struct list_head connections; + struct list_head resources; + struct res_opts res_opts; + struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */ + struct mutex adm_mutex; /* mutex to serialize administrative requests */ + spinlock_t req_lock; + + unsigned susp:1; /* IO suspended by user */ + unsigned susp_nod:1; /* IO suspended because no data */ + unsigned susp_fen:1; /* IO suspended because fence peer handler runs */ + + enum write_ordering_e write_ordering; + + cpumask_var_t cpu_mask; +}; + +struct drbd_thread_timing_details +{ + unsigned long start_jif; + void *cb_addr; + const char *caller_fn; + unsigned int line; + unsigned int cb_nr; +}; + +struct drbd_connection { + struct list_head connections; + struct drbd_resource *resource; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_conn; + struct dentry *debugfs_conn_callback_history; + struct dentry *debugfs_conn_oldest_requests; +#endif + struct kref kref; + struct idr peer_devices; /* volume number to peer device mapping */ + enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */ + struct mutex cstate_mutex; /* Protects graceful disconnects */ + unsigned int connect_cnt; /* Inc each time a connection is established */ + + unsigned long flags; + struct net_conf *net_conf; /* content protected by rcu */ + wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */ + + struct sockaddr_storage my_addr; + int my_addr_len; + struct sockaddr_storage peer_addr; + int peer_addr_len; + + struct drbd_socket data; /* data/barrier/cstate/parameter packets */ + struct drbd_socket meta; /* ping/ack (metadata) packets */ + int agreed_pro_version; /* actually used protocol version */ + u32 agreed_features; + unsigned long last_received; /* in jiffies, either socket */ + unsigned int ko_count; + + struct list_head transfer_log; /* all requests not yet fully processed */ + + struct crypto_hash *cram_hmac_tfm; + struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by connection->data->mutex */ + struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */ + struct crypto_hash *csums_tfm; + struct crypto_hash *verify_tfm; + void *int_dig_in; + void *int_dig_vv; + + /* receiver side */ + struct drbd_epoch *current_epoch; + spinlock_t epoch_lock; + unsigned int epochs; + atomic_t current_tle_nr; /* transfer log epoch number */ + unsigned current_tle_writes; /* writes seen within this tl epoch */ + + unsigned long last_reconnect_jif; + struct drbd_thread receiver; + struct drbd_thread worker; + struct drbd_thread asender; + + /* cached pointers, + * so we can look up the oldest pending requests more quickly. + * protected by resource->req_lock */ + struct drbd_request *req_next; /* DRBD 9: todo.req_next */ + struct drbd_request *req_ack_pending; + struct drbd_request *req_not_net_done; + + /* sender side */ + struct drbd_work_queue sender_work; + +#define DRBD_THREAD_DETAILS_HIST 16 + unsigned int w_cb_nr; /* keeps counting up */ + unsigned int r_cb_nr; /* keeps counting up */ + struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST]; + struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST]; + + struct { + /* whether this sender thread + * has processed a single write yet. */ + bool seen_any_write_yet; + + /* Which barrier number to send with the next P_BARRIER */ + int current_epoch_nr; + + /* how many write requests have been sent + * with req->epoch == current_epoch_nr. + * If none, no P_BARRIER will be sent. */ + unsigned current_epoch_writes; + } send; +}; + +void __update_timing_details( + struct drbd_thread_timing_details *tdp, + unsigned int *cb_nr, + void *cb, + const char *fn, const unsigned int line); + +#define update_worker_timing_details(c, cb) \ + __update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ ) +#define update_receiver_timing_details(c, cb) \ + __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ ) + +struct submit_worker { + struct workqueue_struct *wq; + struct work_struct worker; + + /* protected by ..->resource->req_lock */ + struct list_head writes; +}; + +struct drbd_peer_device { + struct list_head peer_devices; + struct drbd_device *device; + struct drbd_connection *connection; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_peer_dev; +#endif +}; + +struct drbd_device { + struct drbd_resource *resource; + struct list_head peer_devices; + struct list_head pending_bitmap_io; + + unsigned long flush_jif; +#ifdef CONFIG_DEBUG_FS + struct dentry *debugfs_minor; + struct dentry *debugfs_vol; + struct dentry *debugfs_vol_oldest_requests; + struct dentry *debugfs_vol_act_log_extents; + struct dentry *debugfs_vol_resync_extents; + struct dentry *debugfs_vol_data_gen_id; +#endif + + unsigned int vnr; /* volume number within the connection */ + unsigned int minor; /* device minor number */ + + struct kref kref; + + /* things that are stored as / read from meta data on disk */ + unsigned long flags; + + /* configured by drbdsetup */ + struct drbd_backing_dev *ldev __protected_by(local); + + sector_t p_size; /* partner's disk size */ + struct request_queue *rq_queue; + struct block_device *this_bdev; + struct gendisk *vdisk; + + unsigned long last_reattach_jif; + struct drbd_work resync_work; + struct drbd_work unplug_work; + struct timer_list resync_timer; + struct timer_list md_sync_timer; + struct timer_list start_resync_timer; + struct timer_list request_timer; + + /* Used after attach while negotiating new disk state. */ + union drbd_state new_state_tmp; + + union drbd_dev_state state; + wait_queue_head_t misc_wait; + wait_queue_head_t state_wait; /* upon each state change. */ + unsigned int send_cnt; + unsigned int recv_cnt; + unsigned int read_cnt; + unsigned int writ_cnt; + unsigned int al_writ_cnt; + unsigned int bm_writ_cnt; + atomic_t ap_bio_cnt; /* Requests we need to complete */ + atomic_t ap_actlog_cnt; /* Requests waiting for activity log */ + atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ + atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ + atomic_t unacked_cnt; /* Need to send replies for */ + atomic_t local_cnt; /* Waiting for local completion */ + + /* Interval tree of pending local requests */ + struct rb_root read_requests; + struct rb_root write_requests; + + /* for statistics and timeouts */ + /* [0] read, [1] write */ + struct list_head pending_master_completion[2]; + struct list_head pending_completion[2]; + + /* use checksums for *this* resync */ + bool use_csums; + /* blocks to resync in this run [unit BM_BLOCK_SIZE] */ + unsigned long rs_total; + /* number of resync blocks that failed in this run */ + unsigned long rs_failed; + /* Syncer's start time [unit jiffies] */ + unsigned long rs_start; + /* cumulated time in PausedSyncX state [unit jiffies] */ + unsigned long rs_paused; + /* skipped because csum was equal [unit BM_BLOCK_SIZE] */ + unsigned long rs_same_csum; +#define DRBD_SYNC_MARKS 8 +#define DRBD_SYNC_MARK_STEP (3*HZ) + /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ + unsigned long rs_mark_left[DRBD_SYNC_MARKS]; + /* marks's time [unit jiffies] */ + unsigned long rs_mark_time[DRBD_SYNC_MARKS]; + /* current index into rs_mark_{left,time} */ + int rs_last_mark; + unsigned long rs_last_bcast; /* [unit jiffies] */ + + /* where does the admin want us to start? (sector) */ + sector_t ov_start_sector; + sector_t ov_stop_sector; + /* where are we now? (sector) */ + sector_t ov_position; + /* Start sector of out of sync range (to merge printk reporting). */ + sector_t ov_last_oos_start; + /* size of out-of-sync range in sectors. */ + sector_t ov_last_oos_size; + unsigned long ov_left; /* in bits */ + + struct drbd_bitmap *bitmap; + unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ + + /* Used to track operations of resync... */ + struct lru_cache *resync; + /* Number of locked elements in resync LRU */ + unsigned int resync_locked; + /* resync extent number waiting for application requests */ + unsigned int resync_wenr; + + int open_cnt; + u64 *p_uuid; + + struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */ + struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */ + struct list_head done_ee; /* need to send P_WRITE_ACK */ + struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ + struct list_head net_ee; /* zero-copy network send in progress */ + + int next_barrier_nr; + struct list_head resync_reads; + atomic_t pp_in_use; /* allocated from page pool */ + atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ + wait_queue_head_t ee_wait; + struct drbd_md_io md_io; + spinlock_t al_lock; + wait_queue_head_t al_wait; + struct lru_cache *act_log; /* activity log */ + unsigned int al_tr_number; + int al_tr_cycle; + wait_queue_head_t seq_wait; + atomic_t packet_seq; + unsigned int peer_seq; + spinlock_t peer_seq_lock; + unsigned long comm_bm_set; /* communicated number of set bits. */ + struct bm_io_work bm_io_work; + u64 ed_uuid; /* UUID of the exposed data */ + struct mutex own_state_mutex; + struct mutex *state_mutex; /* either own_state_mutex or first_peer_device(device)->connection->cstate_mutex */ + char congestion_reason; /* Why we where congested... */ + atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */ + atomic_t rs_sect_ev; /* for submitted resync data rate, both */ + int rs_last_sect_ev; /* counter to compare with */ + int rs_last_events; /* counter of read or write "events" (unit sectors) + * on the lower level device when we last looked. */ + int c_sync_rate; /* current resync rate after syncer throttle magic */ + struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */ + int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ + atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ + unsigned int peer_max_bio_size; + unsigned int local_max_bio_size; + + /* any requests that would block in drbd_make_request() + * are deferred to this single-threaded work queue */ + struct submit_worker submit; +}; + +struct drbd_bm_aio_ctx { + struct drbd_device *device; + struct list_head list; /* on device->pending_bitmap_io */; + unsigned long start_jif; + atomic_t in_flight; + unsigned int done; + unsigned flags; +#define BM_AIO_COPY_PAGES 1 +#define BM_AIO_WRITE_HINTED 2 +#define BM_AIO_WRITE_ALL_PAGES 4 +#define BM_AIO_READ 8 + int error; + struct kref kref; +}; + +struct drbd_config_context { + /* assigned from drbd_genlmsghdr */ + unsigned int minor; + /* assigned from request attributes, if present */ + unsigned int volume; +#define VOLUME_UNSPECIFIED (-1U) + /* pointer into the request skb, + * limited lifetime! */ + char *resource_name; + struct nlattr *my_addr; + struct nlattr *peer_addr; + + /* reply buffer */ + struct sk_buff *reply_skb; + /* pointer into reply buffer */ + struct drbd_genlmsghdr *reply_dh; + /* resolved from attributes, if possible */ + struct drbd_device *device; + struct drbd_resource *resource; + struct drbd_connection *connection; +}; + +static inline struct drbd_device *minor_to_device(unsigned int minor) +{ + return (struct drbd_device *)idr_find(&drbd_devices, minor); +} + +static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device) +{ + return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices); +} + +#define for_each_resource(resource, _resources) \ + list_for_each_entry(resource, _resources, resources) + +#define for_each_resource_rcu(resource, _resources) \ + list_for_each_entry_rcu(resource, _resources, resources) + +#define for_each_resource_safe(resource, tmp, _resources) \ + list_for_each_entry_safe(resource, tmp, _resources, resources) + +#define for_each_connection(connection, resource) \ + list_for_each_entry(connection, &resource->connections, connections) + +#define for_each_connection_rcu(connection, resource) \ + list_for_each_entry_rcu(connection, &resource->connections, connections) + +#define for_each_connection_safe(connection, tmp, resource) \ + list_for_each_entry_safe(connection, tmp, &resource->connections, connections) + +#define for_each_peer_device(peer_device, device) \ + list_for_each_entry(peer_device, &device->peer_devices, peer_devices) + +#define for_each_peer_device_rcu(peer_device, device) \ + list_for_each_entry_rcu(peer_device, &device->peer_devices, peer_devices) + +#define for_each_peer_device_safe(peer_device, tmp, device) \ + list_for_each_entry_safe(peer_device, tmp, &device->peer_devices, peer_devices) + +static inline unsigned int device_to_minor(struct drbd_device *device) +{ + return device->minor; +} + +/* + * function declarations + *************************/ + +/* drbd_main.c */ + +enum dds_flags { + DDSF_FORCED = 1, + DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */ +}; + +extern void drbd_init_set_defaults(struct drbd_device *device); +extern int drbd_thread_start(struct drbd_thread *thi); +extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); +#ifdef CONFIG_SMP +extern void drbd_thread_current_set_cpu(struct drbd_thread *thi); +#else +#define drbd_thread_current_set_cpu(A) ({}) +#endif +extern void tl_release(struct drbd_connection *, unsigned int barrier_nr, + unsigned int set_size); +extern void tl_clear(struct drbd_connection *); +extern void drbd_free_sock(struct drbd_connection *connection); +extern int drbd_send(struct drbd_connection *connection, struct socket *sock, + void *buf, size_t size, unsigned msg_flags); +extern int drbd_send_all(struct drbd_connection *, struct socket *, void *, size_t, + unsigned); + +extern int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd); +extern int drbd_send_protocol(struct drbd_connection *connection); +extern int drbd_send_uuids(struct drbd_peer_device *); +extern int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *); +extern void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *); +extern int drbd_send_sizes(struct drbd_peer_device *, int trigger_reply, enum dds_flags flags); +extern int drbd_send_state(struct drbd_peer_device *, union drbd_state s); +extern int drbd_send_current_state(struct drbd_peer_device *); +extern int drbd_send_sync_param(struct drbd_peer_device *); +extern void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, + u32 set_size); +extern int drbd_send_ack(struct drbd_peer_device *, enum drbd_packet, + struct drbd_peer_request *); +extern void drbd_send_ack_rp(struct drbd_peer_device *, enum drbd_packet, + struct p_block_req *rp); +extern void drbd_send_ack_dp(struct drbd_peer_device *, enum drbd_packet, + struct p_data *dp, int data_size); +extern int drbd_send_ack_ex(struct drbd_peer_device *, enum drbd_packet, + sector_t sector, int blksize, u64 block_id); +extern int drbd_send_out_of_sync(struct drbd_peer_device *, struct drbd_request *); +extern int drbd_send_block(struct drbd_peer_device *, enum drbd_packet, + struct drbd_peer_request *); +extern int drbd_send_dblock(struct drbd_peer_device *, struct drbd_request *req); +extern int drbd_send_drequest(struct drbd_peer_device *, int cmd, + sector_t sector, int size, u64 block_id); +extern int drbd_send_drequest_csum(struct drbd_peer_device *, sector_t sector, + int size, void *digest, int digest_size, + enum drbd_packet cmd); +extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int size); + +extern int drbd_send_bitmap(struct drbd_device *device); +extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); +extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); +extern void drbd_free_ldev(struct drbd_backing_dev *ldev); +extern void drbd_device_cleanup(struct drbd_device *device); +void drbd_print_uuids(struct drbd_device *device, const char *text); + +extern void conn_md_sync(struct drbd_connection *connection); +extern void drbd_md_write(struct drbd_device *device, void *buffer); +extern void drbd_md_sync(struct drbd_device *device); +extern int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev); +extern void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local); +extern void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local); +extern void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local); +extern void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local); +extern void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local); +extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local); +extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local); +extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local); +extern int drbd_md_test_flag(struct drbd_backing_dev *, int); +extern void drbd_md_mark_dirty(struct drbd_device *device); +extern void drbd_queue_bitmap_io(struct drbd_device *device, + int (*io_fn)(struct drbd_device *), + void (*done)(struct drbd_device *, int), + char *why, enum bm_flag flags); +extern int drbd_bitmap_io(struct drbd_device *device, + int (*io_fn)(struct drbd_device *), + char *why, enum bm_flag flags); +extern int drbd_bitmap_io_from_worker(struct drbd_device *device, + int (*io_fn)(struct drbd_device *), + char *why, enum bm_flag flags); +extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local); +extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local); + +/* Meta data layout + * + * We currently have two possible layouts. + * Offsets in (512 byte) sectors. + * external: + * |----------- md_size_sect ------------------| + * [ 4k superblock ][ activity log ][ Bitmap ] + * | al_offset == 8 | + * | bm_offset = al_offset + X | + * ==> bitmap sectors = md_size_sect - bm_offset + * + * Variants: + * old, indexed fixed size meta data: + * + * internal: + * |----------- md_size_sect ------------------| + * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*] + * | al_offset < 0 | + * | bm_offset = al_offset - Y | + * ==> bitmap sectors = Y = al_offset - bm_offset + * + * [padding*] are zero or up to 7 unused 512 Byte sectors to the + * end of the device, so that the [4k superblock] will be 4k aligned. + * + * The activity log consists of 4k transaction blocks, + * which are written in a ring-buffer, or striped ring-buffer like fashion, + * which are writtensize used to be fixed 32kB, + * but is about to become configurable. + */ + +/* Our old fixed size meta data layout + * allows up to about 3.8TB, so if you want more, + * you need to use the "flexible" meta data format. */ +#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */ +#define MD_4kB_SECT 8 +#define MD_32kB_SECT 64 + +/* One activity log extent represents 4M of storage */ +#define AL_EXTENT_SHIFT 22 +#define AL_EXTENT_SIZE (1< we need 32 KB bitmap. + * Bit 0 ==> local node thinks this block is binary identical on both nodes + * Bit 1 ==> local node thinks this block needs to be synced. + */ + +#define SLEEP_TIME (HZ/10) + +/* We do bitmap IO in units of 4k blocks. + * We also still have a hardcoded 4k per bit relation. */ +#define BM_BLOCK_SHIFT 12 /* 4k per bit */ +#define BM_BLOCK_SIZE (1<>(BM_BLOCK_SHIFT-9)) +#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9)) +#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1) + +/* bit to represented kilo byte conversion */ +#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10)) + +/* in which _bitmap_ extent (resp. sector) the bit for a certain + * _storage_ sector is located in */ +#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) +#define BM_BIT_TO_EXT(x) ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT)) + +/* first storage sector a bitmap extent corresponds to */ +#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) +/* how much _storage_ sectors we have per bitmap extent */ +#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) +/* how many bits are covered by one bitmap extent (resync extent) */ +#define BM_BITS_PER_EXT (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT)) + +#define BM_BLOCKS_PER_BM_EXT_MASK (BM_BITS_PER_EXT - 1) + + +/* in one sector of the bitmap, we have this many activity_log extents. */ +#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) + +/* the extent in "PER_EXTENT" below is an activity log extent + * we need that many (long words/bytes) to store the bitmap + * of one AL_EXTENT_SIZE chunk of storage. + * we can store the bitmap for that many AL_EXTENTS within + * one sector of the _on_disk_ bitmap: + * bit 0 bit 37 bit 38 bit (512*8)-1 + * ...|........|........|.. // ..|........| + * sect. 0 `296 `304 ^(512*8*8)-1 + * +#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG ) +#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128 +#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4 + */ + +#define DRBD_MAX_SECTORS_32 (0xffffffffLU) +/* we have a certain meta data variant that has a fixed on-disk size of 128 + * MiB, of which 4k are our "superblock", and 32k are the fixed size activity + * log, leaving this many sectors for the bitmap. + */ + +#define DRBD_MAX_SECTORS_FIXED_BM \ + ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9))) +#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32 +#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 +#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 +#else +#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM +/* 16 TB in units of sectors */ +#if BITS_PER_LONG == 32 +/* adjust by one page worth of bitmap, + * so we won't wrap around in drbd_bm_find_next_bit. + * you should use 64bit OS for that much storage, anyways. */ +#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff) +#else +/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */ +#define DRBD_MAX_SECTORS_FLEX (1UL << 51) +/* corresponds to (1UL << 38) bits right now. */ +#endif +#endif + +/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE, + * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte. + * Since we may live in a mixed-platform cluster, + * we limit us to a platform agnostic constant here for now. + * A followup commit may allow even bigger BIO sizes, + * once we thought that through. */ +#define DRBD_MAX_BIO_SIZE (1U << 20) +#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE +#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE +#endif +#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ + +#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ +#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ + +/* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ +#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE +#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9) + +extern int drbd_bm_init(struct drbd_device *device); +extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); +extern void drbd_bm_cleanup(struct drbd_device *device); +extern void drbd_bm_set_all(struct drbd_device *device); +extern void drbd_bm_clear_all(struct drbd_device *device); +/* set/clear/test only a few bits at a time */ +extern int drbd_bm_set_bits( + struct drbd_device *device, unsigned long s, unsigned long e); +extern int drbd_bm_clear_bits( + struct drbd_device *device, unsigned long s, unsigned long e); +extern int drbd_bm_count_bits( + struct drbd_device *device, const unsigned long s, const unsigned long e); +/* bm_set_bits variant for use while holding drbd_bm_lock, + * may process the whole bitmap in one go */ +extern void _drbd_bm_set_bits(struct drbd_device *device, + const unsigned long s, const unsigned long e); +extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr); +extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr); +extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); +extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); +extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); +extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); +extern int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local); +extern size_t drbd_bm_words(struct drbd_device *device); +extern unsigned long drbd_bm_bits(struct drbd_device *device); +extern sector_t drbd_bm_capacity(struct drbd_device *device); + +#define DRBD_END_OF_BITMAP (~(unsigned long)0) +extern unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo); +/* bm_find_next variants for use while you hold drbd_bm_lock() */ +extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo); +extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo); +extern unsigned long _drbd_bm_total_weight(struct drbd_device *device); +extern unsigned long drbd_bm_total_weight(struct drbd_device *device); +/* for receive_bitmap */ +extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, + size_t number, unsigned long *buffer); +/* for _drbd_send_bitmap */ +extern void drbd_bm_get_lel(struct drbd_device *device, size_t offset, + size_t number, unsigned long *buffer); + +extern void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags); +extern void drbd_bm_unlock(struct drbd_device *device); +/* drbd_main.c */ + +extern struct kmem_cache *drbd_request_cache; +extern struct kmem_cache *drbd_ee_cache; /* peer requests */ +extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ +extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ +extern mempool_t *drbd_request_mempool; +extern mempool_t *drbd_ee_mempool; + +/* drbd's page pool, used to buffer data received from the peer, + * or data requested by the peer. + * + * This does not have an emergency reserve. + * + * When allocating from this pool, it first takes pages from the pool. + * Only if the pool is depleted will try to allocate from the system. + * + * The assumption is that pages taken from this pool will be processed, + * and given back, "quickly", and then can be recycled, so we can avoid + * frequent calls to alloc_page(), and still will be able to make progress even + * under memory pressure. + */ +extern struct page *drbd_pp_pool; +extern spinlock_t drbd_pp_lock; +extern int drbd_pp_vacant; +extern wait_queue_head_t drbd_pp_wait; + +/* We also need a standard (emergency-reserve backed) page pool + * for meta data IO (activity log, bitmap). + * We can keep it global, as long as it is used as "N pages at a time". + * 128 should be plenty, currently we probably can get away with as few as 1. + */ +#define DRBD_MIN_POOL_PAGES 128 +extern mempool_t *drbd_md_io_page_pool; + +/* We also need to make sure we get a bio + * when we need it for housekeeping purposes */ +extern struct bio_set *drbd_md_io_bio_set; +/* to allocate from that set */ +extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); + +extern rwlock_t global_state_lock; + +extern int conn_lowest_minor(struct drbd_connection *connection); +extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); +extern void drbd_destroy_device(struct kref *kref); +extern void drbd_delete_device(struct drbd_device *device); + +extern struct drbd_resource *drbd_create_resource(const char *name); +extern void drbd_free_resource(struct drbd_resource *resource); + +extern int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts); +extern struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts); +extern void drbd_destroy_connection(struct kref *kref); +extern struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len, + void *peer_addr, int peer_addr_len); +extern struct drbd_resource *drbd_find_resource(const char *name); +extern void drbd_destroy_resource(struct kref *kref); +extern void conn_free_crypto(struct drbd_connection *connection); + +extern int proc_details; + +/* drbd_req */ +extern void do_submit(struct work_struct *ws); +extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long); +extern void drbd_make_request(struct request_queue *q, struct bio *bio); +extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); +extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); +extern int is_valid_ar_handle(struct drbd_request *, sector_t); + + +/* drbd_nl.c */ +extern void drbd_suspend_io(struct drbd_device *device); +extern void drbd_resume_io(struct drbd_device *device); +extern char *ppsize(char *buf, unsigned long long size); +extern sector_t drbd_new_dev_size(struct drbd_device *, struct drbd_backing_dev *, sector_t, int); +enum determine_dev_size { + DS_ERROR_SHRINK = -3, + DS_ERROR_SPACE_MD = -2, + DS_ERROR = -1, + DS_UNCHANGED = 0, + DS_SHRUNK = 1, + DS_GREW = 2, + DS_GREW_FROM_ZERO = 3, +}; +extern enum determine_dev_size +drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local); +extern void resync_after_online_grow(struct drbd_device *); +extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev); +extern enum drbd_state_rv drbd_set_role(struct drbd_device *device, + enum drbd_role new_role, + int force); +extern bool conn_try_outdate_peer(struct drbd_connection *connection); +extern void conn_try_outdate_peer_async(struct drbd_connection *connection); +extern int drbd_khelper(struct drbd_device *device, char *cmd); + +/* drbd_worker.c */ +/* bi_end_io handlers */ +extern void drbd_md_endio(struct bio *bio, int error); +extern void drbd_peer_request_endio(struct bio *bio, int error); +extern void drbd_request_endio(struct bio *bio, int error); +extern int drbd_worker(struct drbd_thread *thi); +enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor); +void drbd_resync_after_changed(struct drbd_device *device); +extern void drbd_start_resync(struct drbd_device *device, enum drbd_conns side); +extern void resume_next_sg(struct drbd_device *device); +extern void suspend_other_sg(struct drbd_device *device); +extern int drbd_resync_finished(struct drbd_device *device); +/* maybe rather drbd_main.c ? */ +extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent); +extern void drbd_md_put_buffer(struct drbd_device *device); +extern int drbd_md_sync_page_io(struct drbd_device *device, + struct drbd_backing_dev *bdev, sector_t sector, int rw); +extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int); +extern void wait_until_done_or_force_detached(struct drbd_device *device, + struct drbd_backing_dev *bdev, unsigned int *done); +extern void drbd_rs_controller_reset(struct drbd_device *device); + +static inline void ov_out_of_sync_print(struct drbd_device *device) +{ + if (device->ov_last_oos_size) { + drbd_err(device, "Out of sync: start=%llu, size=%lu (sectors)\n", + (unsigned long long)device->ov_last_oos_start, + (unsigned long)device->ov_last_oos_size); + } + device->ov_last_oos_size = 0; +} + + +extern void drbd_csum_bio(struct crypto_hash *, struct bio *, void *); +extern void drbd_csum_ee(struct crypto_hash *, struct drbd_peer_request *, void *); +/* worker callbacks */ +extern int w_e_end_data_req(struct drbd_work *, int); +extern int w_e_end_rsdata_req(struct drbd_work *, int); +extern int w_e_end_csum_rs_req(struct drbd_work *, int); +extern int w_e_end_ov_reply(struct drbd_work *, int); +extern int w_e_end_ov_req(struct drbd_work *, int); +extern int w_ov_finished(struct drbd_work *, int); +extern int w_resync_timer(struct drbd_work *, int); +extern int w_send_write_hint(struct drbd_work *, int); +extern int w_send_dblock(struct drbd_work *, int); +extern int w_send_read_req(struct drbd_work *, int); +extern int w_e_reissue(struct drbd_work *, int); +extern int w_restart_disk_io(struct drbd_work *, int); +extern int w_send_out_of_sync(struct drbd_work *, int); +extern int w_start_resync(struct drbd_work *, int); + +extern void resync_timer_fn(unsigned long data); +extern void start_resync_timer_fn(unsigned long data); + +extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); + +/* drbd_receiver.c */ +extern int drbd_receiver(struct drbd_thread *thi); +extern int drbd_asender(struct drbd_thread *thi); +extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); +extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, + bool throttle_if_app_is_waiting); +extern int drbd_submit_peer_request(struct drbd_device *, + struct drbd_peer_request *, const unsigned, + const int); +extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); +extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, + sector_t, unsigned int, + bool, + gfp_t) __must_hold(local); +extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, + int); +#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) +#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) +extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool); +extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); +extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); +extern int drbd_connected(struct drbd_peer_device *); + +static inline void drbd_tcp_cork(struct socket *sock) +{ + int val = 1; + (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, + (char*)&val, sizeof(val)); +} + +static inline void drbd_tcp_uncork(struct socket *sock) +{ + int val = 0; + (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, + (char*)&val, sizeof(val)); +} + +static inline void drbd_tcp_nodelay(struct socket *sock) +{ + int val = 1; + (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char*)&val, sizeof(val)); +} + +static inline void drbd_tcp_quickack(struct socket *sock) +{ + int val = 2; + (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, + (char*)&val, sizeof(val)); +} + +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(struct drbd_device *device, + sector_t size) +{ + /* set_capacity(device->this_bdev->bd_disk, size); */ + set_capacity(device->vdisk, size); + device->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +/* + * used to submit our private bio + */ +static inline void drbd_generic_make_request(struct drbd_device *device, + int fault_type, struct bio *bio) +{ + __release(local); + if (!bio->bi_bdev) { + drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n"); + bio_endio(bio, -ENODEV); + return; + } + + if (drbd_insert_fault(device, fault_type)) + bio_endio(bio, -EIO); + else + generic_make_request(bio); +} + +void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, + enum write_ordering_e wo); + +/* drbd_proc.c */ +extern struct proc_dir_entry *drbd_proc; +extern const struct file_operations drbd_proc_fops; +extern const char *drbd_conn_str(enum drbd_conns s); +extern const char *drbd_role_str(enum drbd_role s); + +/* drbd_actlog.c */ +extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); +extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i); +extern void drbd_al_begin_io_commit(struct drbd_device *device); +extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i); +extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i); +extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i); +extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector); +extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector); +extern int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector); +extern void drbd_rs_cancel_all(struct drbd_device *device); +extern int drbd_rs_del_all(struct drbd_device *device); +extern void drbd_rs_failed_io(struct drbd_device *device, + sector_t sector, int size); +extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go); + +enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC }; +extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, + enum update_sync_bits_mode mode); +#define drbd_set_in_sync(device, sector, size) \ + __drbd_change_sync(device, sector, size, SET_IN_SYNC) +#define drbd_set_out_of_sync(device, sector, size) \ + __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC) +#define drbd_rs_failed_io(device, sector, size) \ + __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) +extern void drbd_al_shrink(struct drbd_device *device); +extern int drbd_initialize_al(struct drbd_device *, void *); + +/* drbd_nl.c */ +/* state info broadcast */ +struct sib_info { + enum drbd_state_info_bcast_reason sib_reason; + union { + struct { + char *helper_name; + unsigned helper_exit_code; + }; + struct { + union drbd_state os; + union drbd_state ns; + }; + }; +}; +void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib); + +/* + * inline helper functions + *************************/ + +/* see also page_chain_add and friends in drbd_receiver.c */ +static inline struct page *page_chain_next(struct page *page) +{ + return (struct page *)page_private(page); +} +#define page_chain_for_each(page) \ + for (; page && ({ prefetch(page_chain_next(page)); 1; }); \ + page = page_chain_next(page)) +#define page_chain_for_each_safe(page, n) \ + for (; page && ({ n = page_chain_next(page); 1; }); page = n) + + +static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req) +{ + struct page *page = peer_req->pages; + page_chain_for_each(page) { + if (page_count(page) > 1) + return 1; + } + return 0; +} + +static inline enum drbd_state_rv +_drbd_set_state(struct drbd_device *device, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) +{ + enum drbd_state_rv rv; + + read_lock(&global_state_lock); + rv = __drbd_set_state(device, ns, flags, done); + read_unlock(&global_state_lock); + + return rv; +} + +static inline union drbd_state drbd_read_state(struct drbd_device *device) +{ + struct drbd_resource *resource = device->resource; + union drbd_state rv; + + rv.i = device->state.i; + rv.susp = resource->susp; + rv.susp_nod = resource->susp_nod; + rv.susp_fen = resource->susp_fen; + + return rv; +} + +enum drbd_force_detach_flags { + DRBD_READ_ERROR, + DRBD_WRITE_ERROR, + DRBD_META_IO_ERROR, + DRBD_FORCE_DETACH, +}; + +#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) +static inline void __drbd_chk_io_error_(struct drbd_device *device, + enum drbd_force_detach_flags df, + const char *where) +{ + enum drbd_io_error_p ep; + + rcu_read_lock(); + ep = rcu_dereference(device->ldev->disk_conf)->on_io_error; + rcu_read_unlock(); + switch (ep) { + case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */ + if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Local IO failed in %s.\n", where); + if (device->state.disk > D_INCONSISTENT) + _drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL); + break; + } + /* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */ + case EP_DETACH: + case EP_CALL_HELPER: + /* Remember whether we saw a READ or WRITE error. + * + * Recovery of the affected area for WRITE failure is covered + * by the activity log. + * READ errors may fall outside that area though. Certain READ + * errors can be "healed" by writing good data to the affected + * blocks, which triggers block re-allocation in lower layers. + * + * If we can not write the bitmap after a READ error, + * we may need to trigger a full sync (see w_go_diskless()). + * + * Force-detach is not really an IO error, but rather a + * desperate measure to try to deal with a completely + * unresponsive lower level IO stack. + * Still it should be treated as a WRITE error. + * + * Meta IO error is always WRITE error: + * we read meta data only once during attach, + * which will fail in case of errors. + */ + set_bit(WAS_IO_ERROR, &device->flags); + if (df == DRBD_READ_ERROR) + set_bit(WAS_READ_ERROR, &device->flags); + if (df == DRBD_FORCE_DETACH) + set_bit(FORCE_DETACH, &device->flags); + if (device->state.disk > D_FAILED) { + _drbd_set_state(_NS(device, disk, D_FAILED), CS_HARD, NULL); + drbd_err(device, + "Local IO failed in %s. Detaching...\n", where); + } + break; + } +} + +/** + * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers + * @device: DRBD device. + * @error: Error code passed to the IO completion callback + * @forcedetach: Force detach. I.e. the error happened while accessing the meta data + * + * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED) + */ +#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) +static inline void drbd_chk_io_error_(struct drbd_device *device, + int error, enum drbd_force_detach_flags forcedetach, const char *where) +{ + if (error) { + unsigned long flags; + spin_lock_irqsave(&device->resource->req_lock, flags); + __drbd_chk_io_error_(device, forcedetach, where); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + } +} + + +/** + * drbd_md_first_sector() - Returns the first sector number of the meta data area + * @bdev: Meta data block device. + * + * BTW, for internal meta data, this happens to be the maximum capacity + * we could agree upon with our peer node. + */ +static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->md.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + bdev->md.bm_offset; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset; + } +} + +/** + * drbd_md_last_sector() - Return the last sector number of the meta data area + * @bdev: Meta data block device. + */ +static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->md.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + MD_4kB_SECT -1; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset + bdev->md.md_size_sect -1; + } +} + +/* Returns the number of 512 byte sectors of the device */ +static inline sector_t drbd_get_capacity(struct block_device *bdev) +{ + /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ + return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0; +} + +/** + * drbd_get_max_capacity() - Returns the capacity we announce to out peer + * @bdev: Meta data block device. + * + * returns the capacity we announce to out peer. we clip ourselves at the + * various MAX_SECTORS, because if we don't, current implementation will + * oops sooner or later + */ +static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) +{ + sector_t s; + + switch (bdev->md.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + s = drbd_get_capacity(bdev->backing_bdev) + ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, + drbd_md_first_sector(bdev)) + : 0; + break; + case DRBD_MD_INDEX_FLEX_EXT: + s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX, + drbd_get_capacity(bdev->backing_bdev)); + /* clip at maximum size the meta device can support */ + s = min_t(sector_t, s, + BM_EXT_TO_SECT(bdev->md.md_size_sect + - bdev->md.bm_offset)); + break; + default: + s = min_t(sector_t, DRBD_MAX_SECTORS, + drbd_get_capacity(bdev->backing_bdev)); + } + return s; +} + +/** + * drbd_md_ss() - Return the sector number of our meta data super block + * @bdev: Meta data block device. + */ +static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev) +{ + const int meta_dev_idx = bdev->md.meta_dev_idx; + + if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT) + return 0; + + /* Since drbd08, internal meta data is always "flexible". + * position: last 4k aligned block of 4k size */ + if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + meta_dev_idx == DRBD_MD_INDEX_FLEX_INT) + return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8; + + /* external, some index; this is the old fixed size layout */ + return MD_128MB_SECT * bdev->md.meta_dev_idx; +} + +static inline void +drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock, flags); + list_add_tail(&w->list, &q->q); + spin_unlock_irqrestore(&q->q_lock, flags); + wake_up(&q->q_wait); +} + +static inline void +drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock, flags); + if (list_empty_careful(&w->list)) + list_add_tail(&w->list, &q->q); + spin_unlock_irqrestore(&q->q_lock, flags); + wake_up(&q->q_wait); +} + +static inline void +drbd_device_post_work(struct drbd_device *device, int work_bit) +{ + if (!test_and_set_bit(work_bit, &device->flags)) { + struct drbd_connection *connection = + first_peer_device(device)->connection; + struct drbd_work_queue *q = &connection->sender_work; + if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags)) + wake_up(&q->q_wait); + } +} + +extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); + +static inline void wake_asender(struct drbd_connection *connection) +{ + if (test_bit(SIGNAL_ASENDER, &connection->flags)) + force_sig(DRBD_SIG, connection->asender.task); +} + +static inline void request_ping(struct drbd_connection *connection) +{ + set_bit(SEND_PING, &connection->flags); + wake_asender(connection); +} + +extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *); +extern void *drbd_prepare_command(struct drbd_peer_device *, struct drbd_socket *); +extern int conn_send_command(struct drbd_connection *, struct drbd_socket *, + enum drbd_packet, unsigned int, void *, + unsigned int); +extern int drbd_send_command(struct drbd_peer_device *, struct drbd_socket *, + enum drbd_packet, unsigned int, void *, + unsigned int); + +extern int drbd_send_ping(struct drbd_connection *connection); +extern int drbd_send_ping_ack(struct drbd_connection *connection); +extern int drbd_send_state_req(struct drbd_peer_device *, union drbd_state, union drbd_state); +extern int conn_send_state_req(struct drbd_connection *, union drbd_state, union drbd_state); + +static inline void drbd_thread_stop(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, false, true); +} + +static inline void drbd_thread_stop_nowait(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, false, false); +} + +static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, true, false); +} + +/* counts how many answer packets packets we expect from our peer, + * for either explicit application requests, + * or implicit barrier packets as necessary. + * increased: + * w_send_barrier + * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ); + * it is much easier and equally valid to count what we queue for the + * worker, even before it actually was queued or send. + * (drbd_make_request_common; recovery path on read io-error) + * decreased: + * got_BarrierAck (respective tl_clear, tl_clear_barrier) + * _req_mod(req, DATA_RECEIVED) + * [from receive_DataReply] + * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED) + * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] + * for some reason it is NOT decreased in got_NegAck, + * but in the resulting cleanup code from report_params. + * we should try to remember the reason for that... + * _req_mod(req, SEND_FAILED or SEND_CANCELED) + * _req_mod(req, CONNECTION_LOST_WHILE_PENDING) + * [from tl_clear_barrier] + */ +static inline void inc_ap_pending(struct drbd_device *device) +{ + atomic_inc(&device->ap_pending_cnt); +} + +#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \ + if (atomic_read(&device->which) < 0) \ + drbd_err(device, "in %s:%d: " #which " = %d < 0 !\n", \ + func, line, \ + atomic_read(&device->which)) + +#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__) +static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line) +{ + if (atomic_dec_and_test(&device->ap_pending_cnt)) + wake_up(&device->misc_wait); + ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line); +} + +/* counts how many resync-related answers we still expect from the peer + * increase decrease + * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY) + * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK with ID_SYNCER) + * (or P_NEG_ACK with ID_SYNCER) + */ +static inline void inc_rs_pending(struct drbd_device *device) +{ + atomic_inc(&device->rs_pending_cnt); +} + +#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__) +static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line) +{ + atomic_dec(&device->rs_pending_cnt); + ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line); +} + +/* counts how many answers we still need to send to the peer. + * increased on + * receive_Data unless protocol A; + * we need to send a P_RECV_ACK (proto B) + * or P_WRITE_ACK (proto C) + * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK + * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA + * receive_Barrier_* we need to send a P_BARRIER_ACK + */ +static inline void inc_unacked(struct drbd_device *device) +{ + atomic_inc(&device->unacked_cnt); +} + +#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__) +static inline void _dec_unacked(struct drbd_device *device, const char *func, int line) +{ + atomic_dec(&device->unacked_cnt); + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); +} + +#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__) +static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line) +{ + atomic_sub(n, &device->unacked_cnt); + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); +} + +static inline bool is_sync_state(enum drbd_conns connection_state) +{ + return + (connection_state == C_SYNC_SOURCE + || connection_state == C_SYNC_TARGET + || connection_state == C_PAUSED_SYNC_S + || connection_state == C_PAUSED_SYNC_T); +} + +/** + * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev + * @_device: DRBD device. + * @_min_state: Minimum device state required for success. + * + * You have to call put_ldev() when finished working with device->ldev. + */ +#define get_ldev_if_state(_device, _min_state) \ + (_get_ldev_if_state((_device), (_min_state)) ? \ + ({ __acquire(x); true; }) : false) +#define get_ldev(_device) get_ldev_if_state(_device, D_INCONSISTENT) + +static inline void put_ldev(struct drbd_device *device) +{ + enum drbd_disk_state disk_state = device->state.disk; + /* We must check the state *before* the atomic_dec becomes visible, + * or we have a theoretical race where someone hitting zero, + * while state still D_FAILED, will then see D_DISKLESS in the + * condition below and calling into destroy, where he must not, yet. */ + int i = atomic_dec_return(&device->local_cnt); + + /* This may be called from some endio handler, + * so we must not sleep here. */ + + __release(local); + D_ASSERT(device, i >= 0); + if (i == 0) { + if (disk_state == D_DISKLESS) + /* even internal references gone, safe to destroy */ + drbd_device_post_work(device, DESTROY_DISK); + if (disk_state == D_FAILED) + /* all application IO references gone. */ + if (!test_and_set_bit(GOING_DISKLESS, &device->flags)) + drbd_device_post_work(device, GO_DISKLESS); + wake_up(&device->misc_wait); + } +} + +#ifndef __CHECKER__ +static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins) +{ + int io_allowed; + + /* never get a reference while D_DISKLESS */ + if (device->state.disk == D_DISKLESS) + return 0; + + atomic_inc(&device->local_cnt); + io_allowed = (device->state.disk >= mins); + if (!io_allowed) + put_ldev(device); + return io_allowed; +} +#else +extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins); +#endif + +/* this throttles on-the-fly application requests + * according to max_buffers settings; + * maybe re-implement using semaphores? */ +static inline int drbd_get_max_buffers(struct drbd_device *device) +{ + struct net_conf *nc; + int mxb; + + rcu_read_lock(); + nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */ + rcu_read_unlock(); + + return mxb; +} + +static inline int drbd_state_is_stable(struct drbd_device *device) +{ + union drbd_dev_state s = device->state; + + /* DO NOT add a default clause, we want the compiler to warn us + * for any newly introduced state we may have forgotten to add here */ + + switch ((enum drbd_conns)s.conn) { + /* new io only accepted when there is no connection, ... */ + case C_STANDALONE: + case C_WF_CONNECTION: + /* ... or there is a well established connection. */ + case C_CONNECTED: + case C_SYNC_SOURCE: + case C_SYNC_TARGET: + case C_VERIFY_S: + case C_VERIFY_T: + case C_PAUSED_SYNC_S: + case C_PAUSED_SYNC_T: + case C_AHEAD: + case C_BEHIND: + /* transitional states, IO allowed */ + case C_DISCONNECTING: + case C_UNCONNECTED: + case C_TIMEOUT: + case C_BROKEN_PIPE: + case C_NETWORK_FAILURE: + case C_PROTOCOL_ERROR: + case C_TEAR_DOWN: + case C_WF_REPORT_PARAMS: + case C_STARTING_SYNC_S: + case C_STARTING_SYNC_T: + break; + + /* Allow IO in BM exchange states with new protocols */ + case C_WF_BITMAP_S: + if (first_peer_device(device)->connection->agreed_pro_version < 96) + return 0; + break; + + /* no new io accepted in these states */ + case C_WF_BITMAP_T: + case C_WF_SYNC_UUID: + case C_MASK: + /* not "stable" */ + return 0; + } + + switch ((enum drbd_disk_state)s.disk) { + case D_DISKLESS: + case D_INCONSISTENT: + case D_OUTDATED: + case D_CONSISTENT: + case D_UP_TO_DATE: + case D_FAILED: + /* disk state is stable as well. */ + break; + + /* no new io accepted during transitional states */ + case D_ATTACHING: + case D_NEGOTIATING: + case D_UNKNOWN: + case D_MASK: + /* not "stable" */ + return 0; + } + + return 1; +} + +static inline int drbd_suspended(struct drbd_device *device) +{ + struct drbd_resource *resource = device->resource; + + return resource->susp || resource->susp_fen || resource->susp_nod; +} + +static inline bool may_inc_ap_bio(struct drbd_device *device) +{ + int mxb = drbd_get_max_buffers(device); + + if (drbd_suspended(device)) + return false; + if (test_bit(SUSPEND_IO, &device->flags)) + return false; + + /* to avoid potential deadlock or bitmap corruption, + * in various places, we only allow new application io + * to start during "stable" states. */ + + /* no new io accepted when attaching or detaching the disk */ + if (!drbd_state_is_stable(device)) + return false; + + /* since some older kernels don't have atomic_add_unless, + * and we are within the spinlock anyways, we have this workaround. */ + if (atomic_read(&device->ap_bio_cnt) > mxb) + return false; + if (test_bit(BITMAP_IO, &device->flags)) + return false; + return true; +} + +static inline bool inc_ap_bio_cond(struct drbd_device *device) +{ + bool rv = false; + + spin_lock_irq(&device->resource->req_lock); + rv = may_inc_ap_bio(device); + if (rv) + atomic_inc(&device->ap_bio_cnt); + spin_unlock_irq(&device->resource->req_lock); + + return rv; +} + +static inline void inc_ap_bio(struct drbd_device *device) +{ + /* we wait here + * as long as the device is suspended + * until the bitmap is no longer on the fly during connection + * handshake as long as we would exceed the max_buffer limit. + * + * to avoid races with the reconnect code, + * we need to atomic_inc within the spinlock. */ + + wait_event(device->misc_wait, inc_ap_bio_cond(device)); +} + +static inline void dec_ap_bio(struct drbd_device *device) +{ + int mxb = drbd_get_max_buffers(device); + int ap_bio = atomic_dec_return(&device->ap_bio_cnt); + + D_ASSERT(device, ap_bio >= 0); + + if (ap_bio == 0 && test_bit(BITMAP_IO, &device->flags)) { + if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) + drbd_queue_work(&first_peer_device(device)-> + connection->sender_work, + &device->bm_io_work.w); + } + + /* this currently does wake_up for every dec_ap_bio! + * maybe rather introduce some type of hysteresis? + * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */ + if (ap_bio < mxb) + wake_up(&device->misc_wait); +} + +static inline bool verify_can_do_stop_sector(struct drbd_device *device) +{ + return first_peer_device(device)->connection->agreed_pro_version >= 97 && + first_peer_device(device)->connection->agreed_pro_version != 100; +} + +static inline int drbd_set_ed_uuid(struct drbd_device *device, u64 val) +{ + int changed = device->ed_uuid != val; + device->ed_uuid = val; + return changed; +} + +static inline int drbd_queue_order_type(struct drbd_device *device) +{ + /* sorry, we currently have no working implementation + * of distributed TCQ stuff */ +#ifndef QUEUE_ORDERED_NONE +#define QUEUE_ORDERED_NONE 0 +#endif + return QUEUE_ORDERED_NONE; +} + +static inline struct drbd_connection *first_connection(struct drbd_resource *resource) +{ + return list_first_entry_or_null(&resource->connections, + struct drbd_connection, connections); +} + +#endif diff --git a/kernel/drivers/block/drbd/drbd_interval.c b/kernel/drivers/block/drbd/drbd_interval.c new file mode 100644 index 000000000..51b25ad85 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_interval.c @@ -0,0 +1,179 @@ +#include +#include +#include "drbd_interval.h" + +/** + * interval_end - return end of @node + */ +static inline +sector_t interval_end(struct rb_node *node) +{ + struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb); + return this->end; +} + +/** + * compute_subtree_last - compute end of @node + * + * The end of an interval is the highest (start + (size >> 9)) value of this + * node and of its children. Called for @node and its parents whenever the end + * may have changed. + */ +static inline sector_t +compute_subtree_last(struct drbd_interval *node) +{ + sector_t max = node->sector + (node->size >> 9); + + if (node->rb.rb_left) { + sector_t left = interval_end(node->rb.rb_left); + if (left > max) + max = left; + } + if (node->rb.rb_right) { + sector_t right = interval_end(node->rb.rb_right); + if (right > max) + max = right; + } + return max; +} + +RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb, + sector_t, end, compute_subtree_last); + +/** + * drbd_insert_interval - insert a new interval into a tree + */ +bool +drbd_insert_interval(struct rb_root *root, struct drbd_interval *this) +{ + struct rb_node **new = &root->rb_node, *parent = NULL; + sector_t this_end = this->sector + (this->size >> 9); + + BUG_ON(!IS_ALIGNED(this->size, 512)); + + while (*new) { + struct drbd_interval *here = + rb_entry(*new, struct drbd_interval, rb); + + parent = *new; + if (here->end < this_end) + here->end = this_end; + if (this->sector < here->sector) + new = &(*new)->rb_left; + else if (this->sector > here->sector) + new = &(*new)->rb_right; + else if (this < here) + new = &(*new)->rb_left; + else if (this > here) + new = &(*new)->rb_right; + else + return false; + } + + this->end = this_end; + rb_link_node(&this->rb, parent, new); + rb_insert_augmented(&this->rb, root, &augment_callbacks); + return true; +} + +/** + * drbd_contains_interval - check if a tree contains a given interval + * @sector: start sector of @interval + * @interval: may not be a valid pointer + * + * Returns if the tree contains the node @interval with start sector @start. + * Does not dereference @interval until @interval is known to be a valid object + * in @tree. Returns %false if @interval is in the tree but with a different + * sector number. + */ +bool +drbd_contains_interval(struct rb_root *root, sector_t sector, + struct drbd_interval *interval) +{ + struct rb_node *node = root->rb_node; + + while (node) { + struct drbd_interval *here = + rb_entry(node, struct drbd_interval, rb); + + if (sector < here->sector) + node = node->rb_left; + else if (sector > here->sector) + node = node->rb_right; + else if (interval < here) + node = node->rb_left; + else if (interval > here) + node = node->rb_right; + else + return true; + } + return false; +} + +/** + * drbd_remove_interval - remove an interval from a tree + */ +void +drbd_remove_interval(struct rb_root *root, struct drbd_interval *this) +{ + rb_erase_augmented(&this->rb, root, &augment_callbacks); +} + +/** + * drbd_find_overlap - search for an interval overlapping with [sector, sector + size) + * @sector: start sector + * @size: size, aligned to 512 bytes + * + * Returns an interval overlapping with [sector, sector + size), or NULL if + * there is none. When there is more than one overlapping interval in the + * tree, the interval with the lowest start sector is returned, and all other + * overlapping intervals will be on the right side of the tree, reachable with + * rb_next(). + */ +struct drbd_interval * +drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size) +{ + struct rb_node *node = root->rb_node; + struct drbd_interval *overlap = NULL; + sector_t end = sector + (size >> 9); + + BUG_ON(!IS_ALIGNED(size, 512)); + + while (node) { + struct drbd_interval *here = + rb_entry(node, struct drbd_interval, rb); + + if (node->rb_left && + sector < interval_end(node->rb_left)) { + /* Overlap if any must be on left side */ + node = node->rb_left; + } else if (here->sector < end && + sector < here->sector + (here->size >> 9)) { + overlap = here; + break; + } else if (sector >= here->sector) { + /* Overlap if any must be on right side */ + node = node->rb_right; + } else + break; + } + return overlap; +} + +struct drbd_interval * +drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size) +{ + sector_t end = sector + (size >> 9); + struct rb_node *node; + + for (;;) { + node = rb_next(&i->rb); + if (!node) + return NULL; + i = rb_entry(node, struct drbd_interval, rb); + if (i->sector >= end) + return NULL; + if (sector < i->sector + (i->size >> 9)) + return i; + } +} diff --git a/kernel/drivers/block/drbd/drbd_interval.h b/kernel/drivers/block/drbd/drbd_interval.h new file mode 100644 index 000000000..f210543f0 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_interval.h @@ -0,0 +1,42 @@ +#ifndef __DRBD_INTERVAL_H +#define __DRBD_INTERVAL_H + +#include +#include + +struct drbd_interval { + struct rb_node rb; + sector_t sector; /* start sector of the interval */ + unsigned int size; /* size in bytes */ + sector_t end; /* highest interval end in subtree */ + int local:1 /* local or remote request? */; + int waiting:1; /* someone is waiting for this to complete */ + int completed:1; /* this has been completed already; + * ignore for conflict detection */ +}; + +static inline void drbd_clear_interval(struct drbd_interval *i) +{ + RB_CLEAR_NODE(&i->rb); +} + +static inline bool drbd_interval_empty(struct drbd_interval *i) +{ + return RB_EMPTY_NODE(&i->rb); +} + +extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *); +extern bool drbd_contains_interval(struct rb_root *, sector_t, + struct drbd_interval *); +extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *); +extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t, + unsigned int); +extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t, + unsigned int); + +#define drbd_for_each_overlap(i, root, sector, size) \ + for (i = drbd_find_overlap(root, sector, size); \ + i; \ + i = drbd_next_overlap(i, sector, size)) + +#endif /* __DRBD_INTERVAL_H */ diff --git a/kernel/drivers/block/drbd/drbd_main.c b/kernel/drivers/block/drbd/drbd_main.c new file mode 100644 index 000000000..81fde9ef7 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_main.c @@ -0,0 +1,3844 @@ +/* + drbd.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev + from Logicworks, Inc. for making SDP replication support possible. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define __KERNEL_SYSCALLS__ +#include +#include + +#include +#include "drbd_int.h" +#include "drbd_protocol.h" +#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ +#include "drbd_vli.h" +#include "drbd_debugfs.h" + +static DEFINE_MUTEX(drbd_main_mutex); +static int drbd_open(struct block_device *bdev, fmode_t mode); +static void drbd_release(struct gendisk *gd, fmode_t mode); +static void md_sync_timer_fn(unsigned long data); +static int w_bitmap_io(struct drbd_work *w, int unused); + +MODULE_AUTHOR("Philipp Reisner , " + "Lars Ellenberg "); +MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); +MODULE_VERSION(REL_VERSION); +MODULE_LICENSE("GPL"); +MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices (" + __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")"); +MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); + +#include +/* allow_open_on_secondary */ +MODULE_PARM_DESC(allow_oos, "DONT USE!"); +/* thanks to these macros, if compiled into the kernel (not-module), + * this becomes the boot parameter drbd.minor_count */ +module_param(minor_count, uint, 0444); +module_param(disable_sendpage, bool, 0644); +module_param(allow_oos, bool, 0); +module_param(proc_details, int, 0644); + +#ifdef CONFIG_DRBD_FAULT_INJECTION +int enable_faults; +int fault_rate; +static int fault_count; +int fault_devs; +/* bitmap of enabled faults */ +module_param(enable_faults, int, 0664); +/* fault rate % value - applies to all enabled faults */ +module_param(fault_rate, int, 0664); +/* count of faults inserted */ +module_param(fault_count, int, 0664); +/* bitmap of devices to insert faults on */ +module_param(fault_devs, int, 0644); +#endif + +/* module parameter, defined */ +unsigned int minor_count = DRBD_MINOR_COUNT_DEF; +bool disable_sendpage; +bool allow_oos; +int proc_details; /* Detail level in proc drbd*/ + +/* Module parameter for setting the user mode helper program + * to run. Default is /sbin/drbdadm */ +char usermode_helper[80] = "/sbin/drbdadm"; + +module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644); + +/* in 2.6.x, our device mapping and config info contains our virtual gendisks + * as member "struct gendisk *vdisk;" + */ +struct idr drbd_devices; +struct list_head drbd_resources; + +struct kmem_cache *drbd_request_cache; +struct kmem_cache *drbd_ee_cache; /* peer requests */ +struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ +struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ +mempool_t *drbd_request_mempool; +mempool_t *drbd_ee_mempool; +mempool_t *drbd_md_io_page_pool; +struct bio_set *drbd_md_io_bio_set; + +/* I do not use a standard mempool, because: + 1) I want to hand out the pre-allocated objects first. + 2) I want to be able to interrupt sleeping allocation with a signal. + Note: This is a single linked list, the next pointer is the private + member of struct page. + */ +struct page *drbd_pp_pool; +spinlock_t drbd_pp_lock; +int drbd_pp_vacant; +wait_queue_head_t drbd_pp_wait; + +DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); + +static const struct block_device_operations drbd_ops = { + .owner = THIS_MODULE, + .open = drbd_open, + .release = drbd_release, +}; + +struct bio *bio_alloc_drbd(gfp_t gfp_mask) +{ + struct bio *bio; + + if (!drbd_md_io_bio_set) + return bio_alloc(gfp_mask, 1); + + bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); + if (!bio) + return NULL; + return bio; +} + +#ifdef __CHECKER__ +/* When checking with sparse, and this is an inline function, sparse will + give tons of false positives. When this is a real functions sparse works. + */ +int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins) +{ + int io_allowed; + + atomic_inc(&device->local_cnt); + io_allowed = (device->state.disk >= mins); + if (!io_allowed) { + if (atomic_dec_and_test(&device->local_cnt)) + wake_up(&device->misc_wait); + } + return io_allowed; +} + +#endif + +/** + * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch + * @connection: DRBD connection. + * @barrier_nr: Expected identifier of the DRBD write barrier packet. + * @set_size: Expected number of requests before that barrier. + * + * In case the passed barrier_nr or set_size does not match the oldest + * epoch of not yet barrier-acked requests, this function will cause a + * termination of the connection. + */ +void tl_release(struct drbd_connection *connection, unsigned int barrier_nr, + unsigned int set_size) +{ + struct drbd_request *r; + struct drbd_request *req = NULL; + int expect_epoch = 0; + int expect_size = 0; + + spin_lock_irq(&connection->resource->req_lock); + + /* find oldest not yet barrier-acked write request, + * count writes in its epoch. */ + list_for_each_entry(r, &connection->transfer_log, tl_requests) { + const unsigned s = r->rq_state; + if (!req) { + if (!(s & RQ_WRITE)) + continue; + if (!(s & RQ_NET_MASK)) + continue; + if (s & RQ_NET_DONE) + continue; + req = r; + expect_epoch = req->epoch; + expect_size ++; + } else { + if (r->epoch != expect_epoch) + break; + if (!(s & RQ_WRITE)) + continue; + /* if (s & RQ_DONE): not expected */ + /* if (!(s & RQ_NET_MASK)): not expected */ + expect_size++; + } + } + + /* first some paranoia code */ + if (req == NULL) { + drbd_err(connection, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", + barrier_nr); + goto bail; + } + if (expect_epoch != barrier_nr) { + drbd_err(connection, "BAD! BarrierAck #%u received, expected #%u!\n", + barrier_nr, expect_epoch); + goto bail; + } + + if (expect_size != set_size) { + drbd_err(connection, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n", + barrier_nr, set_size, expect_size); + goto bail; + } + + /* Clean up list of requests processed during current epoch. */ + /* this extra list walk restart is paranoia, + * to catch requests being barrier-acked "unexpectedly". + * It usually should find the same req again, or some READ preceding it. */ + list_for_each_entry(req, &connection->transfer_log, tl_requests) + if (req->epoch == expect_epoch) + break; + list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) { + if (req->epoch != expect_epoch) + break; + _req_mod(req, BARRIER_ACKED); + } + spin_unlock_irq(&connection->resource->req_lock); + + return; + +bail: + spin_unlock_irq(&connection->resource->req_lock); + conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD); +} + + +/** + * _tl_restart() - Walks the transfer log, and applies an action to all requests + * @connection: DRBD connection to operate on. + * @what: The action/event to perform with all request objects + * + * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO, + * RESTART_FROZEN_DISK_IO. + */ +/* must hold resource->req_lock */ +void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what) +{ + struct drbd_request *req, *r; + + list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) + _req_mod(req, what); +} + +void tl_restart(struct drbd_connection *connection, enum drbd_req_event what) +{ + spin_lock_irq(&connection->resource->req_lock); + _tl_restart(connection, what); + spin_unlock_irq(&connection->resource->req_lock); +} + +/** + * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL + * @device: DRBD device. + * + * This is called after the connection to the peer was lost. The storage covered + * by the requests on the transfer gets marked as our of sync. Called from the + * receiver thread and the worker thread. + */ +void tl_clear(struct drbd_connection *connection) +{ + tl_restart(connection, CONNECTION_LOST_WHILE_PENDING); +} + +/** + * tl_abort_disk_io() - Abort disk I/O for all requests for a certain device in the TL + * @device: DRBD device. + */ +void tl_abort_disk_io(struct drbd_device *device) +{ + struct drbd_connection *connection = first_peer_device(device)->connection; + struct drbd_request *req, *r; + + spin_lock_irq(&connection->resource->req_lock); + list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) { + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + if (req->device != device) + continue; + _req_mod(req, ABORT_DISK_IO); + } + spin_unlock_irq(&connection->resource->req_lock); +} + +static int drbd_thread_setup(void *arg) +{ + struct drbd_thread *thi = (struct drbd_thread *) arg; + struct drbd_resource *resource = thi->resource; + unsigned long flags; + int retval; + + snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s", + thi->name[0], + resource->name); + +restart: + retval = thi->function(thi); + + spin_lock_irqsave(&thi->t_lock, flags); + + /* if the receiver has been "EXITING", the last thing it did + * was set the conn state to "StandAlone", + * if now a re-connect request comes in, conn state goes C_UNCONNECTED, + * and receiver thread will be "started". + * drbd_thread_start needs to set "RESTARTING" in that case. + * t_state check and assignment needs to be within the same spinlock, + * so either thread_start sees EXITING, and can remap to RESTARTING, + * or thread_start see NONE, and can proceed as normal. + */ + + if (thi->t_state == RESTARTING) { + drbd_info(resource, "Restarting %s thread\n", thi->name); + thi->t_state = RUNNING; + spin_unlock_irqrestore(&thi->t_lock, flags); + goto restart; + } + + thi->task = NULL; + thi->t_state = NONE; + smp_mb(); + complete_all(&thi->stop); + spin_unlock_irqrestore(&thi->t_lock, flags); + + drbd_info(resource, "Terminating %s\n", current->comm); + + /* Release mod reference taken when thread was started */ + + if (thi->connection) + kref_put(&thi->connection->kref, drbd_destroy_connection); + kref_put(&resource->kref, drbd_destroy_resource); + module_put(THIS_MODULE); + return retval; +} + +static void drbd_thread_init(struct drbd_resource *resource, struct drbd_thread *thi, + int (*func) (struct drbd_thread *), const char *name) +{ + spin_lock_init(&thi->t_lock); + thi->task = NULL; + thi->t_state = NONE; + thi->function = func; + thi->resource = resource; + thi->connection = NULL; + thi->name = name; +} + +int drbd_thread_start(struct drbd_thread *thi) +{ + struct drbd_resource *resource = thi->resource; + struct task_struct *nt; + unsigned long flags; + + /* is used from state engine doing drbd_thread_stop_nowait, + * while holding the req lock irqsave */ + spin_lock_irqsave(&thi->t_lock, flags); + + switch (thi->t_state) { + case NONE: + drbd_info(resource, "Starting %s thread (from %s [%d])\n", + thi->name, current->comm, current->pid); + + /* Get ref on module for thread - this is released when thread exits */ + if (!try_module_get(THIS_MODULE)) { + drbd_err(resource, "Failed to get module reference in drbd_thread_start\n"); + spin_unlock_irqrestore(&thi->t_lock, flags); + return false; + } + + kref_get(&resource->kref); + if (thi->connection) + kref_get(&thi->connection->kref); + + init_completion(&thi->stop); + thi->reset_cpu_mask = 1; + thi->t_state = RUNNING; + spin_unlock_irqrestore(&thi->t_lock, flags); + flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ + + nt = kthread_create(drbd_thread_setup, (void *) thi, + "drbd_%c_%s", thi->name[0], thi->resource->name); + + if (IS_ERR(nt)) { + drbd_err(resource, "Couldn't start thread\n"); + + if (thi->connection) + kref_put(&thi->connection->kref, drbd_destroy_connection); + kref_put(&resource->kref, drbd_destroy_resource); + module_put(THIS_MODULE); + return false; + } + spin_lock_irqsave(&thi->t_lock, flags); + thi->task = nt; + thi->t_state = RUNNING; + spin_unlock_irqrestore(&thi->t_lock, flags); + wake_up_process(nt); + break; + case EXITING: + thi->t_state = RESTARTING; + drbd_info(resource, "Restarting %s thread (from %s [%d])\n", + thi->name, current->comm, current->pid); + /* fall through */ + case RUNNING: + case RESTARTING: + default: + spin_unlock_irqrestore(&thi->t_lock, flags); + break; + } + + return true; +} + + +void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) +{ + unsigned long flags; + + enum drbd_thread_state ns = restart ? RESTARTING : EXITING; + + /* may be called from state engine, holding the req lock irqsave */ + spin_lock_irqsave(&thi->t_lock, flags); + + if (thi->t_state == NONE) { + spin_unlock_irqrestore(&thi->t_lock, flags); + if (restart) + drbd_thread_start(thi); + return; + } + + if (thi->t_state != ns) { + if (thi->task == NULL) { + spin_unlock_irqrestore(&thi->t_lock, flags); + return; + } + + thi->t_state = ns; + smp_mb(); + init_completion(&thi->stop); + if (thi->task != current) + force_sig(DRBD_SIGKILL, thi->task); + } + + spin_unlock_irqrestore(&thi->t_lock, flags); + + if (wait) + wait_for_completion(&thi->stop); +} + +int conn_lowest_minor(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr = 0, minor = -1; + + rcu_read_lock(); + peer_device = idr_get_next(&connection->peer_devices, &vnr); + if (peer_device) + minor = device_to_minor(peer_device->device); + rcu_read_unlock(); + + return minor; +} + +#ifdef CONFIG_SMP +/** + * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs + * + * Forces all threads of a resource onto the same CPU. This is beneficial for + * DRBD's performance. May be overwritten by user's configuration. + */ +static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask) +{ + unsigned int *resources_per_cpu, min_index = ~0; + + resources_per_cpu = kzalloc(nr_cpu_ids * sizeof(*resources_per_cpu), GFP_KERNEL); + if (resources_per_cpu) { + struct drbd_resource *resource; + unsigned int cpu, min = ~0; + + rcu_read_lock(); + for_each_resource_rcu(resource, &drbd_resources) { + for_each_cpu(cpu, resource->cpu_mask) + resources_per_cpu[cpu]++; + } + rcu_read_unlock(); + for_each_online_cpu(cpu) { + if (resources_per_cpu[cpu] < min) { + min = resources_per_cpu[cpu]; + min_index = cpu; + } + } + kfree(resources_per_cpu); + } + if (min_index == ~0) { + cpumask_setall(*cpu_mask); + return; + } + cpumask_set_cpu(min_index, *cpu_mask); +} + +/** + * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread + * @device: DRBD device. + * @thi: drbd_thread object + * + * call in the "main loop" of _all_ threads, no need for any mutex, current won't die + * prematurely. + */ +void drbd_thread_current_set_cpu(struct drbd_thread *thi) +{ + struct drbd_resource *resource = thi->resource; + struct task_struct *p = current; + + if (!thi->reset_cpu_mask) + return; + thi->reset_cpu_mask = 0; + set_cpus_allowed_ptr(p, resource->cpu_mask); +} +#else +#define drbd_calc_cpu_mask(A) ({}) +#endif + +/** + * drbd_header_size - size of a packet header + * + * The header size is a multiple of 8, so any payload following the header is + * word aligned on 64-bit architectures. (The bitmap send and receive code + * relies on this.) + */ +unsigned int drbd_header_size(struct drbd_connection *connection) +{ + if (connection->agreed_pro_version >= 100) { + BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8)); + return sizeof(struct p_header100); + } else { + BUILD_BUG_ON(sizeof(struct p_header80) != + sizeof(struct p_header95)); + BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8)); + return sizeof(struct p_header80); + } +} + +static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size) +{ + h->magic = cpu_to_be32(DRBD_MAGIC); + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be16(size); + return sizeof(struct p_header80); +} + +static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size) +{ + h->magic = cpu_to_be16(DRBD_MAGIC_BIG); + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be32(size); + return sizeof(struct p_header95); +} + +static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd, + int size, int vnr) +{ + h->magic = cpu_to_be32(DRBD_MAGIC_100); + h->volume = cpu_to_be16(vnr); + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be32(size); + h->pad = 0; + return sizeof(struct p_header100); +} + +static unsigned int prepare_header(struct drbd_connection *connection, int vnr, + void *buffer, enum drbd_packet cmd, int size) +{ + if (connection->agreed_pro_version >= 100) + return prepare_header100(buffer, cmd, size, vnr); + else if (connection->agreed_pro_version >= 95 && + size > DRBD_MAX_SIZE_H80_PACKET) + return prepare_header95(buffer, cmd, size); + else + return prepare_header80(buffer, cmd, size); +} + +static void *__conn_prepare_command(struct drbd_connection *connection, + struct drbd_socket *sock) +{ + if (!sock->socket) + return NULL; + return sock->sbuf + drbd_header_size(connection); +} + +void *conn_prepare_command(struct drbd_connection *connection, struct drbd_socket *sock) +{ + void *p; + + mutex_lock(&sock->mutex); + p = __conn_prepare_command(connection, sock); + if (!p) + mutex_unlock(&sock->mutex); + + return p; +} + +void *drbd_prepare_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock) +{ + return conn_prepare_command(peer_device->connection, sock); +} + +static int __send_command(struct drbd_connection *connection, int vnr, + struct drbd_socket *sock, enum drbd_packet cmd, + unsigned int header_size, void *data, + unsigned int size) +{ + int msg_flags; + int err; + + /* + * Called with @data == NULL and the size of the data blocks in @size + * for commands that send data blocks. For those commands, omit the + * MSG_MORE flag: this will increase the likelihood that data blocks + * which are page aligned on the sender will end up page aligned on the + * receiver. + */ + msg_flags = data ? MSG_MORE : 0; + + header_size += prepare_header(connection, vnr, sock->sbuf, cmd, + header_size + size); + err = drbd_send_all(connection, sock->socket, sock->sbuf, header_size, + msg_flags); + if (data && !err) + err = drbd_send_all(connection, sock->socket, data, size, 0); + /* DRBD protocol "pings" are latency critical. + * This is supposed to trigger tcp_push_pending_frames() */ + if (!err && (cmd == P_PING || cmd == P_PING_ACK)) + drbd_tcp_nodelay(sock->socket); + + return err; +} + +static int __conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock, + enum drbd_packet cmd, unsigned int header_size, + void *data, unsigned int size) +{ + return __send_command(connection, 0, sock, cmd, header_size, data, size); +} + +int conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock, + enum drbd_packet cmd, unsigned int header_size, + void *data, unsigned int size) +{ + int err; + + err = __conn_send_command(connection, sock, cmd, header_size, data, size); + mutex_unlock(&sock->mutex); + return err; +} + +int drbd_send_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock, + enum drbd_packet cmd, unsigned int header_size, + void *data, unsigned int size) +{ + int err; + + err = __send_command(peer_device->connection, peer_device->device->vnr, + sock, cmd, header_size, data, size); + mutex_unlock(&sock->mutex); + return err; +} + +int drbd_send_ping(struct drbd_connection *connection) +{ + struct drbd_socket *sock; + + sock = &connection->meta; + if (!conn_prepare_command(connection, sock)) + return -EIO; + return conn_send_command(connection, sock, P_PING, 0, NULL, 0); +} + +int drbd_send_ping_ack(struct drbd_connection *connection) +{ + struct drbd_socket *sock; + + sock = &connection->meta; + if (!conn_prepare_command(connection, sock)) + return -EIO; + return conn_send_command(connection, sock, P_PING_ACK, 0, NULL, 0); +} + +int drbd_send_sync_param(struct drbd_peer_device *peer_device) +{ + struct drbd_socket *sock; + struct p_rs_param_95 *p; + int size; + const int apv = peer_device->connection->agreed_pro_version; + enum drbd_packet cmd; + struct net_conf *nc; + struct disk_conf *dc; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + + rcu_read_lock(); + nc = rcu_dereference(peer_device->connection->net_conf); + + size = apv <= 87 ? sizeof(struct p_rs_param) + : apv == 88 ? sizeof(struct p_rs_param) + + strlen(nc->verify_alg) + 1 + : apv <= 94 ? sizeof(struct p_rs_param_89) + : /* apv >= 95 */ sizeof(struct p_rs_param_95); + + cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; + + /* initialize verify_alg and csums_alg */ + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + + if (get_ldev(peer_device->device)) { + dc = rcu_dereference(peer_device->device->ldev->disk_conf); + p->resync_rate = cpu_to_be32(dc->resync_rate); + p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead); + p->c_delay_target = cpu_to_be32(dc->c_delay_target); + p->c_fill_target = cpu_to_be32(dc->c_fill_target); + p->c_max_rate = cpu_to_be32(dc->c_max_rate); + put_ldev(peer_device->device); + } else { + p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF); + p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF); + p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF); + p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF); + p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF); + } + + if (apv >= 88) + strcpy(p->verify_alg, nc->verify_alg); + if (apv >= 89) + strcpy(p->csums_alg, nc->csums_alg); + rcu_read_unlock(); + + return drbd_send_command(peer_device, sock, cmd, size, NULL, 0); +} + +int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd) +{ + struct drbd_socket *sock; + struct p_protocol *p; + struct net_conf *nc; + int size, cf; + + sock = &connection->data; + p = __conn_prepare_command(connection, sock); + if (!p) + return -EIO; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + + if (nc->tentative && connection->agreed_pro_version < 92) { + rcu_read_unlock(); + mutex_unlock(&sock->mutex); + drbd_err(connection, "--dry-run is not supported by peer"); + return -EOPNOTSUPP; + } + + size = sizeof(*p); + if (connection->agreed_pro_version >= 87) + size += strlen(nc->integrity_alg) + 1; + + p->protocol = cpu_to_be32(nc->wire_protocol); + p->after_sb_0p = cpu_to_be32(nc->after_sb_0p); + p->after_sb_1p = cpu_to_be32(nc->after_sb_1p); + p->after_sb_2p = cpu_to_be32(nc->after_sb_2p); + p->two_primaries = cpu_to_be32(nc->two_primaries); + cf = 0; + if (nc->discard_my_data) + cf |= CF_DISCARD_MY_DATA; + if (nc->tentative) + cf |= CF_DRY_RUN; + p->conn_flags = cpu_to_be32(cf); + + if (connection->agreed_pro_version >= 87) + strcpy(p->integrity_alg, nc->integrity_alg); + rcu_read_unlock(); + + return __conn_send_command(connection, sock, cmd, size, NULL, 0); +} + +int drbd_send_protocol(struct drbd_connection *connection) +{ + int err; + + mutex_lock(&connection->data.mutex); + err = __drbd_send_protocol(connection, P_PROTOCOL); + mutex_unlock(&connection->data.mutex); + + return err; +} + +static int _drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags) +{ + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock; + struct p_uuids *p; + int i; + + if (!get_ldev_if_state(device, D_NEGOTIATING)) + return 0; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) { + put_ldev(device); + return -EIO; + } + spin_lock_irq(&device->ldev->md.uuid_lock); + for (i = UI_CURRENT; i < UI_SIZE; i++) + p->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]); + spin_unlock_irq(&device->ldev->md.uuid_lock); + + device->comm_bm_set = drbd_bm_total_weight(device); + p->uuid[UI_SIZE] = cpu_to_be64(device->comm_bm_set); + rcu_read_lock(); + uuid_flags |= rcu_dereference(peer_device->connection->net_conf)->discard_my_data ? 1 : 0; + rcu_read_unlock(); + uuid_flags |= test_bit(CRASHED_PRIMARY, &device->flags) ? 2 : 0; + uuid_flags |= device->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; + p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); + + put_ldev(device); + return drbd_send_command(peer_device, sock, P_UUIDS, sizeof(*p), NULL, 0); +} + +int drbd_send_uuids(struct drbd_peer_device *peer_device) +{ + return _drbd_send_uuids(peer_device, 0); +} + +int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *peer_device) +{ + return _drbd_send_uuids(peer_device, 8); +} + +void drbd_print_uuids(struct drbd_device *device, const char *text) +{ + if (get_ldev_if_state(device, D_NEGOTIATING)) { + u64 *uuid = device->ldev->md.uuid; + drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX\n", + text, + (unsigned long long)uuid[UI_CURRENT], + (unsigned long long)uuid[UI_BITMAP], + (unsigned long long)uuid[UI_HISTORY_START], + (unsigned long long)uuid[UI_HISTORY_END]); + put_ldev(device); + } else { + drbd_info(device, "%s effective data uuid: %016llX\n", + text, + (unsigned long long)device->ed_uuid); + } +} + +void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock; + struct p_rs_uuid *p; + u64 uuid; + + D_ASSERT(device, device->state.disk == D_UP_TO_DATE); + + uuid = device->ldev->md.uuid[UI_BITMAP]; + if (uuid && uuid != UUID_JUST_CREATED) + uuid = uuid + UUID_NEW_BM_OFFSET; + else + get_random_bytes(&uuid, sizeof(u64)); + drbd_uuid_set(device, UI_BITMAP, uuid); + drbd_print_uuids(device, "updated sync UUID"); + drbd_md_sync(device); + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (p) { + p->uuid = cpu_to_be64(uuid); + drbd_send_command(peer_device, sock, P_SYNC_UUID, sizeof(*p), NULL, 0); + } +} + +int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags) +{ + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock; + struct p_sizes *p; + sector_t d_size, u_size; + int q_order_type; + unsigned int max_bio_size; + + if (get_ldev_if_state(device, D_NEGOTIATING)) { + D_ASSERT(device, device->ldev->backing_bdev); + d_size = drbd_get_max_capacity(device->ldev); + rcu_read_lock(); + u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; + rcu_read_unlock(); + q_order_type = drbd_queue_order_type(device); + max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9; + max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); + put_ldev(device); + } else { + d_size = 0; + u_size = 0; + q_order_type = QUEUE_ORDERED_NONE; + max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ + } + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + + if (peer_device->connection->agreed_pro_version <= 94) + max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + else if (peer_device->connection->agreed_pro_version < 100) + max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95); + + p->d_size = cpu_to_be64(d_size); + p->u_size = cpu_to_be64(u_size); + p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(device->this_bdev)); + p->max_bio_size = cpu_to_be32(max_bio_size); + p->queue_order_type = cpu_to_be16(q_order_type); + p->dds_flags = cpu_to_be16(flags); + return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0); +} + +/** + * drbd_send_current_state() - Sends the drbd state to the peer + * @peer_device: DRBD peer device. + */ +int drbd_send_current_state(struct drbd_peer_device *peer_device) +{ + struct drbd_socket *sock; + struct p_state *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->state = cpu_to_be32(peer_device->device->state.i); /* Within the send mutex */ + return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0); +} + +/** + * drbd_send_state() - After a state change, sends the new state to the peer + * @peer_device: DRBD peer device. + * @state: the state to send, not necessarily the current state. + * + * Each state change queues an "after_state_ch" work, which will eventually + * send the resulting new state to the peer. If more state changes happen + * between queuing and processing of the after_state_ch work, we still + * want to send each intermediary state in the order it occurred. + */ +int drbd_send_state(struct drbd_peer_device *peer_device, union drbd_state state) +{ + struct drbd_socket *sock; + struct p_state *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->state = cpu_to_be32(state.i); /* Within the send mutex */ + return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0); +} + +int drbd_send_state_req(struct drbd_peer_device *peer_device, union drbd_state mask, union drbd_state val) +{ + struct drbd_socket *sock; + struct p_req_state *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->mask = cpu_to_be32(mask.i); + p->val = cpu_to_be32(val.i); + return drbd_send_command(peer_device, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0); +} + +int conn_send_state_req(struct drbd_connection *connection, union drbd_state mask, union drbd_state val) +{ + enum drbd_packet cmd; + struct drbd_socket *sock; + struct p_req_state *p; + + cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ; + sock = &connection->data; + p = conn_prepare_command(connection, sock); + if (!p) + return -EIO; + p->mask = cpu_to_be32(mask.i); + p->val = cpu_to_be32(val.i); + return conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0); +} + +void drbd_send_sr_reply(struct drbd_peer_device *peer_device, enum drbd_state_rv retcode) +{ + struct drbd_socket *sock; + struct p_req_state_reply *p; + + sock = &peer_device->connection->meta; + p = drbd_prepare_command(peer_device, sock); + if (p) { + p->retcode = cpu_to_be32(retcode); + drbd_send_command(peer_device, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0); + } +} + +void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode) +{ + struct drbd_socket *sock; + struct p_req_state_reply *p; + enum drbd_packet cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY; + + sock = &connection->meta; + p = conn_prepare_command(connection, sock); + if (p) { + p->retcode = cpu_to_be32(retcode); + conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0); + } +} + +static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) +{ + BUG_ON(code & ~0xf); + p->encoding = (p->encoding & ~0xf) | code; +} + +static void dcbp_set_start(struct p_compressed_bm *p, int set) +{ + p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); +} + +static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n) +{ + BUG_ON(n & ~0x7); + p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); +} + +static int fill_bitmap_rle_bits(struct drbd_device *device, + struct p_compressed_bm *p, + unsigned int size, + struct bm_xfer_ctx *c) +{ + struct bitstream bs; + unsigned long plain_bits; + unsigned long tmp; + unsigned long rl; + unsigned len; + unsigned toggle; + int bits, use_rle; + + /* may we use this feature? */ + rcu_read_lock(); + use_rle = rcu_dereference(first_peer_device(device)->connection->net_conf)->use_rle; + rcu_read_unlock(); + if (!use_rle || first_peer_device(device)->connection->agreed_pro_version < 90) + return 0; + + if (c->bit_offset >= c->bm_bits) + return 0; /* nothing to do. */ + + /* use at most thus many bytes */ + bitstream_init(&bs, p->code, size, 0); + memset(p->code, 0, size); + /* plain bits covered in this code string */ + plain_bits = 0; + + /* p->encoding & 0x80 stores whether the first run length is set. + * bit offset is implicit. + * start with toggle == 2 to be able to tell the first iteration */ + toggle = 2; + + /* see how much plain bits we can stuff into one packet + * using RLE and VLI. */ + do { + tmp = (toggle == 0) ? _drbd_bm_find_next_zero(device, c->bit_offset) + : _drbd_bm_find_next(device, c->bit_offset); + if (tmp == -1UL) + tmp = c->bm_bits; + rl = tmp - c->bit_offset; + + if (toggle == 2) { /* first iteration */ + if (rl == 0) { + /* the first checked bit was set, + * store start value, */ + dcbp_set_start(p, 1); + /* but skip encoding of zero run length */ + toggle = !toggle; + continue; + } + dcbp_set_start(p, 0); + } + + /* paranoia: catch zero runlength. + * can only happen if bitmap is modified while we scan it. */ + if (rl == 0) { + drbd_err(device, "unexpected zero runlength while encoding bitmap " + "t:%u bo:%lu\n", toggle, c->bit_offset); + return -1; + } + + bits = vli_encode_bits(&bs, rl); + if (bits == -ENOBUFS) /* buffer full */ + break; + if (bits <= 0) { + drbd_err(device, "error while encoding bitmap: %d\n", bits); + return 0; + } + + toggle = !toggle; + plain_bits += rl; + c->bit_offset = tmp; + } while (c->bit_offset < c->bm_bits); + + len = bs.cur.b - p->code + !!bs.cur.bit; + + if (plain_bits < (len << 3)) { + /* incompressible with this method. + * we need to rewind both word and bit position. */ + c->bit_offset -= plain_bits; + bm_xfer_ctx_bit_to_word_offset(c); + c->bit_offset = c->word_offset * BITS_PER_LONG; + return 0; + } + + /* RLE + VLI was able to compress it just fine. + * update c->word_offset. */ + bm_xfer_ctx_bit_to_word_offset(c); + + /* store pad_bits */ + dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); + + return len; +} + +/** + * send_bitmap_rle_or_plain + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +static int +send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c) +{ + struct drbd_socket *sock = &first_peer_device(device)->connection->data; + unsigned int header_size = drbd_header_size(first_peer_device(device)->connection); + struct p_compressed_bm *p = sock->sbuf + header_size; + int len, err; + + len = fill_bitmap_rle_bits(device, p, + DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c); + if (len < 0) + return -EIO; + + if (len) { + dcbp_set_code(p, RLE_VLI_Bits); + err = __send_command(first_peer_device(device)->connection, device->vnr, sock, + P_COMPRESSED_BITMAP, sizeof(*p) + len, + NULL, 0); + c->packets[0]++; + c->bytes[0] += header_size + sizeof(*p) + len; + + if (c->bit_offset >= c->bm_bits) + len = 0; /* DONE */ + } else { + /* was not compressible. + * send a buffer full of plain text bits instead. */ + unsigned int data_size; + unsigned long num_words; + unsigned long *p = sock->sbuf + header_size; + + data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; + num_words = min_t(size_t, data_size / sizeof(*p), + c->bm_words - c->word_offset); + len = num_words * sizeof(*p); + if (len) + drbd_bm_get_lel(device, c->word_offset, num_words, p); + err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0); + c->word_offset += num_words; + c->bit_offset = c->word_offset * BITS_PER_LONG; + + c->packets[1]++; + c->bytes[1] += header_size + len; + + if (c->bit_offset > c->bm_bits) + c->bit_offset = c->bm_bits; + } + if (!err) { + if (len == 0) { + INFO_bm_xfer_stats(device, "send", c); + return 0; + } else + return 1; + } + return -EIO; +} + +/* See the comment at receive_bitmap() */ +static int _drbd_send_bitmap(struct drbd_device *device) +{ + struct bm_xfer_ctx c; + int err; + + if (!expect(device->bitmap)) + return false; + + if (get_ldev(device)) { + if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) { + drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n"); + drbd_bm_set_all(device); + if (drbd_bm_write(device)) { + /* write_bm did fail! Leave full sync flag set in Meta P_DATA + * but otherwise process as per normal - need to tell other + * side that a full resync is required! */ + drbd_err(device, "Failed to write bitmap to disk!\n"); + } else { + drbd_md_clear_flag(device, MDF_FULL_SYNC); + drbd_md_sync(device); + } + } + put_ldev(device); + } + + c = (struct bm_xfer_ctx) { + .bm_bits = drbd_bm_bits(device), + .bm_words = drbd_bm_words(device), + }; + + do { + err = send_bitmap_rle_or_plain(device, &c); + } while (err > 0); + + return err == 0; +} + +int drbd_send_bitmap(struct drbd_device *device) +{ + struct drbd_socket *sock = &first_peer_device(device)->connection->data; + int err = -1; + + mutex_lock(&sock->mutex); + if (sock->socket) + err = !_drbd_send_bitmap(device); + mutex_unlock(&sock->mutex); + return err; +} + +void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size) +{ + struct drbd_socket *sock; + struct p_barrier_ack *p; + + if (connection->cstate < C_WF_REPORT_PARAMS) + return; + + sock = &connection->meta; + p = conn_prepare_command(connection, sock); + if (!p) + return; + p->barrier = barrier_nr; + p->set_size = cpu_to_be32(set_size); + conn_send_command(connection, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0); +} + +/** + * _drbd_send_ack() - Sends an ack packet + * @device: DRBD device. + * @cmd: Packet command code. + * @sector: sector, needs to be in big endian byte order + * @blksize: size in byte, needs to be in big endian byte order + * @block_id: Id, big endian byte order + */ +static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + u64 sector, u32 blksize, u64 block_id) +{ + struct drbd_socket *sock; + struct p_block_ack *p; + + if (peer_device->device->state.conn < C_CONNECTED) + return -EIO; + + sock = &peer_device->connection->meta; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = sector; + p->block_id = block_id; + p->blksize = blksize; + p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->device->packet_seq)); + return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0); +} + +/* dp->sector and dp->block_id already/still in network byte order, + * data_size is payload size according to dp->head, + * and may need to be corrected for digest size. */ +void drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + struct p_data *dp, int data_size) +{ + if (peer_device->connection->peer_integrity_tfm) + data_size -= crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); + _drbd_send_ack(peer_device, cmd, dp->sector, cpu_to_be32(data_size), + dp->block_id); +} + +void drbd_send_ack_rp(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + struct p_block_req *rp) +{ + _drbd_send_ack(peer_device, cmd, rp->sector, rp->blksize, rp->block_id); +} + +/** + * drbd_send_ack() - Sends an ack packet + * @device: DRBD device + * @cmd: packet command code + * @peer_req: peer request + */ +int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + struct drbd_peer_request *peer_req) +{ + return _drbd_send_ack(peer_device, cmd, + cpu_to_be64(peer_req->i.sector), + cpu_to_be32(peer_req->i.size), + peer_req->block_id); +} + +/* This function misuses the block_id field to signal if the blocks + * are is sync or not. */ +int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + sector_t sector, int blksize, u64 block_id) +{ + return _drbd_send_ack(peer_device, cmd, + cpu_to_be64(sector), + cpu_to_be32(blksize), + cpu_to_be64(block_id)); +} + +int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd, + sector_t sector, int size, u64 block_id) +{ + struct drbd_socket *sock; + struct p_block_req *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(sector); + p->block_id = block_id; + p->blksize = cpu_to_be32(size); + return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0); +} + +int drbd_send_drequest_csum(struct drbd_peer_device *peer_device, sector_t sector, int size, + void *digest, int digest_size, enum drbd_packet cmd) +{ + struct drbd_socket *sock; + struct p_block_req *p; + + /* FIXME: Put the digest into the preallocated socket buffer. */ + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(sector); + p->block_id = ID_SYNCER /* unused */; + p->blksize = cpu_to_be32(size); + return drbd_send_command(peer_device, sock, cmd, sizeof(*p), digest, digest_size); +} + +int drbd_send_ov_request(struct drbd_peer_device *peer_device, sector_t sector, int size) +{ + struct drbd_socket *sock; + struct p_block_req *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(sector); + p->block_id = ID_SYNCER /* unused */; + p->blksize = cpu_to_be32(size); + return drbd_send_command(peer_device, sock, P_OV_REQUEST, sizeof(*p), NULL, 0); +} + +/* called on sndtimeo + * returns false if we should retry, + * true if we think connection is dead + */ +static int we_should_drop_the_connection(struct drbd_connection *connection, struct socket *sock) +{ + int drop_it; + /* long elapsed = (long)(jiffies - device->last_received); */ + + drop_it = connection->meta.socket == sock + || !connection->asender.task + || get_t_state(&connection->asender) != RUNNING + || connection->cstate < C_WF_REPORT_PARAMS; + + if (drop_it) + return true; + + drop_it = !--connection->ko_count; + if (!drop_it) { + drbd_err(connection, "[%s/%d] sock_sendmsg time expired, ko = %u\n", + current->comm, current->pid, connection->ko_count); + request_ping(connection); + } + + return drop_it; /* && (device->state == R_PRIMARY) */; +} + +static void drbd_update_congested(struct drbd_connection *connection) +{ + struct sock *sk = connection->data.socket->sk; + if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) + set_bit(NET_CONGESTED, &connection->flags); +} + +/* The idea of sendpage seems to be to put some kind of reference + * to the page into the skb, and to hand it over to the NIC. In + * this process get_page() gets called. + * + * As soon as the page was really sent over the network put_page() + * gets called by some part of the network layer. [ NIC driver? ] + * + * [ get_page() / put_page() increment/decrement the count. If count + * reaches 0 the page will be freed. ] + * + * This works nicely with pages from FSs. + * But this means that in protocol A we might signal IO completion too early! + * + * In order not to corrupt data during a resync we must make sure + * that we do not reuse our own buffer pages (EEs) to early, therefore + * we have the net_ee list. + * + * XFS seems to have problems, still, it submits pages with page_count == 0! + * As a workaround, we disable sendpage on pages + * with page_count == 0 or PageSlab. + */ +static int _drbd_no_send_page(struct drbd_peer_device *peer_device, struct page *page, + int offset, size_t size, unsigned msg_flags) +{ + struct socket *socket; + void *addr; + int err; + + socket = peer_device->connection->data.socket; + addr = kmap(page) + offset; + err = drbd_send_all(peer_device->connection, socket, addr, size, msg_flags); + kunmap(page); + if (!err) + peer_device->device->send_cnt += size >> 9; + return err; +} + +static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *page, + int offset, size_t size, unsigned msg_flags) +{ + struct socket *socket = peer_device->connection->data.socket; + mm_segment_t oldfs = get_fs(); + int len = size; + int err = -EIO; + + /* e.g. XFS meta- & log-data is in slab pages, which have a + * page_count of 0 and/or have PageSlab() set. + * we cannot use send_page for those, as that does get_page(); + * put_page(); and would cause either a VM_BUG directly, or + * __page_cache_release a page that would actually still be referenced + * by someone, leading to some obscure delayed Oops somewhere else. */ + if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) + return _drbd_no_send_page(peer_device, page, offset, size, msg_flags); + + msg_flags |= MSG_NOSIGNAL; + drbd_update_congested(peer_device->connection); + set_fs(KERNEL_DS); + do { + int sent; + + sent = socket->ops->sendpage(socket, page, offset, len, msg_flags); + if (sent <= 0) { + if (sent == -EAGAIN) { + if (we_should_drop_the_connection(peer_device->connection, socket)) + break; + continue; + } + drbd_warn(peer_device->device, "%s: size=%d len=%d sent=%d\n", + __func__, (int)size, len, sent); + if (sent < 0) + err = sent; + break; + } + len -= sent; + offset += sent; + } while (len > 0 /* THINK && device->cstate >= C_CONNECTED*/); + set_fs(oldfs); + clear_bit(NET_CONGESTED, &peer_device->connection->flags); + + if (len == 0) { + err = 0; + peer_device->device->send_cnt += size >> 9; + } + return err; +} + +static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio) +{ + struct bio_vec bvec; + struct bvec_iter iter; + + /* hint all but last page with MSG_MORE */ + bio_for_each_segment(bvec, bio, iter) { + int err; + + err = _drbd_no_send_page(peer_device, bvec.bv_page, + bvec.bv_offset, bvec.bv_len, + bio_iter_last(bvec, iter) + ? 0 : MSG_MORE); + if (err) + return err; + } + return 0; +} + +static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *bio) +{ + struct bio_vec bvec; + struct bvec_iter iter; + + /* hint all but last page with MSG_MORE */ + bio_for_each_segment(bvec, bio, iter) { + int err; + + err = _drbd_send_page(peer_device, bvec.bv_page, + bvec.bv_offset, bvec.bv_len, + bio_iter_last(bvec, iter) ? 0 : MSG_MORE); + if (err) + return err; + } + return 0; +} + +static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device, + struct drbd_peer_request *peer_req) +{ + struct page *page = peer_req->pages; + unsigned len = peer_req->i.size; + int err; + + /* hint all but last page with MSG_MORE */ + page_chain_for_each(page) { + unsigned l = min_t(unsigned, len, PAGE_SIZE); + + err = _drbd_send_page(peer_device, page, 0, l, + page_chain_next(page) ? MSG_MORE : 0); + if (err) + return err; + len -= l; + } + return 0; +} + +static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long bi_rw) +{ + if (connection->agreed_pro_version >= 95) + return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | + (bi_rw & REQ_FUA ? DP_FUA : 0) | + (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | + (bi_rw & REQ_DISCARD ? DP_DISCARD : 0); + else + return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; +} + +/* Used to send write or TRIM aka REQ_DISCARD requests + * R_PRIMARY -> Peer (P_DATA, P_TRIM) + */ +int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock; + struct p_data *p; + unsigned int dp_flags = 0; + int digest_size; + int err; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + digest_size = peer_device->connection->integrity_tfm ? + crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0; + + if (!p) + return -EIO; + p->sector = cpu_to_be64(req->i.sector); + p->block_id = (unsigned long)req; + p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq)); + dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio->bi_rw); + if (device->state.conn >= C_SYNC_SOURCE && + device->state.conn <= C_PAUSED_SYNC_T) + dp_flags |= DP_MAY_SET_IN_SYNC; + if (peer_device->connection->agreed_pro_version >= 100) { + if (req->rq_state & RQ_EXP_RECEIVE_ACK) + dp_flags |= DP_SEND_RECEIVE_ACK; + /* During resync, request an explicit write ack, + * even in protocol != C */ + if (req->rq_state & RQ_EXP_WRITE_ACK + || (dp_flags & DP_MAY_SET_IN_SYNC)) + dp_flags |= DP_SEND_WRITE_ACK; + } + p->dp_flags = cpu_to_be32(dp_flags); + + if (dp_flags & DP_DISCARD) { + struct p_trim *t = (struct p_trim*)p; + t->size = cpu_to_be32(req->i.size); + err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); + goto out; + } + + /* our digest is still only over the payload. + * TRIM does not carry any payload. */ + if (digest_size) + drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); + err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size); + if (!err) { + /* For protocol A, we have to memcpy the payload into + * socket buffers, as we may complete right away + * as soon as we handed it over to tcp, at which point the data + * pages may become invalid. + * + * For data-integrity enabled, we copy it as well, so we can be + * sure that even if the bio pages may still be modified, it + * won't change the data on the wire, thus if the digest checks + * out ok after sending on this side, but does not fit on the + * receiving side, we sure have detected corruption elsewhere. + */ + if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || digest_size) + err = _drbd_send_bio(peer_device, req->master_bio); + else + err = _drbd_send_zc_bio(peer_device, req->master_bio); + + /* double check digest, sometimes buffers have been modified in flight. */ + if (digest_size > 0 && digest_size <= 64) { + /* 64 byte, 512 bit, is the largest digest size + * currently supported in kernel crypto. */ + unsigned char digest[64]; + drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest); + if (memcmp(p + 1, digest, digest_size)) { + drbd_warn(device, + "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n", + (unsigned long long)req->i.sector, req->i.size); + } + } /* else if (digest_size > 64) { + ... Be noisy about digest too large ... + } */ + } +out: + mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ + + return err; +} + +/* answer packet, used to send data back for read requests: + * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) + * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) + */ +int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd, + struct drbd_peer_request *peer_req) +{ + struct drbd_device *device = peer_device->device; + struct drbd_socket *sock; + struct p_data *p; + int err; + int digest_size; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + + digest_size = peer_device->connection->integrity_tfm ? + crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0; + + if (!p) + return -EIO; + p->sector = cpu_to_be64(peer_req->i.sector); + p->block_id = peer_req->block_id; + p->seq_num = 0; /* unused */ + p->dp_flags = 0; + if (digest_size) + drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + 1); + err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + digest_size, NULL, peer_req->i.size); + if (!err) + err = _drbd_send_zc_ee(peer_device, peer_req); + mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */ + + return err; +} + +int drbd_send_out_of_sync(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_socket *sock; + struct p_block_desc *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(req->i.sector); + p->blksize = cpu_to_be32(req->i.size); + return drbd_send_command(peer_device, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0); +} + +/* + drbd_send distinguishes two cases: + + Packets sent via the data socket "sock" + and packets sent via the meta data socket "msock" + + sock msock + -----------------+-------------------------+------------------------------ + timeout conf.timeout / 2 conf.timeout / 2 + timeout action send a ping via msock Abort communication + and close all sockets +*/ + +/* + * you must have down()ed the appropriate [m]sock_mutex elsewhere! + */ +int drbd_send(struct drbd_connection *connection, struct socket *sock, + void *buf, size_t size, unsigned msg_flags) +{ + struct kvec iov; + struct msghdr msg; + int rv, sent = 0; + + if (!sock) + return -EBADR; + + /* THINK if (signal_pending) return ... ? */ + + iov.iov_base = buf; + iov.iov_len = size; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = msg_flags | MSG_NOSIGNAL; + + if (sock == connection->data.socket) { + rcu_read_lock(); + connection->ko_count = rcu_dereference(connection->net_conf)->ko_count; + rcu_read_unlock(); + drbd_update_congested(connection); + } + do { + /* STRANGE + * tcp_sendmsg does _not_ use its size parameter at all ? + * + * -EAGAIN on timeout, -EINTR on signal. + */ +/* THINK + * do we need to block DRBD_SIG if sock == &meta.socket ?? + * otherwise wake_asender() might interrupt some send_*Ack ! + */ + rv = kernel_sendmsg(sock, &msg, &iov, 1, size); + if (rv == -EAGAIN) { + if (we_should_drop_the_connection(connection, sock)) + break; + else + continue; + } + if (rv == -EINTR) { + flush_signals(current); + rv = 0; + } + if (rv < 0) + break; + sent += rv; + iov.iov_base += rv; + iov.iov_len -= rv; + } while (sent < size); + + if (sock == connection->data.socket) + clear_bit(NET_CONGESTED, &connection->flags); + + if (rv <= 0) { + if (rv != -EAGAIN) { + drbd_err(connection, "%s_sendmsg returned %d\n", + sock == connection->meta.socket ? "msock" : "sock", + rv); + conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD); + } else + conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD); + } + + return sent; +} + +/** + * drbd_send_all - Send an entire buffer + * + * Returns 0 upon success and a negative error value otherwise. + */ +int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void *buffer, + size_t size, unsigned msg_flags) +{ + int err; + + err = drbd_send(connection, sock, buffer, size, msg_flags); + if (err < 0) + return err; + if (err != size) + return -EIO; + return 0; +} + +static int drbd_open(struct block_device *bdev, fmode_t mode) +{ + struct drbd_device *device = bdev->bd_disk->private_data; + unsigned long flags; + int rv = 0; + + mutex_lock(&drbd_main_mutex); + spin_lock_irqsave(&device->resource->req_lock, flags); + /* to have a stable device->state.role + * and no race with updating open_cnt */ + + if (device->state.role != R_PRIMARY) { + if (mode & FMODE_WRITE) + rv = -EROFS; + else if (!allow_oos) + rv = -EMEDIUMTYPE; + } + + if (!rv) + device->open_cnt++; + spin_unlock_irqrestore(&device->resource->req_lock, flags); + mutex_unlock(&drbd_main_mutex); + + return rv; +} + +static void drbd_release(struct gendisk *gd, fmode_t mode) +{ + struct drbd_device *device = gd->private_data; + mutex_lock(&drbd_main_mutex); + device->open_cnt--; + mutex_unlock(&drbd_main_mutex); +} + +static void drbd_set_defaults(struct drbd_device *device) +{ + /* Beware! The actual layout differs + * between big endian and little endian */ + device->state = (union drbd_dev_state) { + { .role = R_SECONDARY, + .peer = R_UNKNOWN, + .conn = C_STANDALONE, + .disk = D_DISKLESS, + .pdsk = D_UNKNOWN, + } }; +} + +void drbd_init_set_defaults(struct drbd_device *device) +{ + /* the memset(,0,) did most of this. + * note: only assignments, no allocation in here */ + + drbd_set_defaults(device); + + atomic_set(&device->ap_bio_cnt, 0); + atomic_set(&device->ap_actlog_cnt, 0); + atomic_set(&device->ap_pending_cnt, 0); + atomic_set(&device->rs_pending_cnt, 0); + atomic_set(&device->unacked_cnt, 0); + atomic_set(&device->local_cnt, 0); + atomic_set(&device->pp_in_use_by_net, 0); + atomic_set(&device->rs_sect_in, 0); + atomic_set(&device->rs_sect_ev, 0); + atomic_set(&device->ap_in_flight, 0); + atomic_set(&device->md_io.in_use, 0); + + mutex_init(&device->own_state_mutex); + device->state_mutex = &device->own_state_mutex; + + spin_lock_init(&device->al_lock); + spin_lock_init(&device->peer_seq_lock); + + INIT_LIST_HEAD(&device->active_ee); + INIT_LIST_HEAD(&device->sync_ee); + INIT_LIST_HEAD(&device->done_ee); + INIT_LIST_HEAD(&device->read_ee); + INIT_LIST_HEAD(&device->net_ee); + INIT_LIST_HEAD(&device->resync_reads); + INIT_LIST_HEAD(&device->resync_work.list); + INIT_LIST_HEAD(&device->unplug_work.list); + INIT_LIST_HEAD(&device->bm_io_work.w.list); + INIT_LIST_HEAD(&device->pending_master_completion[0]); + INIT_LIST_HEAD(&device->pending_master_completion[1]); + INIT_LIST_HEAD(&device->pending_completion[0]); + INIT_LIST_HEAD(&device->pending_completion[1]); + + device->resync_work.cb = w_resync_timer; + device->unplug_work.cb = w_send_write_hint; + device->bm_io_work.w.cb = w_bitmap_io; + + init_timer(&device->resync_timer); + init_timer(&device->md_sync_timer); + init_timer(&device->start_resync_timer); + init_timer(&device->request_timer); + device->resync_timer.function = resync_timer_fn; + device->resync_timer.data = (unsigned long) device; + device->md_sync_timer.function = md_sync_timer_fn; + device->md_sync_timer.data = (unsigned long) device; + device->start_resync_timer.function = start_resync_timer_fn; + device->start_resync_timer.data = (unsigned long) device; + device->request_timer.function = request_timer_fn; + device->request_timer.data = (unsigned long) device; + + init_waitqueue_head(&device->misc_wait); + init_waitqueue_head(&device->state_wait); + init_waitqueue_head(&device->ee_wait); + init_waitqueue_head(&device->al_wait); + init_waitqueue_head(&device->seq_wait); + + device->resync_wenr = LC_FREE; + device->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; + device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; +} + +void drbd_device_cleanup(struct drbd_device *device) +{ + int i; + if (first_peer_device(device)->connection->receiver.t_state != NONE) + drbd_err(device, "ASSERT FAILED: receiver t_state == %d expected 0.\n", + first_peer_device(device)->connection->receiver.t_state); + + device->al_writ_cnt = + device->bm_writ_cnt = + device->read_cnt = + device->recv_cnt = + device->send_cnt = + device->writ_cnt = + device->p_size = + device->rs_start = + device->rs_total = + device->rs_failed = 0; + device->rs_last_events = 0; + device->rs_last_sect_ev = 0; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + device->rs_mark_left[i] = 0; + device->rs_mark_time[i] = 0; + } + D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL); + + drbd_set_my_capacity(device, 0); + if (device->bitmap) { + /* maybe never allocated. */ + drbd_bm_resize(device, 0, 1); + drbd_bm_cleanup(device); + } + + drbd_free_ldev(device->ldev); + device->ldev = NULL; + + clear_bit(AL_SUSPENDED, &device->flags); + + D_ASSERT(device, list_empty(&device->active_ee)); + D_ASSERT(device, list_empty(&device->sync_ee)); + D_ASSERT(device, list_empty(&device->done_ee)); + D_ASSERT(device, list_empty(&device->read_ee)); + D_ASSERT(device, list_empty(&device->net_ee)); + D_ASSERT(device, list_empty(&device->resync_reads)); + D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q)); + D_ASSERT(device, list_empty(&device->resync_work.list)); + D_ASSERT(device, list_empty(&device->unplug_work.list)); + + drbd_set_defaults(device); +} + + +static void drbd_destroy_mempools(void) +{ + struct page *page; + + while (drbd_pp_pool) { + page = drbd_pp_pool; + drbd_pp_pool = (struct page *)page_private(page); + __free_page(page); + drbd_pp_vacant--; + } + + /* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */ + + if (drbd_md_io_bio_set) + bioset_free(drbd_md_io_bio_set); + if (drbd_md_io_page_pool) + mempool_destroy(drbd_md_io_page_pool); + if (drbd_ee_mempool) + mempool_destroy(drbd_ee_mempool); + if (drbd_request_mempool) + mempool_destroy(drbd_request_mempool); + if (drbd_ee_cache) + kmem_cache_destroy(drbd_ee_cache); + if (drbd_request_cache) + kmem_cache_destroy(drbd_request_cache); + if (drbd_bm_ext_cache) + kmem_cache_destroy(drbd_bm_ext_cache); + if (drbd_al_ext_cache) + kmem_cache_destroy(drbd_al_ext_cache); + + drbd_md_io_bio_set = NULL; + drbd_md_io_page_pool = NULL; + drbd_ee_mempool = NULL; + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + drbd_bm_ext_cache = NULL; + drbd_al_ext_cache = NULL; + + return; +} + +static int drbd_create_mempools(void) +{ + struct page *page; + const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count; + int i; + + /* prepare our caches and mempools */ + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + drbd_bm_ext_cache = NULL; + drbd_al_ext_cache = NULL; + drbd_pp_pool = NULL; + drbd_md_io_page_pool = NULL; + drbd_md_io_bio_set = NULL; + + /* caches */ + drbd_request_cache = kmem_cache_create( + "drbd_req", sizeof(struct drbd_request), 0, 0, NULL); + if (drbd_request_cache == NULL) + goto Enomem; + + drbd_ee_cache = kmem_cache_create( + "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL); + if (drbd_ee_cache == NULL) + goto Enomem; + + drbd_bm_ext_cache = kmem_cache_create( + "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL); + if (drbd_bm_ext_cache == NULL) + goto Enomem; + + drbd_al_ext_cache = kmem_cache_create( + "drbd_al", sizeof(struct lc_element), 0, 0, NULL); + if (drbd_al_ext_cache == NULL) + goto Enomem; + + /* mempools */ + drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_bio_set == NULL) + goto Enomem; + + drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_page_pool == NULL) + goto Enomem; + + drbd_request_mempool = mempool_create_slab_pool(number, + drbd_request_cache); + if (drbd_request_mempool == NULL) + goto Enomem; + + drbd_ee_mempool = mempool_create_slab_pool(number, drbd_ee_cache); + if (drbd_ee_mempool == NULL) + goto Enomem; + + /* drbd's page pool */ + spin_lock_init(&drbd_pp_lock); + + for (i = 0; i < number; i++) { + page = alloc_page(GFP_HIGHUSER); + if (!page) + goto Enomem; + set_page_private(page, (unsigned long)drbd_pp_pool); + drbd_pp_pool = page; + } + drbd_pp_vacant = number; + + return 0; + +Enomem: + drbd_destroy_mempools(); /* in case we allocated some */ + return -ENOMEM; +} + +static void drbd_release_all_peer_reqs(struct drbd_device *device) +{ + int rr; + + rr = drbd_free_peer_reqs(device, &device->active_ee); + if (rr) + drbd_err(device, "%d EEs in active list found!\n", rr); + + rr = drbd_free_peer_reqs(device, &device->sync_ee); + if (rr) + drbd_err(device, "%d EEs in sync list found!\n", rr); + + rr = drbd_free_peer_reqs(device, &device->read_ee); + if (rr) + drbd_err(device, "%d EEs in read list found!\n", rr); + + rr = drbd_free_peer_reqs(device, &device->done_ee); + if (rr) + drbd_err(device, "%d EEs in done list found!\n", rr); + + rr = drbd_free_peer_reqs(device, &device->net_ee); + if (rr) + drbd_err(device, "%d EEs in net list found!\n", rr); +} + +/* caution. no locking. */ +void drbd_destroy_device(struct kref *kref) +{ + struct drbd_device *device = container_of(kref, struct drbd_device, kref); + struct drbd_resource *resource = device->resource; + struct drbd_peer_device *peer_device, *tmp_peer_device; + + del_timer_sync(&device->request_timer); + + /* paranoia asserts */ + D_ASSERT(device, device->open_cnt == 0); + /* end paranoia asserts */ + + /* cleanup stuff that may have been allocated during + * device (re-)configuration or state changes */ + + if (device->this_bdev) + bdput(device->this_bdev); + + drbd_free_ldev(device->ldev); + device->ldev = NULL; + + drbd_release_all_peer_reqs(device); + + lc_destroy(device->act_log); + lc_destroy(device->resync); + + kfree(device->p_uuid); + /* device->p_uuid = NULL; */ + + if (device->bitmap) /* should no longer be there. */ + drbd_bm_cleanup(device); + __free_page(device->md_io.page); + put_disk(device->vdisk); + blk_cleanup_queue(device->rq_queue); + kfree(device->rs_plan_s); + + /* not for_each_connection(connection, resource): + * those may have been cleaned up and disassociated already. + */ + for_each_peer_device_safe(peer_device, tmp_peer_device, device) { + kref_put(&peer_device->connection->kref, drbd_destroy_connection); + kfree(peer_device); + } + memset(device, 0xfd, sizeof(*device)); + kfree(device); + kref_put(&resource->kref, drbd_destroy_resource); +} + +/* One global retry thread, if we need to push back some bio and have it + * reinserted through our make request function. + */ +static struct retry_worker { + struct workqueue_struct *wq; + struct work_struct worker; + + spinlock_t lock; + struct list_head writes; +} retry; + +static void do_retry(struct work_struct *ws) +{ + struct retry_worker *retry = container_of(ws, struct retry_worker, worker); + LIST_HEAD(writes); + struct drbd_request *req, *tmp; + + spin_lock_irq(&retry->lock); + list_splice_init(&retry->writes, &writes); + spin_unlock_irq(&retry->lock); + + list_for_each_entry_safe(req, tmp, &writes, tl_requests) { + struct drbd_device *device = req->device; + struct bio *bio = req->master_bio; + unsigned long start_jif = req->start_jif; + bool expected; + + expected = + expect(atomic_read(&req->completion_ref) == 0) && + expect(req->rq_state & RQ_POSTPONED) && + expect((req->rq_state & RQ_LOCAL_PENDING) == 0 || + (req->rq_state & RQ_LOCAL_ABORTED) != 0); + + if (!expected) + drbd_err(device, "req=%p completion_ref=%d rq_state=%x\n", + req, atomic_read(&req->completion_ref), + req->rq_state); + + /* We still need to put one kref associated with the + * "completion_ref" going zero in the code path that queued it + * here. The request object may still be referenced by a + * frozen local req->private_bio, in case we force-detached. + */ + kref_put(&req->kref, drbd_req_destroy); + + /* A single suspended or otherwise blocking device may stall + * all others as well. Fortunately, this code path is to + * recover from a situation that "should not happen": + * concurrent writes in multi-primary setup. + * In a "normal" lifecycle, this workqueue is supposed to be + * destroyed without ever doing anything. + * If it turns out to be an issue anyways, we can do per + * resource (replication group) or per device (minor) retry + * workqueues instead. + */ + + /* We are not just doing generic_make_request(), + * as we want to keep the start_time information. */ + inc_ap_bio(device); + __drbd_make_request(device, bio, start_jif); + } +} + +/* called via drbd_req_put_completion_ref(), + * holds resource->req_lock */ +void drbd_restart_request(struct drbd_request *req) +{ + unsigned long flags; + spin_lock_irqsave(&retry.lock, flags); + list_move_tail(&req->tl_requests, &retry.writes); + spin_unlock_irqrestore(&retry.lock, flags); + + /* Drop the extra reference that would otherwise + * have been dropped by complete_master_bio. + * do_retry() needs to grab a new one. */ + dec_ap_bio(req->device); + + queue_work(retry.wq, &retry.worker); +} + +void drbd_destroy_resource(struct kref *kref) +{ + struct drbd_resource *resource = + container_of(kref, struct drbd_resource, kref); + + idr_destroy(&resource->devices); + free_cpumask_var(resource->cpu_mask); + kfree(resource->name); + memset(resource, 0xf2, sizeof(*resource)); + kfree(resource); +} + +void drbd_free_resource(struct drbd_resource *resource) +{ + struct drbd_connection *connection, *tmp; + + for_each_connection_safe(connection, tmp, resource) { + list_del(&connection->connections); + drbd_debugfs_connection_cleanup(connection); + kref_put(&connection->kref, drbd_destroy_connection); + } + drbd_debugfs_resource_cleanup(resource); + kref_put(&resource->kref, drbd_destroy_resource); +} + +static void drbd_cleanup(void) +{ + unsigned int i; + struct drbd_device *device; + struct drbd_resource *resource, *tmp; + + /* first remove proc, + * drbdsetup uses it's presence to detect + * whether DRBD is loaded. + * If we would get stuck in proc removal, + * but have netlink already deregistered, + * some drbdsetup commands may wait forever + * for an answer. + */ + if (drbd_proc) + remove_proc_entry("drbd", NULL); + + if (retry.wq) + destroy_workqueue(retry.wq); + + drbd_genl_unregister(); + drbd_debugfs_cleanup(); + + idr_for_each_entry(&drbd_devices, device, i) + drbd_delete_device(device); + + /* not _rcu since, no other updater anymore. Genl already unregistered */ + for_each_resource_safe(resource, tmp, &drbd_resources) { + list_del(&resource->resources); + drbd_free_resource(resource); + } + + drbd_destroy_mempools(); + unregister_blkdev(DRBD_MAJOR, "drbd"); + + idr_destroy(&drbd_devices); + + pr_info("module cleanup done.\n"); +} + +/** + * drbd_congested() - Callback for the flusher thread + * @congested_data: User data + * @bdi_bits: Bits the BDI flusher thread is currently interested in + * + * Returns 1<connection->flags)) { + r |= (1 << BDI_async_congested); + /* Without good local data, we would need to read from remote, + * and that would need the worker thread as well, which is + * currently blocked waiting for that usermode helper to + * finish. + */ + if (!get_ldev_if_state(device, D_UP_TO_DATE)) + r |= (1 << BDI_sync_congested); + else + put_ldev(device); + r &= bdi_bits; + reason = 'c'; + goto out; + } + + if (get_ldev(device)) { + q = bdev_get_queue(device->ldev->backing_bdev); + r = bdi_congested(&q->backing_dev_info, bdi_bits); + put_ldev(device); + if (r) + reason = 'b'; + } + + if (bdi_bits & (1 << BDI_async_congested) && + test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) { + r |= (1 << BDI_async_congested); + reason = reason == 'b' ? 'a' : 'n'; + } + +out: + device->congestion_reason = reason; + return r; +} + +static void drbd_init_workqueue(struct drbd_work_queue* wq) +{ + spin_lock_init(&wq->q_lock); + INIT_LIST_HEAD(&wq->q); + init_waitqueue_head(&wq->q_wait); +} + +struct completion_work { + struct drbd_work w; + struct completion done; +}; + +static int w_complete(struct drbd_work *w, int cancel) +{ + struct completion_work *completion_work = + container_of(w, struct completion_work, w); + + complete(&completion_work->done); + return 0; +} + +void drbd_flush_workqueue(struct drbd_work_queue *work_queue) +{ + struct completion_work completion_work; + + completion_work.w.cb = w_complete; + init_completion(&completion_work.done); + drbd_queue_work(work_queue, &completion_work.w); + wait_for_completion(&completion_work.done); +} + +struct drbd_resource *drbd_find_resource(const char *name) +{ + struct drbd_resource *resource; + + if (!name || !name[0]) + return NULL; + + rcu_read_lock(); + for_each_resource_rcu(resource, &drbd_resources) { + if (!strcmp(resource->name, name)) { + kref_get(&resource->kref); + goto found; + } + } + resource = NULL; +found: + rcu_read_unlock(); + return resource; +} + +struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len, + void *peer_addr, int peer_addr_len) +{ + struct drbd_resource *resource; + struct drbd_connection *connection; + + rcu_read_lock(); + for_each_resource_rcu(resource, &drbd_resources) { + for_each_connection_rcu(connection, resource) { + if (connection->my_addr_len == my_addr_len && + connection->peer_addr_len == peer_addr_len && + !memcmp(&connection->my_addr, my_addr, my_addr_len) && + !memcmp(&connection->peer_addr, peer_addr, peer_addr_len)) { + kref_get(&connection->kref); + goto found; + } + } + } + connection = NULL; +found: + rcu_read_unlock(); + return connection; +} + +static int drbd_alloc_socket(struct drbd_socket *socket) +{ + socket->rbuf = (void *) __get_free_page(GFP_KERNEL); + if (!socket->rbuf) + return -ENOMEM; + socket->sbuf = (void *) __get_free_page(GFP_KERNEL); + if (!socket->sbuf) + return -ENOMEM; + return 0; +} + +static void drbd_free_socket(struct drbd_socket *socket) +{ + free_page((unsigned long) socket->sbuf); + free_page((unsigned long) socket->rbuf); +} + +void conn_free_crypto(struct drbd_connection *connection) +{ + drbd_free_sock(connection); + + crypto_free_hash(connection->csums_tfm); + crypto_free_hash(connection->verify_tfm); + crypto_free_hash(connection->cram_hmac_tfm); + crypto_free_hash(connection->integrity_tfm); + crypto_free_hash(connection->peer_integrity_tfm); + kfree(connection->int_dig_in); + kfree(connection->int_dig_vv); + + connection->csums_tfm = NULL; + connection->verify_tfm = NULL; + connection->cram_hmac_tfm = NULL; + connection->integrity_tfm = NULL; + connection->peer_integrity_tfm = NULL; + connection->int_dig_in = NULL; + connection->int_dig_vv = NULL; +} + +int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts) +{ + struct drbd_connection *connection; + cpumask_var_t new_cpu_mask; + int err; + + if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + /* silently ignore cpu mask on UP kernel */ + if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { + err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE, + cpumask_bits(new_cpu_mask), nr_cpu_ids); + if (err == -EOVERFLOW) { + /* So what. mask it out. */ + cpumask_var_t tmp_cpu_mask; + if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) { + cpumask_setall(tmp_cpu_mask); + cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask); + drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n", + res_opts->cpu_mask, + strlen(res_opts->cpu_mask) > 12 ? "..." : "", + nr_cpu_ids); + free_cpumask_var(tmp_cpu_mask); + err = 0; + } + } + if (err) { + drbd_warn(resource, "bitmap_parse() failed with %d\n", err); + /* retcode = ERR_CPU_MASK_PARSE; */ + goto fail; + } + } + resource->res_opts = *res_opts; + if (cpumask_empty(new_cpu_mask)) + drbd_calc_cpu_mask(&new_cpu_mask); + if (!cpumask_equal(resource->cpu_mask, new_cpu_mask)) { + cpumask_copy(resource->cpu_mask, new_cpu_mask); + for_each_connection_rcu(connection, resource) { + connection->receiver.reset_cpu_mask = 1; + connection->asender.reset_cpu_mask = 1; + connection->worker.reset_cpu_mask = 1; + } + } + err = 0; + +fail: + free_cpumask_var(new_cpu_mask); + return err; + +} + +struct drbd_resource *drbd_create_resource(const char *name) +{ + struct drbd_resource *resource; + + resource = kzalloc(sizeof(struct drbd_resource), GFP_KERNEL); + if (!resource) + goto fail; + resource->name = kstrdup(name, GFP_KERNEL); + if (!resource->name) + goto fail_free_resource; + if (!zalloc_cpumask_var(&resource->cpu_mask, GFP_KERNEL)) + goto fail_free_name; + kref_init(&resource->kref); + idr_init(&resource->devices); + INIT_LIST_HEAD(&resource->connections); + resource->write_ordering = WO_bdev_flush; + list_add_tail_rcu(&resource->resources, &drbd_resources); + mutex_init(&resource->conf_update); + mutex_init(&resource->adm_mutex); + spin_lock_init(&resource->req_lock); + drbd_debugfs_resource_add(resource); + return resource; + +fail_free_name: + kfree(resource->name); +fail_free_resource: + kfree(resource); +fail: + return NULL; +} + +/* caller must be under adm_mutex */ +struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) +{ + struct drbd_resource *resource; + struct drbd_connection *connection; + + connection = kzalloc(sizeof(struct drbd_connection), GFP_KERNEL); + if (!connection) + return NULL; + + if (drbd_alloc_socket(&connection->data)) + goto fail; + if (drbd_alloc_socket(&connection->meta)) + goto fail; + + connection->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); + if (!connection->current_epoch) + goto fail; + + INIT_LIST_HEAD(&connection->transfer_log); + + INIT_LIST_HEAD(&connection->current_epoch->list); + connection->epochs = 1; + spin_lock_init(&connection->epoch_lock); + + connection->send.seen_any_write_yet = false; + connection->send.current_epoch_nr = 0; + connection->send.current_epoch_writes = 0; + + resource = drbd_create_resource(name); + if (!resource) + goto fail; + + connection->cstate = C_STANDALONE; + mutex_init(&connection->cstate_mutex); + init_waitqueue_head(&connection->ping_wait); + idr_init(&connection->peer_devices); + + drbd_init_workqueue(&connection->sender_work); + mutex_init(&connection->data.mutex); + mutex_init(&connection->meta.mutex); + + drbd_thread_init(resource, &connection->receiver, drbd_receiver, "receiver"); + connection->receiver.connection = connection; + drbd_thread_init(resource, &connection->worker, drbd_worker, "worker"); + connection->worker.connection = connection; + drbd_thread_init(resource, &connection->asender, drbd_asender, "asender"); + connection->asender.connection = connection; + + kref_init(&connection->kref); + + connection->resource = resource; + + if (set_resource_options(resource, res_opts)) + goto fail_resource; + + kref_get(&resource->kref); + list_add_tail_rcu(&connection->connections, &resource->connections); + drbd_debugfs_connection_add(connection); + return connection; + +fail_resource: + list_del(&resource->resources); + drbd_free_resource(resource); +fail: + kfree(connection->current_epoch); + drbd_free_socket(&connection->meta); + drbd_free_socket(&connection->data); + kfree(connection); + return NULL; +} + +void drbd_destroy_connection(struct kref *kref) +{ + struct drbd_connection *connection = container_of(kref, struct drbd_connection, kref); + struct drbd_resource *resource = connection->resource; + + if (atomic_read(&connection->current_epoch->epoch_size) != 0) + drbd_err(connection, "epoch_size:%d\n", atomic_read(&connection->current_epoch->epoch_size)); + kfree(connection->current_epoch); + + idr_destroy(&connection->peer_devices); + + drbd_free_socket(&connection->meta); + drbd_free_socket(&connection->data); + kfree(connection->int_dig_in); + kfree(connection->int_dig_vv); + memset(connection, 0xfc, sizeof(*connection)); + kfree(connection); + kref_put(&resource->kref, drbd_destroy_resource); +} + +static int init_submitter(struct drbd_device *device) +{ + /* opencoded create_singlethread_workqueue(), + * to be able to say "drbd%d", ..., minor */ + device->submit.wq = alloc_workqueue("drbd%u_submit", + WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor); + if (!device->submit.wq) + return -ENOMEM; + + INIT_WORK(&device->submit.worker, do_submit); + INIT_LIST_HEAD(&device->submit.writes); + return 0; +} + +enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor) +{ + struct drbd_resource *resource = adm_ctx->resource; + struct drbd_connection *connection; + struct drbd_device *device; + struct drbd_peer_device *peer_device, *tmp_peer_device; + struct gendisk *disk; + struct request_queue *q; + int id; + int vnr = adm_ctx->volume; + enum drbd_ret_code err = ERR_NOMEM; + + device = minor_to_device(minor); + if (device) + return ERR_MINOR_OR_VOLUME_EXISTS; + + /* GFP_KERNEL, we are outside of all write-out paths */ + device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL); + if (!device) + return ERR_NOMEM; + kref_init(&device->kref); + + kref_get(&resource->kref); + device->resource = resource; + device->minor = minor; + device->vnr = vnr; + + drbd_init_set_defaults(device); + + q = blk_alloc_queue(GFP_KERNEL); + if (!q) + goto out_no_q; + device->rq_queue = q; + q->queuedata = device; + + disk = alloc_disk(1); + if (!disk) + goto out_no_disk; + device->vdisk = disk; + + set_disk_ro(disk, true); + + disk->queue = q; + disk->major = DRBD_MAJOR; + disk->first_minor = minor; + disk->fops = &drbd_ops; + sprintf(disk->disk_name, "drbd%d", minor); + disk->private_data = device; + + device->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor)); + /* we have no partitions. we contain only ourselves. */ + device->this_bdev->bd_contains = device->this_bdev; + + q->backing_dev_info.congested_fn = drbd_congested; + q->backing_dev_info.congested_data = device; + + blk_queue_make_request(q, drbd_make_request); + blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + /* Setting the max_hw_sectors to an odd value of 8kibyte here + This triggers a max_bio_size message upon first attach or connect */ + blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); + blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); + blk_queue_merge_bvec(q, drbd_merge_bvec); + q->queue_lock = &resource->req_lock; + + device->md_io.page = alloc_page(GFP_KERNEL); + if (!device->md_io.page) + goto out_no_io_page; + + if (drbd_bm_init(device)) + goto out_no_bitmap; + device->read_requests = RB_ROOT; + device->write_requests = RB_ROOT; + + id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL); + if (id < 0) { + if (id == -ENOSPC) + err = ERR_MINOR_OR_VOLUME_EXISTS; + goto out_no_minor_idr; + } + kref_get(&device->kref); + + id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL); + if (id < 0) { + if (id == -ENOSPC) + err = ERR_MINOR_OR_VOLUME_EXISTS; + goto out_idr_remove_minor; + } + kref_get(&device->kref); + + INIT_LIST_HEAD(&device->peer_devices); + INIT_LIST_HEAD(&device->pending_bitmap_io); + for_each_connection(connection, resource) { + peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL); + if (!peer_device) + goto out_idr_remove_from_resource; + peer_device->connection = connection; + peer_device->device = device; + + list_add(&peer_device->peer_devices, &device->peer_devices); + kref_get(&device->kref); + + id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL); + if (id < 0) { + if (id == -ENOSPC) + err = ERR_INVALID_REQUEST; + goto out_idr_remove_from_resource; + } + kref_get(&connection->kref); + } + + if (init_submitter(device)) { + err = ERR_NOMEM; + goto out_idr_remove_vol; + } + + add_disk(disk); + + /* inherit the connection state */ + device->state.conn = first_connection(resource)->cstate; + if (device->state.conn == C_WF_REPORT_PARAMS) { + for_each_peer_device(peer_device, device) + drbd_connected(peer_device); + } + /* move to create_peer_device() */ + for_each_peer_device(peer_device, device) + drbd_debugfs_peer_device_add(peer_device); + drbd_debugfs_device_add(device); + return NO_ERROR; + +out_idr_remove_vol: + idr_remove(&connection->peer_devices, vnr); +out_idr_remove_from_resource: + for_each_connection(connection, resource) { + peer_device = idr_find(&connection->peer_devices, vnr); + if (peer_device) { + idr_remove(&connection->peer_devices, vnr); + kref_put(&connection->kref, drbd_destroy_connection); + } + } + for_each_peer_device_safe(peer_device, tmp_peer_device, device) { + list_del(&peer_device->peer_devices); + kfree(peer_device); + } + idr_remove(&resource->devices, vnr); +out_idr_remove_minor: + idr_remove(&drbd_devices, minor); + synchronize_rcu(); +out_no_minor_idr: + drbd_bm_cleanup(device); +out_no_bitmap: + __free_page(device->md_io.page); +out_no_io_page: + put_disk(disk); +out_no_disk: + blk_cleanup_queue(q); +out_no_q: + kref_put(&resource->kref, drbd_destroy_resource); + kfree(device); + return err; +} + +void drbd_delete_device(struct drbd_device *device) +{ + struct drbd_resource *resource = device->resource; + struct drbd_connection *connection; + struct drbd_peer_device *peer_device; + int refs = 3; + + /* move to free_peer_device() */ + for_each_peer_device(peer_device, device) + drbd_debugfs_peer_device_cleanup(peer_device); + drbd_debugfs_device_cleanup(device); + for_each_connection(connection, resource) { + idr_remove(&connection->peer_devices, device->vnr); + refs++; + } + idr_remove(&resource->devices, device->vnr); + idr_remove(&drbd_devices, device_to_minor(device)); + del_gendisk(device->vdisk); + synchronize_rcu(); + kref_sub(&device->kref, refs, drbd_destroy_device); +} + +static int __init drbd_init(void) +{ + int err; + + if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) { + pr_err("invalid minor_count (%d)\n", minor_count); +#ifdef MODULE + return -EINVAL; +#else + minor_count = DRBD_MINOR_COUNT_DEF; +#endif + } + + err = register_blkdev(DRBD_MAJOR, "drbd"); + if (err) { + pr_err("unable to register block device major %d\n", + DRBD_MAJOR); + return err; + } + + /* + * allocate all necessary structs + */ + init_waitqueue_head(&drbd_pp_wait); + + drbd_proc = NULL; /* play safe for drbd_cleanup */ + idr_init(&drbd_devices); + + rwlock_init(&global_state_lock); + INIT_LIST_HEAD(&drbd_resources); + + err = drbd_genl_register(); + if (err) { + pr_err("unable to register generic netlink family\n"); + goto fail; + } + + err = drbd_create_mempools(); + if (err) + goto fail; + + err = -ENOMEM; + drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL); + if (!drbd_proc) { + pr_err("unable to register proc file\n"); + goto fail; + } + + retry.wq = create_singlethread_workqueue("drbd-reissue"); + if (!retry.wq) { + pr_err("unable to create retry workqueue\n"); + goto fail; + } + INIT_WORK(&retry.worker, do_retry); + spin_lock_init(&retry.lock); + INIT_LIST_HEAD(&retry.writes); + + if (drbd_debugfs_init()) + pr_notice("failed to initialize debugfs -- will not be available\n"); + + pr_info("initialized. " + "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", + API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); + pr_info("%s\n", drbd_buildtag()); + pr_info("registered as block device major %d\n", DRBD_MAJOR); + return 0; /* Success! */ + +fail: + drbd_cleanup(); + if (err == -ENOMEM) + pr_err("ran out of memory\n"); + else + pr_err("initialization failure\n"); + return err; +} + +void drbd_free_ldev(struct drbd_backing_dev *ldev) +{ + if (ldev == NULL) + return; + + blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + + kfree(ldev->disk_conf); + kfree(ldev); +} + +static void drbd_free_one_sock(struct drbd_socket *ds) +{ + struct socket *s; + mutex_lock(&ds->mutex); + s = ds->socket; + ds->socket = NULL; + mutex_unlock(&ds->mutex); + if (s) { + /* so debugfs does not need to mutex_lock() */ + synchronize_rcu(); + kernel_sock_shutdown(s, SHUT_RDWR); + sock_release(s); + } +} + +void drbd_free_sock(struct drbd_connection *connection) +{ + if (connection->data.socket) + drbd_free_one_sock(&connection->data); + if (connection->meta.socket) + drbd_free_one_sock(&connection->meta); +} + +/* meta data management */ + +void conn_md_sync(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + + kref_get(&device->kref); + rcu_read_unlock(); + drbd_md_sync(device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + +/* aligned 4kByte */ +struct meta_data_on_disk { + u64 la_size_sect; /* last agreed size. */ + u64 uuid[UI_SIZE]; /* UUIDs. */ + u64 device_uuid; + u64 reserved_u64_1; + u32 flags; /* MDF */ + u32 magic; + u32 md_size_sect; + u32 al_offset; /* offset to this block */ + u32 al_nr_extents; /* important for restoring the AL (userspace) */ + /* `-- act_log->nr_elements <-- ldev->dc.al_extents */ + u32 bm_offset; /* offset to the bitmap, from here */ + u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ + u32 la_peer_max_bio_size; /* last peer max_bio_size */ + + /* see al_tr_number_to_on_disk_sector() */ + u32 al_stripes; + u32 al_stripe_size_4k; + + u8 reserved_u8[4096 - (7*8 + 10*4)]; +} __packed; + + + +void drbd_md_write(struct drbd_device *device, void *b) +{ + struct meta_data_on_disk *buffer = b; + sector_t sector; + int i; + + memset(buffer, 0, sizeof(*buffer)); + + buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(device->this_bdev)); + for (i = UI_CURRENT; i < UI_SIZE; i++) + buffer->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]); + buffer->flags = cpu_to_be32(device->ldev->md.flags); + buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN); + + buffer->md_size_sect = cpu_to_be32(device->ldev->md.md_size_sect); + buffer->al_offset = cpu_to_be32(device->ldev->md.al_offset); + buffer->al_nr_extents = cpu_to_be32(device->act_log->nr_elements); + buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE); + buffer->device_uuid = cpu_to_be64(device->ldev->md.device_uuid); + + buffer->bm_offset = cpu_to_be32(device->ldev->md.bm_offset); + buffer->la_peer_max_bio_size = cpu_to_be32(device->peer_max_bio_size); + + buffer->al_stripes = cpu_to_be32(device->ldev->md.al_stripes); + buffer->al_stripe_size_4k = cpu_to_be32(device->ldev->md.al_stripe_size_4k); + + D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset); + sector = device->ldev->md.md_offset; + + if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { + /* this was a try anyways ... */ + drbd_err(device, "meta data update failed!\n"); + drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); + } +} + +/** + * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set + * @device: DRBD device. + */ +void drbd_md_sync(struct drbd_device *device) +{ + struct meta_data_on_disk *buffer; + + /* Don't accidentally change the DRBD meta data layout. */ + BUILD_BUG_ON(UI_SIZE != 4); + BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096); + + del_timer(&device->md_sync_timer); + /* timer may be rearmed by drbd_md_mark_dirty() now. */ + if (!test_and_clear_bit(MD_DIRTY, &device->flags)) + return; + + /* We use here D_FAILED and not D_ATTACHING because we try to write + * metadata even if we detach due to a disk failure! */ + if (!get_ldev_if_state(device, D_FAILED)) + return; + + buffer = drbd_md_get_buffer(device, __func__); + if (!buffer) + goto out; + + drbd_md_write(device, buffer); + + /* Update device->ldev->md.la_size_sect, + * since we updated it on metadata. */ + device->ldev->md.la_size_sect = drbd_get_capacity(device->this_bdev); + + drbd_md_put_buffer(device); +out: + put_ldev(device); +} + +static int check_activity_log_stripe_size(struct drbd_device *device, + struct meta_data_on_disk *on_disk, + struct drbd_md *in_core) +{ + u32 al_stripes = be32_to_cpu(on_disk->al_stripes); + u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k); + u64 al_size_4k; + + /* both not set: default to old fixed size activity log */ + if (al_stripes == 0 && al_stripe_size_4k == 0) { + al_stripes = 1; + al_stripe_size_4k = MD_32kB_SECT/8; + } + + /* some paranoia plausibility checks */ + + /* we need both values to be set */ + if (al_stripes == 0 || al_stripe_size_4k == 0) + goto err; + + al_size_4k = (u64)al_stripes * al_stripe_size_4k; + + /* Upper limit of activity log area, to avoid potential overflow + * problems in al_tr_number_to_on_disk_sector(). As right now, more + * than 72 * 4k blocks total only increases the amount of history, + * limiting this arbitrarily to 16 GB is not a real limitation ;-) */ + if (al_size_4k > (16 * 1024 * 1024/4)) + goto err; + + /* Lower limit: we need at least 8 transaction slots (32kB) + * to not break existing setups */ + if (al_size_4k < MD_32kB_SECT/8) + goto err; + + in_core->al_stripe_size_4k = al_stripe_size_4k; + in_core->al_stripes = al_stripes; + in_core->al_size_4k = al_size_4k; + + return 0; +err: + drbd_err(device, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n", + al_stripes, al_stripe_size_4k); + return -EINVAL; +} + +static int check_offsets_and_sizes(struct drbd_device *device, struct drbd_backing_dev *bdev) +{ + sector_t capacity = drbd_get_capacity(bdev->md_bdev); + struct drbd_md *in_core = &bdev->md; + s32 on_disk_al_sect; + s32 on_disk_bm_sect; + + /* The on-disk size of the activity log, calculated from offsets, and + * the size of the activity log calculated from the stripe settings, + * should match. + * Though we could relax this a bit: it is ok, if the striped activity log + * fits in the available on-disk activity log size. + * Right now, that would break how resize is implemented. + * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware + * of possible unused padding space in the on disk layout. */ + if (in_core->al_offset < 0) { + if (in_core->bm_offset > in_core->al_offset) + goto err; + on_disk_al_sect = -in_core->al_offset; + on_disk_bm_sect = in_core->al_offset - in_core->bm_offset; + } else { + if (in_core->al_offset != MD_4kB_SECT) + goto err; + if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT) + goto err; + + on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT; + on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset; + } + + /* old fixed size meta data is exactly that: fixed. */ + if (in_core->meta_dev_idx >= 0) { + if (in_core->md_size_sect != MD_128MB_SECT + || in_core->al_offset != MD_4kB_SECT + || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT + || in_core->al_stripes != 1 + || in_core->al_stripe_size_4k != MD_32kB_SECT/8) + goto err; + } + + if (capacity < in_core->md_size_sect) + goto err; + if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev)) + goto err; + + /* should be aligned, and at least 32k */ + if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT)) + goto err; + + /* should fit (for now: exactly) into the available on-disk space; + * overflow prevention is in check_activity_log_stripe_size() above. */ + if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT) + goto err; + + /* again, should be aligned */ + if (in_core->bm_offset & 7) + goto err; + + /* FIXME check for device grow with flex external meta data? */ + + /* can the available bitmap space cover the last agreed device size? */ + if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512) + goto err; + + return 0; + +err: + drbd_err(device, "meta data offsets don't make sense: idx=%d " + "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, " + "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n", + in_core->meta_dev_idx, + in_core->al_stripes, in_core->al_stripe_size_4k, + in_core->al_offset, in_core->bm_offset, in_core->md_size_sect, + (unsigned long long)in_core->la_size_sect, + (unsigned long long)capacity); + + return -EINVAL; +} + + +/** + * drbd_md_read() - Reads in the meta data super block + * @device: DRBD device. + * @bdev: Device from which the meta data should be read in. + * + * Return NO_ERROR on success, and an enum drbd_ret_code in case + * something goes wrong. + * + * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS, + * even before @bdev is assigned to @device->ldev. + */ +int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev) +{ + struct meta_data_on_disk *buffer; + u32 magic, flags; + int i, rv = NO_ERROR; + + if (device->state.disk != D_DISKLESS) + return ERR_DISK_CONFIGURED; + + buffer = drbd_md_get_buffer(device, __func__); + if (!buffer) + return ERR_NOMEM; + + /* First, figure out where our meta data superblock is located, + * and read it. */ + bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; + bdev->md.md_offset = drbd_md_ss(bdev); + + if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) { + /* NOTE: can't do normal error processing here as this is + called BEFORE disk is attached */ + drbd_err(device, "Error while reading metadata.\n"); + rv = ERR_IO_MD_DISK; + goto err; + } + + magic = be32_to_cpu(buffer->magic); + flags = be32_to_cpu(buffer->flags); + if (magic == DRBD_MD_MAGIC_84_UNCLEAN || + (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) { + /* btw: that's Activity Log clean, not "all" clean. */ + drbd_err(device, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n"); + rv = ERR_MD_UNCLEAN; + goto err; + } + + rv = ERR_MD_INVALID; + if (magic != DRBD_MD_MAGIC_08) { + if (magic == DRBD_MD_MAGIC_07) + drbd_err(device, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n"); + else + drbd_err(device, "Meta data magic not found. Did you \"drbdadm create-md\"?\n"); + goto err; + } + + if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { + drbd_err(device, "unexpected bm_bytes_per_bit: %u (expected %u)\n", + be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); + goto err; + } + + + /* convert to in_core endian */ + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect); + for (i = UI_CURRENT; i < UI_SIZE; i++) + bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); + bdev->md.flags = be32_to_cpu(buffer->flags); + bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); + + bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect); + bdev->md.al_offset = be32_to_cpu(buffer->al_offset); + bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset); + + if (check_activity_log_stripe_size(device, buffer, &bdev->md)) + goto err; + if (check_offsets_and_sizes(device, bdev)) + goto err; + + if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { + drbd_err(device, "unexpected bm_offset: %d (expected %d)\n", + be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); + goto err; + } + if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { + drbd_err(device, "unexpected md_size: %u (expected %u)\n", + be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); + goto err; + } + + rv = NO_ERROR; + + spin_lock_irq(&device->resource->req_lock); + if (device->state.conn < C_CONNECTED) { + unsigned int peer; + peer = be32_to_cpu(buffer->la_peer_max_bio_size); + peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); + device->peer_max_bio_size = peer; + } + spin_unlock_irq(&device->resource->req_lock); + + err: + drbd_md_put_buffer(device); + + return rv; +} + +/** + * drbd_md_mark_dirty() - Mark meta data super block as dirty + * @device: DRBD device. + * + * Call this function if you change anything that should be written to + * the meta-data super block. This function sets MD_DIRTY, and starts a + * timer that ensures that within five seconds you have to call drbd_md_sync(). + */ +#ifdef DEBUG +void drbd_md_mark_dirty_(struct drbd_device *device, unsigned int line, const char *func) +{ + if (!test_and_set_bit(MD_DIRTY, &device->flags)) { + mod_timer(&device->md_sync_timer, jiffies + HZ); + device->last_md_mark_dirty.line = line; + device->last_md_mark_dirty.func = func; + } +} +#else +void drbd_md_mark_dirty(struct drbd_device *device) +{ + if (!test_and_set_bit(MD_DIRTY, &device->flags)) + mod_timer(&device->md_sync_timer, jiffies + 5*HZ); +} +#endif + +void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local) +{ + int i; + + for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) + device->ldev->md.uuid[i+1] = device->ldev->md.uuid[i]; +} + +void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local) +{ + if (idx == UI_CURRENT) { + if (device->state.role == R_PRIMARY) + val |= 1; + else + val &= ~((u64)1); + + drbd_set_ed_uuid(device, val); + } + + device->ldev->md.uuid[idx] = val; + drbd_md_mark_dirty(device); +} + +void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local) +{ + unsigned long flags; + spin_lock_irqsave(&device->ldev->md.uuid_lock, flags); + __drbd_uuid_set(device, idx, val); + spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags); +} + +void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local) +{ + unsigned long flags; + spin_lock_irqsave(&device->ldev->md.uuid_lock, flags); + if (device->ldev->md.uuid[idx]) { + drbd_uuid_move_history(device); + device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[idx]; + } + __drbd_uuid_set(device, idx, val); + spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags); +} + +/** + * drbd_uuid_new_current() - Creates a new current UUID + * @device: DRBD device. + * + * Creates a new current UUID, and rotates the old current UUID into + * the bitmap slot. Causes an incremental resync upon next connect. + */ +void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local) +{ + u64 val; + unsigned long long bm_uuid; + + get_random_bytes(&val, sizeof(u64)); + + spin_lock_irq(&device->ldev->md.uuid_lock); + bm_uuid = device->ldev->md.uuid[UI_BITMAP]; + + if (bm_uuid) + drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid); + + device->ldev->md.uuid[UI_BITMAP] = device->ldev->md.uuid[UI_CURRENT]; + __drbd_uuid_set(device, UI_CURRENT, val); + spin_unlock_irq(&device->ldev->md.uuid_lock); + + drbd_print_uuids(device, "new current UUID"); + /* get it to stable storage _now_ */ + drbd_md_sync(device); +} + +void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local) +{ + unsigned long flags; + if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) + return; + + spin_lock_irqsave(&device->ldev->md.uuid_lock, flags); + if (val == 0) { + drbd_uuid_move_history(device); + device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP]; + device->ldev->md.uuid[UI_BITMAP] = 0; + } else { + unsigned long long bm_uuid = device->ldev->md.uuid[UI_BITMAP]; + if (bm_uuid) + drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid); + + device->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1); + } + spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags); + + drbd_md_mark_dirty(device); +} + +/** + * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() + * @device: DRBD device. + * + * Sets all bits in the bitmap and writes the whole bitmap to stable storage. + */ +int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local) +{ + int rv = -EIO; + + drbd_md_set_flag(device, MDF_FULL_SYNC); + drbd_md_sync(device); + drbd_bm_set_all(device); + + rv = drbd_bm_write(device); + + if (!rv) { + drbd_md_clear_flag(device, MDF_FULL_SYNC); + drbd_md_sync(device); + } + + return rv; +} + +/** + * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() + * @device: DRBD device. + * + * Clears all bits in the bitmap and writes the whole bitmap to stable storage. + */ +int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local) +{ + drbd_resume_al(device); + drbd_bm_clear_all(device); + return drbd_bm_write(device); +} + +static int w_bitmap_io(struct drbd_work *w, int unused) +{ + struct drbd_device *device = + container_of(w, struct drbd_device, bm_io_work.w); + struct bm_io_work *work = &device->bm_io_work; + int rv = -EIO; + + D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0); + + if (get_ldev(device)) { + drbd_bm_lock(device, work->why, work->flags); + rv = work->io_fn(device); + drbd_bm_unlock(device); + put_ldev(device); + } + + clear_bit_unlock(BITMAP_IO, &device->flags); + wake_up(&device->misc_wait); + + if (work->done) + work->done(device, rv); + + clear_bit(BITMAP_IO_QUEUED, &device->flags); + work->why = NULL; + work->flags = 0; + + return 0; +} + +/** + * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap + * @device: DRBD device. + * @io_fn: IO callback to be called when bitmap IO is possible + * @done: callback to be called after the bitmap IO was performed + * @why: Descriptive text of the reason for doing the IO + * + * While IO on the bitmap happens we freeze application IO thus we ensure + * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be + * called from worker context. It MUST NOT be used while a previous such + * work is still pending! + * + * Its worker function encloses the call of io_fn() by get_ldev() and + * put_ldev(). + */ +void drbd_queue_bitmap_io(struct drbd_device *device, + int (*io_fn)(struct drbd_device *), + void (*done)(struct drbd_device *, int), + char *why, enum bm_flag flags) +{ + D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); + + D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags)); + D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags)); + D_ASSERT(device, list_empty(&device->bm_io_work.w.list)); + if (device->bm_io_work.why) + drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n", + why, device->bm_io_work.why); + + device->bm_io_work.io_fn = io_fn; + device->bm_io_work.done = done; + device->bm_io_work.why = why; + device->bm_io_work.flags = flags; + + spin_lock_irq(&device->resource->req_lock); + set_bit(BITMAP_IO, &device->flags); + if (atomic_read(&device->ap_bio_cnt) == 0) { + if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) + drbd_queue_work(&first_peer_device(device)->connection->sender_work, + &device->bm_io_work.w); + } + spin_unlock_irq(&device->resource->req_lock); +} + +/** + * drbd_bitmap_io() - Does an IO operation on the whole bitmap + * @device: DRBD device. + * @io_fn: IO callback to be called when bitmap IO is possible + * @why: Descriptive text of the reason for doing the IO + * + * freezes application IO while that the actual IO operations runs. This + * functions MAY NOT be called from worker context. + */ +int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *), + char *why, enum bm_flag flags) +{ + int rv; + + D_ASSERT(device, current != first_peer_device(device)->connection->worker.task); + + if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + drbd_suspend_io(device); + + drbd_bm_lock(device, why, flags); + rv = io_fn(device); + drbd_bm_unlock(device); + + if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + drbd_resume_io(device); + + return rv; +} + +void drbd_md_set_flag(struct drbd_device *device, int flag) __must_hold(local) +{ + if ((device->ldev->md.flags & flag) != flag) { + drbd_md_mark_dirty(device); + device->ldev->md.flags |= flag; + } +} + +void drbd_md_clear_flag(struct drbd_device *device, int flag) __must_hold(local) +{ + if ((device->ldev->md.flags & flag) != 0) { + drbd_md_mark_dirty(device); + device->ldev->md.flags &= ~flag; + } +} +int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) +{ + return (bdev->md.flags & flag) != 0; +} + +static void md_sync_timer_fn(unsigned long data) +{ + struct drbd_device *device = (struct drbd_device *) data; + drbd_device_post_work(device, MD_SYNC); +} + +const char *cmdname(enum drbd_packet cmd) +{ + /* THINK may need to become several global tables + * when we want to support more than + * one PRO_VERSION */ + static const char *cmdnames[] = { + [P_DATA] = "Data", + [P_DATA_REPLY] = "DataReply", + [P_RS_DATA_REPLY] = "RSDataReply", + [P_BARRIER] = "Barrier", + [P_BITMAP] = "ReportBitMap", + [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", + [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", + [P_UNPLUG_REMOTE] = "UnplugRemote", + [P_DATA_REQUEST] = "DataRequest", + [P_RS_DATA_REQUEST] = "RSDataRequest", + [P_SYNC_PARAM] = "SyncParam", + [P_SYNC_PARAM89] = "SyncParam89", + [P_PROTOCOL] = "ReportProtocol", + [P_UUIDS] = "ReportUUIDs", + [P_SIZES] = "ReportSizes", + [P_STATE] = "ReportState", + [P_SYNC_UUID] = "ReportSyncUUID", + [P_AUTH_CHALLENGE] = "AuthChallenge", + [P_AUTH_RESPONSE] = "AuthResponse", + [P_PING] = "Ping", + [P_PING_ACK] = "PingAck", + [P_RECV_ACK] = "RecvAck", + [P_WRITE_ACK] = "WriteAck", + [P_RS_WRITE_ACK] = "RSWriteAck", + [P_SUPERSEDED] = "Superseded", + [P_NEG_ACK] = "NegAck", + [P_NEG_DREPLY] = "NegDReply", + [P_NEG_RS_DREPLY] = "NegRSDReply", + [P_BARRIER_ACK] = "BarrierAck", + [P_STATE_CHG_REQ] = "StateChgRequest", + [P_STATE_CHG_REPLY] = "StateChgReply", + [P_OV_REQUEST] = "OVRequest", + [P_OV_REPLY] = "OVReply", + [P_OV_RESULT] = "OVResult", + [P_CSUM_RS_REQUEST] = "CsumRSRequest", + [P_RS_IS_IN_SYNC] = "CsumRSIsInSync", + [P_COMPRESSED_BITMAP] = "CBitmap", + [P_DELAY_PROBE] = "DelayProbe", + [P_OUT_OF_SYNC] = "OutOfSync", + [P_RETRY_WRITE] = "RetryWrite", + [P_RS_CANCEL] = "RSCancel", + [P_CONN_ST_CHG_REQ] = "conn_st_chg_req", + [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", + [P_RETRY_WRITE] = "retry_write", + [P_PROTOCOL_UPDATE] = "protocol_update", + + /* enum drbd_packet, but not commands - obsoleted flags: + * P_MAY_IGNORE + * P_MAX_OPT_CMD + */ + }; + + /* too big for the array: 0xfffX */ + if (cmd == P_INITIAL_META) + return "InitialMeta"; + if (cmd == P_INITIAL_DATA) + return "InitialData"; + if (cmd == P_CONNECTION_FEATURES) + return "ConnectionFeatures"; + if (cmd >= ARRAY_SIZE(cmdnames)) + return "Unknown"; + return cmdnames[cmd]; +} + +/** + * drbd_wait_misc - wait for a request to make progress + * @device: device associated with the request + * @i: the struct drbd_interval embedded in struct drbd_request or + * struct drbd_peer_request + */ +int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i) +{ + struct net_conf *nc; + DEFINE_WAIT(wait); + long timeout; + + rcu_read_lock(); + nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + if (!nc) { + rcu_read_unlock(); + return -ETIMEDOUT; + } + timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT; + rcu_read_unlock(); + + /* Indicate to wake up device->misc_wait on progress. */ + i->waiting = true; + prepare_to_wait(&device->misc_wait, &wait, TASK_INTERRUPTIBLE); + spin_unlock_irq(&device->resource->req_lock); + timeout = schedule_timeout(timeout); + finish_wait(&device->misc_wait, &wait); + spin_lock_irq(&device->resource->req_lock); + if (!timeout || device->state.conn < C_CONNECTED) + return -ETIMEDOUT; + if (signal_pending(current)) + return -ERESTARTSYS; + return 0; +} + +#ifdef CONFIG_DRBD_FAULT_INJECTION +/* Fault insertion support including random number generator shamelessly + * stolen from kernel/rcutorture.c */ +struct fault_random_state { + unsigned long state; + unsigned long count; +}; + +#define FAULT_RANDOM_MULT 39916801 /* prime */ +#define FAULT_RANDOM_ADD 479001701 /* prime */ +#define FAULT_RANDOM_REFRESH 10000 + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from get_random_bytes(). + */ +static unsigned long +_drbd_fault_random(struct fault_random_state *rsp) +{ + long refresh; + + if (!rsp->count--) { + get_random_bytes(&refresh, sizeof(refresh)); + rsp->state += refresh; + rsp->count = FAULT_RANDOM_REFRESH; + } + rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD; + return swahw32(rsp->state); +} + +static char * +_drbd_fault_str(unsigned int type) { + static char *_faults[] = { + [DRBD_FAULT_MD_WR] = "Meta-data write", + [DRBD_FAULT_MD_RD] = "Meta-data read", + [DRBD_FAULT_RS_WR] = "Resync write", + [DRBD_FAULT_RS_RD] = "Resync read", + [DRBD_FAULT_DT_WR] = "Data write", + [DRBD_FAULT_DT_RD] = "Data read", + [DRBD_FAULT_DT_RA] = "Data read ahead", + [DRBD_FAULT_BM_ALLOC] = "BM allocation", + [DRBD_FAULT_AL_EE] = "EE allocation", + [DRBD_FAULT_RECEIVE] = "receive data corruption", + }; + + return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; +} + +unsigned int +_drbd_insert_fault(struct drbd_device *device, unsigned int type) +{ + static struct fault_random_state rrs = {0, 0}; + + unsigned int ret = ( + (fault_devs == 0 || + ((1 << device_to_minor(device)) & fault_devs) != 0) && + (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate)); + + if (ret) { + fault_count++; + + if (__ratelimit(&drbd_ratelimit_state)) + drbd_warn(device, "***Simulating %s failure\n", + _drbd_fault_str(type)); + } + + return ret; +} +#endif + +const char *drbd_buildtag(void) +{ + /* DRBD built from external sources has here a reference to the + git hash of the source code. */ + + static char buildtag[38] = "\0uilt-in"; + + if (buildtag[0] == 0) { +#ifdef MODULE + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); +#else + buildtag[0] = 'b'; +#endif + } + + return buildtag; +} + +module_init(drbd_init) +module_exit(drbd_cleanup) + +EXPORT_SYMBOL(drbd_conn_str); +EXPORT_SYMBOL(drbd_role_str); +EXPORT_SYMBOL(drbd_disk_str); +EXPORT_SYMBOL(drbd_set_st_err_str); diff --git a/kernel/drivers/block/drbd/drbd_nl.c b/kernel/drivers/block/drbd/drbd_nl.c new file mode 100644 index 000000000..74df8cfad --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_nl.c @@ -0,0 +1,3674 @@ +/* + drbd_nl.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include "drbd_int.h" +#include "drbd_protocol.h" +#include "drbd_req.h" +#include +#include +#include + +#include + +/* .doit */ +// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info); +// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info); + +int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info); + +int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_down(struct sk_buff *skb, struct genl_info *info); + +int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); +int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); +/* .dumpit */ +int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); + +#include +#include "drbd_nla.h" +#include + +/* used blkdev_get_by_path, to claim our meta data device(s) */ +static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; + +static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) +{ + genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb)))); + if (genlmsg_reply(skb, info)) + pr_err("error sending genl reply\n"); +} + +/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only + * reason it could fail was no space in skb, and there are 4k available. */ +static int drbd_msg_put_info(struct sk_buff *skb, const char *info) +{ + struct nlattr *nla; + int err = -EMSGSIZE; + + if (!info || !info[0]) + return 0; + + nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY); + if (!nla) + return err; + + err = nla_put_string(skb, T_info_text, info); + if (err) { + nla_nest_cancel(skb, nla); + return err; + } else + nla_nest_end(skb, nla); + return 0; +} + +/* This would be a good candidate for a "pre_doit" hook, + * and per-family private info->pointers. + * But we need to stay compatible with older kernels. + * If it returns successfully, adm_ctx members are valid. + * + * At this point, we still rely on the global genl_lock(). + * If we want to avoid that, and allow "genl_family.parallel_ops", we may need + * to add additional synchronization against object destruction/modification. + */ +#define DRBD_ADM_NEED_MINOR 1 +#define DRBD_ADM_NEED_RESOURCE 2 +#define DRBD_ADM_NEED_CONNECTION 4 +static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, + struct sk_buff *skb, struct genl_info *info, unsigned flags) +{ + struct drbd_genlmsghdr *d_in = info->userhdr; + const u8 cmd = info->genlhdr->cmd; + int err; + + memset(adm_ctx, 0, sizeof(*adm_ctx)); + + /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ + if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) + return -EPERM; + + adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!adm_ctx->reply_skb) { + err = -ENOMEM; + goto fail; + } + + adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb, + info, &drbd_genl_family, 0, cmd); + /* put of a few bytes into a fresh skb of >= 4k will always succeed. + * but anyways */ + if (!adm_ctx->reply_dh) { + err = -ENOMEM; + goto fail; + } + + adm_ctx->reply_dh->minor = d_in->minor; + adm_ctx->reply_dh->ret_code = NO_ERROR; + + adm_ctx->volume = VOLUME_UNSPECIFIED; + if (info->attrs[DRBD_NLA_CFG_CONTEXT]) { + struct nlattr *nla; + /* parse and validate only */ + err = drbd_cfg_context_from_attrs(NULL, info); + if (err) + goto fail; + + /* It was present, and valid, + * copy it over to the reply skb. */ + err = nla_put_nohdr(adm_ctx->reply_skb, + info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len, + info->attrs[DRBD_NLA_CFG_CONTEXT]); + if (err) + goto fail; + + /* and assign stuff to the adm_ctx */ + nla = nested_attr_tb[__nla_type(T_ctx_volume)]; + if (nla) + adm_ctx->volume = nla_get_u32(nla); + nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; + if (nla) + adm_ctx->resource_name = nla_data(nla); + adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; + adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; + if ((adm_ctx->my_addr && + nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) || + (adm_ctx->peer_addr && + nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) { + err = -EINVAL; + goto fail; + } + } + + adm_ctx->minor = d_in->minor; + adm_ctx->device = minor_to_device(d_in->minor); + + /* We are protected by the global genl_lock(). + * But we may explicitly drop it/retake it in drbd_adm_set_role(), + * so make sure this object stays around. */ + if (adm_ctx->device) + kref_get(&adm_ctx->device->kref); + + if (adm_ctx->resource_name) { + adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name); + } + + if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor"); + return ERR_MINOR_INVALID; + } + if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource"); + if (adm_ctx->resource_name) + return ERR_RES_NOT_KNOWN; + return ERR_INVALID_REQUEST; + } + + if (flags & DRBD_ADM_NEED_CONNECTION) { + if (adm_ctx->resource) { + drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx->device) { + drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx->my_addr && adm_ctx->peer_addr) + adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr), + nla_len(adm_ctx->my_addr), + nla_data(adm_ctx->peer_addr), + nla_len(adm_ctx->peer_addr)); + if (!adm_ctx->connection) { + drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection"); + return ERR_INVALID_REQUEST; + } + } + + /* some more paranoia, if the request was over-determined */ + if (adm_ctx->device && adm_ctx->resource && + adm_ctx->device->resource != adm_ctx->resource) { + pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n", + adm_ctx->minor, adm_ctx->resource->name, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource"); + return ERR_INVALID_REQUEST; + } + if (adm_ctx->device && + adm_ctx->volume != VOLUME_UNSPECIFIED && + adm_ctx->volume != adm_ctx->device->vnr) { + pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n", + adm_ctx->minor, adm_ctx->volume, + adm_ctx->device->vnr, + adm_ctx->device->resource->name); + drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume"); + return ERR_INVALID_REQUEST; + } + + /* still, provide adm_ctx->resource always, if possible. */ + if (!adm_ctx->resource) { + adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource + : adm_ctx->connection ? adm_ctx->connection->resource : NULL; + if (adm_ctx->resource) + kref_get(&adm_ctx->resource->kref); + } + + return NO_ERROR; + +fail: + nlmsg_free(adm_ctx->reply_skb); + adm_ctx->reply_skb = NULL; + return err; +} + +static int drbd_adm_finish(struct drbd_config_context *adm_ctx, + struct genl_info *info, int retcode) +{ + if (adm_ctx->device) { + kref_put(&adm_ctx->device->kref, drbd_destroy_device); + adm_ctx->device = NULL; + } + if (adm_ctx->connection) { + kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection); + adm_ctx->connection = NULL; + } + if (adm_ctx->resource) { + kref_put(&adm_ctx->resource->kref, drbd_destroy_resource); + adm_ctx->resource = NULL; + } + + if (!adm_ctx->reply_skb) + return -ENOMEM; + + adm_ctx->reply_dh->ret_code = retcode; + drbd_adm_send_reply(adm_ctx->reply_skb, info); + return 0; +} + +static void setup_khelper_env(struct drbd_connection *connection, char **envp) +{ + char *afs; + + /* FIXME: A future version will not allow this case. */ + if (connection->my_addr_len == 0 || connection->peer_addr_len == 0) + return; + + switch (((struct sockaddr *)&connection->peer_addr)->sa_family) { + case AF_INET6: + afs = "ipv6"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6", + &((struct sockaddr_in6 *)&connection->peer_addr)->sin6_addr); + break; + case AF_INET: + afs = "ipv4"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)&connection->peer_addr)->sin_addr); + break; + default: + afs = "ssocks"; + snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)&connection->peer_addr)->sin_addr); + } + snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs); +} + +int drbd_khelper(struct drbd_device *device, char *cmd) +{ + char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + (char[20]) { }, /* address family */ + (char[60]) { }, /* address */ + NULL }; + char mb[12]; + char *argv[] = {usermode_helper, cmd, mb, NULL }; + struct drbd_connection *connection = first_peer_device(device)->connection; + struct sib_info sib; + int ret; + + if (current == connection->worker.task) + set_bit(CALLBACK_PENDING, &connection->flags); + + snprintf(mb, 12, "minor-%d", device_to_minor(device)); + setup_khelper_env(connection, envp); + + /* The helper may take some time. + * write out any unsynced meta data changes now */ + drbd_md_sync(device); + + drbd_info(device, "helper command: %s %s %s\n", usermode_helper, cmd, mb); + sib.sib_reason = SIB_HELPER_PRE; + sib.helper_name = cmd; + drbd_bcast_event(device, &sib); + ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); + if (ret) + drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, mb, + (ret >> 8) & 0xff, ret); + else + drbd_info(device, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, mb, + (ret >> 8) & 0xff, ret); + sib.sib_reason = SIB_HELPER_POST; + sib.helper_exit_code = ret; + drbd_bcast_event(device, &sib); + + if (current == connection->worker.task) + clear_bit(CALLBACK_PENDING, &connection->flags); + + if (ret < 0) /* Ignore any ERRNOs we got. */ + ret = 0; + + return ret; +} + +static int conn_khelper(struct drbd_connection *connection, char *cmd) +{ + char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + (char[20]) { }, /* address family */ + (char[60]) { }, /* address */ + NULL }; + char *resource_name = connection->resource->name; + char *argv[] = {usermode_helper, cmd, resource_name, NULL }; + int ret; + + setup_khelper_env(connection, envp); + conn_md_sync(connection); + + drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name); + /* TODO: conn_bcast_event() ?? */ + + ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); + if (ret) + drbd_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, resource_name, + (ret >> 8) & 0xff, ret); + else + drbd_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, resource_name, + (ret >> 8) & 0xff, ret); + /* TODO: conn_bcast_event() ?? */ + + if (ret < 0) /* Ignore any ERRNOs we got. */ + ret = 0; + + return ret; +} + +static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connection) +{ + enum drbd_fencing_p fp = FP_NOT_AVAIL; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (get_ldev_if_state(device, D_CONSISTENT)) { + struct disk_conf *disk_conf = + rcu_dereference(peer_device->device->ldev->disk_conf); + fp = max_t(enum drbd_fencing_p, fp, disk_conf->fencing); + put_ldev(device); + } + } + rcu_read_unlock(); + + if (fp == FP_NOT_AVAIL) { + /* IO Suspending works on the whole resource. + Do it only for one device. */ + vnr = 0; + peer_device = idr_get_next(&connection->peer_devices, &vnr); + drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0)); + } + + return fp; +} + +bool conn_try_outdate_peer(struct drbd_connection *connection) +{ + unsigned int connect_cnt; + union drbd_state mask = { }; + union drbd_state val = { }; + enum drbd_fencing_p fp; + char *ex_to_string; + int r; + + spin_lock_irq(&connection->resource->req_lock); + if (connection->cstate >= C_WF_REPORT_PARAMS) { + drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n"); + spin_unlock_irq(&connection->resource->req_lock); + return false; + } + + connect_cnt = connection->connect_cnt; + spin_unlock_irq(&connection->resource->req_lock); + + fp = highest_fencing_policy(connection); + switch (fp) { + case FP_NOT_AVAIL: + drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n"); + goto out; + case FP_DONT_CARE: + return true; + default: ; + } + + r = conn_khelper(connection, "fence-peer"); + + switch ((r>>8) & 0xff) { + case 3: /* peer is inconsistent */ + ex_to_string = "peer is inconsistent or worse"; + mask.pdsk = D_MASK; + val.pdsk = D_INCONSISTENT; + break; + case 4: /* peer got outdated, or was already outdated */ + ex_to_string = "peer was fenced"; + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; + break; + case 5: /* peer was down */ + if (conn_highest_disk(connection) == D_UP_TO_DATE) { + /* we will(have) create(d) a new UUID anyways... */ + ex_to_string = "peer is unreachable, assumed to be dead"; + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; + } else { + ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; + } + break; + case 6: /* Peer is primary, voluntarily outdate myself. + * This is useful when an unconnected R_SECONDARY is asked to + * become R_PRIMARY, but finds the other peer being active. */ + ex_to_string = "peer is active"; + drbd_warn(connection, "Peer is primary, outdating myself.\n"); + mask.disk = D_MASK; + val.disk = D_OUTDATED; + break; + case 7: + if (fp != FP_STONITH) + drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n"); + ex_to_string = "peer was stonithed"; + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; + break; + default: + /* The script is broken ... */ + drbd_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); + return false; /* Eventually leave IO frozen */ + } + + drbd_info(connection, "fence-peer helper returned %d (%s)\n", + (r>>8) & 0xff, ex_to_string); + + out: + + /* Not using + conn_request_state(connection, mask, val, CS_VERBOSE); + here, because we might were able to re-establish the connection in the + meantime. */ + spin_lock_irq(&connection->resource->req_lock); + if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) { + if (connection->connect_cnt != connect_cnt) + /* In case the connection was established and droped + while the fence-peer handler was running, ignore it */ + drbd_info(connection, "Ignoring fence-peer exit code\n"); + else + _conn_request_state(connection, mask, val, CS_VERBOSE); + } + spin_unlock_irq(&connection->resource->req_lock); + + return conn_highest_pdsk(connection) <= D_OUTDATED; +} + +static int _try_outdate_peer_async(void *data) +{ + struct drbd_connection *connection = (struct drbd_connection *)data; + + conn_try_outdate_peer(connection); + + kref_put(&connection->kref, drbd_destroy_connection); + return 0; +} + +void conn_try_outdate_peer_async(struct drbd_connection *connection) +{ + struct task_struct *opa; + + kref_get(&connection->kref); + /* We may just have force_sig()'ed this thread + * to get it out of some blocking network function. + * Clear signals; otherwise kthread_run(), which internally uses + * wait_on_completion_killable(), will mistake our pending signal + * for a new fatal signal and fail. */ + flush_signals(current); + opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h"); + if (IS_ERR(opa)) { + drbd_err(connection, "out of mem, failed to invoke fence-peer helper\n"); + kref_put(&connection->kref, drbd_destroy_connection); + } +} + +enum drbd_state_rv +drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force) +{ + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; + const int max_tries = 4; + enum drbd_state_rv rv = SS_UNKNOWN_ERROR; + struct net_conf *nc; + int try = 0; + int forced = 0; + union drbd_state mask, val; + + if (new_role == R_PRIMARY) { + struct drbd_connection *connection; + + /* Detect dead peers as soon as possible. */ + + rcu_read_lock(); + for_each_connection(connection, device->resource) + request_ping(connection); + rcu_read_unlock(); + } + + mutex_lock(device->state_mutex); + + mask.i = 0; mask.role = R_MASK; + val.i = 0; val.role = new_role; + + while (try++ < max_tries) { + rv = _drbd_request_state_holding_state_mutex(device, mask, val, CS_WAIT_COMPLETE); + + /* in case we first succeeded to outdate, + * but now suddenly could establish a connection */ + if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { + val.pdsk = 0; + mask.pdsk = 0; + continue; + } + + if (rv == SS_NO_UP_TO_DATE_DISK && force && + (device->state.disk < D_UP_TO_DATE && + device->state.disk >= D_INCONSISTENT)) { + mask.disk = D_MASK; + val.disk = D_UP_TO_DATE; + forced = 1; + continue; + } + + if (rv == SS_NO_UP_TO_DATE_DISK && + device->state.disk == D_CONSISTENT && mask.pdsk == 0) { + D_ASSERT(device, device->state.pdsk == D_UNKNOWN); + + if (conn_try_outdate_peer(connection)) { + val.disk = D_UP_TO_DATE; + mask.disk = D_MASK; + } + continue; + } + + if (rv == SS_NOTHING_TO_DO) + goto out; + if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) { + if (!conn_try_outdate_peer(connection) && force) { + drbd_warn(device, "Forced into split brain situation!\n"); + mask.pdsk = D_MASK; + val.pdsk = D_OUTDATED; + + } + continue; + } + if (rv == SS_TWO_PRIMARIES) { + /* Maybe the peer is detected as dead very soon... + retry at most once more in this case. */ + int timeo; + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1; + rcu_read_unlock(); + schedule_timeout_interruptible(timeo); + if (try < max_tries) + try = max_tries - 1; + continue; + } + if (rv < SS_SUCCESS) { + rv = _drbd_request_state(device, mask, val, + CS_VERBOSE + CS_WAIT_COMPLETE); + if (rv < SS_SUCCESS) + goto out; + } + break; + } + + if (rv < SS_SUCCESS) + goto out; + + if (forced) + drbd_warn(device, "Forced to consider local data as UpToDate!\n"); + + /* Wait until nothing is on the fly :) */ + wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0); + + /* FIXME also wait for all pending P_BARRIER_ACK? */ + + if (new_role == R_SECONDARY) { + if (get_ldev(device)) { + device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; + put_ldev(device); + } + } else { + mutex_lock(&device->resource->conf_update); + nc = connection->net_conf; + if (nc) + nc->discard_my_data = 0; /* without copy; single bit op is atomic */ + mutex_unlock(&device->resource->conf_update); + + if (get_ldev(device)) { + if (((device->state.conn < C_CONNECTED || + device->state.pdsk <= D_FAILED) + && device->ldev->md.uuid[UI_BITMAP] == 0) || forced) + drbd_uuid_new_current(device); + + device->ldev->md.uuid[UI_CURRENT] |= (u64)1; + put_ldev(device); + } + } + + /* writeout of activity log covered areas of the bitmap + * to stable storage done in after state change already */ + + if (device->state.conn >= C_WF_REPORT_PARAMS) { + /* if this was forced, we should consider sync */ + if (forced) + drbd_send_uuids(peer_device); + drbd_send_current_state(peer_device); + } + + drbd_md_sync(device); + set_disk_ro(device->vdisk, new_role == R_SECONDARY); + kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); +out: + mutex_unlock(device->state_mutex); + return rv; +} + +static const char *from_attrs_err_to_txt(int err) +{ + return err == -ENOMSG ? "required attribute missing" : + err == -EOPNOTSUPP ? "unknown mandatory attribute" : + err == -EEXIST ? "can not change invariant setting" : + "invalid attribute value"; +} + +int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct set_role_parms parms; + int err; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + memset(&parms, 0, sizeof(parms)); + if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) { + err = set_role_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto out; + } + } + genl_unlock(); + mutex_lock(&adm_ctx.resource->adm_mutex); + + if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) + retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); + else + retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); + + mutex_unlock(&adm_ctx.resource->adm_mutex); + genl_lock(); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +/* Initializes the md.*_offset members, so we are able to find + * the on disk meta data. + * + * We currently have two possible layouts: + * external: + * |----------- md_size_sect ------------------| + * [ 4k superblock ][ activity log ][ Bitmap ] + * | al_offset == 8 | + * | bm_offset = al_offset + X | + * ==> bitmap sectors = md_size_sect - bm_offset + * + * internal: + * |----------- md_size_sect ------------------| + * [data.....][ Bitmap ][ activity log ][ 4k superblock ] + * | al_offset < 0 | + * | bm_offset = al_offset - Y | + * ==> bitmap sectors = Y = al_offset - bm_offset + * + * Activity log size used to be fixed 32kB, + * but is about to become configurable. + */ +static void drbd_md_set_sector_offsets(struct drbd_device *device, + struct drbd_backing_dev *bdev) +{ + sector_t md_size_sect = 0; + unsigned int al_size_sect = bdev->md.al_size_4k * 8; + + bdev->md.md_offset = drbd_md_ss(bdev); + + switch (bdev->md.meta_dev_idx) { + default: + /* v07 style fixed size indexed meta data */ + bdev->md.md_size_sect = MD_128MB_SECT; + bdev->md.al_offset = MD_4kB_SECT; + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; + break; + case DRBD_MD_INDEX_FLEX_EXT: + /* just occupy the full device; unit: sectors */ + bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); + bdev->md.al_offset = MD_4kB_SECT; + bdev->md.bm_offset = MD_4kB_SECT + al_size_sect; + break; + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + /* al size is still fixed */ + bdev->md.al_offset = -al_size_sect; + /* we need (slightly less than) ~ this much bitmap sectors: */ + md_size_sect = drbd_get_capacity(bdev->backing_bdev); + md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); + md_size_sect = BM_SECT_TO_EXT(md_size_sect); + md_size_sect = ALIGN(md_size_sect, 8); + + /* plus the "drbd meta data super block", + * and the activity log; */ + md_size_sect += MD_4kB_SECT + al_size_sect; + + bdev->md.md_size_sect = md_size_sect; + /* bitmap offset is adjusted by 'super' block size */ + bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT; + break; + } +} + +/* input size is expected to be in KB */ +char *ppsize(char *buf, unsigned long long size) +{ + /* Needs 9 bytes at max including trailing NUL: + * -1ULL ==> "16384 EB" */ + static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; + int base = 0; + while (size >= 10000 && base < sizeof(units)-1) { + /* shift + round */ + size = (size >> 10) + !!(size & (1<<9)); + base++; + } + sprintf(buf, "%u %cB", (unsigned)size, units[base]); + + return buf; +} + +/* there is still a theoretical deadlock when called from receiver + * on an D_INCONSISTENT R_PRIMARY: + * remote READ does inc_ap_bio, receiver would need to receive answer + * packet from remote to dec_ap_bio again. + * receiver receive_sizes(), comes here, + * waits for ap_bio_cnt == 0. -> deadlock. + * but this cannot happen, actually, because: + * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable + * (not connected, or bad/no disk on peer): + * see drbd_fail_request_early, ap_bio_cnt is zero. + * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: + * peer may not initiate a resize. + */ +/* Note these are not to be confused with + * drbd_adm_suspend_io/drbd_adm_resume_io, + * which are (sub) state changes triggered by admin (drbdsetup), + * and can be long lived. + * This changes an device->flag, is triggered by drbd internals, + * and should be short-lived. */ +void drbd_suspend_io(struct drbd_device *device) +{ + set_bit(SUSPEND_IO, &device->flags); + if (drbd_suspended(device)) + return; + wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt)); +} + +void drbd_resume_io(struct drbd_device *device) +{ + clear_bit(SUSPEND_IO, &device->flags); + wake_up(&device->misc_wait); +} + +/** + * drbd_determine_dev_size() - Sets the right device size obeying all constraints + * @device: DRBD device. + * + * Returns 0 on success, negative return values indicate errors. + * You should call drbd_md_sync() after calling this function. + */ +enum determine_dev_size +drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) +{ + sector_t prev_first_sect, prev_size; /* previous meta location */ + sector_t la_size_sect, u_size; + struct drbd_md *md = &device->ldev->md; + u32 prev_al_stripe_size_4k; + u32 prev_al_stripes; + sector_t size; + char ppb[10]; + void *buffer; + + int md_moved, la_size_changed; + enum determine_dev_size rv = DS_UNCHANGED; + + /* race: + * application request passes inc_ap_bio, + * but then cannot get an AL-reference. + * this function later may wait on ap_bio_cnt == 0. -> deadlock. + * + * to avoid that: + * Suspend IO right here. + * still lock the act_log to not trigger ASSERTs there. + */ + drbd_suspend_io(device); + buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ + if (!buffer) { + drbd_resume_io(device); + return DS_ERROR; + } + + /* no wait necessary anymore, actually we could assert that */ + wait_event(device->al_wait, lc_try_lock(device->act_log)); + + prev_first_sect = drbd_md_first_sector(device->ldev); + prev_size = device->ldev->md.md_size_sect; + la_size_sect = device->ldev->md.la_size_sect; + + if (rs) { + /* rs is non NULL if we should change the AL layout only */ + + prev_al_stripes = md->al_stripes; + prev_al_stripe_size_4k = md->al_stripe_size_4k; + + md->al_stripes = rs->al_stripes; + md->al_stripe_size_4k = rs->al_stripe_size / 4; + md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; + } + + drbd_md_set_sector_offsets(device, device->ldev); + + rcu_read_lock(); + u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; + rcu_read_unlock(); + size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED); + + if (size < la_size_sect) { + if (rs && u_size == 0) { + /* Remove "rs &&" later. This check should always be active, but + right now the receiver expects the permissive behavior */ + drbd_warn(device, "Implicit shrink not allowed. " + "Use --size=%llus for explicit shrink.\n", + (unsigned long long)size); + rv = DS_ERROR_SHRINK; + } + if (u_size > size) + rv = DS_ERROR_SPACE_MD; + if (rv != DS_UNCHANGED) + goto err_out; + } + + if (drbd_get_capacity(device->this_bdev) != size || + drbd_bm_capacity(device) != size) { + int err; + err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC)); + if (unlikely(err)) { + /* currently there is only one error: ENOMEM! */ + size = drbd_bm_capacity(device)>>1; + if (size == 0) { + drbd_err(device, "OUT OF MEMORY! " + "Could not allocate bitmap!\n"); + } else { + drbd_err(device, "BM resizing failed. " + "Leaving size unchanged at size = %lu KB\n", + (unsigned long)size); + } + rv = DS_ERROR; + } + /* racy, see comments above. */ + drbd_set_my_capacity(device, size); + device->ldev->md.la_size_sect = size; + drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), + (unsigned long long)size>>1); + } + if (rv <= DS_ERROR) + goto err_out; + + la_size_changed = (la_size_sect != device->ldev->md.la_size_sect); + + md_moved = prev_first_sect != drbd_md_first_sector(device->ldev) + || prev_size != device->ldev->md.md_size_sect; + + if (la_size_changed || md_moved || rs) { + u32 prev_flags; + + /* We do some synchronous IO below, which may take some time. + * Clear the timer, to avoid scary "timer expired!" messages, + * "Superblock" is written out at least twice below, anyways. */ + del_timer(&device->md_sync_timer); + drbd_al_shrink(device); /* All extents inactive. */ + + prev_flags = md->flags; + md->flags &= ~MDF_PRIMARY_IND; + drbd_md_write(device, buffer); + + drbd_info(device, "Writing the whole bitmap, %s\n", + la_size_changed && md_moved ? "size changed and md moved" : + la_size_changed ? "size changed" : "md moved"); + /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ + drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, + "size changed", BM_LOCKED_MASK); + drbd_initialize_al(device, buffer); + + md->flags = prev_flags; + drbd_md_write(device, buffer); + + if (rs) + drbd_info(device, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n", + md->al_stripes, md->al_stripe_size_4k * 4); + } + + if (size > la_size_sect) + rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO; + if (size < la_size_sect) + rv = DS_SHRUNK; + + if (0) { + err_out: + if (rs) { + md->al_stripes = prev_al_stripes; + md->al_stripe_size_4k = prev_al_stripe_size_4k; + md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; + + drbd_md_set_sector_offsets(device, device->ldev); + } + } + lc_unlock(device->act_log); + wake_up(&device->al_wait); + drbd_md_put_buffer(device); + drbd_resume_io(device); + + return rv; +} + +sector_t +drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev, + sector_t u_size, int assume_peer_has_space) +{ + sector_t p_size = device->p_size; /* partner's disk size. */ + sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */ + sector_t m_size; /* my size */ + sector_t size = 0; + + m_size = drbd_get_max_capacity(bdev); + + if (device->state.conn < C_CONNECTED && assume_peer_has_space) { + drbd_warn(device, "Resize while not connected was forced by the user!\n"); + p_size = m_size; + } + + if (p_size && m_size) { + size = min_t(sector_t, p_size, m_size); + } else { + if (la_size_sect) { + size = la_size_sect; + if (m_size && m_size < size) + size = m_size; + if (p_size && p_size < size) + size = p_size; + } else { + if (m_size) + size = m_size; + if (p_size) + size = p_size; + } + } + + if (size == 0) + drbd_err(device, "Both nodes diskless!\n"); + + if (u_size) { + if (u_size > size) + drbd_err(device, "Requested disk size is too big (%lu > %lu)\n", + (unsigned long)u_size>>1, (unsigned long)size>>1); + else + size = u_size; + } + + return size; +} + +/** + * drbd_check_al_size() - Ensures that the AL is of the right size + * @device: DRBD device. + * + * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation + * failed, and 0 on success. You should call drbd_md_sync() after you called + * this function. + */ +static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) +{ + struct lru_cache *n, *t; + struct lc_element *e; + unsigned int in_use; + int i; + + if (device->act_log && + device->act_log->nr_elements == dc->al_extents) + return 0; + + in_use = 0; + t = device->act_log; + n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION, + dc->al_extents, sizeof(struct lc_element), 0); + + if (n == NULL) { + drbd_err(device, "Cannot allocate act_log lru!\n"); + return -ENOMEM; + } + spin_lock_irq(&device->al_lock); + if (t) { + for (i = 0; i < t->nr_elements; i++) { + e = lc_element_by_index(t, i); + if (e->refcnt) + drbd_err(device, "refcnt(%d)==%d\n", + e->lc_number, e->refcnt); + in_use += e->refcnt; + } + } + if (!in_use) + device->act_log = n; + spin_unlock_irq(&device->al_lock); + if (in_use) { + drbd_err(device, "Activity log still in use!\n"); + lc_destroy(n); + return -EBUSY; + } else { + if (t) + lc_destroy(t); + } + drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */ + return 0; +} + +static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, + unsigned int max_bio_size) +{ + struct request_queue * const q = device->rq_queue; + unsigned int max_hw_sectors = max_bio_size >> 9; + unsigned int max_segments = 0; + struct request_queue *b = NULL; + + if (bdev) { + b = bdev->backing_bdev->bd_disk->queue; + + max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); + rcu_read_lock(); + max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; + rcu_read_unlock(); + + blk_set_stacking_limits(&q->limits); + blk_queue_max_write_same_sectors(q, 0); + } + + blk_queue_logical_block_size(q, 512); + blk_queue_max_hw_sectors(q, max_hw_sectors); + /* This is the workaround for "bio would need to, but cannot, be split" */ + blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); + blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); + + if (b) { + struct drbd_connection *connection = first_peer_device(device)->connection; + + if (blk_queue_discard(b) && + (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { + /* For now, don't allow more than one activity log extent worth of data + * to be discarded in one go. We may need to rework drbd_al_begin_io() + * to allow for even larger discard ranges */ + q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS; + + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + /* REALLY? Is stacking secdiscard "legal"? */ + if (blk_queue_secdiscard(b)) + queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); + } else { + q->limits.max_discard_sectors = 0; + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); + } + + blk_queue_stack_limits(q, b); + + if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { + drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", + q->backing_dev_info.ra_pages, + b->backing_dev_info.ra_pages); + q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + } + } +} + +void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) +{ + unsigned int now, new, local, peer; + + now = queue_max_hw_sectors(device->rq_queue) << 9; + local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */ + peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */ + + if (bdev) { + local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9; + device->local_max_bio_size = local; + } + local = min(local, DRBD_MAX_BIO_SIZE); + + /* We may ignore peer limits if the peer is modern enough. + Because new from 8.3.8 onwards the peer can use multiple + BIOs for a single peer_request */ + if (device->state.conn >= C_WF_REPORT_PARAMS) { + if (first_peer_device(device)->connection->agreed_pro_version < 94) + peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ + else if (first_peer_device(device)->connection->agreed_pro_version == 94) + peer = DRBD_MAX_SIZE_H80_PACKET; + else if (first_peer_device(device)->connection->agreed_pro_version < 100) + peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ + else + peer = DRBD_MAX_BIO_SIZE; + + /* We may later detach and re-attach on a disconnected Primary. + * Avoid this setting to jump back in that case. + * We want to store what we know the peer DRBD can handle, + * not what the peer IO backend can handle. */ + if (peer > device->peer_max_bio_size) + device->peer_max_bio_size = peer; + } + new = min(local, peer); + + if (device->state.role == R_PRIMARY && new < now) + drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now); + + if (new != now) + drbd_info(device, "max BIO size = %u\n", new); + + drbd_setup_queue_param(device, bdev, new); +} + +/* Starts the worker thread */ +static void conn_reconfig_start(struct drbd_connection *connection) +{ + drbd_thread_start(&connection->worker); + drbd_flush_workqueue(&connection->sender_work); +} + +/* if still unconfigured, stops worker again. */ +static void conn_reconfig_done(struct drbd_connection *connection) +{ + bool stop_threads; + spin_lock_irq(&connection->resource->req_lock); + stop_threads = conn_all_vols_unconf(connection) && + connection->cstate == C_STANDALONE; + spin_unlock_irq(&connection->resource->req_lock); + if (stop_threads) { + /* asender is implicitly stopped by receiver + * in conn_disconnect() */ + drbd_thread_stop(&connection->receiver); + drbd_thread_stop(&connection->worker); + } +} + +/* Make sure IO is suspended before calling this function(). */ +static void drbd_suspend_al(struct drbd_device *device) +{ + int s = 0; + + if (!lc_try_lock(device->act_log)) { + drbd_warn(device, "Failed to lock al in drbd_suspend_al()\n"); + return; + } + + drbd_al_shrink(device); + spin_lock_irq(&device->resource->req_lock); + if (device->state.conn < C_CONNECTED) + s = !test_and_set_bit(AL_SUSPENDED, &device->flags); + spin_unlock_irq(&device->resource->req_lock); + lc_unlock(device->act_log); + + if (s) + drbd_info(device, "Suspended AL updates\n"); +} + + +static bool should_set_defaults(struct genl_info *info) +{ + unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags; + return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS); +} + +static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev) +{ + /* This is limited by 16 bit "slot" numbers, + * and by available on-disk context storage. + * + * Also (u16)~0 is special (denotes a "free" extent). + * + * One transaction occupies one 4kB on-disk block, + * we have n such blocks in the on disk ring buffer, + * the "current" transaction may fail (n-1), + * and there is 919 slot numbers context information per transaction. + * + * 72 transaction blocks amounts to more than 2**16 context slots, + * so cap there first. + */ + const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX; + const unsigned int sufficient_on_disk = + (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1) + /AL_CONTEXT_PER_TRANSACTION; + + unsigned int al_size_4k = bdev->md.al_size_4k; + + if (al_size_4k > sufficient_on_disk) + return max_al_nr; + + return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION; +} + +static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b) +{ + return a->disk_barrier != b->disk_barrier || + a->disk_flushes != b->disk_flushes || + a->disk_drain != b->disk_drain; +} + +int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct drbd_device *device; + struct disk_conf *new_disk_conf, *old_disk_conf; + struct fifo_buffer *old_plan = NULL, *new_plan = NULL; + int err, fifo_size; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + + device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); + + /* we also need a disk + * to change the options on */ + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto out; + } + + new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + retcode = ERR_NOMEM; + goto fail; + } + + mutex_lock(&device->resource->conf_update); + old_disk_conf = device->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; + if (should_set_defaults(info)) + set_disk_conf_defaults(new_disk_conf); + + err = disk_conf_from_attrs_for_change(new_disk_conf, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail_unlock; + } + + if (!expect(new_disk_conf->resync_rate >= 1)) + new_disk_conf->resync_rate = 1; + + if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) + new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; + if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev)) + new_disk_conf->al_extents = drbd_al_extents_max(device->ldev); + + if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) + new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; + + fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != device->rs_plan_s->size) { + new_plan = fifo_alloc(fifo_size); + if (!new_plan) { + drbd_err(device, "kmalloc of fifo_buffer failed"); + retcode = ERR_NOMEM; + goto fail_unlock; + } + } + + drbd_suspend_io(device); + wait_event(device->al_wait, lc_try_lock(device->act_log)); + drbd_al_shrink(device); + err = drbd_check_al_size(device, new_disk_conf); + lc_unlock(device->act_log); + wake_up(&device->al_wait); + drbd_resume_io(device); + + if (err) { + retcode = ERR_NOMEM; + goto fail_unlock; + } + + write_lock_irq(&global_state_lock); + retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); + if (retcode == NO_ERROR) { + rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); + drbd_resync_after_changed(device); + } + write_unlock_irq(&global_state_lock); + + if (retcode != NO_ERROR) + goto fail_unlock; + + if (new_plan) { + old_plan = device->rs_plan_s; + rcu_assign_pointer(device->rs_plan_s, new_plan); + } + + mutex_unlock(&device->resource->conf_update); + + if (new_disk_conf->al_updates) + device->ldev->md.flags &= ~MDF_AL_DISABLED; + else + device->ldev->md.flags |= MDF_AL_DISABLED; + + if (new_disk_conf->md_flushes) + clear_bit(MD_NO_FUA, &device->flags); + else + set_bit(MD_NO_FUA, &device->flags); + + if (write_ordering_changed(old_disk_conf, new_disk_conf)) + drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush); + + drbd_md_sync(device); + + if (device->state.conn >= C_CONNECTED) { + struct drbd_peer_device *peer_device; + + for_each_peer_device(peer_device, device) + drbd_send_sync_param(peer_device); + } + + synchronize_rcu(); + kfree(old_disk_conf); + kfree(old_plan); + mod_timer(&device->request_timer, jiffies + HZ); + goto success; + +fail_unlock: + mutex_unlock(&device->resource->conf_update); + fail: + kfree(new_disk_conf); + kfree(new_plan); +success: + put_ldev(device); + out: + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_device *device; + struct drbd_peer_device *peer_device; + struct drbd_connection *connection; + int err; + enum drbd_ret_code retcode; + enum determine_dev_size dd; + sector_t max_possible_sectors; + sector_t min_md_device_sectors; + struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ + struct disk_conf *new_disk_conf = NULL; + struct block_device *bdev; + struct lru_cache *resync_lru = NULL; + struct fifo_buffer *new_plan = NULL; + union drbd_state ns, os; + enum drbd_state_rv rv; + struct net_conf *nc; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + + device = adm_ctx.device; + mutex_lock(&adm_ctx.resource->adm_mutex); + peer_device = first_peer_device(device); + connection = peer_device ? peer_device->connection : NULL; + conn_reconfig_start(connection); + + /* if you want to reconfigure, please tear down first */ + if (device->state.disk > D_DISKLESS) { + retcode = ERR_DISK_CONFIGURED; + goto fail; + } + /* It may just now have detached because of IO error. Make sure + * drbd_ldev_destroy is done already, we may end up here very fast, + * e.g. if someone calls attach from the on-io-error handler, + * to realize a "hot spare" feature (not that I'd recommend that) */ + wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags)); + + /* make sure there is no leftover from previous force-detach attempts */ + clear_bit(FORCE_DETACH, &device->flags); + clear_bit(WAS_IO_ERROR, &device->flags); + clear_bit(WAS_READ_ERROR, &device->flags); + + /* and no leftover from previously aborted resync or verify, either */ + device->rs_total = 0; + device->rs_failed = 0; + atomic_set(&device->rs_pending_cnt, 0); + + /* allocation not in the IO path, drbdsetup context */ + nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); + if (!nbc) { + retcode = ERR_NOMEM; + goto fail; + } + spin_lock_init(&nbc->md.uuid_lock); + + new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + retcode = ERR_NOMEM; + goto fail; + } + nbc->disk_conf = new_disk_conf; + + set_disk_conf_defaults(new_disk_conf); + err = disk_conf_from_attrs(new_disk_conf, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail; + } + + if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) + new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; + + new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ); + if (!new_plan) { + retcode = ERR_NOMEM; + goto fail; + } + + if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { + retcode = ERR_MD_IDX_INVALID; + goto fail; + } + + write_lock_irq(&global_state_lock); + retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); + write_unlock_irq(&global_state_lock); + if (retcode != NO_ERROR) + goto fail; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (nc) { + if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) { + rcu_read_unlock(); + retcode = ERR_STONITH_AND_PROT_A; + goto fail; + } + } + rcu_read_unlock(); + + bdev = blkdev_get_by_path(new_disk_conf->backing_dev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, device); + if (IS_ERR(bdev)) { + drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, + PTR_ERR(bdev)); + retcode = ERR_OPEN_DISK; + goto fail; + } + nbc->backing_bdev = bdev; + + /* + * meta_dev_idx >= 0: external fixed size, possibly multiple + * drbd sharing one meta device. TODO in that case, paranoia + * check that [md_bdev, meta_dev_idx] is not yet used by some + * other drbd minor! (if you use drbd.conf + drbdadm, that + * should check it for you already; but if you don't, or + * someone fooled it, we need to double check here) + */ + bdev = blkdev_get_by_path(new_disk_conf->meta_dev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, + (new_disk_conf->meta_dev_idx < 0) ? + (void *)device : (void *)drbd_m_holder); + if (IS_ERR(bdev)) { + drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, + PTR_ERR(bdev)); + retcode = ERR_OPEN_MD_DISK; + goto fail; + } + nbc->md_bdev = bdev; + + if ((nbc->backing_bdev == nbc->md_bdev) != + (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { + retcode = ERR_MD_IDX_INVALID; + goto fail; + } + + resync_lru = lc_create("resync", drbd_bm_ext_cache, + 1, 61, sizeof(struct bm_extent), + offsetof(struct bm_extent, lce)); + if (!resync_lru) { + retcode = ERR_NOMEM; + goto fail; + } + + /* Read our meta data super block early. + * This also sets other on-disk offsets. */ + retcode = drbd_md_read(device, nbc); + if (retcode != NO_ERROR) + goto fail; + + if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) + new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; + if (new_disk_conf->al_extents > drbd_al_extents_max(nbc)) + new_disk_conf->al_extents = drbd_al_extents_max(nbc); + + if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { + drbd_err(device, "max capacity %llu smaller than disk size %llu\n", + (unsigned long long) drbd_get_max_capacity(nbc), + (unsigned long long) new_disk_conf->disk_size); + retcode = ERR_DISK_TOO_SMALL; + goto fail; + } + + if (new_disk_conf->meta_dev_idx < 0) { + max_possible_sectors = DRBD_MAX_SECTORS_FLEX; + /* at least one MB, otherwise it does not make sense */ + min_md_device_sectors = (2<<10); + } else { + max_possible_sectors = DRBD_MAX_SECTORS; + min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1); + } + + if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { + retcode = ERR_MD_DISK_TOO_SMALL; + drbd_warn(device, "refusing attach: md-device too small, " + "at least %llu sectors needed for this meta-disk type\n", + (unsigned long long) min_md_device_sectors); + goto fail; + } + + /* Make sure the new disk is big enough + * (we may currently be R_PRIMARY with no local disk...) */ + if (drbd_get_max_capacity(nbc) < + drbd_get_capacity(device->this_bdev)) { + retcode = ERR_DISK_TOO_SMALL; + goto fail; + } + + nbc->known_size = drbd_get_capacity(nbc->backing_bdev); + + if (nbc->known_size > max_possible_sectors) { + drbd_warn(device, "==> truncating very big lower level device " + "to currently maximum possible %llu sectors <==\n", + (unsigned long long) max_possible_sectors); + if (new_disk_conf->meta_dev_idx >= 0) + drbd_warn(device, "==>> using internal or flexible " + "meta data may help <<==\n"); + } + + drbd_suspend_io(device); + /* also wait for the last barrier ack. */ + /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171 + * We need a way to either ignore barrier acks for barriers sent before a device + * was attached, or a way to wait for all pending barrier acks to come in. + * As barriers are counted per resource, + * we'd need to suspend io on all devices of a resource. + */ + wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device)); + /* and for any other previously queued work */ + drbd_flush_workqueue(&connection->sender_work); + + rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE); + retcode = rv; /* FIXME: Type mismatch. */ + drbd_resume_io(device); + if (rv < SS_SUCCESS) + goto fail; + + if (!get_ldev_if_state(device, D_ATTACHING)) + goto force_diskless; + + if (!device->bitmap) { + if (drbd_bm_init(device)) { + retcode = ERR_NOMEM; + goto force_diskless_dec; + } + } + + if (device->state.conn < C_CONNECTED && + device->state.role == R_PRIMARY && device->ed_uuid && + (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { + drbd_err(device, "Can only attach to data with current UUID=%016llX\n", + (unsigned long long)device->ed_uuid); + retcode = ERR_DATA_NOT_CURRENT; + goto force_diskless_dec; + } + + /* Since we are diskless, fix the activity log first... */ + if (drbd_check_al_size(device, new_disk_conf)) { + retcode = ERR_NOMEM; + goto force_diskless_dec; + } + + /* Prevent shrinking of consistent devices ! */ + if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && + drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) { + drbd_warn(device, "refusing to truncate a consistent device\n"); + retcode = ERR_DISK_TOO_SMALL; + goto force_diskless_dec; + } + + /* Reset the "barriers don't work" bits here, then force meta data to + * be written, to ensure we determine if barriers are supported. */ + if (new_disk_conf->md_flushes) + clear_bit(MD_NO_FUA, &device->flags); + else + set_bit(MD_NO_FUA, &device->flags); + + /* Point of no return reached. + * Devices and memory are no longer released by error cleanup below. + * now device takes over responsibility, and the state engine should + * clean it up somewhere. */ + D_ASSERT(device, device->ldev == NULL); + device->ldev = nbc; + device->resync = resync_lru; + device->rs_plan_s = new_plan; + nbc = NULL; + resync_lru = NULL; + new_disk_conf = NULL; + new_plan = NULL; + + drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush); + + if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY)) + set_bit(CRASHED_PRIMARY, &device->flags); + else + clear_bit(CRASHED_PRIMARY, &device->flags); + + if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) && + !(device->state.role == R_PRIMARY && device->resource->susp_nod)) + set_bit(CRASHED_PRIMARY, &device->flags); + + device->send_cnt = 0; + device->recv_cnt = 0; + device->read_cnt = 0; + device->writ_cnt = 0; + + drbd_reconsider_max_bio_size(device, device->ldev); + + /* If I am currently not R_PRIMARY, + * but meta data primary indicator is set, + * I just now recover from a hard crash, + * and have been R_PRIMARY before that crash. + * + * Now, if I had no connection before that crash + * (have been degraded R_PRIMARY), chances are that + * I won't find my peer now either. + * + * In that case, and _only_ in that case, + * we use the degr-wfc-timeout instead of the default, + * so we can automatically recover from a crash of a + * degraded but active "cluster" after a certain timeout. + */ + clear_bit(USE_DEGR_WFC_T, &device->flags); + if (device->state.role != R_PRIMARY && + drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) && + !drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND)) + set_bit(USE_DEGR_WFC_T, &device->flags); + + dd = drbd_determine_dev_size(device, 0, NULL); + if (dd <= DS_ERROR) { + retcode = ERR_NOMEM_BITMAP; + goto force_diskless_dec; + } else if (dd == DS_GREW) + set_bit(RESYNC_AFTER_NEG, &device->flags); + + if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) || + (test_bit(CRASHED_PRIMARY, &device->flags) && + drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) { + drbd_info(device, "Assuming that all blocks are out of sync " + "(aka FullSync)\n"); + if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, + "set_n_write from attaching", BM_LOCKED_MASK)) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + } else { + if (drbd_bitmap_io(device, &drbd_bm_read, + "read from attaching", BM_LOCKED_MASK)) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + } + + if (_drbd_bm_total_weight(device) == drbd_bm_bits(device)) + drbd_suspend_al(device); /* IO is still suspended here... */ + + spin_lock_irq(&device->resource->req_lock); + os = drbd_read_state(device); + ns = os; + /* If MDF_CONSISTENT is not set go into inconsistent state, + otherwise investigate MDF_WasUpToDate... + If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, + otherwise into D_CONSISTENT state. + */ + if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) { + if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE)) + ns.disk = D_CONSISTENT; + else + ns.disk = D_OUTDATED; + } else { + ns.disk = D_INCONSISTENT; + } + + if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED)) + ns.pdsk = D_OUTDATED; + + rcu_read_lock(); + if (ns.disk == D_CONSISTENT && + (ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE)) + ns.disk = D_UP_TO_DATE; + + /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, + MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before + this point, because drbd_request_state() modifies these + flags. */ + + if (rcu_dereference(device->ldev->disk_conf)->al_updates) + device->ldev->md.flags &= ~MDF_AL_DISABLED; + else + device->ldev->md.flags |= MDF_AL_DISABLED; + + rcu_read_unlock(); + + /* In case we are C_CONNECTED postpone any decision on the new disk + state after the negotiation phase. */ + if (device->state.conn == C_CONNECTED) { + device->new_state_tmp.i = ns.i; + ns.i = os.i; + ns.disk = D_NEGOTIATING; + + /* We expect to receive up-to-date UUIDs soon. + To avoid a race in receive_state, free p_uuid while + holding req_lock. I.e. atomic with the state change */ + kfree(device->p_uuid); + device->p_uuid = NULL; + } + + rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL); + spin_unlock_irq(&device->resource->req_lock); + + if (rv < SS_SUCCESS) + goto force_diskless_dec; + + mod_timer(&device->request_timer, jiffies + HZ); + + if (device->state.role == R_PRIMARY) + device->ldev->md.uuid[UI_CURRENT] |= (u64)1; + else + device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; + + drbd_md_mark_dirty(device); + drbd_md_sync(device); + + kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); + put_ldev(device); + conn_reconfig_done(connection); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; + + force_diskless_dec: + put_ldev(device); + force_diskless: + drbd_force_state(device, NS(disk, D_DISKLESS)); + drbd_md_sync(device); + fail: + conn_reconfig_done(connection); + if (nbc) { + if (nbc->backing_bdev) + blkdev_put(nbc->backing_bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL); + if (nbc->md_bdev) + blkdev_put(nbc->md_bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL); + kfree(nbc); + } + kfree(new_disk_conf); + lc_destroy(resync_lru); + kfree(new_plan); + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static int adm_detach(struct drbd_device *device, int force) +{ + enum drbd_state_rv retcode; + int ret; + + if (force) { + set_bit(FORCE_DETACH, &device->flags); + drbd_force_state(device, NS(disk, D_FAILED)); + retcode = SS_SUCCESS; + goto out; + } + + drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ + drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ + retcode = drbd_request_state(device, NS(disk, D_FAILED)); + drbd_md_put_buffer(device); + /* D_FAILED will transition to DISKLESS. */ + ret = wait_event_interruptible(device->misc_wait, + device->state.disk != D_FAILED); + drbd_resume_io(device); + if ((int)retcode == (int)SS_IS_DISKLESS) + retcode = SS_NOTHING_TO_DO; + if (ret) + retcode = ERR_INTR; +out: + return retcode; +} + +/* Detaching the disk is a process in multiple stages. First we need to lock + * out application IO, in-flight IO, IO stuck in drbd_al_begin_io. + * Then we transition to D_DISKLESS, and wait for put_ldev() to return all + * internal references as well. + * Only then we have finally detached. */ +int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct detach_parms parms = { }; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + if (info->attrs[DRBD_NLA_DETACH_PARMS]) { + err = detach_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto out; + } + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = adm_detach(adm_ctx.device, parms.force_detach); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static bool conn_resync_running(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + bool rv = false; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (device->state.conn == C_SYNC_SOURCE || + device->state.conn == C_SYNC_TARGET || + device->state.conn == C_PAUSED_SYNC_S || + device->state.conn == C_PAUSED_SYNC_T) { + rv = true; + break; + } + } + rcu_read_unlock(); + + return rv; +} + +static bool conn_ov_running(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + bool rv = false; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (device->state.conn == C_VERIFY_S || + device->state.conn == C_VERIFY_T) { + rv = true; + break; + } + } + rcu_read_unlock(); + + return rv; +} + +static enum drbd_ret_code +_check_net_options(struct drbd_connection *connection, struct net_conf *old_net_conf, struct net_conf *new_net_conf) +{ + struct drbd_peer_device *peer_device; + int i; + + if (old_net_conf && connection->cstate == C_WF_REPORT_PARAMS && connection->agreed_pro_version < 100) { + if (new_net_conf->wire_protocol != old_net_conf->wire_protocol) + return ERR_NEED_APV_100; + + if (new_net_conf->two_primaries != old_net_conf->two_primaries) + return ERR_NEED_APV_100; + + if (strcmp(new_net_conf->integrity_alg, old_net_conf->integrity_alg)) + return ERR_NEED_APV_100; + } + + if (!new_net_conf->two_primaries && + conn_highest_role(connection) == R_PRIMARY && + conn_highest_peer(connection) == R_PRIMARY) + return ERR_NEED_ALLOW_TWO_PRI; + + if (new_net_conf->two_primaries && + (new_net_conf->wire_protocol != DRBD_PROT_C)) + return ERR_NOT_PROTO_C; + + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + struct drbd_device *device = peer_device->device; + if (get_ldev(device)) { + enum drbd_fencing_p fp = rcu_dereference(device->ldev->disk_conf)->fencing; + put_ldev(device); + if (new_net_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) + return ERR_STONITH_AND_PROT_A; + } + if (device->state.role == R_PRIMARY && new_net_conf->discard_my_data) + return ERR_DISCARD_IMPOSSIBLE; + } + + if (new_net_conf->on_congestion != OC_BLOCK && new_net_conf->wire_protocol != DRBD_PROT_A) + return ERR_CONG_NOT_PROTO_A; + + return NO_ERROR; +} + +static enum drbd_ret_code +check_net_options(struct drbd_connection *connection, struct net_conf *new_net_conf) +{ + static enum drbd_ret_code rv; + struct drbd_peer_device *peer_device; + int i; + + rcu_read_lock(); + rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf); + rcu_read_unlock(); + + /* connection->peer_devices protected by genl_lock() here */ + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + struct drbd_device *device = peer_device->device; + if (!device->bitmap) { + if (drbd_bm_init(device)) + return ERR_NOMEM; + } + } + + return rv; +} + +struct crypto { + struct crypto_hash *verify_tfm; + struct crypto_hash *csums_tfm; + struct crypto_hash *cram_hmac_tfm; + struct crypto_hash *integrity_tfm; +}; + +static int +alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg) +{ + if (!tfm_name[0]) + return NO_ERROR; + + *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(*tfm)) { + *tfm = NULL; + return err_alg; + } + + return NO_ERROR; +} + +static enum drbd_ret_code +alloc_crypto(struct crypto *crypto, struct net_conf *new_net_conf) +{ + char hmac_name[CRYPTO_MAX_ALG_NAME]; + enum drbd_ret_code rv; + + rv = alloc_hash(&crypto->csums_tfm, new_net_conf->csums_alg, + ERR_CSUMS_ALG); + if (rv != NO_ERROR) + return rv; + rv = alloc_hash(&crypto->verify_tfm, new_net_conf->verify_alg, + ERR_VERIFY_ALG); + if (rv != NO_ERROR) + return rv; + rv = alloc_hash(&crypto->integrity_tfm, new_net_conf->integrity_alg, + ERR_INTEGRITY_ALG); + if (rv != NO_ERROR) + return rv; + if (new_net_conf->cram_hmac_alg[0] != 0) { + snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", + new_net_conf->cram_hmac_alg); + + rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name, + ERR_AUTH_ALG); + } + + return rv; +} + +static void free_crypto(struct crypto *crypto) +{ + crypto_free_hash(crypto->cram_hmac_tfm); + crypto_free_hash(crypto->integrity_tfm); + crypto_free_hash(crypto->csums_tfm); + crypto_free_hash(crypto->verify_tfm); +} + +int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct drbd_connection *connection; + struct net_conf *old_net_conf, *new_net_conf = NULL; + int err; + int ovr; /* online verify running */ + int rsr; /* re-sync running */ + struct crypto crypto = { }; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + + connection = adm_ctx.connection; + mutex_lock(&adm_ctx.resource->adm_mutex); + + new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_net_conf) { + retcode = ERR_NOMEM; + goto out; + } + + conn_reconfig_start(connection); + + mutex_lock(&connection->data.mutex); + mutex_lock(&connection->resource->conf_update); + old_net_conf = connection->net_conf; + + if (!old_net_conf) { + drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect"); + retcode = ERR_INVALID_REQUEST; + goto fail; + } + + *new_net_conf = *old_net_conf; + if (should_set_defaults(info)) + set_net_conf_defaults(new_net_conf); + + err = net_conf_from_attrs_for_change(new_net_conf, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail; + } + + retcode = check_net_options(connection, new_net_conf); + if (retcode != NO_ERROR) + goto fail; + + /* re-sync running */ + rsr = conn_resync_running(connection); + if (rsr && strcmp(new_net_conf->csums_alg, old_net_conf->csums_alg)) { + retcode = ERR_CSUMS_RESYNC_RUNNING; + goto fail; + } + + /* online verify running */ + ovr = conn_ov_running(connection); + if (ovr && strcmp(new_net_conf->verify_alg, old_net_conf->verify_alg)) { + retcode = ERR_VERIFY_RUNNING; + goto fail; + } + + retcode = alloc_crypto(&crypto, new_net_conf); + if (retcode != NO_ERROR) + goto fail; + + rcu_assign_pointer(connection->net_conf, new_net_conf); + + if (!rsr) { + crypto_free_hash(connection->csums_tfm); + connection->csums_tfm = crypto.csums_tfm; + crypto.csums_tfm = NULL; + } + if (!ovr) { + crypto_free_hash(connection->verify_tfm); + connection->verify_tfm = crypto.verify_tfm; + crypto.verify_tfm = NULL; + } + + crypto_free_hash(connection->integrity_tfm); + connection->integrity_tfm = crypto.integrity_tfm; + if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100) + /* Do this without trying to take connection->data.mutex again. */ + __drbd_send_protocol(connection, P_PROTOCOL_UPDATE); + + crypto_free_hash(connection->cram_hmac_tfm); + connection->cram_hmac_tfm = crypto.cram_hmac_tfm; + + mutex_unlock(&connection->resource->conf_update); + mutex_unlock(&connection->data.mutex); + synchronize_rcu(); + kfree(old_net_conf); + + if (connection->cstate >= C_WF_REPORT_PARAMS) { + struct drbd_peer_device *peer_device; + int vnr; + + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + drbd_send_sync_param(peer_device); + } + + goto done; + + fail: + mutex_unlock(&connection->resource->conf_update); + mutex_unlock(&connection->data.mutex); + free_crypto(&crypto); + kfree(new_net_conf); + done: + conn_reconfig_done(connection); + out: + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_peer_device *peer_device; + struct net_conf *old_net_conf, *new_net_conf = NULL; + struct crypto crypto = { }; + struct drbd_resource *resource; + struct drbd_connection *connection; + enum drbd_ret_code retcode; + int i; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); + + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { + drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing"); + retcode = ERR_INVALID_REQUEST; + goto out; + } + + /* No need for _rcu here. All reconfiguration is + * strictly serialized on genl_lock(). We are protected against + * concurrent reconfiguration/addition/deletion */ + for_each_resource(resource, &drbd_resources) { + for_each_connection(connection, resource) { + if (nla_len(adm_ctx.my_addr) == connection->my_addr_len && + !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr, + connection->my_addr_len)) { + retcode = ERR_LOCAL_ADDR; + goto out; + } + + if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len && + !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr, + connection->peer_addr_len)) { + retcode = ERR_PEER_ADDR; + goto out; + } + } + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + connection = first_connection(adm_ctx.resource); + conn_reconfig_start(connection); + + if (connection->cstate > C_STANDALONE) { + retcode = ERR_NET_CONFIGURED; + goto fail; + } + + /* allocation not in the IO path, drbdsetup / netlink process context */ + new_net_conf = kzalloc(sizeof(*new_net_conf), GFP_KERNEL); + if (!new_net_conf) { + retcode = ERR_NOMEM; + goto fail; + } + + set_net_conf_defaults(new_net_conf); + + err = net_conf_from_attrs(new_net_conf, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail; + } + + retcode = check_net_options(connection, new_net_conf); + if (retcode != NO_ERROR) + goto fail; + + retcode = alloc_crypto(&crypto, new_net_conf); + if (retcode != NO_ERROR) + goto fail; + + ((char *)new_net_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; + + drbd_flush_workqueue(&connection->sender_work); + + mutex_lock(&adm_ctx.resource->conf_update); + old_net_conf = connection->net_conf; + if (old_net_conf) { + retcode = ERR_NET_CONFIGURED; + mutex_unlock(&adm_ctx.resource->conf_update); + goto fail; + } + rcu_assign_pointer(connection->net_conf, new_net_conf); + + conn_free_crypto(connection); + connection->cram_hmac_tfm = crypto.cram_hmac_tfm; + connection->integrity_tfm = crypto.integrity_tfm; + connection->csums_tfm = crypto.csums_tfm; + connection->verify_tfm = crypto.verify_tfm; + + connection->my_addr_len = nla_len(adm_ctx.my_addr); + memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len); + connection->peer_addr_len = nla_len(adm_ctx.peer_addr); + memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len); + + mutex_unlock(&adm_ctx.resource->conf_update); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + struct drbd_device *device = peer_device->device; + device->send_cnt = 0; + device->recv_cnt = 0; + } + rcu_read_unlock(); + + retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); + + conn_reconfig_done(connection); + mutex_unlock(&adm_ctx.resource->adm_mutex); + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; + +fail: + free_crypto(&crypto); + kfree(new_net_conf); + + conn_reconfig_done(connection); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force) +{ + enum drbd_state_rv rv; + + rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), + force ? CS_HARD : 0); + + switch (rv) { + case SS_NOTHING_TO_DO: + break; + case SS_ALREADY_STANDALONE: + return SS_SUCCESS; + case SS_PRIMARY_NOP: + /* Our state checking code wants to see the peer outdated. */ + rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0); + + if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */ + rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_VERBOSE); + + break; + case SS_CW_FAILED_BY_PEER: + /* The peer probably wants to see us outdated. */ + rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, + disk, D_OUTDATED), 0); + if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) { + rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), + CS_HARD); + } + break; + default:; + /* no special handling necessary */ + } + + if (rv >= SS_SUCCESS) { + enum drbd_state_rv rv2; + /* No one else can reconfigure the network while I am here. + * The state handling only uses drbd_thread_stop_nowait(), + * we want to really wait here until the receiver is no more. + */ + drbd_thread_stop(&connection->receiver); + + /* Race breaker. This additional state change request may be + * necessary, if this was a forced disconnect during a receiver + * restart. We may have "killed" the receiver thread just + * after drbd_receiver() returned. Typically, we should be + * C_STANDALONE already, now, and this becomes a no-op. + */ + rv2 = conn_request_state(connection, NS(conn, C_STANDALONE), + CS_VERBOSE | CS_HARD); + if (rv2 < SS_SUCCESS) + drbd_err(connection, + "unexpected rv2=%d in conn_try_disconnect()\n", + rv2); + } + return rv; +} + +int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct disconnect_parms parms; + struct drbd_connection *connection; + enum drbd_state_rv rv; + enum drbd_ret_code retcode; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto fail; + + connection = adm_ctx.connection; + memset(&parms, 0, sizeof(parms)); + if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) { + err = disconnect_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail; + } + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + rv = conn_try_disconnect(connection, parms.force_disconnect); + if (rv < SS_SUCCESS) + retcode = rv; /* FIXME: Type mismatch. */ + else + retcode = NO_ERROR; + mutex_unlock(&adm_ctx.resource->adm_mutex); + fail: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +void resync_after_online_grow(struct drbd_device *device) +{ + int iass; /* I am sync source */ + + drbd_info(device, "Resync of new storage after online grow\n"); + if (device->state.role != device->state.peer) + iass = (device->state.role == R_PRIMARY); + else + iass = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags); + + if (iass) + drbd_start_resync(device, C_SYNC_SOURCE); + else + _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); +} + +int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct disk_conf *old_disk_conf, *new_disk_conf = NULL; + struct resize_parms rs; + struct drbd_device *device; + enum drbd_ret_code retcode; + enum determine_dev_size dd; + bool change_al_layout = false; + enum dds_flags ddsf; + sector_t u_size; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + + mutex_lock(&adm_ctx.resource->adm_mutex); + device = adm_ctx.device; + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto fail; + } + + memset(&rs, 0, sizeof(struct resize_parms)); + rs.al_stripes = device->ldev->md.al_stripes; + rs.al_stripe_size = device->ldev->md.al_stripe_size_4k * 4; + if (info->attrs[DRBD_NLA_RESIZE_PARMS]) { + err = resize_parms_from_attrs(&rs, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail_ldev; + } + } + + if (device->state.conn > C_CONNECTED) { + retcode = ERR_RESIZE_RESYNC; + goto fail_ldev; + } + + if (device->state.role == R_SECONDARY && + device->state.peer == R_SECONDARY) { + retcode = ERR_NO_PRIMARY; + goto fail_ldev; + } + + if (rs.no_resync && first_peer_device(device)->connection->agreed_pro_version < 93) { + retcode = ERR_NEED_APV_93; + goto fail_ldev; + } + + rcu_read_lock(); + u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; + rcu_read_unlock(); + if (u_size != (sector_t)rs.resize_size) { + new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + retcode = ERR_NOMEM; + goto fail_ldev; + } + } + + if (device->ldev->md.al_stripes != rs.al_stripes || + device->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) { + u32 al_size_k = rs.al_stripes * rs.al_stripe_size; + + if (al_size_k > (16 * 1024 * 1024)) { + retcode = ERR_MD_LAYOUT_TOO_BIG; + goto fail_ldev; + } + + if (al_size_k < MD_32kB_SECT/2) { + retcode = ERR_MD_LAYOUT_TOO_SMALL; + goto fail_ldev; + } + + if (device->state.conn != C_CONNECTED && !rs.resize_force) { + retcode = ERR_MD_LAYOUT_CONNECTED; + goto fail_ldev; + } + + change_al_layout = true; + } + + if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) + device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev); + + if (new_disk_conf) { + mutex_lock(&device->resource->conf_update); + old_disk_conf = device->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; + new_disk_conf->disk_size = (sector_t)rs.resize_size; + rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); + mutex_unlock(&device->resource->conf_update); + synchronize_rcu(); + kfree(old_disk_conf); + } + + ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); + dd = drbd_determine_dev_size(device, ddsf, change_al_layout ? &rs : NULL); + drbd_md_sync(device); + put_ldev(device); + if (dd == DS_ERROR) { + retcode = ERR_NOMEM_BITMAP; + goto fail; + } else if (dd == DS_ERROR_SPACE_MD) { + retcode = ERR_MD_LAYOUT_NO_FIT; + goto fail; + } else if (dd == DS_ERROR_SHRINK) { + retcode = ERR_IMPLICIT_SHRINK; + goto fail; + } + + if (device->state.conn == C_CONNECTED) { + if (dd == DS_GREW) + set_bit(RESIZE_PENDING, &device->flags); + + drbd_send_uuids(first_peer_device(device)); + drbd_send_sizes(first_peer_device(device), 1, ddsf); + } + + fail: + mutex_unlock(&adm_ctx.resource->adm_mutex); + finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; + + fail_ldev: + put_ldev(device); + goto fail; +} + +int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct res_opts res_opts; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto fail; + + res_opts = adm_ctx.resource->res_opts; + if (should_set_defaults(info)) + set_res_opts_defaults(&res_opts); + + err = res_opts_from_attrs(&res_opts, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto fail; + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + err = set_resource_options(adm_ctx.resource, &res_opts); + if (err) { + retcode = ERR_INVALID_REQUEST; + if (err == -ENOMEM) + retcode = ERR_NOMEM; + } + mutex_unlock(&adm_ctx.resource->adm_mutex); + +fail: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_device *device; + int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + device = adm_ctx.device; + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto out; + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + + /* If there is still bitmap IO pending, probably because of a previous + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ + drbd_suspend_io(device); + wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); + drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work); + + /* If we happen to be C_STANDALONE R_SECONDARY, just change to + * D_INCONSISTENT, and set all bits in the bitmap. Otherwise, + * try to start a resync handshake as sync target for full sync. + */ + if (device->state.conn == C_STANDALONE && device->state.role == R_SECONDARY) { + retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT)); + if (retcode >= SS_SUCCESS) { + if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, + "set_n_write from invalidate", BM_LOCKED_MASK)) + retcode = ERR_IO_MD_DISK; + } + } else + retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); + drbd_resume_io(device); + mutex_unlock(&adm_ctx.resource->adm_mutex); + put_ldev(device); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, + union drbd_state mask, union drbd_state val) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = drbd_request_state(adm_ctx.device, mask, val); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local) +{ + int rv; + + rv = drbd_bmio_set_n_write(device); + drbd_suspend_al(device); + return rv; +} + +int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + int retcode; /* drbd_ret_code, drbd_state_rv */ + struct drbd_device *device; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + device = adm_ctx.device; + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto out; + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + + /* If there is still bitmap IO pending, probably because of a previous + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ + drbd_suspend_io(device); + wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); + drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work); + + /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits + * in the bitmap. Otherwise, try to start a resync handshake + * as sync source for full sync. + */ + if (device->state.conn == C_STANDALONE && device->state.role == R_PRIMARY) { + /* The peer will get a resync upon connect anyways. Just make that + into a full resync. */ + retcode = drbd_request_state(device, NS(pdsk, D_INCONSISTENT)); + if (retcode >= SS_SUCCESS) { + if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al, + "set_n_write from invalidate_peer", + BM_LOCKED_SET_ALLOWED)) + retcode = ERR_IO_MD_DISK; + } + } else + retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); + drbd_resume_io(device); + mutex_unlock(&adm_ctx.resource->adm_mutex); + put_ldev(device); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mutex_lock(&adm_ctx.resource->adm_mutex); + if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO) + retcode = ERR_PAUSE_IS_SET; + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + union drbd_dev_state s; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mutex_lock(&adm_ctx.resource->adm_mutex); + if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { + s = adm_ctx.device->state; + if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { + retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : + s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; + } else { + retcode = ERR_PAUSE_IS_CLEAR; + } + } + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) +{ + return drbd_adm_simple_request_state(skb, info, NS(susp, 1)); +} + +int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_device *device; + int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mutex_lock(&adm_ctx.resource->adm_mutex); + device = adm_ctx.device; + if (test_bit(NEW_CUR_UUID, &device->flags)) { + drbd_uuid_new_current(device); + clear_bit(NEW_CUR_UUID, &device->flags); + } + drbd_suspend_io(device); + retcode = drbd_request_state(device, NS3(susp, 0, susp_nod, 0, susp_fen, 0)); + if (retcode == SS_SUCCESS) { + if (device->state.conn < C_CONNECTED) + tl_clear(first_peer_device(device)->connection); + if (device->state.disk == D_DISKLESS || device->state.disk == D_FAILED) + tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO); + } + drbd_resume_io(device); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info) +{ + return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED)); +} + +static int nla_put_drbd_cfg_context(struct sk_buff *skb, + struct drbd_resource *resource, + struct drbd_connection *connection, + struct drbd_device *device) +{ + struct nlattr *nla; + nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT); + if (!nla) + goto nla_put_failure; + if (device && + nla_put_u32(skb, T_ctx_volume, device->vnr)) + goto nla_put_failure; + if (nla_put_string(skb, T_ctx_resource_name, resource->name)) + goto nla_put_failure; + if (connection) { + if (connection->my_addr_len && + nla_put(skb, T_ctx_my_addr, connection->my_addr_len, &connection->my_addr)) + goto nla_put_failure; + if (connection->peer_addr_len && + nla_put(skb, T_ctx_peer_addr, connection->peer_addr_len, &connection->peer_addr)) + goto nla_put_failure; + } + nla_nest_end(skb, nla); + return 0; + +nla_put_failure: + if (nla) + nla_nest_cancel(skb, nla); + return -EMSGSIZE; +} + +/* + * Return the connection of @resource if @resource has exactly one connection. + */ +static struct drbd_connection *the_only_connection(struct drbd_resource *resource) +{ + struct list_head *connections = &resource->connections; + + if (list_empty(connections) || connections->next->next != connections) + return NULL; + return list_first_entry(&resource->connections, struct drbd_connection, connections); +} + +static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device, + const struct sib_info *sib) +{ + struct drbd_resource *resource = device->resource; + struct state_info *si = NULL; /* for sizeof(si->member); */ + struct nlattr *nla; + int got_ldev; + int err = 0; + int exclude_sensitive; + + /* If sib != NULL, this is drbd_bcast_event, which anyone can listen + * to. So we better exclude_sensitive information. + * + * If sib == NULL, this is drbd_adm_get_status, executed synchronously + * in the context of the requesting user process. Exclude sensitive + * information, unless current has superuser. + * + * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and + * relies on the current implementation of netlink_dump(), which + * executes the dump callback successively from netlink_recvmsg(), + * always in the context of the receiving process */ + exclude_sensitive = sib || !capable(CAP_SYS_ADMIN); + + got_ldev = get_ldev(device); + + /* We need to add connection name and volume number information still. + * Minor number is in drbd_genlmsghdr. */ + if (nla_put_drbd_cfg_context(skb, resource, the_only_connection(resource), device)) + goto nla_put_failure; + + if (res_opts_to_skb(skb, &device->resource->res_opts, exclude_sensitive)) + goto nla_put_failure; + + rcu_read_lock(); + if (got_ldev) { + struct disk_conf *disk_conf; + + disk_conf = rcu_dereference(device->ldev->disk_conf); + err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive); + } + if (!err) { + struct net_conf *nc; + + nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + if (nc) + err = net_conf_to_skb(skb, nc, exclude_sensitive); + } + rcu_read_unlock(); + if (err) + goto nla_put_failure; + + nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO); + if (!nla) + goto nla_put_failure; + if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) || + nla_put_u32(skb, T_current_state, device->state.i) || + nla_put_u64(skb, T_ed_uuid, device->ed_uuid) || + nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) || + nla_put_u64(skb, T_send_cnt, device->send_cnt) || + nla_put_u64(skb, T_recv_cnt, device->recv_cnt) || + nla_put_u64(skb, T_read_cnt, device->read_cnt) || + nla_put_u64(skb, T_writ_cnt, device->writ_cnt) || + nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) || + nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) || + nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) || + nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) || + nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt))) + goto nla_put_failure; + + if (got_ldev) { + int err; + + spin_lock_irq(&device->ldev->md.uuid_lock); + err = nla_put(skb, T_uuids, sizeof(si->uuids), device->ldev->md.uuid); + spin_unlock_irq(&device->ldev->md.uuid_lock); + + if (err) + goto nla_put_failure; + + if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) || + nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) || + nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device))) + goto nla_put_failure; + if (C_SYNC_SOURCE <= device->state.conn && + C_PAUSED_SYNC_T >= device->state.conn) { + if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) || + nla_put_u64(skb, T_bits_rs_failed, device->rs_failed)) + goto nla_put_failure; + } + } + + if (sib) { + switch(sib->sib_reason) { + case SIB_SYNC_PROGRESS: + case SIB_GET_STATUS_REPLY: + break; + case SIB_STATE_CHANGE: + if (nla_put_u32(skb, T_prev_state, sib->os.i) || + nla_put_u32(skb, T_new_state, sib->ns.i)) + goto nla_put_failure; + break; + case SIB_HELPER_POST: + if (nla_put_u32(skb, T_helper_exit_code, + sib->helper_exit_code)) + goto nla_put_failure; + /* fall through */ + case SIB_HELPER_PRE: + if (nla_put_string(skb, T_helper, sib->helper_name)) + goto nla_put_failure; + break; + } + } + nla_nest_end(skb, nla); + + if (0) +nla_put_failure: + err = -EMSGSIZE; + if (got_ldev) + put_ldev(device); + return err; +} + +int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL); + if (err) { + nlmsg_free(adm_ctx.reply_skb); + return err; + } +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static int get_one_status(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct drbd_device *device; + struct drbd_genlmsghdr *dh; + struct drbd_resource *pos = (struct drbd_resource *)cb->args[0]; + struct drbd_resource *resource = NULL; + struct drbd_resource *tmp; + unsigned volume = cb->args[1]; + + /* Open coded, deferred, iteration: + * for_each_resource_safe(resource, tmp, &drbd_resources) { + * connection = "first connection of resource or undefined"; + * idr_for_each_entry(&resource->devices, device, i) { + * ... + * } + * } + * where resource is cb->args[0]; + * and i is cb->args[1]; + * + * cb->args[2] indicates if we shall loop over all resources, + * or just dump all volumes of a single resource. + * + * This may miss entries inserted after this dump started, + * or entries deleted before they are reached. + * + * We need to make sure the device won't disappear while + * we are looking at it, and revalidate our iterators + * on each iteration. + */ + + /* synchronize with conn_create()/drbd_destroy_connection() */ + rcu_read_lock(); + /* revalidate iterator position */ + for_each_resource_rcu(tmp, &drbd_resources) { + if (pos == NULL) { + /* first iteration */ + pos = tmp; + resource = pos; + break; + } + if (tmp == pos) { + resource = pos; + break; + } + } + if (resource) { +next_resource: + device = idr_get_next(&resource->devices, &volume); + if (!device) { + /* No more volumes to dump on this resource. + * Advance resource iterator. */ + pos = list_entry_rcu(resource->resources.next, + struct drbd_resource, resources); + /* Did we dump any volume of this resource yet? */ + if (volume != 0) { + /* If we reached the end of the list, + * or only a single resource dump was requested, + * we are done. */ + if (&pos->resources == &drbd_resources || cb->args[2]) + goto out; + volume = 0; + resource = pos; + goto next_resource; + } + } + + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_STATUS); + if (!dh) + goto out; + + if (!device) { + /* This is a connection without a single volume. + * Suprisingly enough, it may have a network + * configuration. */ + struct drbd_connection *connection; + + dh->minor = -1U; + dh->ret_code = NO_ERROR; + connection = the_only_connection(resource); + if (nla_put_drbd_cfg_context(skb, resource, connection, NULL)) + goto cancel; + if (connection) { + struct net_conf *nc; + + nc = rcu_dereference(connection->net_conf); + if (nc && net_conf_to_skb(skb, nc, 1) != 0) + goto cancel; + } + goto done; + } + + D_ASSERT(device, device->vnr == volume); + D_ASSERT(device, device->resource == resource); + + dh->minor = device_to_minor(device); + dh->ret_code = NO_ERROR; + + if (nla_put_status_info(skb, device, NULL)) { +cancel: + genlmsg_cancel(skb, dh); + goto out; + } +done: + genlmsg_end(skb, dh); + } + +out: + rcu_read_unlock(); + /* where to start the next iteration */ + cb->args[0] = (long)pos; + cb->args[1] = (pos == resource) ? volume + 1 : 0; + + /* No more resources/volumes/minors found results in an empty skb. + * Which will terminate the dump. */ + return skb->len; +} + +/* + * Request status of all resources, or of all volumes within a single resource. + * + * This is a dump, as the answer may not fit in a single reply skb otherwise. + * Which means we cannot use the family->attrbuf or other such members, because + * dump is NOT protected by the genl_lock(). During dump, we only have access + * to the incoming skb, and need to opencode "parsing" of the nlattr payload. + * + * Once things are setup properly, we call into get_one_status(). + */ +int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; + struct nlattr *nla; + const char *resource_name; + struct drbd_resource *resource; + int maxtype; + + /* Is this a followup call? */ + if (cb->args[0]) { + /* ... of a single resource dump, + * and the resource iterator has been advanced already? */ + if (cb->args[2] && cb->args[2] != cb->args[0]) + return 0; /* DONE. */ + goto dump; + } + + /* First call (from netlink_dump_start). We need to figure out + * which resource(s) the user wants us to dump. */ + nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen), + nlmsg_attrlen(cb->nlh, hdrlen), + DRBD_NLA_CFG_CONTEXT); + + /* No explicit context given. Dump all. */ + if (!nla) + goto dump; + maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; + nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name)); + if (IS_ERR(nla)) + return PTR_ERR(nla); + /* context given, but no name present? */ + if (!nla) + return -EINVAL; + resource_name = nla_data(nla); + if (!*resource_name) + return -ENODEV; + resource = drbd_find_resource(resource_name); + if (!resource) + return -ENODEV; + + kref_put(&resource->kref, drbd_destroy_resource); /* get_one_status() revalidates the resource */ + + /* prime iterators, and set "filter" mode mark: + * only dump this connection. */ + cb->args[0] = (long)resource; + /* cb->args[1] = 0; passed in this way. */ + cb->args[2] = (long)resource; + +dump: + return get_one_status(skb, cb); +} + +int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct timeout_parms tp; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + tp.timeout_type = + adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : + test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED : + UT_DEFAULT; + + err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp); + if (err) { + nlmsg_free(adm_ctx.reply_skb); + return err; + } +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_device *device; + enum drbd_ret_code retcode; + struct start_ov_parms parms; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + device = adm_ctx.device; + + /* resume from last known position, if possible */ + parms.ov_start_sector = device->ov_start_sector; + parms.ov_stop_sector = ULLONG_MAX; + if (info->attrs[DRBD_NLA_START_OV_PARMS]) { + int err = start_ov_parms_from_attrs(&parms, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto out; + } + } + mutex_lock(&adm_ctx.resource->adm_mutex); + + /* w_make_ov_request expects position to be aligned */ + device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); + device->ov_stop_sector = parms.ov_stop_sector; + + /* If there is still bitmap IO pending, e.g. previous resync or verify + * just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(device); + wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); + retcode = drbd_request_state(device, NS(conn, C_VERIFY_S)); + drbd_resume_io(device); + + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + + +int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_device *device; + enum drbd_ret_code retcode; + int skip_initial_sync = 0; + int err; + struct new_c_uuid_parms args; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out_nolock; + + device = adm_ctx.device; + memset(&args, 0, sizeof(args)); + if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) { + err = new_c_uuid_parms_from_attrs(&args, info); + if (err) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto out_nolock; + } + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */ + + if (!get_ldev(device)) { + retcode = ERR_NO_DISK; + goto out; + } + + /* this is "skip initial sync", assume to be clean */ + if (device->state.conn == C_CONNECTED && + first_peer_device(device)->connection->agreed_pro_version >= 90 && + device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { + drbd_info(device, "Preparing to skip initial sync\n"); + skip_initial_sync = 1; + } else if (device->state.conn != C_STANDALONE) { + retcode = ERR_CONNECTED; + goto out_dec; + } + + drbd_uuid_set(device, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */ + drbd_uuid_new_current(device); /* New current, previous to UI_BITMAP */ + + if (args.clear_bm) { + err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write, + "clear_n_write from new_c_uuid", BM_LOCKED_MASK); + if (err) { + drbd_err(device, "Writing bitmap failed with %d\n", err); + retcode = ERR_IO_MD_DISK; + } + if (skip_initial_sync) { + drbd_send_uuids_skip_initial_sync(first_peer_device(device)); + _drbd_uuid_set(device, UI_BITMAP, 0); + drbd_print_uuids(device, "cleared bitmap UUID"); + spin_lock_irq(&device->resource->req_lock); + _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), + CS_VERBOSE, NULL); + spin_unlock_irq(&device->resource->req_lock); + } + } + + drbd_md_sync(device); +out_dec: + put_ldev(device); +out: + mutex_unlock(device->state_mutex); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out_nolock: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static enum drbd_ret_code +drbd_check_resource_name(struct drbd_config_context *adm_ctx) +{ + const char *name = adm_ctx->resource_name; + if (!name || !name[0]) { + drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing"); + return ERR_MANDATORY_TAG; + } + /* if we want to use these in sysfs/configfs/debugfs some day, + * we must not allow slashes */ + if (strchr(name, '/')) { + drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name"); + return ERR_INVALID_REQUEST; + } + return NO_ERROR; +} + +int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + struct res_opts res_opts; + int err; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + set_res_opts_defaults(&res_opts); + err = res_opts_from_attrs(&res_opts, info); + if (err && err != -ENOMSG) { + retcode = ERR_MANDATORY_TAG; + drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + goto out; + } + + retcode = drbd_check_resource_name(&adm_ctx); + if (retcode != NO_ERROR) + goto out; + + if (adm_ctx.resource) { + if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { + retcode = ERR_INVALID_REQUEST; + drbd_msg_put_info(adm_ctx.reply_skb, "resource exists"); + } + /* else: still NO_ERROR */ + goto out; + } + + /* not yet safe for genl_family.parallel_ops */ + if (!conn_create(adm_ctx.resource_name, &res_opts)) + retcode = ERR_NOMEM; +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_genlmsghdr *dh = info->userhdr; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + if (dh->minor > MINORMASK) { + drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range"); + retcode = ERR_INVALID_REQUEST; + goto out; + } + if (adm_ctx.volume > DRBD_VOLUME_MAX) { + drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range"); + retcode = ERR_INVALID_REQUEST; + goto out; + } + + /* drbd_adm_prepare made sure already + * that first_peer_device(device)->connection and device->vnr match the request. */ + if (adm_ctx.device) { + if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) + retcode = ERR_MINOR_OR_VOLUME_EXISTS; + /* else: still NO_ERROR */ + goto out; + } + + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = drbd_create_device(&adm_ctx, dh->minor); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static enum drbd_ret_code adm_del_minor(struct drbd_device *device) +{ + if (device->state.disk == D_DISKLESS && + /* no need to be device->state.conn == C_STANDALONE && + * we may want to delete a minor from a live replication group. + */ + device->state.role == R_SECONDARY) { + _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS), + CS_VERBOSE + CS_WAIT_COMPLETE); + drbd_delete_device(device); + return NO_ERROR; + } else + return ERR_MINOR_CONFIGURED; +} + +int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto out; + + mutex_lock(&adm_ctx.resource->adm_mutex); + retcode = adm_del_minor(adm_ctx.device); + mutex_unlock(&adm_ctx.resource->adm_mutex); +out: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +static int adm_del_resource(struct drbd_resource *resource) +{ + struct drbd_connection *connection; + + for_each_connection(connection, resource) { + if (connection->cstate > C_STANDALONE) + return ERR_NET_CONFIGURED; + } + if (!idr_is_empty(&resource->devices)) + return ERR_RES_IN_USE; + + list_del_rcu(&resource->resources); + /* Make sure all threads have actually stopped: state handling only + * does drbd_thread_stop_nowait(). */ + list_for_each_entry(connection, &resource->connections, connections) + drbd_thread_stop(&connection->worker); + synchronize_rcu(); + drbd_free_resource(resource); + return NO_ERROR; +} + +int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_resource *resource; + struct drbd_connection *connection; + struct drbd_device *device; + int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ + unsigned i; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + + resource = adm_ctx.resource; + mutex_lock(&resource->adm_mutex); + /* demote */ + for_each_connection(connection, resource) { + struct drbd_peer_device *peer_device; + + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0); + if (retcode < SS_SUCCESS) { + drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote"); + goto out; + } + } + + retcode = conn_try_disconnect(connection, 0); + if (retcode < SS_SUCCESS) { + drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect"); + goto out; + } + } + + /* detach */ + idr_for_each_entry(&resource->devices, device, i) { + retcode = adm_detach(device, 0); + if (retcode < SS_SUCCESS || retcode > NO_ERROR) { + drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach"); + goto out; + } + } + + /* delete volumes */ + idr_for_each_entry(&resource->devices, device, i) { + retcode = adm_del_minor(device); + if (retcode != NO_ERROR) { + /* "can not happen" */ + drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume"); + goto out; + } + } + + retcode = adm_del_resource(resource); +out: + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context adm_ctx; + struct drbd_resource *resource; + enum drbd_ret_code retcode; + + retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); + if (!adm_ctx.reply_skb) + return retcode; + if (retcode != NO_ERROR) + goto finish; + resource = adm_ctx.resource; + + mutex_lock(&resource->adm_mutex); + retcode = adm_del_resource(resource); + mutex_unlock(&resource->adm_mutex); +finish: + drbd_adm_finish(&adm_ctx, info, retcode); + return 0; +} + +void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) +{ + static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ + struct sk_buff *msg; + struct drbd_genlmsghdr *d_out; + unsigned seq; + int err = -ENOMEM; + + seq = atomic_inc_return(&drbd_genl_seq); + msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + if (!msg) + goto failed; + + err = -EMSGSIZE; + d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT); + if (!d_out) /* cannot happen, but anyways. */ + goto nla_put_failure; + d_out->minor = device_to_minor(device); + d_out->ret_code = NO_ERROR; + + if (nla_put_status_info(msg, device, sib)) + goto nla_put_failure; + genlmsg_end(msg, d_out); + err = drbd_genl_multicast_events(msg, 0); + /* msg has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + + return; + +nla_put_failure: + nlmsg_free(msg); +failed: + drbd_err(device, "Error %d while broadcasting event. " + "Event seq:%u sib_reason:%u\n", + err, seq, sib->sib_reason); +} diff --git a/kernel/drivers/block/drbd/drbd_nla.c b/kernel/drivers/block/drbd/drbd_nla.c new file mode 100644 index 000000000..b2d479149 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_nla.c @@ -0,0 +1,54 @@ +#include +#include +#include +#include "drbd_nla.h" + +static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla) +{ + struct nlattr *head = nla_data(nla); + int len = nla_len(nla); + int rem; + + /* + * validate_nla (called from nla_parse_nested) ignores attributes + * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag. + * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY + * flag set also, check and remove that flag before calling + * nla_parse_nested. + */ + + nla_for_each_attr(nla, head, len, rem) { + if (nla->nla_type & DRBD_GENLA_F_MANDATORY) { + nla->nla_type &= ~DRBD_GENLA_F_MANDATORY; + if (nla_type(nla) > maxtype) + return -EOPNOTSUPP; + } + } + return 0; +} + +int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, + const struct nla_policy *policy) +{ + int err; + + err = drbd_nla_check_mandatory(maxtype, nla); + if (!err) + err = nla_parse_nested(tb, maxtype, nla, policy); + + return err; +} + +struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype) +{ + int err; + /* + * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and + * we don't know about that attribute, reject all the nested + * attributes. + */ + err = drbd_nla_check_mandatory(maxtype, nla); + if (err) + return ERR_PTR(err); + return nla_find_nested(nla, attrtype); +} diff --git a/kernel/drivers/block/drbd/drbd_nla.h b/kernel/drivers/block/drbd/drbd_nla.h new file mode 100644 index 000000000..679c2d5b4 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_nla.h @@ -0,0 +1,8 @@ +#ifndef __DRBD_NLA_H +#define __DRBD_NLA_H + +extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, + const struct nla_policy *policy); +extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype); + +#endif /* __DRBD_NLA_H */ diff --git a/kernel/drivers/block/drbd/drbd_proc.c b/kernel/drivers/block/drbd/drbd_proc.c new file mode 100644 index 000000000..3b10fa6cb --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_proc.c @@ -0,0 +1,368 @@ +/* + drbd_proc.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include + +#include +#include +#include +#include +#include +#include +#include "drbd_int.h" + +static int drbd_proc_open(struct inode *inode, struct file *file); +static int drbd_proc_release(struct inode *inode, struct file *file); + + +struct proc_dir_entry *drbd_proc; +const struct file_operations drbd_proc_fops = { + .owner = THIS_MODULE, + .open = drbd_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = drbd_proc_release, +}; + +static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v) +{ + /* v is in kB/sec. We don't expect TiByte/sec yet. */ + if (unlikely(v >= 1000000)) { + /* cool: > GiByte/s */ + seq_printf(seq, "%ld,", v / 1000000); + v %= 1000000; + seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000); + } else if (likely(v >= 1000)) + seq_printf(seq, "%ld,%03ld", v/1000, v % 1000); + else + seq_printf(seq, "%ld", v); +} + +static void drbd_get_syncer_progress(struct drbd_device *device, + union drbd_dev_state state, unsigned long *rs_total, + unsigned long *bits_left, unsigned int *per_mil_done) +{ + /* this is to break it at compile time when we change that, in case we + * want to support more than (1<<32) bits on a 32bit arch. */ + typecheck(unsigned long, device->rs_total); + *rs_total = device->rs_total; + + /* note: both rs_total and rs_left are in bits, i.e. in + * units of BM_BLOCK_SIZE. + * for the percentage, we don't care. */ + + if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) + *bits_left = device->ov_left; + else + *bits_left = drbd_bm_total_weight(device) - device->rs_failed; + /* >> 10 to prevent overflow, + * +1 to prevent division by zero */ + if (*bits_left > *rs_total) { + /* D'oh. Maybe a logic bug somewhere. More likely just a race + * between state change and reset of rs_total. + */ + *bits_left = *rs_total; + *per_mil_done = *rs_total ? 0 : 1000; + } else { + /* Make sure the division happens in long context. + * We allow up to one petabyte storage right now, + * at a granularity of 4k per bit that is 2**38 bits. + * After shift right and multiplication by 1000, + * this should still fit easily into a 32bit long, + * so we don't need a 64bit division on 32bit arch. + * Note: currently we don't support such large bitmaps on 32bit + * arch anyways, but no harm done to be prepared for it here. + */ + unsigned int shift = *rs_total > UINT_MAX ? 16 : 10; + unsigned long left = *bits_left >> shift; + unsigned long total = 1UL + (*rs_total >> shift); + unsigned long tmp = 1000UL - left * 1000UL/total; + *per_mil_done = tmp; + } +} + + +/*lge + * progress bars shamelessly adapted from driver/md/md.c + * output looks like + * [=====>..............] 33.5% (23456/123456) + * finish: 2:20:20 speed: 6,345 (6,456) K/sec + */ +static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq, + union drbd_dev_state state) +{ + unsigned long db, dt, dbdt, rt, rs_total, rs_left; + unsigned int res; + int i, x, y; + int stalled = 0; + + drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res); + + x = res/50; + y = 20-x; + seq_printf(seq, "\t["); + for (i = 1; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + + if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) + seq_printf(seq, "verified:"); + else + seq_printf(seq, "sync'ed:"); + seq_printf(seq, "%3u.%u%% ", res / 10, res % 10); + + /* if more than a few GB, display in MB */ + if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT))) + seq_printf(seq, "(%lu/%lu)M", + (unsigned long) Bit2KB(rs_left >> 10), + (unsigned long) Bit2KB(rs_total >> 10)); + else + seq_printf(seq, "(%lu/%lu)K", + (unsigned long) Bit2KB(rs_left), + (unsigned long) Bit2KB(rs_total)); + + seq_printf(seq, "\n\t"); + + /* see drivers/md/md.c + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is + * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at + * least DRBD_SYNC_MARK_STEP time before it will be modified. */ + /* ------------------------ ~18s average ------------------------ */ + i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS; + dt = (jiffies - device->rs_mark_time[i]) / HZ; + if (dt > 180) + stalled = 1; + + if (!dt) + dt++; + db = device->rs_mark_left[i] - rs_left; + rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ + + seq_printf(seq, "finish: %lu:%02lu:%02lu", + rt / 3600, (rt % 3600) / 60, rt % 60); + + dbdt = Bit2KB(db/dt); + seq_printf(seq, " speed: "); + seq_printf_with_thousands_grouping(seq, dbdt); + seq_printf(seq, " ("); + /* ------------------------- ~3s average ------------------------ */ + if (proc_details >= 1) { + /* this is what drbd_rs_should_slow_down() uses */ + i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS; + dt = (jiffies - device->rs_mark_time[i]) / HZ; + if (!dt) + dt++; + db = device->rs_mark_left[i] - rs_left; + dbdt = Bit2KB(db/dt); + seq_printf_with_thousands_grouping(seq, dbdt); + seq_printf(seq, " -- "); + } + + /* --------------------- long term average ---------------------- */ + /* mean speed since syncer started + * we do account for PausedSync periods */ + dt = (jiffies - device->rs_start - device->rs_paused) / HZ; + if (dt == 0) + dt = 1; + db = rs_total - rs_left; + dbdt = Bit2KB(db/dt); + seq_printf_with_thousands_grouping(seq, dbdt); + seq_printf(seq, ")"); + + if (state.conn == C_SYNC_TARGET || + state.conn == C_VERIFY_S) { + seq_printf(seq, " want: "); + seq_printf_with_thousands_grouping(seq, device->c_sync_rate); + } + seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : ""); + + if (proc_details >= 1) { + /* 64 bit: + * we convert to sectors in the display below. */ + unsigned long bm_bits = drbd_bm_bits(device); + unsigned long bit_pos; + unsigned long long stop_sector = 0; + if (state.conn == C_VERIFY_S || + state.conn == C_VERIFY_T) { + bit_pos = bm_bits - device->ov_left; + if (verify_can_do_stop_sector(device)) + stop_sector = device->ov_stop_sector; + } else + bit_pos = device->bm_resync_fo; + /* Total sectors may be slightly off for oddly + * sized devices. So what. */ + seq_printf(seq, + "\t%3d%% sector pos: %llu/%llu", + (int)(bit_pos / (bm_bits/100+1)), + (unsigned long long)bit_pos * BM_SECT_PER_BIT, + (unsigned long long)bm_bits * BM_SECT_PER_BIT); + if (stop_sector != 0 && stop_sector != ULLONG_MAX) + seq_printf(seq, " stop sector: %llu", stop_sector); + seq_printf(seq, "\n"); + } +} + +static int drbd_seq_show(struct seq_file *seq, void *v) +{ + int i, prev_i = -1; + const char *sn; + struct drbd_device *device; + struct net_conf *nc; + union drbd_dev_state state; + char wp; + + static char write_ordering_chars[] = { + [WO_none] = 'n', + [WO_drain_io] = 'd', + [WO_bdev_flush] = 'f', + }; + + seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", + API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag()); + + /* + cs .. connection state + ro .. node role (local/remote) + ds .. disk state (local/remote) + protocol + various flags + ns .. network send + nr .. network receive + dw .. disk write + dr .. disk read + al .. activity log write count + bm .. bitmap update write count + pe .. pending (waiting for ack or data reply) + ua .. unack'd (still need to send ack or data reply) + ap .. application requests accepted, but not yet completed + ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending + wo .. write ordering mode currently in use + oos .. known out-of-sync kB + */ + + rcu_read_lock(); + idr_for_each_entry(&drbd_devices, device, i) { + if (prev_i != i - 1) + seq_printf(seq, "\n"); + prev_i = i; + + state = device->state; + sn = drbd_conn_str(state.conn); + + if (state.conn == C_STANDALONE && + state.disk == D_DISKLESS && + state.role == R_SECONDARY) { + seq_printf(seq, "%2d: cs:Unconfigured\n", i); + } else { + /* reset device->congestion_reason */ + bdi_rw_congested(&device->rq_queue->backing_dev_info); + + nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' '; + seq_printf(seq, + "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" + " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " + "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", + i, sn, + drbd_role_str(state.role), + drbd_role_str(state.peer), + drbd_disk_str(state.disk), + drbd_disk_str(state.pdsk), + wp, + drbd_suspended(device) ? 's' : 'r', + state.aftr_isp ? 'a' : '-', + state.peer_isp ? 'p' : '-', + state.user_isp ? 'u' : '-', + device->congestion_reason ?: '-', + test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-', + device->send_cnt/2, + device->recv_cnt/2, + device->writ_cnt/2, + device->read_cnt/2, + device->al_writ_cnt, + device->bm_writ_cnt, + atomic_read(&device->local_cnt), + atomic_read(&device->ap_pending_cnt) + + atomic_read(&device->rs_pending_cnt), + atomic_read(&device->unacked_cnt), + atomic_read(&device->ap_bio_cnt), + first_peer_device(device)->connection->epochs, + write_ordering_chars[device->resource->write_ordering] + ); + seq_printf(seq, " oos:%llu\n", + Bit2KB((unsigned long long) + drbd_bm_total_weight(device))); + } + if (state.conn == C_SYNC_SOURCE || + state.conn == C_SYNC_TARGET || + state.conn == C_VERIFY_S || + state.conn == C_VERIFY_T) + drbd_syncer_progress(device, seq, state); + + if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) { + lc_seq_printf_stats(seq, device->resync); + lc_seq_printf_stats(seq, device->act_log); + put_ldev(device); + } + + if (proc_details >= 2) + seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt)); + } + rcu_read_unlock(); + + return 0; +} + +static int drbd_proc_open(struct inode *inode, struct file *file) +{ + int err; + + if (try_module_get(THIS_MODULE)) { + err = single_open(file, drbd_seq_show, NULL); + if (err) + module_put(THIS_MODULE); + return err; + } + return -ENODEV; +} + +static int drbd_proc_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return single_release(inode, file); +} + +/* PROC FS stuff end */ diff --git a/kernel/drivers/block/drbd/drbd_protocol.h b/kernel/drivers/block/drbd/drbd_protocol.h new file mode 100644 index 000000000..2da9104a3 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_protocol.h @@ -0,0 +1,307 @@ +#ifndef __DRBD_PROTOCOL_H +#define __DRBD_PROTOCOL_H + +enum drbd_packet { + /* receiver (data socket) */ + P_DATA = 0x00, + P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */ + P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */ + P_BARRIER = 0x03, + P_BITMAP = 0x04, + P_BECOME_SYNC_TARGET = 0x05, + P_BECOME_SYNC_SOURCE = 0x06, + P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */ + P_DATA_REQUEST = 0x08, /* Used to ask for a data block */ + P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */ + P_SYNC_PARAM = 0x0a, + P_PROTOCOL = 0x0b, + P_UUIDS = 0x0c, + P_SIZES = 0x0d, + P_STATE = 0x0e, + P_SYNC_UUID = 0x0f, + P_AUTH_CHALLENGE = 0x10, + P_AUTH_RESPONSE = 0x11, + P_STATE_CHG_REQ = 0x12, + + /* asender (meta socket */ + P_PING = 0x13, + P_PING_ACK = 0x14, + P_RECV_ACK = 0x15, /* Used in protocol B */ + P_WRITE_ACK = 0x16, /* Used in protocol C */ + P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */ + P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */ + P_NEG_ACK = 0x19, /* Sent if local disk is unusable */ + P_NEG_DREPLY = 0x1a, /* Local disk is broken... */ + P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */ + P_BARRIER_ACK = 0x1c, + P_STATE_CHG_REPLY = 0x1d, + + /* "new" commands, no longer fitting into the ordering scheme above */ + + P_OV_REQUEST = 0x1e, /* data socket */ + P_OV_REPLY = 0x1f, + P_OV_RESULT = 0x20, /* meta socket */ + P_CSUM_RS_REQUEST = 0x21, /* data socket */ + P_RS_IS_IN_SYNC = 0x22, /* meta socket */ + P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */ + P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */ + /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */ + /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */ + P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */ + P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */ + P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */ + P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */ + P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */ + P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */ + P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */ + /* 0x2e to 0x30 reserved, used in drbd 9 */ + + /* REQ_DISCARD. We used "discard" in different contexts before, + * which is why I chose TRIM here, to disambiguate. */ + P_TRIM = 0x31, + + P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ + P_MAX_OPT_CMD = 0x101, + + /* special command ids for handshake */ + + P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */ + P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */ + + P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */ +}; + +#ifndef __packed +#define __packed __attribute__((packed)) +#endif + +/* This is the layout for a packet on the wire. + * The byteorder is the network byte order. + * (except block_id and barrier fields. + * these are pointers to local structs + * and have no relevance for the partner, + * which just echoes them as received.) + * + * NOTE that the payload starts at a long aligned offset, + * regardless of 32 or 64 bit arch! + */ +struct p_header80 { + u32 magic; + u16 command; + u16 length; /* bytes of data after this header */ +} __packed; + +/* Header for big packets, Used for data packets exceeding 64kB */ +struct p_header95 { + u16 magic; /* use DRBD_MAGIC_BIG here */ + u16 command; + u32 length; +} __packed; + +struct p_header100 { + u32 magic; + u16 volume; + u16 command; + u32 length; + u32 pad; +} __packed; + +/* these defines must not be changed without changing the protocol version */ +#define DP_HARDBARRIER 1 /* depricated */ +#define DP_RW_SYNC 2 /* equals REQ_SYNC */ +#define DP_MAY_SET_IN_SYNC 4 +#define DP_UNPLUG 8 /* not used anymore */ +#define DP_FUA 16 /* equals REQ_FUA */ +#define DP_FLUSH 32 /* equals REQ_FLUSH */ +#define DP_DISCARD 64 /* equals REQ_DISCARD */ +#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ +#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ + +struct p_data { + u64 sector; /* 64 bits sector number */ + u64 block_id; /* to identify the request in protocol B&C */ + u32 seq_num; + u32 dp_flags; +} __packed; + +struct p_trim { + struct p_data p_data; + u32 size; /* == bio->bi_size */ +} __packed; + +/* + * commands which share a struct: + * p_block_ack: + * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), + * P_SUPERSEDED (proto C, two-primaries conflict detection) + * p_block_req: + * P_DATA_REQUEST, P_RS_DATA_REQUEST + */ +struct p_block_ack { + u64 sector; + u64 block_id; + u32 blksize; + u32 seq_num; +} __packed; + +struct p_block_req { + u64 sector; + u64 block_id; + u32 blksize; + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +/* + * commands with their own struct for additional fields: + * P_CONNECTION_FEATURES + * P_BARRIER + * P_BARRIER_ACK + * P_SYNC_PARAM + * ReportParams + */ + +#define FF_TRIM 1 + +struct p_connection_features { + u32 protocol_min; + u32 feature_flags; + u32 protocol_max; + + /* should be more than enough for future enhancements + * for now, feature_flags and the reserved array shall be zero. + */ + + u32 _pad; + u64 reserved[7]; +} __packed; + +struct p_barrier { + u32 barrier; /* barrier number _handle_ only */ + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +struct p_barrier_ack { + u32 barrier; + u32 set_size; +} __packed; + +struct p_rs_param { + u32 resync_rate; + + /* Since protocol version 88 and higher. */ + char verify_alg[0]; +} __packed; + +struct p_rs_param_89 { + u32 resync_rate; + /* protocol version 89: */ + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; +} __packed; + +struct p_rs_param_95 { + u32 resync_rate; + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; + u32 c_plan_ahead; + u32 c_delay_target; + u32 c_fill_target; + u32 c_max_rate; +} __packed; + +enum drbd_conn_flags { + CF_DISCARD_MY_DATA = 1, + CF_DRY_RUN = 2, +}; + +struct p_protocol { + u32 protocol; + u32 after_sb_0p; + u32 after_sb_1p; + u32 after_sb_2p; + u32 conn_flags; + u32 two_primaries; + + /* Since protocol version 87 and higher. */ + char integrity_alg[0]; + +} __packed; + +struct p_uuids { + u64 uuid[UI_EXTENDED_SIZE]; +} __packed; + +struct p_rs_uuid { + u64 uuid; +} __packed; + +struct p_sizes { + u64 d_size; /* size of disk */ + u64 u_size; /* user requested size */ + u64 c_size; /* current exported size */ + u32 max_bio_size; /* Maximal size of a BIO */ + u16 queue_order_type; /* not yet implemented in DRBD*/ + u16 dds_flags; /* use enum dds_flags here. */ +} __packed; + +struct p_state { + u32 state; +} __packed; + +struct p_req_state { + u32 mask; + u32 val; +} __packed; + +struct p_req_state_reply { + u32 retcode; +} __packed; + +struct p_drbd06_param { + u64 size; + u32 state; + u32 blksize; + u32 protocol; + u32 version; + u32 gen_cnt[5]; + u32 bit_map_gen[5]; +} __packed; + +struct p_block_desc { + u64 sector; + u32 blksize; + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +/* Valid values for the encoding field. + * Bump proto version when changing this. */ +enum drbd_bitmap_code { + /* RLE_VLI_Bytes = 0, + * and other bit variants had been defined during + * algorithm evaluation. */ + RLE_VLI_Bits = 2, +}; + +struct p_compressed_bm { + /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code + * (encoding & 0x80): polarity (set/unset) of first runlength + * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits + * used to pad up to head.length bytes + */ + u8 encoding; + + u8 code[0]; +} __packed; + +struct p_delay_probe93 { + u32 seq_num; /* sequence number to match the two probe packets */ + u32 offset; /* usecs the probe got sent after the reference time point */ +} __packed; + +/* + * Bitmap packets need to fit within a single page on the sender and receiver, + * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger). + */ +#define DRBD_SOCKET_BUFFER_SIZE 4096 + +#endif /* __DRBD_PROTOCOL_H */ diff --git a/kernel/drivers/block/drbd/drbd_receiver.c b/kernel/drivers/block/drbd/drbd_receiver.c new file mode 100644 index 000000000..cee20354a --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_receiver.c @@ -0,0 +1,5661 @@ +/* + drbd_receiver.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define __KERNEL_SYSCALLS__ +#include +#include +#include +#include +#include +#include "drbd_int.h" +#include "drbd_protocol.h" +#include "drbd_req.h" +#include "drbd_vli.h" + +#define PRO_FEATURES (FF_TRIM) + +struct packet_info { + enum drbd_packet cmd; + unsigned int size; + unsigned int vnr; + void *data; +}; + +enum finish_epoch { + FE_STILL_LIVE, + FE_DESTROYED, + FE_RECYCLED, +}; + +static int drbd_do_features(struct drbd_connection *connection); +static int drbd_do_auth(struct drbd_connection *connection); +static int drbd_disconnected(struct drbd_peer_device *); +static void conn_wait_active_ee_empty(struct drbd_connection *connection); +static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event); +static int e_end_block(struct drbd_work *, int); + + +#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) + +/* + * some helper functions to deal with single linked page lists, + * page->private being our "next" pointer. + */ + +/* If at least n pages are linked at head, get n pages off. + * Otherwise, don't modify head, and return NULL. + * Locking is the responsibility of the caller. + */ +static struct page *page_chain_del(struct page **head, int n) +{ + struct page *page; + struct page *tmp; + + BUG_ON(!n); + BUG_ON(!head); + + page = *head; + + if (!page) + return NULL; + + while (page) { + tmp = page_chain_next(page); + if (--n == 0) + break; /* found sufficient pages */ + if (tmp == NULL) + /* insufficient pages, don't use any of them. */ + return NULL; + page = tmp; + } + + /* add end of list marker for the returned list */ + set_page_private(page, 0); + /* actual return value, and adjustment of head */ + page = *head; + *head = tmp; + return page; +} + +/* may be used outside of locks to find the tail of a (usually short) + * "private" page chain, before adding it back to a global chain head + * with page_chain_add() under a spinlock. */ +static struct page *page_chain_tail(struct page *page, int *len) +{ + struct page *tmp; + int i = 1; + while ((tmp = page_chain_next(page))) + ++i, page = tmp; + if (len) + *len = i; + return page; +} + +static int page_chain_free(struct page *page) +{ + struct page *tmp; + int i = 0; + page_chain_for_each_safe(page, tmp) { + put_page(page); + ++i; + } + return i; +} + +static void page_chain_add(struct page **head, + struct page *chain_first, struct page *chain_last) +{ +#if 1 + struct page *tmp; + tmp = page_chain_tail(chain_first, NULL); + BUG_ON(tmp != chain_last); +#endif + + /* add chain to head */ + set_page_private(chain_last, (unsigned long)*head); + *head = chain_first; +} + +static struct page *__drbd_alloc_pages(struct drbd_device *device, + unsigned int number) +{ + struct page *page = NULL; + struct page *tmp = NULL; + unsigned int i = 0; + + /* Yes, testing drbd_pp_vacant outside the lock is racy. + * So what. It saves a spin_lock. */ + if (drbd_pp_vacant >= number) { + spin_lock(&drbd_pp_lock); + page = page_chain_del(&drbd_pp_pool, number); + if (page) + drbd_pp_vacant -= number; + spin_unlock(&drbd_pp_lock); + if (page) + return page; + } + + /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + for (i = 0; i < number; i++) { + tmp = alloc_page(GFP_TRY); + if (!tmp) + break; + set_page_private(tmp, (unsigned long)page); + page = tmp; + } + + if (i == number) + return page; + + /* Not enough pages immediately available this time. + * No need to jump around here, drbd_alloc_pages will retry this + * function "soon". */ + if (page) { + tmp = page_chain_tail(page, NULL); + spin_lock(&drbd_pp_lock); + page_chain_add(&drbd_pp_pool, page, tmp); + drbd_pp_vacant += i; + spin_unlock(&drbd_pp_lock); + } + return NULL; +} + +static void reclaim_finished_net_peer_reqs(struct drbd_device *device, + struct list_head *to_be_freed) +{ + struct drbd_peer_request *peer_req, *tmp; + + /* The EEs are always appended to the end of the list. Since + they are sent in order over the wire, they have to finish + in order. As soon as we see the first not finished we can + stop to examine the list... */ + + list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) { + if (drbd_peer_req_has_active_page(peer_req)) + break; + list_move(&peer_req->w.list, to_be_freed); + } +} + +static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) +{ + LIST_HEAD(reclaimed); + struct drbd_peer_request *peer_req, *t; + + spin_lock_irq(&device->resource->req_lock); + reclaim_finished_net_peer_reqs(device, &reclaimed); + spin_unlock_irq(&device->resource->req_lock); + + list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) + drbd_free_net_peer_req(device, peer_req); +} + +/** + * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) + * @device: DRBD device. + * @number: number of pages requested + * @retry: whether to retry, if not enough pages are available right now + * + * Tries to allocate number pages, first from our own page pool, then from + * the kernel. + * Possibly retry until DRBD frees sufficient pages somewhere else. + * + * If this allocation would exceed the max_buffers setting, we throttle + * allocation (schedule_timeout) to give the system some room to breathe. + * + * We do not use max-buffers as hard limit, because it could lead to + * congestion and further to a distributed deadlock during online-verify or + * (checksum based) resync, if the max-buffers, socket buffer sizes and + * resync-rate settings are mis-configured. + * + * Returns a page chain linked via page->private. + */ +struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number, + bool retry) +{ + struct drbd_device *device = peer_device->device; + struct page *page = NULL; + struct net_conf *nc; + DEFINE_WAIT(wait); + unsigned int mxb; + + rcu_read_lock(); + nc = rcu_dereference(peer_device->connection->net_conf); + mxb = nc ? nc->max_buffers : 1000000; + rcu_read_unlock(); + + if (atomic_read(&device->pp_in_use) < mxb) + page = __drbd_alloc_pages(device, number); + + while (page == NULL) { + prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); + + drbd_kick_lo_and_reclaim_net(device); + + if (atomic_read(&device->pp_in_use) < mxb) { + page = __drbd_alloc_pages(device, number); + if (page) + break; + } + + if (!retry) + break; + + if (signal_pending(current)) { + drbd_warn(device, "drbd_alloc_pages interrupted!\n"); + break; + } + + if (schedule_timeout(HZ/10) == 0) + mxb = UINT_MAX; + } + finish_wait(&drbd_pp_wait, &wait); + + if (page) + atomic_add(number, &device->pp_in_use); + return page; +} + +/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages. + * Is also used from inside an other spin_lock_irq(&resource->req_lock); + * Either links the page chain back to the global pool, + * or returns all pages to the system. */ +static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net) +{ + atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use; + int i; + + if (page == NULL) + return; + + if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count) + i = page_chain_free(page); + else { + struct page *tmp; + tmp = page_chain_tail(page, &i); + spin_lock(&drbd_pp_lock); + page_chain_add(&drbd_pp_pool, page, tmp); + drbd_pp_vacant += i; + spin_unlock(&drbd_pp_lock); + } + i = atomic_sub_return(i, a); + if (i < 0) + drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n", + is_net ? "pp_in_use_by_net" : "pp_in_use", i); + wake_up(&drbd_pp_wait); +} + +/* +You need to hold the req_lock: + _drbd_wait_ee_list_empty() + +You must not have the req_lock: + drbd_free_peer_req() + drbd_alloc_peer_req() + drbd_free_peer_reqs() + drbd_ee_fix_bhs() + drbd_finish_peer_reqs() + drbd_clear_done_ee() + drbd_wait_ee_list_empty() +*/ + +struct drbd_peer_request * +drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, + unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + struct drbd_peer_request *peer_req; + struct page *page = NULL; + unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; + + if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) + return NULL; + + peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); + if (!peer_req) { + if (!(gfp_mask & __GFP_NOWARN)) + drbd_err(device, "%s: allocation failed\n", __func__); + return NULL; + } + + if (has_payload && data_size) { + page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); + if (!page) + goto fail; + } + + memset(peer_req, 0, sizeof(*peer_req)); + INIT_LIST_HEAD(&peer_req->w.list); + drbd_clear_interval(&peer_req->i); + peer_req->i.size = data_size; + peer_req->i.sector = sector; + peer_req->submit_jif = jiffies; + peer_req->peer_device = peer_device; + peer_req->pages = page; + /* + * The block_id is opaque to the receiver. It is not endianness + * converted, and sent back to the sender unchanged. + */ + peer_req->block_id = id; + + return peer_req; + + fail: + mempool_free(peer_req, drbd_ee_mempool); + return NULL; +} + +void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req, + int is_net) +{ + might_sleep(); + if (peer_req->flags & EE_HAS_DIGEST) + kfree(peer_req->digest); + drbd_free_pages(device, peer_req->pages, is_net); + D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0); + D_ASSERT(device, drbd_interval_empty(&peer_req->i)); + if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) { + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; + drbd_al_complete_io(device, &peer_req->i); + } + mempool_free(peer_req, drbd_ee_mempool); +} + +int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list) +{ + LIST_HEAD(work_list); + struct drbd_peer_request *peer_req, *t; + int count = 0; + int is_net = list == &device->net_ee; + + spin_lock_irq(&device->resource->req_lock); + list_splice_init(list, &work_list); + spin_unlock_irq(&device->resource->req_lock); + + list_for_each_entry_safe(peer_req, t, &work_list, w.list) { + __drbd_free_peer_req(device, peer_req, is_net); + count++; + } + return count; +} + +/* + * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier. + */ +static int drbd_finish_peer_reqs(struct drbd_device *device) +{ + LIST_HEAD(work_list); + LIST_HEAD(reclaimed); + struct drbd_peer_request *peer_req, *t; + int err = 0; + + spin_lock_irq(&device->resource->req_lock); + reclaim_finished_net_peer_reqs(device, &reclaimed); + list_splice_init(&device->done_ee, &work_list); + spin_unlock_irq(&device->resource->req_lock); + + list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) + drbd_free_net_peer_req(device, peer_req); + + /* possible callbacks here: + * e_end_block, and e_end_resync_block, e_send_superseded. + * all ignore the last argument. + */ + list_for_each_entry_safe(peer_req, t, &work_list, w.list) { + int err2; + + /* list_del not necessary, next/prev members not touched */ + err2 = peer_req->w.cb(&peer_req->w, !!err); + if (!err) + err = err2; + drbd_free_peer_req(device, peer_req); + } + wake_up(&device->ee_wait); + + return err; +} + +static void _drbd_wait_ee_list_empty(struct drbd_device *device, + struct list_head *head) +{ + DEFINE_WAIT(wait); + + /* avoids spin_lock/unlock + * and calling prepare_to_wait in the fast path */ + while (!list_empty(head)) { + prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&device->resource->req_lock); + io_schedule(); + finish_wait(&device->ee_wait, &wait); + spin_lock_irq(&device->resource->req_lock); + } +} + +static void drbd_wait_ee_list_empty(struct drbd_device *device, + struct list_head *head) +{ + spin_lock_irq(&device->resource->req_lock); + _drbd_wait_ee_list_empty(device, head); + spin_unlock_irq(&device->resource->req_lock); +} + +static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags) +{ + struct kvec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) + }; + return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags); +} + +static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size) +{ + int rv; + + rv = drbd_recv_short(connection->data.socket, buf, size, 0); + + if (rv < 0) { + if (rv == -ECONNRESET) + drbd_info(connection, "sock was reset by peer\n"); + else if (rv != -ERESTARTSYS) + drbd_err(connection, "sock_recvmsg returned %d\n", rv); + } else if (rv == 0) { + if (test_bit(DISCONNECT_SENT, &connection->flags)) { + long t; + rcu_read_lock(); + t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10; + rcu_read_unlock(); + + t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t); + + if (t) + goto out; + } + drbd_info(connection, "sock was shut down by peer\n"); + } + + if (rv != size) + conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD); + +out: + return rv; +} + +static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size) +{ + int err; + + err = drbd_recv(connection, buf, size); + if (err != size) { + if (err >= 0) + err = -EIO; + } else + err = 0; + return err; +} + +static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size) +{ + int err; + + err = drbd_recv_all(connection, buf, size); + if (err && !signal_pending(current)) + drbd_warn(connection, "short read (expected size %d)\n", (int)size); + return err; +} + +/* quoting tcp(7): + * On individual connections, the socket buffer size must be set prior to the + * listen(2) or connect(2) calls in order to have it take effect. + * This is our wrapper to do so. + */ +static void drbd_setbufsize(struct socket *sock, unsigned int snd, + unsigned int rcv) +{ + /* open coded SO_SNDBUF, SO_RCVBUF */ + if (snd) { + sock->sk->sk_sndbuf = snd; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } + if (rcv) { + sock->sk->sk_rcvbuf = rcv; + sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } +} + +static struct socket *drbd_try_connect(struct drbd_connection *connection) +{ + const char *what; + struct socket *sock; + struct sockaddr_in6 src_in6; + struct sockaddr_in6 peer_in6; + struct net_conf *nc; + int err, peer_addr_len, my_addr_len; + int sndbuf_size, rcvbuf_size, connect_int; + int disconnect_on_error = 1; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (!nc) { + rcu_read_unlock(); + return NULL; + } + sndbuf_size = nc->sndbuf_size; + rcvbuf_size = nc->rcvbuf_size; + connect_int = nc->connect_int; + rcu_read_unlock(); + + my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6)); + memcpy(&src_in6, &connection->my_addr, my_addr_len); + + if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6) + src_in6.sin6_port = 0; + else + ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ + + peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6)); + memcpy(&peer_in6, &connection->peer_addr, peer_addr_len); + + what = "sock_create_kern"; + err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family, + SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) { + sock = NULL; + goto out; + } + + sock->sk->sk_rcvtimeo = + sock->sk->sk_sndtimeo = connect_int * HZ; + drbd_setbufsize(sock, sndbuf_size, rcvbuf_size); + + /* explicitly bind to the configured IP as source IP + * for the outgoing connections. + * This is needed for multihomed hosts and to be + * able to use lo: interfaces for drbd. + * Make sure to use 0 as port number, so linux selects + * a free one dynamically. + */ + what = "bind before connect"; + err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len); + if (err < 0) + goto out; + + /* connect may fail, peer not yet available. + * stay C_WF_CONNECTION, don't go Disconnecting! */ + disconnect_on_error = 0; + what = "connect"; + err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0); + +out: + if (err < 0) { + if (sock) { + sock_release(sock); + sock = NULL; + } + switch (-err) { + /* timeout, busy, signal pending */ + case ETIMEDOUT: case EAGAIN: case EINPROGRESS: + case EINTR: case ERESTARTSYS: + /* peer not (yet) available, network problem */ + case ECONNREFUSED: case ENETUNREACH: + case EHOSTDOWN: case EHOSTUNREACH: + disconnect_on_error = 0; + break; + default: + drbd_err(connection, "%s failed, err = %d\n", what, err); + } + if (disconnect_on_error) + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + } + + return sock; +} + +struct accept_wait_data { + struct drbd_connection *connection; + struct socket *s_listen; + struct completion door_bell; + void (*original_sk_state_change)(struct sock *sk); + +}; + +static void drbd_incoming_connection(struct sock *sk) +{ + struct accept_wait_data *ad = sk->sk_user_data; + void (*state_change)(struct sock *sk); + + state_change = ad->original_sk_state_change; + if (sk->sk_state == TCP_ESTABLISHED) + complete(&ad->door_bell); + state_change(sk); +} + +static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad) +{ + int err, sndbuf_size, rcvbuf_size, my_addr_len; + struct sockaddr_in6 my_addr; + struct socket *s_listen; + struct net_conf *nc; + const char *what; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (!nc) { + rcu_read_unlock(); + return -EIO; + } + sndbuf_size = nc->sndbuf_size; + rcvbuf_size = nc->rcvbuf_size; + rcu_read_unlock(); + + my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6)); + memcpy(&my_addr, &connection->my_addr, my_addr_len); + + what = "sock_create_kern"; + err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family, + SOCK_STREAM, IPPROTO_TCP, &s_listen); + if (err) { + s_listen = NULL; + goto out; + } + + s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size); + + what = "bind before listen"; + err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len); + if (err < 0) + goto out; + + ad->s_listen = s_listen; + write_lock_bh(&s_listen->sk->sk_callback_lock); + ad->original_sk_state_change = s_listen->sk->sk_state_change; + s_listen->sk->sk_state_change = drbd_incoming_connection; + s_listen->sk->sk_user_data = ad; + write_unlock_bh(&s_listen->sk->sk_callback_lock); + + what = "listen"; + err = s_listen->ops->listen(s_listen, 5); + if (err < 0) + goto out; + + return 0; +out: + if (s_listen) + sock_release(s_listen); + if (err < 0) { + if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { + drbd_err(connection, "%s failed, err = %d\n", what, err); + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + } + } + + return -EIO; +} + +static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad) +{ + write_lock_bh(&sk->sk_callback_lock); + sk->sk_state_change = ad->original_sk_state_change; + sk->sk_user_data = NULL; + write_unlock_bh(&sk->sk_callback_lock); +} + +static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad) +{ + int timeo, connect_int, err = 0; + struct socket *s_estab = NULL; + struct net_conf *nc; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (!nc) { + rcu_read_unlock(); + return NULL; + } + connect_int = nc->connect_int; + rcu_read_unlock(); + + timeo = connect_int * HZ; + /* 28.5% random jitter */ + timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7; + + err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo); + if (err <= 0) + return NULL; + + err = kernel_accept(ad->s_listen, &s_estab, 0); + if (err < 0) { + if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { + drbd_err(connection, "accept failed, err = %d\n", err); + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + } + } + + if (s_estab) + unregister_state_change(s_estab->sk, ad); + + return s_estab; +} + +static int decode_header(struct drbd_connection *, void *, struct packet_info *); + +static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock, + enum drbd_packet cmd) +{ + if (!conn_prepare_command(connection, sock)) + return -EIO; + return conn_send_command(connection, sock, cmd, 0, NULL, 0); +} + +static int receive_first_packet(struct drbd_connection *connection, struct socket *sock) +{ + unsigned int header_size = drbd_header_size(connection); + struct packet_info pi; + struct net_conf *nc; + int err; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (!nc) { + rcu_read_unlock(); + return -EIO; + } + sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10; + rcu_read_unlock(); + + err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0); + if (err != header_size) { + if (err >= 0) + err = -EIO; + return err; + } + err = decode_header(connection, connection->data.rbuf, &pi); + if (err) + return err; + return pi.cmd; +} + +/** + * drbd_socket_okay() - Free the socket if its connection is not okay + * @sock: pointer to the pointer to the socket. + */ +static bool drbd_socket_okay(struct socket **sock) +{ + int rr; + char tb[4]; + + if (!*sock) + return false; + + rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); + + if (rr > 0 || rr == -EAGAIN) { + return true; + } else { + sock_release(*sock); + *sock = NULL; + return false; + } +} + +static bool connection_established(struct drbd_connection *connection, + struct socket **sock1, + struct socket **sock2) +{ + struct net_conf *nc; + int timeout; + bool ok; + + if (!*sock1 || !*sock2) + return false; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10; + rcu_read_unlock(); + schedule_timeout_interruptible(timeout); + + ok = drbd_socket_okay(sock1); + ok = drbd_socket_okay(sock2) && ok; + + return ok; +} + +/* Gets called if a connection is established, or if a new minor gets created + in a connection */ +int drbd_connected(struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + int err; + + atomic_set(&device->packet_seq, 0); + device->peer_seq = 0; + + device->state_mutex = peer_device->connection->agreed_pro_version < 100 ? + &peer_device->connection->cstate_mutex : + &device->own_state_mutex; + + err = drbd_send_sync_param(peer_device); + if (!err) + err = drbd_send_sizes(peer_device, 0, 0); + if (!err) + err = drbd_send_uuids(peer_device); + if (!err) + err = drbd_send_current_state(peer_device); + clear_bit(USE_DEGR_WFC_T, &device->flags); + clear_bit(RESIZE_PENDING, &device->flags); + atomic_set(&device->ap_in_flight, 0); + mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */ + return err; +} + +/* + * return values: + * 1 yes, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + * -2 We do not have a network config... + */ +static int conn_connect(struct drbd_connection *connection) +{ + struct drbd_socket sock, msock; + struct drbd_peer_device *peer_device; + struct net_conf *nc; + int vnr, timeout, h; + bool discard_my_data, ok; + enum drbd_state_rv rv; + struct accept_wait_data ad = { + .connection = connection, + .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell), + }; + + clear_bit(DISCONNECT_SENT, &connection->flags); + if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS) + return -2; + + mutex_init(&sock.mutex); + sock.sbuf = connection->data.sbuf; + sock.rbuf = connection->data.rbuf; + sock.socket = NULL; + mutex_init(&msock.mutex); + msock.sbuf = connection->meta.sbuf; + msock.rbuf = connection->meta.rbuf; + msock.socket = NULL; + + /* Assume that the peer only understands protocol 80 until we know better. */ + connection->agreed_pro_version = 80; + + if (prepare_listen_socket(connection, &ad)) + return 0; + + do { + struct socket *s; + + s = drbd_try_connect(connection); + if (s) { + if (!sock.socket) { + sock.socket = s; + send_first_packet(connection, &sock, P_INITIAL_DATA); + } else if (!msock.socket) { + clear_bit(RESOLVE_CONFLICTS, &connection->flags); + msock.socket = s; + send_first_packet(connection, &msock, P_INITIAL_META); + } else { + drbd_err(connection, "Logic error in conn_connect()\n"); + goto out_release_sockets; + } + } + + if (connection_established(connection, &sock.socket, &msock.socket)) + break; + +retry: + s = drbd_wait_for_connect(connection, &ad); + if (s) { + int fp = receive_first_packet(connection, s); + drbd_socket_okay(&sock.socket); + drbd_socket_okay(&msock.socket); + switch (fp) { + case P_INITIAL_DATA: + if (sock.socket) { + drbd_warn(connection, "initial packet S crossed\n"); + sock_release(sock.socket); + sock.socket = s; + goto randomize; + } + sock.socket = s; + break; + case P_INITIAL_META: + set_bit(RESOLVE_CONFLICTS, &connection->flags); + if (msock.socket) { + drbd_warn(connection, "initial packet M crossed\n"); + sock_release(msock.socket); + msock.socket = s; + goto randomize; + } + msock.socket = s; + break; + default: + drbd_warn(connection, "Error receiving initial packet\n"); + sock_release(s); +randomize: + if (prandom_u32() & 1) + goto retry; + } + } + + if (connection->cstate <= C_DISCONNECTING) + goto out_release_sockets; + if (signal_pending(current)) { + flush_signals(current); + smp_rmb(); + if (get_t_state(&connection->receiver) == EXITING) + goto out_release_sockets; + } + + ok = connection_established(connection, &sock.socket, &msock.socket); + } while (!ok); + + if (ad.s_listen) + sock_release(ad.s_listen); + + sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + + sock.socket->sk->sk_allocation = GFP_NOIO; + msock.socket->sk->sk_allocation = GFP_NOIO; + + sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; + msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE; + + /* NOT YET ... + * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10; + * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + * first set it to the P_CONNECTION_FEATURES timeout, + * which we set to 4x the configured ping_timeout. */ + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + + sock.socket->sk->sk_sndtimeo = + sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10; + + msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ; + timeout = nc->timeout * HZ / 10; + discard_my_data = nc->discard_my_data; + rcu_read_unlock(); + + msock.socket->sk->sk_sndtimeo = timeout; + + /* we don't want delays. + * we use TCP_CORK where appropriate, though */ + drbd_tcp_nodelay(sock.socket); + drbd_tcp_nodelay(msock.socket); + + connection->data.socket = sock.socket; + connection->meta.socket = msock.socket; + connection->last_received = jiffies; + + h = drbd_do_features(connection); + if (h <= 0) + return h; + + if (connection->cram_hmac_tfm) { + /* drbd_request_state(device, NS(conn, WFAuth)); */ + switch (drbd_do_auth(connection)) { + case -1: + drbd_err(connection, "Authentication of peer failed\n"); + return -1; + case 0: + drbd_err(connection, "Authentication of peer failed, trying again.\n"); + return 0; + } + } + + connection->data.socket->sk->sk_sndtimeo = timeout; + connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + + if (drbd_send_protocol(connection) == -EOPNOTSUPP) + return -1; + + /* Prevent a race between resync-handshake and + * being promoted to Primary. + * + * Grab and release the state mutex, so we know that any current + * drbd_set_role() is finished, and any incoming drbd_set_role + * will see the STATE_SENT flag, and wait for it to be cleared. + */ + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_lock(peer_device->device->state_mutex); + + set_bit(STATE_SENT, &connection->flags); + + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + mutex_unlock(peer_device->device->state_mutex); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + kref_get(&device->kref); + rcu_read_unlock(); + + if (discard_my_data) + set_bit(DISCARD_MY_DATA, &device->flags); + else + clear_bit(DISCARD_MY_DATA, &device->flags); + + drbd_connected(peer_device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); + + rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE); + if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) { + clear_bit(STATE_SENT, &connection->flags); + return 0; + } + + drbd_thread_start(&connection->asender); + + mutex_lock(&connection->resource->conf_update); + /* The discard_my_data flag is a single-shot modifier to the next + * connection attempt, the handshake of which is now well underway. + * No need for rcu style copying of the whole struct + * just to clear a single value. */ + connection->net_conf->discard_my_data = 0; + mutex_unlock(&connection->resource->conf_update); + + return h; + +out_release_sockets: + if (ad.s_listen) + sock_release(ad.s_listen); + if (sock.socket) + sock_release(sock.socket); + if (msock.socket) + sock_release(msock.socket); + return -1; +} + +static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi) +{ + unsigned int header_size = drbd_header_size(connection); + + if (header_size == sizeof(struct p_header100) && + *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) { + struct p_header100 *h = header; + if (h->pad != 0) { + drbd_err(connection, "Header padding is not zero\n"); + return -EINVAL; + } + pi->vnr = be16_to_cpu(h->volume); + pi->cmd = be16_to_cpu(h->command); + pi->size = be32_to_cpu(h->length); + } else if (header_size == sizeof(struct p_header95) && + *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) { + struct p_header95 *h = header; + pi->cmd = be16_to_cpu(h->command); + pi->size = be32_to_cpu(h->length); + pi->vnr = 0; + } else if (header_size == sizeof(struct p_header80) && + *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) { + struct p_header80 *h = header; + pi->cmd = be16_to_cpu(h->command); + pi->size = be16_to_cpu(h->length); + pi->vnr = 0; + } else { + drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n", + be32_to_cpu(*(__be32 *)header), + connection->agreed_pro_version); + return -EINVAL; + } + pi->data = header + header_size; + return 0; +} + +static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi) +{ + void *buffer = connection->data.rbuf; + int err; + + err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection)); + if (err) + return err; + + err = decode_header(connection, buffer, pi); + connection->last_received = jiffies; + + return err; +} + +static void drbd_flush(struct drbd_connection *connection) +{ + int rv; + struct drbd_peer_device *peer_device; + int vnr; + + if (connection->resource->write_ordering >= WO_bdev_flush) { + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + + if (!get_ldev(device)) + continue; + kref_get(&device->kref); + rcu_read_unlock(); + + /* Right now, we have only this one synchronous code path + * for flushes between request epochs. + * We may want to make those asynchronous, + * or at least parallelize the flushes to the volume devices. + */ + device->flush_jif = jiffies; + set_bit(FLUSH_PENDING, &device->flags); + rv = blkdev_issue_flush(device->ldev->backing_bdev, + GFP_NOIO, NULL); + clear_bit(FLUSH_PENDING, &device->flags); + if (rv) { + drbd_info(device, "local disk flush failed with status %d\n", rv); + /* would rather check on EOPNOTSUPP, but that is not reliable. + * don't try again for ANY return value != 0 + * if (rv == -EOPNOTSUPP) */ + drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io); + } + put_ldev(device); + kref_put(&device->kref, drbd_destroy_device); + + rcu_read_lock(); + if (rv) + break; + } + rcu_read_unlock(); + } +} + +/** + * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. + * @device: DRBD device. + * @epoch: Epoch object. + * @ev: Epoch event. + */ +static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection, + struct drbd_epoch *epoch, + enum epoch_event ev) +{ + int epoch_size; + struct drbd_epoch *next_epoch; + enum finish_epoch rv = FE_STILL_LIVE; + + spin_lock(&connection->epoch_lock); + do { + next_epoch = NULL; + + epoch_size = atomic_read(&epoch->epoch_size); + + switch (ev & ~EV_CLEANUP) { + case EV_PUT: + atomic_dec(&epoch->active); + break; + case EV_GOT_BARRIER_NR: + set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); + break; + case EV_BECAME_LAST: + /* nothing to do*/ + break; + } + + if (epoch_size != 0 && + atomic_read(&epoch->active) == 0 && + (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { + if (!(ev & EV_CLEANUP)) { + spin_unlock(&connection->epoch_lock); + drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size); + spin_lock(&connection->epoch_lock); + } +#if 0 + /* FIXME: dec unacked on connection, once we have + * something to count pending connection packets in. */ + if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) + dec_unacked(epoch->connection); +#endif + + if (connection->current_epoch != epoch) { + next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); + list_del(&epoch->list); + ev = EV_BECAME_LAST | (ev & EV_CLEANUP); + connection->epochs--; + kfree(epoch); + + if (rv == FE_STILL_LIVE) + rv = FE_DESTROYED; + } else { + epoch->flags = 0; + atomic_set(&epoch->epoch_size, 0); + /* atomic_set(&epoch->active, 0); is already zero */ + if (rv == FE_STILL_LIVE) + rv = FE_RECYCLED; + } + } + + if (!next_epoch) + break; + + epoch = next_epoch; + } while (1); + + spin_unlock(&connection->epoch_lock); + + return rv; +} + +static enum write_ordering_e +max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) +{ + struct disk_conf *dc; + + dc = rcu_dereference(bdev->disk_conf); + + if (wo == WO_bdev_flush && !dc->disk_flushes) + wo = WO_drain_io; + if (wo == WO_drain_io && !dc->disk_drain) + wo = WO_none; + + return wo; +} + +/** + * drbd_bump_write_ordering() - Fall back to an other write ordering method + * @connection: DRBD connection. + * @wo: Write ordering method to try. + */ +void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, + enum write_ordering_e wo) +{ + struct drbd_device *device; + enum write_ordering_e pwo; + int vnr; + static char *write_ordering_str[] = { + [WO_none] = "none", + [WO_drain_io] = "drain", + [WO_bdev_flush] = "flush", + }; + + pwo = resource->write_ordering; + if (wo != WO_bdev_flush) + wo = min(pwo, wo); + rcu_read_lock(); + idr_for_each_entry(&resource->devices, device, vnr) { + if (get_ldev(device)) { + wo = max_allowed_wo(device->ldev, wo); + if (device->ldev == bdev) + bdev = NULL; + put_ldev(device); + } + } + + if (bdev) + wo = max_allowed_wo(bdev, wo); + + rcu_read_unlock(); + + resource->write_ordering = wo; + if (pwo != resource->write_ordering || wo == WO_bdev_flush) + drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); +} + +/** + * drbd_submit_peer_request() + * @device: DRBD device. + * @peer_req: peer request + * @rw: flag field, see bio->bi_rw + * + * May spread the pages to multiple bios, + * depending on bio_add_page restrictions. + * + * Returns 0 if all bios have been submitted, + * -ENOMEM if we could not allocate enough bios, + * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a + * single page to an empty bio (which should never happen and likely indicates + * that the lower level IO stack is in some way broken). This has been observed + * on certain Xen deployments. + */ +/* TODO allocate from our own bio_set. */ +int drbd_submit_peer_request(struct drbd_device *device, + struct drbd_peer_request *peer_req, + const unsigned rw, const int fault_type) +{ + struct bio *bios = NULL; + struct bio *bio; + struct page *page = peer_req->pages; + sector_t sector = peer_req->i.sector; + unsigned data_size = peer_req->i.size; + unsigned n_bios = 0; + unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; + int err = -ENOMEM; + + if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { + /* wait for all pending IO completions, before we start + * zeroing things out. */ + conn_wait_active_ee_empty(first_peer_device(device)->connection); + /* add it to the active list now, + * so we can find it to present it in debugfs */ + peer_req->submit_jif = jiffies; + peer_req->flags |= EE_SUBMITTED; + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->active_ee); + spin_unlock_irq(&device->resource->req_lock); + if (blkdev_issue_zeroout(device->ldev->backing_bdev, + sector, data_size >> 9, GFP_NOIO, false)) + peer_req->flags |= EE_WAS_ERROR; + drbd_endio_write_sec_final(peer_req); + return 0; + } + + /* Discards don't have any payload. + * But the scsi layer still expects a bio_vec it can use internally, + * see sd_setup_discard_cmnd() and blk_add_request_payload(). */ + if (peer_req->flags & EE_IS_TRIM) + nr_pages = 1; + + /* In most cases, we will only need one bio. But in case the lower + * level restrictions happen to be different at this offset on this + * side than those of the sending peer, we may need to submit the + * request in more than one bio. + * + * Plain bio_alloc is good enough here, this is no DRBD internally + * generated bio, but a bio allocated on behalf of the peer. + */ +next_bio: + bio = bio_alloc(GFP_NOIO, nr_pages); + if (!bio) { + drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages); + goto fail; + } + /* > peer_req->i.sector, unless this is the first bio */ + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = device->ldev->backing_bdev; + bio->bi_rw = rw; + bio->bi_private = peer_req; + bio->bi_end_io = drbd_peer_request_endio; + + bio->bi_next = bios; + bios = bio; + ++n_bios; + + if (rw & REQ_DISCARD) { + bio->bi_iter.bi_size = data_size; + goto submit; + } + + page_chain_for_each(page) { + unsigned len = min_t(unsigned, data_size, PAGE_SIZE); + if (!bio_add_page(bio, page, len, 0)) { + /* A single page must always be possible! + * But in case it fails anyways, + * we deal with it, and complain (below). */ + if (bio->bi_vcnt == 0) { + drbd_err(device, + "bio_add_page failed for len=%u, " + "bi_vcnt=0 (bi_sector=%llu)\n", + len, (uint64_t)bio->bi_iter.bi_sector); + err = -ENOSPC; + goto fail; + } + goto next_bio; + } + data_size -= len; + sector += len >> 9; + --nr_pages; + } + D_ASSERT(device, data_size == 0); +submit: + D_ASSERT(device, page == NULL); + + atomic_set(&peer_req->pending_bios, n_bios); + /* for debugfs: update timestamp, mark as submitted */ + peer_req->submit_jif = jiffies; + peer_req->flags |= EE_SUBMITTED; + do { + bio = bios; + bios = bios->bi_next; + bio->bi_next = NULL; + + drbd_generic_make_request(device, fault_type, bio); + } while (bios); + return 0; + +fail: + while (bios) { + bio = bios; + bios = bios->bi_next; + bio_put(bio); + } + return err; +} + +static void drbd_remove_epoch_entry_interval(struct drbd_device *device, + struct drbd_peer_request *peer_req) +{ + struct drbd_interval *i = &peer_req->i; + + drbd_remove_interval(&device->write_requests, i); + drbd_clear_interval(i); + + /* Wake up any processes waiting for this peer request to complete. */ + if (i->waiting) + wake_up(&device->misc_wait); +} + +static void conn_wait_active_ee_empty(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + + kref_get(&device->kref); + rcu_read_unlock(); + drbd_wait_ee_list_empty(device, &device->active_ee); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + +static struct drbd_peer_device * +conn_peer_device(struct drbd_connection *connection, int volume_number) +{ + return idr_find(&connection->peer_devices, volume_number); +} + +static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi) +{ + int rv; + struct p_barrier *p = pi->data; + struct drbd_epoch *epoch; + + /* FIXME these are unacked on connection, + * not a specific (peer)device. + */ + connection->current_epoch->barrier_nr = p->barrier; + connection->current_epoch->connection = connection; + rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR); + + /* P_BARRIER_ACK may imply that the corresponding extent is dropped from + * the activity log, which means it would not be resynced in case the + * R_PRIMARY crashes now. + * Therefore we must send the barrier_ack after the barrier request was + * completed. */ + switch (connection->resource->write_ordering) { + case WO_none: + if (rv == FE_RECYCLED) + return 0; + + /* receiver context, in the writeout path of the other node. + * avoid potential distributed deadlock */ + epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); + if (epoch) + break; + else + drbd_warn(connection, "Allocation of an epoch failed, slowing down\n"); + /* Fall through */ + + case WO_bdev_flush: + case WO_drain_io: + conn_wait_active_ee_empty(connection); + drbd_flush(connection); + + if (atomic_read(&connection->current_epoch->epoch_size)) { + epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); + if (epoch) + break; + } + + return 0; + default: + drbd_err(connection, "Strangeness in connection->write_ordering %d\n", + connection->resource->write_ordering); + return -EIO; + } + + epoch->flags = 0; + atomic_set(&epoch->epoch_size, 0); + atomic_set(&epoch->active, 0); + + spin_lock(&connection->epoch_lock); + if (atomic_read(&connection->current_epoch->epoch_size)) { + list_add(&epoch->list, &connection->current_epoch->list); + connection->current_epoch = epoch; + connection->epochs++; + } else { + /* The current_epoch got recycled while we allocated this one... */ + kfree(epoch); + } + spin_unlock(&connection->epoch_lock); + + return 0; +} + +/* used from receive_RSDataReply (recv_resync_read) + * and from receive_Data */ +static struct drbd_peer_request * +read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, + struct packet_info *pi) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + const sector_t capacity = drbd_get_capacity(device->this_bdev); + struct drbd_peer_request *peer_req; + struct page *page; + int digest_size, err; + unsigned int data_size = pi->size, ds; + void *dig_in = peer_device->connection->int_dig_in; + void *dig_vv = peer_device->connection->int_dig_vv; + unsigned long *data; + struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; + + digest_size = 0; + if (!trim && peer_device->connection->peer_integrity_tfm) { + digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); + /* + * FIXME: Receive the incoming digest into the receive buffer + * here, together with its struct p_data? + */ + err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); + if (err) + return NULL; + data_size -= digest_size; + } + + if (trim) { + D_ASSERT(peer_device, data_size == 0); + data_size = be32_to_cpu(trim->size); + } + + if (!expect(IS_ALIGNED(data_size, 512))) + return NULL; + /* prepare for larger trim requests. */ + if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE)) + return NULL; + + /* even though we trust out peer, + * we sometimes have to double check. */ + if (sector + (data_size>>9) > capacity) { + drbd_err(device, "request from peer beyond end of local disk: " + "capacity: %llus < sector: %llus + size: %u\n", + (unsigned long long)capacity, + (unsigned long long)sector, data_size); + return NULL; + } + + /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO); + if (!peer_req) + return NULL; + + peer_req->flags |= EE_WRITE; + if (trim) + return peer_req; + + ds = data_size; + page = peer_req->pages; + page_chain_for_each(page) { + unsigned len = min_t(int, ds, PAGE_SIZE); + data = kmap(page); + err = drbd_recv_all_warn(peer_device->connection, data, len); + if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) { + drbd_err(device, "Fault injection: Corrupting data on receive\n"); + data[0] = data[0] ^ (unsigned long)-1; + } + kunmap(page); + if (err) { + drbd_free_peer_req(device, peer_req); + return NULL; + } + ds -= len; + } + + if (digest_size) { + drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv); + if (memcmp(dig_in, dig_vv, digest_size)) { + drbd_err(device, "Digest integrity check FAILED: %llus +%u\n", + (unsigned long long)sector, data_size); + drbd_free_peer_req(device, peer_req); + return NULL; + } + } + device->recv_cnt += data_size >> 9; + return peer_req; +} + +/* drbd_drain_block() just takes a data block + * out of the socket input buffer, and discards it. + */ +static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size) +{ + struct page *page; + int err = 0; + void *data; + + if (!data_size) + return 0; + + page = drbd_alloc_pages(peer_device, 1, 1); + + data = kmap(page); + while (data_size) { + unsigned int len = min_t(int, data_size, PAGE_SIZE); + + err = drbd_recv_all_warn(peer_device->connection, data, len); + if (err) + break; + data_size -= len; + } + kunmap(page); + drbd_free_pages(peer_device->device, page, 0); + return err; +} + +static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req, + sector_t sector, int data_size) +{ + struct bio_vec bvec; + struct bvec_iter iter; + struct bio *bio; + int digest_size, err, expect; + void *dig_in = peer_device->connection->int_dig_in; + void *dig_vv = peer_device->connection->int_dig_vv; + + digest_size = 0; + if (peer_device->connection->peer_integrity_tfm) { + digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm); + err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size); + if (err) + return err; + data_size -= digest_size; + } + + /* optimistically update recv_cnt. if receiving fails below, + * we disconnect anyways, and counters will be reset. */ + peer_device->device->recv_cnt += data_size>>9; + + bio = req->master_bio; + D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector); + + bio_for_each_segment(bvec, bio, iter) { + void *mapped = kmap(bvec.bv_page) + bvec.bv_offset; + expect = min_t(int, data_size, bvec.bv_len); + err = drbd_recv_all_warn(peer_device->connection, mapped, expect); + kunmap(bvec.bv_page); + if (err) + return err; + data_size -= expect; + } + + if (digest_size) { + drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv); + if (memcmp(dig_in, dig_vv, digest_size)) { + drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n"); + return -EINVAL; + } + } + + D_ASSERT(peer_device->device, data_size == 0); + return 0; +} + +/* + * e_end_resync_block() is called in asender context via + * drbd_finish_peer_reqs(). + */ +static int e_end_resync_block(struct drbd_work *w, int unused) +{ + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + sector_t sector = peer_req->i.sector; + int err; + + D_ASSERT(device, drbd_interval_empty(&peer_req->i)); + + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + drbd_set_in_sync(device, sector, peer_req->i.size); + err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req); + } else { + /* Record failure to sync */ + drbd_rs_failed_io(device, sector, peer_req->i.size); + + err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); + } + dec_unacked(device); + + return err; +} + +static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector, + struct packet_info *pi) __releases(local) +{ + struct drbd_device *device = peer_device->device; + struct drbd_peer_request *peer_req; + + peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi); + if (!peer_req) + goto fail; + + dec_rs_pending(device); + + inc_unacked(device); + /* corresponding dec_unacked() in e_end_resync_block() + * respective _drbd_clear_done_ee */ + + peer_req->w.cb = e_end_resync_block; + peer_req->submit_jif = jiffies; + + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->sync_ee); + spin_unlock_irq(&device->resource->req_lock); + + atomic_add(pi->size >> 9, &device->rs_sect_ev); + if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) + return 0; + + /* don't care for the reason here */ + drbd_err(device, "submit failed, triggering re-connect\n"); + spin_lock_irq(&device->resource->req_lock); + list_del(&peer_req->w.list); + spin_unlock_irq(&device->resource->req_lock); + + drbd_free_peer_req(device, peer_req); +fail: + put_ldev(device); + return -EIO; +} + +static struct drbd_request * +find_request(struct drbd_device *device, struct rb_root *root, u64 id, + sector_t sector, bool missing_ok, const char *func) +{ + struct drbd_request *req; + + /* Request object according to our peer */ + req = (struct drbd_request *)(unsigned long)id; + if (drbd_contains_interval(root, sector, &req->i) && req->i.local) + return req; + if (!missing_ok) { + drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func, + (unsigned long)id, (unsigned long long)sector); + } + return NULL; +} + +static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct drbd_request *req; + sector_t sector; + int err; + struct p_data *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + sector = be64_to_cpu(p->sector); + + spin_lock_irq(&device->resource->req_lock); + req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__); + spin_unlock_irq(&device->resource->req_lock); + if (unlikely(!req)) + return -EIO; + + /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid + * special casing it there for the various failure cases. + * still no race with drbd_fail_pending_reads */ + err = recv_dless_read(peer_device, req, sector, pi->size); + if (!err) + req_mod(req, DATA_RECEIVED); + /* else: nothing. handled from drbd_disconnect... + * I don't think we may complete this just yet + * in case we are "on-disconnect: freeze" */ + + return err; +} + +static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + sector_t sector; + int err; + struct p_data *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + sector = be64_to_cpu(p->sector); + D_ASSERT(device, p->block_id == ID_SYNCER); + + if (get_ldev(device)) { + /* data is submitted to disk within recv_resync_read. + * corresponding put_ldev done below on error, + * or in drbd_peer_request_endio. */ + err = recv_resync_read(peer_device, sector, pi); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Can not write resync data to local disk.\n"); + + err = drbd_drain_block(peer_device, pi->size); + + drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size); + } + + atomic_add(pi->size >> 9, &device->rs_sect_in); + + return err; +} + +static void restart_conflicting_writes(struct drbd_device *device, + sector_t sector, int size) +{ + struct drbd_interval *i; + struct drbd_request *req; + + drbd_for_each_overlap(i, &device->write_requests, sector, size) { + if (!i->local) + continue; + req = container_of(i, struct drbd_request, i); + if (req->rq_state & RQ_LOCAL_PENDING || + !(req->rq_state & RQ_POSTPONED)) + continue; + /* as it is RQ_POSTPONED, this will cause it to + * be queued on the retry workqueue. */ + __req_mod(req, CONFLICT_RESOLVED, NULL); + } +} + +/* + * e_end_block() is called in asender context via drbd_finish_peer_reqs(). + */ +static int e_end_block(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + sector_t sector = peer_req->i.sector; + int err = 0, pcmd; + + if (peer_req->flags & EE_SEND_WRITE_ACK) { + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + pcmd = (device->state.conn >= C_SYNC_SOURCE && + device->state.conn <= C_PAUSED_SYNC_T && + peer_req->flags & EE_MAY_SET_IN_SYNC) ? + P_RS_WRITE_ACK : P_WRITE_ACK; + err = drbd_send_ack(peer_device, pcmd, peer_req); + if (pcmd == P_RS_WRITE_ACK) + drbd_set_in_sync(device, sector, peer_req->i.size); + } else { + err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req); + /* we expect it to be marked out of sync anyways... + * maybe assert this? */ + } + dec_unacked(device); + } + + /* we delete from the conflict detection hash _after_ we sent out the + * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ + if (peer_req->flags & EE_IN_INTERVAL_TREE) { + spin_lock_irq(&device->resource->req_lock); + D_ASSERT(device, !drbd_interval_empty(&peer_req->i)); + drbd_remove_epoch_entry_interval(device, peer_req); + if (peer_req->flags & EE_RESTART_REQUESTS) + restart_conflicting_writes(device, sector, peer_req->i.size); + spin_unlock_irq(&device->resource->req_lock); + } else + D_ASSERT(device, drbd_interval_empty(&peer_req->i)); + + drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); + + return err; +} + +static int e_send_ack(struct drbd_work *w, enum drbd_packet ack) +{ + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + int err; + + err = drbd_send_ack(peer_device, ack, peer_req); + dec_unacked(peer_device->device); + + return err; +} + +static int e_send_superseded(struct drbd_work *w, int unused) +{ + return e_send_ack(w, P_SUPERSEDED); +} + +static int e_send_retry_write(struct drbd_work *w, int unused) +{ + struct drbd_peer_request *peer_req = + container_of(w, struct drbd_peer_request, w); + struct drbd_connection *connection = peer_req->peer_device->connection; + + return e_send_ack(w, connection->agreed_pro_version >= 100 ? + P_RETRY_WRITE : P_SUPERSEDED); +} + +static bool seq_greater(u32 a, u32 b) +{ + /* + * We assume 32-bit wrap-around here. + * For 24-bit wrap-around, we would have to shift: + * a <<= 8; b <<= 8; + */ + return (s32)a - (s32)b > 0; +} + +static u32 seq_max(u32 a, u32 b) +{ + return seq_greater(a, b) ? a : b; +} + +static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq) +{ + struct drbd_device *device = peer_device->device; + unsigned int newest_peer_seq; + + if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) { + spin_lock(&device->peer_seq_lock); + newest_peer_seq = seq_max(device->peer_seq, peer_seq); + device->peer_seq = newest_peer_seq; + spin_unlock(&device->peer_seq_lock); + /* wake up only if we actually changed device->peer_seq */ + if (peer_seq == newest_peer_seq) + wake_up(&device->seq_wait); + } +} + +static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) +{ + return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); +} + +/* maybe change sync_ee into interval trees as well? */ +static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req) +{ + struct drbd_peer_request *rs_req; + bool rv = 0; + + spin_lock_irq(&device->resource->req_lock); + list_for_each_entry(rs_req, &device->sync_ee, w.list) { + if (overlaps(peer_req->i.sector, peer_req->i.size, + rs_req->i.sector, rs_req->i.size)) { + rv = 1; + break; + } + } + spin_unlock_irq(&device->resource->req_lock); + + return rv; +} + +/* Called from receive_Data. + * Synchronize packets on sock with packets on msock. + * + * This is here so even when a P_DATA packet traveling via sock overtook an Ack + * packet traveling on msock, they are still processed in the order they have + * been sent. + * + * Note: we don't care for Ack packets overtaking P_DATA packets. + * + * In case packet_seq is larger than device->peer_seq number, there are + * outstanding packets on the msock. We wait for them to arrive. + * In case we are the logically next packet, we update device->peer_seq + * ourselves. Correctly handles 32bit wrap around. + * + * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, + * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds + * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have + * 1<<9 == 512 seconds aka ages for the 32bit wrap around... + * + * returns 0 if we may process the packet, + * -ERESTARTSYS if we were interrupted (by disconnect signal). */ +static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq) +{ + struct drbd_device *device = peer_device->device; + DEFINE_WAIT(wait); + long timeout; + int ret = 0, tp; + + if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) + return 0; + + spin_lock(&device->peer_seq_lock); + for (;;) { + if (!seq_greater(peer_seq - 1, device->peer_seq)) { + device->peer_seq = seq_max(device->peer_seq, peer_seq); + break; + } + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + rcu_read_lock(); + tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries; + rcu_read_unlock(); + + if (!tp) + break; + + /* Only need to wait if two_primaries is enabled */ + prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE); + spin_unlock(&device->peer_seq_lock); + rcu_read_lock(); + timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10; + rcu_read_unlock(); + timeout = schedule_timeout(timeout); + spin_lock(&device->peer_seq_lock); + if (!timeout) { + ret = -ETIMEDOUT; + drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n"); + break; + } + } + spin_unlock(&device->peer_seq_lock); + finish_wait(&device->seq_wait, &wait); + return ret; +} + +/* see also bio_flags_to_wire() + * DRBD_REQ_*, because we need to semantically map the flags to data packet + * flags and back. We may replicate to other kernel versions. */ +static unsigned long wire_flags_to_bio(u32 dpf) +{ + return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | + (dpf & DP_FUA ? REQ_FUA : 0) | + (dpf & DP_FLUSH ? REQ_FLUSH : 0) | + (dpf & DP_DISCARD ? REQ_DISCARD : 0); +} + +static void fail_postponed_requests(struct drbd_device *device, sector_t sector, + unsigned int size) +{ + struct drbd_interval *i; + + repeat: + drbd_for_each_overlap(i, &device->write_requests, sector, size) { + struct drbd_request *req; + struct bio_and_error m; + + if (!i->local) + continue; + req = container_of(i, struct drbd_request, i); + if (!(req->rq_state & RQ_POSTPONED)) + continue; + req->rq_state &= ~RQ_POSTPONED; + __req_mod(req, NEG_ACKED, &m); + spin_unlock_irq(&device->resource->req_lock); + if (m.bio) + complete_master_bio(device, &m); + spin_lock_irq(&device->resource->req_lock); + goto repeat; + } +} + +static int handle_write_conflicts(struct drbd_device *device, + struct drbd_peer_request *peer_req) +{ + struct drbd_connection *connection = peer_req->peer_device->connection; + bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags); + sector_t sector = peer_req->i.sector; + const unsigned int size = peer_req->i.size; + struct drbd_interval *i; + bool equal; + int err; + + /* + * Inserting the peer request into the write_requests tree will prevent + * new conflicting local requests from being added. + */ + drbd_insert_interval(&device->write_requests, &peer_req->i); + + repeat: + drbd_for_each_overlap(i, &device->write_requests, sector, size) { + if (i == &peer_req->i) + continue; + if (i->completed) + continue; + + if (!i->local) { + /* + * Our peer has sent a conflicting remote request; this + * should not happen in a two-node setup. Wait for the + * earlier peer request to complete. + */ + err = drbd_wait_misc(device, i); + if (err) + goto out; + goto repeat; + } + + equal = i->sector == sector && i->size == size; + if (resolve_conflicts) { + /* + * If the peer request is fully contained within the + * overlapping request, it can be considered overwritten + * and thus superseded; otherwise, it will be retried + * once all overlapping requests have completed. + */ + bool superseded = i->sector <= sector && i->sector + + (i->size >> 9) >= sector + (size >> 9); + + if (!equal) + drbd_alert(device, "Concurrent writes detected: " + "local=%llus +%u, remote=%llus +%u, " + "assuming %s came first\n", + (unsigned long long)i->sector, i->size, + (unsigned long long)sector, size, + superseded ? "local" : "remote"); + + peer_req->w.cb = superseded ? e_send_superseded : + e_send_retry_write; + list_add_tail(&peer_req->w.list, &device->done_ee); + wake_asender(connection); + + err = -ENOENT; + goto out; + } else { + struct drbd_request *req = + container_of(i, struct drbd_request, i); + + if (!equal) + drbd_alert(device, "Concurrent writes detected: " + "local=%llus +%u, remote=%llus +%u\n", + (unsigned long long)i->sector, i->size, + (unsigned long long)sector, size); + + if (req->rq_state & RQ_LOCAL_PENDING || + !(req->rq_state & RQ_POSTPONED)) { + /* + * Wait for the node with the discard flag to + * decide if this request has been superseded + * or needs to be retried. + * Requests that have been superseded will + * disappear from the write_requests tree. + * + * In addition, wait for the conflicting + * request to finish locally before submitting + * the conflicting peer request. + */ + err = drbd_wait_misc(device, &req->i); + if (err) { + _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD); + fail_postponed_requests(device, sector, size); + goto out; + } + goto repeat; + } + /* + * Remember to restart the conflicting requests after + * the new peer request has completed. + */ + peer_req->flags |= EE_RESTART_REQUESTS; + } + } + err = 0; + + out: + if (err) + drbd_remove_epoch_entry_interval(device, peer_req); + return err; +} + +/* mirrored write */ +static int receive_Data(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct net_conf *nc; + sector_t sector; + struct drbd_peer_request *peer_req; + struct p_data *p = pi->data; + u32 peer_seq = be32_to_cpu(p->seq_num); + int rw = WRITE; + u32 dp_flags; + int err, tp; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + if (!get_ldev(device)) { + int err2; + + err = wait_for_and_update_peer_seq(peer_device, peer_seq); + drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size); + atomic_inc(&connection->current_epoch->epoch_size); + err2 = drbd_drain_block(peer_device, pi->size); + if (!err) + err = err2; + return err; + } + + /* + * Corresponding put_ldev done either below (on various errors), or in + * drbd_peer_request_endio, if we successfully submit the data at the + * end of this function. + */ + + sector = be64_to_cpu(p->sector); + peer_req = read_in_block(peer_device, p->block_id, sector, pi); + if (!peer_req) { + put_ldev(device); + return -EIO; + } + + peer_req->w.cb = e_end_block; + peer_req->submit_jif = jiffies; + peer_req->flags |= EE_APPLICATION; + + dp_flags = be32_to_cpu(p->dp_flags); + rw |= wire_flags_to_bio(dp_flags); + if (pi->cmd == P_TRIM) { + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + peer_req->flags |= EE_IS_TRIM; + if (!blk_queue_discard(q)) + peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT; + D_ASSERT(peer_device, peer_req->i.size > 0); + D_ASSERT(peer_device, rw & REQ_DISCARD); + D_ASSERT(peer_device, peer_req->pages == NULL); + } else if (peer_req->pages == NULL) { + D_ASSERT(device, peer_req->i.size == 0); + D_ASSERT(device, dp_flags & DP_FLUSH); + } + + if (dp_flags & DP_MAY_SET_IN_SYNC) + peer_req->flags |= EE_MAY_SET_IN_SYNC; + + spin_lock(&connection->epoch_lock); + peer_req->epoch = connection->current_epoch; + atomic_inc(&peer_req->epoch->epoch_size); + atomic_inc(&peer_req->epoch->active); + spin_unlock(&connection->epoch_lock); + + rcu_read_lock(); + nc = rcu_dereference(peer_device->connection->net_conf); + tp = nc->two_primaries; + if (peer_device->connection->agreed_pro_version < 100) { + switch (nc->wire_protocol) { + case DRBD_PROT_C: + dp_flags |= DP_SEND_WRITE_ACK; + break; + case DRBD_PROT_B: + dp_flags |= DP_SEND_RECEIVE_ACK; + break; + } + } + rcu_read_unlock(); + + if (dp_flags & DP_SEND_WRITE_ACK) { + peer_req->flags |= EE_SEND_WRITE_ACK; + inc_unacked(device); + /* corresponding dec_unacked() in e_end_block() + * respective _drbd_clear_done_ee */ + } + + if (dp_flags & DP_SEND_RECEIVE_ACK) { + /* I really don't like it that the receiver thread + * sends on the msock, but anyways */ + drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req); + } + + if (tp) { + /* two primaries implies protocol C */ + D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK); + peer_req->flags |= EE_IN_INTERVAL_TREE; + err = wait_for_and_update_peer_seq(peer_device, peer_seq); + if (err) + goto out_interrupted; + spin_lock_irq(&device->resource->req_lock); + err = handle_write_conflicts(device, peer_req); + if (err) { + spin_unlock_irq(&device->resource->req_lock); + if (err == -ENOENT) { + put_ldev(device); + return 0; + } + goto out_interrupted; + } + } else { + update_peer_seq(peer_device, peer_seq); + spin_lock_irq(&device->resource->req_lock); + } + /* if we use the zeroout fallback code, we process synchronously + * and we wait for all pending requests, respectively wait for + * active_ee to become empty in drbd_submit_peer_request(); + * better not add ourselves here. */ + if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) + list_add_tail(&peer_req->w.list, &device->active_ee); + spin_unlock_irq(&device->resource->req_lock); + + if (device->state.conn == C_SYNC_TARGET) + wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req)); + + if (device->state.pdsk < D_INCONSISTENT) { + /* In case we have the only disk of the cluster, */ + drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size); + peer_req->flags &= ~EE_MAY_SET_IN_SYNC; + drbd_al_begin_io(device, &peer_req->i); + peer_req->flags |= EE_CALL_AL_COMPLETE_IO; + } + + err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR); + if (!err) + return 0; + + /* don't care for the reason here */ + drbd_err(device, "submit failed, triggering re-connect\n"); + spin_lock_irq(&device->resource->req_lock); + list_del(&peer_req->w.list); + drbd_remove_epoch_entry_interval(device, peer_req); + spin_unlock_irq(&device->resource->req_lock); + if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) { + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; + drbd_al_complete_io(device, &peer_req->i); + } + +out_interrupted: + drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP); + put_ldev(device); + drbd_free_peer_req(device, peer_req); + return err; +} + +/* We may throttle resync, if the lower device seems to be busy, + * and current sync rate is above c_min_rate. + * + * To decide whether or not the lower device is busy, we use a scheme similar + * to MD RAID is_mddev_idle(): if the partition stats reveal "significant" + * (more than 64 sectors) of activity we cannot account for with our own resync + * activity, it obviously is "busy". + * + * The current sync rate used here uses only the most recent two step marks, + * to have a short time average so we can react faster. + */ +bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, + bool throttle_if_app_is_waiting) +{ + struct lc_element *tmp; + bool throttle = drbd_rs_c_min_rate_throttle(device); + + if (!throttle || throttle_if_app_is_waiting) + return throttle; + + spin_lock_irq(&device->al_lock); + tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector)); + if (tmp) { + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); + if (test_bit(BME_PRIORITY, &bm_ext->flags)) + throttle = false; + /* Do not slow down if app IO is already waiting for this extent, + * and our progress is necessary for application IO to complete. */ + } + spin_unlock_irq(&device->al_lock); + + return throttle; +} + +bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) +{ + struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; + unsigned long db, dt, dbdt; + unsigned int c_min_rate; + int curr_events; + + rcu_read_lock(); + c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate; + rcu_read_unlock(); + + /* feature disabled? */ + if (c_min_rate == 0) + return false; + + curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]) - + atomic_read(&device->rs_sect_ev); + + if (atomic_read(&device->ap_actlog_cnt) + || curr_events - device->rs_last_events > 64) { + unsigned long rs_left; + int i; + + device->rs_last_events = curr_events; + + /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP, + * approx. */ + i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS; + + if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) + rs_left = device->ov_left; + else + rs_left = drbd_bm_total_weight(device) - device->rs_failed; + + dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ; + if (!dt) + dt++; + db = device->rs_mark_left[i] - rs_left; + dbdt = Bit2KB(db/dt); + + if (dbdt > c_min_rate) + return true; + } + return false; +} + +static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + sector_t sector; + sector_t capacity; + struct drbd_peer_request *peer_req; + struct digest_info *di = NULL; + int size, verb; + unsigned int fault_type; + struct p_block_req *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + capacity = drbd_get_capacity(device->this_bdev); + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { + drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector, size); + return -EINVAL; + } + if (sector + (size>>9) > capacity) { + drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector, size); + return -EINVAL; + } + + if (!get_ldev_if_state(device, D_UP_TO_DATE)) { + verb = 1; + switch (pi->cmd) { + case P_DATA_REQUEST: + drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p); + break; + case P_RS_DATA_REQUEST: + case P_CSUM_RS_REQUEST: + case P_OV_REQUEST: + drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p); + break; + case P_OV_REPLY: + verb = 0; + dec_rs_pending(device); + drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC); + break; + default: + BUG(); + } + if (verb && __ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Can not satisfy peer's read request, " + "no local data.\n"); + + /* drain possibly payload */ + return drbd_drain_block(peer_device, pi->size); + } + + /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, + true /* has real payload */, GFP_NOIO); + if (!peer_req) { + put_ldev(device); + return -ENOMEM; + } + + switch (pi->cmd) { + case P_DATA_REQUEST: + peer_req->w.cb = w_e_end_data_req; + fault_type = DRBD_FAULT_DT_RD; + /* application IO, don't drbd_rs_begin_io */ + peer_req->flags |= EE_APPLICATION; + goto submit; + + case P_RS_DATA_REQUEST: + peer_req->w.cb = w_e_end_rsdata_req; + fault_type = DRBD_FAULT_RS_RD; + /* used in the sector offset progress display */ + device->bm_resync_fo = BM_SECT_TO_BIT(sector); + break; + + case P_OV_REPLY: + case P_CSUM_RS_REQUEST: + fault_type = DRBD_FAULT_RS_RD; + di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO); + if (!di) + goto out_free_e; + + di->digest_size = pi->size; + di->digest = (((char *)di)+sizeof(struct digest_info)); + + peer_req->digest = di; + peer_req->flags |= EE_HAS_DIGEST; + + if (drbd_recv_all(peer_device->connection, di->digest, pi->size)) + goto out_free_e; + + if (pi->cmd == P_CSUM_RS_REQUEST) { + D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89); + peer_req->w.cb = w_e_end_csum_rs_req; + /* used in the sector offset progress display */ + device->bm_resync_fo = BM_SECT_TO_BIT(sector); + /* remember to report stats in drbd_resync_finished */ + device->use_csums = true; + } else if (pi->cmd == P_OV_REPLY) { + /* track progress, we may need to throttle */ + atomic_add(size >> 9, &device->rs_sect_in); + peer_req->w.cb = w_e_end_ov_reply; + dec_rs_pending(device); + /* drbd_rs_begin_io done when we sent this request, + * but accounting still needs to be done. */ + goto submit_for_resync; + } + break; + + case P_OV_REQUEST: + if (device->ov_start_sector == ~(sector_t)0 && + peer_device->connection->agreed_pro_version >= 90) { + unsigned long now = jiffies; + int i; + device->ov_start_sector = sector; + device->ov_position = sector; + device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector); + device->rs_total = device->ov_left; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + device->rs_mark_left[i] = device->ov_left; + device->rs_mark_time[i] = now; + } + drbd_info(device, "Online Verify start sector: %llu\n", + (unsigned long long)sector); + } + peer_req->w.cb = w_e_end_ov_req; + fault_type = DRBD_FAULT_RS_RD; + break; + + default: + BUG(); + } + + /* Throttle, drbd_rs_begin_io and submit should become asynchronous + * wrt the receiver, but it is not as straightforward as it may seem. + * Various places in the resync start and stop logic assume resync + * requests are processed in order, requeuing this on the worker thread + * introduces a bunch of new code for synchronization between threads. + * + * Unlimited throttling before drbd_rs_begin_io may stall the resync + * "forever", throttling after drbd_rs_begin_io will lock that extent + * for application writes for the same time. For now, just throttle + * here, where the rest of the code expects the receiver to sleep for + * a while, anyways. + */ + + /* Throttle before drbd_rs_begin_io, as that locks out application IO; + * this defers syncer requests for some time, before letting at least + * on request through. The resync controller on the receiving side + * will adapt to the incoming rate accordingly. + * + * We cannot throttle here if remote is Primary/SyncTarget: + * we would also throttle its application reads. + * In that case, throttling is done on the SyncTarget only. + */ + + /* Even though this may be a resync request, we do add to "read_ee"; + * "sync_ee" is only used for resync WRITEs. + * Add to list early, so debugfs can find this request + * even if we have to sleep below. */ + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->read_ee); + spin_unlock_irq(&device->resource->req_lock); + + update_receiver_timing_details(connection, drbd_rs_should_slow_down); + if (device->state.peer != R_PRIMARY + && drbd_rs_should_slow_down(device, sector, false)) + schedule_timeout_uninterruptible(HZ/10); + update_receiver_timing_details(connection, drbd_rs_begin_io); + if (drbd_rs_begin_io(device, sector)) + goto out_free_e; + +submit_for_resync: + atomic_add(size >> 9, &device->rs_sect_ev); + +submit: + update_receiver_timing_details(connection, drbd_submit_peer_request); + inc_unacked(device); + if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0) + return 0; + + /* don't care for the reason here */ + drbd_err(device, "submit failed, triggering re-connect\n"); + +out_free_e: + spin_lock_irq(&device->resource->req_lock); + list_del(&peer_req->w.list); + spin_unlock_irq(&device->resource->req_lock); + /* no drbd_rs_complete_io(), we are dropping the connection anyways */ + + put_ldev(device); + drbd_free_peer_req(device, peer_req); + return -EIO; +} + +/** + * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries + */ +static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + int self, peer, rv = -100; + unsigned long ch_self, ch_peer; + enum drbd_after_sb_p after_sb_0p; + + self = device->ldev->md.uuid[UI_BITMAP] & 1; + peer = device->p_uuid[UI_BITMAP] & 1; + + ch_peer = device->p_uuid[UI_SIZE]; + ch_self = device->comm_bm_set; + + rcu_read_lock(); + after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p; + rcu_read_unlock(); + switch (after_sb_0p) { + case ASB_CONSENSUS: + case ASB_DISCARD_SECONDARY: + case ASB_CALL_HELPER: + case ASB_VIOLENTLY: + drbd_err(device, "Configuration error.\n"); + break; + case ASB_DISCONNECT: + break; + case ASB_DISCARD_YOUNGER_PRI: + if (self == 0 && peer == 1) { + rv = -1; + break; + } + if (self == 1 && peer == 0) { + rv = 1; + break; + } + /* Else fall through to one of the other strategies... */ + case ASB_DISCARD_OLDER_PRI: + if (self == 0 && peer == 1) { + rv = 1; + break; + } + if (self == 1 && peer == 0) { + rv = -1; + break; + } + /* Else fall through to one of the other strategies... */ + drbd_warn(device, "Discard younger/older primary did not find a decision\n" + "Using discard-least-changes instead\n"); + case ASB_DISCARD_ZERO_CHG: + if (ch_peer == 0 && ch_self == 0) { + rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) + ? -1 : 1; + break; + } else { + if (ch_peer == 0) { rv = 1; break; } + if (ch_self == 0) { rv = -1; break; } + } + if (after_sb_0p == ASB_DISCARD_ZERO_CHG) + break; + case ASB_DISCARD_LEAST_CHG: + if (ch_self < ch_peer) + rv = -1; + else if (ch_self > ch_peer) + rv = 1; + else /* ( ch_self == ch_peer ) */ + /* Well, then use something else. */ + rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) + ? -1 : 1; + break; + case ASB_DISCARD_LOCAL: + rv = -1; + break; + case ASB_DISCARD_REMOTE: + rv = 1; + } + + return rv; +} + +/** + * drbd_asb_recover_1p - Recover after split-brain with one remaining primary + */ +static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + int hg, rv = -100; + enum drbd_after_sb_p after_sb_1p; + + rcu_read_lock(); + after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p; + rcu_read_unlock(); + switch (after_sb_1p) { + case ASB_DISCARD_YOUNGER_PRI: + case ASB_DISCARD_OLDER_PRI: + case ASB_DISCARD_LEAST_CHG: + case ASB_DISCARD_LOCAL: + case ASB_DISCARD_REMOTE: + case ASB_DISCARD_ZERO_CHG: + drbd_err(device, "Configuration error.\n"); + break; + case ASB_DISCONNECT: + break; + case ASB_CONSENSUS: + hg = drbd_asb_recover_0p(peer_device); + if (hg == -1 && device->state.role == R_SECONDARY) + rv = hg; + if (hg == 1 && device->state.role == R_PRIMARY) + rv = hg; + break; + case ASB_VIOLENTLY: + rv = drbd_asb_recover_0p(peer_device); + break; + case ASB_DISCARD_SECONDARY: + return device->state.role == R_PRIMARY ? 1 : -1; + case ASB_CALL_HELPER: + hg = drbd_asb_recover_0p(peer_device); + if (hg == -1 && device->state.role == R_PRIMARY) { + enum drbd_state_rv rv2; + + /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, + * we might be here in C_WF_REPORT_PARAMS which is transient. + * we do not need to wait for the after state change work either. */ + rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY)); + if (rv2 != SS_SUCCESS) { + drbd_khelper(device, "pri-lost-after-sb"); + } else { + drbd_warn(device, "Successfully gave up primary role.\n"); + rv = hg; + } + } else + rv = hg; + } + + return rv; +} + +/** + * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries + */ +static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + int hg, rv = -100; + enum drbd_after_sb_p after_sb_2p; + + rcu_read_lock(); + after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p; + rcu_read_unlock(); + switch (after_sb_2p) { + case ASB_DISCARD_YOUNGER_PRI: + case ASB_DISCARD_OLDER_PRI: + case ASB_DISCARD_LEAST_CHG: + case ASB_DISCARD_LOCAL: + case ASB_DISCARD_REMOTE: + case ASB_CONSENSUS: + case ASB_DISCARD_SECONDARY: + case ASB_DISCARD_ZERO_CHG: + drbd_err(device, "Configuration error.\n"); + break; + case ASB_VIOLENTLY: + rv = drbd_asb_recover_0p(peer_device); + break; + case ASB_DISCONNECT: + break; + case ASB_CALL_HELPER: + hg = drbd_asb_recover_0p(peer_device); + if (hg == -1) { + enum drbd_state_rv rv2; + + /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, + * we might be here in C_WF_REPORT_PARAMS which is transient. + * we do not need to wait for the after state change work either. */ + rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY)); + if (rv2 != SS_SUCCESS) { + drbd_khelper(device, "pri-lost-after-sb"); + } else { + drbd_warn(device, "Successfully gave up primary role.\n"); + rv = hg; + } + } else + rv = hg; + } + + return rv; +} + +static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid, + u64 bits, u64 flags) +{ + if (!uuid) { + drbd_info(device, "%s uuid info vanished while I was looking!\n", text); + return; + } + drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n", + text, + (unsigned long long)uuid[UI_CURRENT], + (unsigned long long)uuid[UI_BITMAP], + (unsigned long long)uuid[UI_HISTORY_START], + (unsigned long long)uuid[UI_HISTORY_END], + (unsigned long long)bits, + (unsigned long long)flags); +} + +/* + 100 after split brain try auto recover + 2 C_SYNC_SOURCE set BitMap + 1 C_SYNC_SOURCE use BitMap + 0 no Sync + -1 C_SYNC_TARGET use BitMap + -2 C_SYNC_TARGET set BitMap + -100 after split brain, disconnect +-1000 unrelated data +-1091 requires proto 91 +-1096 requires proto 96 + */ +static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local) +{ + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; + u64 self, peer; + int i, j; + + self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1); + peer = device->p_uuid[UI_CURRENT] & ~((u64)1); + + *rule_nr = 10; + if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED) + return 0; + + *rule_nr = 20; + if ((self == UUID_JUST_CREATED || self == (u64)0) && + peer != UUID_JUST_CREATED) + return -2; + + *rule_nr = 30; + if (self != UUID_JUST_CREATED && + (peer == UUID_JUST_CREATED || peer == (u64)0)) + return 2; + + if (self == peer) { + int rct, dc; /* roles at crash time */ + + if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) { + + if (connection->agreed_pro_version < 91) + return -1091; + + if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) && + (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { + drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n"); + drbd_uuid_move_history(device); + device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP]; + device->ldev->md.uuid[UI_BITMAP] = 0; + + drbd_uuid_dump(device, "self", device->ldev->md.uuid, + device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0); + *rule_nr = 34; + } else { + drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n"); + *rule_nr = 36; + } + + return 1; + } + + if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) { + + if (connection->agreed_pro_version < 91) + return -1091; + + if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) && + (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) { + drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n"); + + device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START]; + device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP]; + device->p_uuid[UI_BITMAP] = 0UL; + + drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); + *rule_nr = 35; + } else { + drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n"); + *rule_nr = 37; + } + + return -1; + } + + /* Common power [off|failure] */ + rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) + + (device->p_uuid[UI_FLAGS] & 2); + /* lowest bit is set when we were primary, + * next bit (weight 2) is set when peer was primary */ + *rule_nr = 40; + + switch (rct) { + case 0: /* !self_pri && !peer_pri */ return 0; + case 1: /* self_pri && !peer_pri */ return 1; + case 2: /* !self_pri && peer_pri */ return -1; + case 3: /* self_pri && peer_pri */ + dc = test_bit(RESOLVE_CONFLICTS, &connection->flags); + return dc ? -1 : 1; + } + } + + *rule_nr = 50; + peer = device->p_uuid[UI_BITMAP] & ~((u64)1); + if (self == peer) + return -1; + + *rule_nr = 51; + peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1); + if (self == peer) { + if (connection->agreed_pro_version < 96 ? + (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == + (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) : + peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) { + /* The last P_SYNC_UUID did not get though. Undo the last start of + resync as sync source modifications of the peer's UUIDs. */ + + if (connection->agreed_pro_version < 91) + return -1091; + + device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START]; + device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1]; + + drbd_info(device, "Lost last syncUUID packet, corrected:\n"); + drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); + + return -1; + } + } + + *rule_nr = 60; + self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1); + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + peer = device->p_uuid[i] & ~((u64)1); + if (self == peer) + return -2; + } + + *rule_nr = 70; + self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1); + peer = device->p_uuid[UI_CURRENT] & ~((u64)1); + if (self == peer) + return 1; + + *rule_nr = 71; + self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); + if (self == peer) { + if (connection->agreed_pro_version < 96 ? + (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == + (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) : + self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) { + /* The last P_SYNC_UUID did not get though. Undo the last start of + resync as sync source modifications of our UUIDs. */ + + if (connection->agreed_pro_version < 91) + return -1091; + + __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]); + __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]); + + drbd_info(device, "Last syncUUID did not get through, corrected:\n"); + drbd_uuid_dump(device, "self", device->ldev->md.uuid, + device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0); + + return 1; + } + } + + + *rule_nr = 80; + peer = device->p_uuid[UI_CURRENT] & ~((u64)1); + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + self = device->ldev->md.uuid[i] & ~((u64)1); + if (self == peer) + return 2; + } + + *rule_nr = 90; + self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1); + peer = device->p_uuid[UI_BITMAP] & ~((u64)1); + if (self == peer && self != ((u64)0)) + return 100; + + *rule_nr = 100; + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + self = device->ldev->md.uuid[i] & ~((u64)1); + for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) { + peer = device->p_uuid[j] & ~((u64)1); + if (self == peer) + return -100; + } + } + + return -1000; +} + +/* drbd_sync_handshake() returns the new conn state on success, or + CONN_MASK (-1) on failure. + */ +static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, + enum drbd_role peer_role, + enum drbd_disk_state peer_disk) __must_hold(local) +{ + struct drbd_device *device = peer_device->device; + enum drbd_conns rv = C_MASK; + enum drbd_disk_state mydisk; + struct net_conf *nc; + int hg, rule_nr, rr_conflict, tentative; + + mydisk = device->state.disk; + if (mydisk == D_NEGOTIATING) + mydisk = device->new_state_tmp.disk; + + drbd_info(device, "drbd_sync_handshake:\n"); + + spin_lock_irq(&device->ldev->md.uuid_lock); + drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0); + drbd_uuid_dump(device, "peer", device->p_uuid, + device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); + + hg = drbd_uuid_compare(device, &rule_nr); + spin_unlock_irq(&device->ldev->md.uuid_lock); + + drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr); + + if (hg == -1000) { + drbd_alert(device, "Unrelated data, aborting!\n"); + return C_MASK; + } + if (hg < -1000) { + drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000); + return C_MASK; + } + + if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) || + (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) { + int f = (hg == -100) || abs(hg) == 2; + hg = mydisk > D_INCONSISTENT ? 1 : -1; + if (f) + hg = hg*2; + drbd_info(device, "Becoming sync %s due to disk states.\n", + hg > 0 ? "source" : "target"); + } + + if (abs(hg) == 100) + drbd_khelper(device, "initial-split-brain"); + + rcu_read_lock(); + nc = rcu_dereference(peer_device->connection->net_conf); + + if (hg == 100 || (hg == -100 && nc->always_asbp)) { + int pcount = (device->state.role == R_PRIMARY) + + (peer_role == R_PRIMARY); + int forced = (hg == -100); + + switch (pcount) { + case 0: + hg = drbd_asb_recover_0p(peer_device); + break; + case 1: + hg = drbd_asb_recover_1p(peer_device); + break; + case 2: + hg = drbd_asb_recover_2p(peer_device); + break; + } + if (abs(hg) < 100) { + drbd_warn(device, "Split-Brain detected, %d primaries, " + "automatically solved. Sync from %s node\n", + pcount, (hg < 0) ? "peer" : "this"); + if (forced) { + drbd_warn(device, "Doing a full sync, since" + " UUIDs where ambiguous.\n"); + hg = hg*2; + } + } + } + + if (hg == -100) { + if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1)) + hg = -1; + if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1)) + hg = 1; + + if (abs(hg) < 100) + drbd_warn(device, "Split-Brain detected, manually solved. " + "Sync from %s node\n", + (hg < 0) ? "peer" : "this"); + } + rr_conflict = nc->rr_conflict; + tentative = nc->tentative; + rcu_read_unlock(); + + if (hg == -100) { + /* FIXME this log message is not correct if we end up here + * after an attempted attach on a diskless node. + * We just refuse to attach -- well, we drop the "connection" + * to that disk, in a way... */ + drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n"); + drbd_khelper(device, "split-brain"); + return C_MASK; + } + + if (hg > 0 && mydisk <= D_INCONSISTENT) { + drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n"); + return C_MASK; + } + + if (hg < 0 && /* by intention we do not use mydisk here. */ + device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) { + switch (rr_conflict) { + case ASB_CALL_HELPER: + drbd_khelper(device, "pri-lost"); + /* fall through */ + case ASB_DISCONNECT: + drbd_err(device, "I shall become SyncTarget, but I am primary!\n"); + return C_MASK; + case ASB_VIOLENTLY: + drbd_warn(device, "Becoming SyncTarget, violating the stable-data" + "assumption\n"); + } + } + + if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) { + if (hg == 0) + drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n"); + else + drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.", + drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET), + abs(hg) >= 2 ? "full" : "bit-map based"); + return C_MASK; + } + + if (abs(hg) >= 2) { + drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); + if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake", + BM_LOCKED_SET_ALLOWED)) + return C_MASK; + } + + if (hg > 0) { /* become sync source. */ + rv = C_WF_BITMAP_S; + } else if (hg < 0) { /* become sync target */ + rv = C_WF_BITMAP_T; + } else { + rv = C_CONNECTED; + if (drbd_bm_total_weight(device)) { + drbd_info(device, "No resync, but %lu bits in bitmap!\n", + drbd_bm_total_weight(device)); + } + } + + return rv; +} + +static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer) +{ + /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ + if (peer == ASB_DISCARD_REMOTE) + return ASB_DISCARD_LOCAL; + + /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ + if (peer == ASB_DISCARD_LOCAL) + return ASB_DISCARD_REMOTE; + + /* everything else is valid if they are equal on both sides. */ + return peer; +} + +static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi) +{ + struct p_protocol *p = pi->data; + enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; + int p_proto, p_discard_my_data, p_two_primaries, cf; + struct net_conf *nc, *old_net_conf, *new_net_conf = NULL; + char integrity_alg[SHARED_SECRET_MAX] = ""; + struct crypto_hash *peer_integrity_tfm = NULL; + void *int_dig_in = NULL, *int_dig_vv = NULL; + + p_proto = be32_to_cpu(p->protocol); + p_after_sb_0p = be32_to_cpu(p->after_sb_0p); + p_after_sb_1p = be32_to_cpu(p->after_sb_1p); + p_after_sb_2p = be32_to_cpu(p->after_sb_2p); + p_two_primaries = be32_to_cpu(p->two_primaries); + cf = be32_to_cpu(p->conn_flags); + p_discard_my_data = cf & CF_DISCARD_MY_DATA; + + if (connection->agreed_pro_version >= 87) { + int err; + + if (pi->size > sizeof(integrity_alg)) + return -EIO; + err = drbd_recv_all(connection, integrity_alg, pi->size); + if (err) + return err; + integrity_alg[SHARED_SECRET_MAX - 1] = 0; + } + + if (pi->cmd != P_PROTOCOL_UPDATE) { + clear_bit(CONN_DRY_RUN, &connection->flags); + + if (cf & CF_DRY_RUN) + set_bit(CONN_DRY_RUN, &connection->flags); + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + + if (p_proto != nc->wire_protocol) { + drbd_err(connection, "incompatible %s settings\n", "protocol"); + goto disconnect_rcu_unlock; + } + + if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) { + drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri"); + goto disconnect_rcu_unlock; + } + + if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) { + drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri"); + goto disconnect_rcu_unlock; + } + + if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) { + drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri"); + goto disconnect_rcu_unlock; + } + + if (p_discard_my_data && nc->discard_my_data) { + drbd_err(connection, "incompatible %s settings\n", "discard-my-data"); + goto disconnect_rcu_unlock; + } + + if (p_two_primaries != nc->two_primaries) { + drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries"); + goto disconnect_rcu_unlock; + } + + if (strcmp(integrity_alg, nc->integrity_alg)) { + drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg"); + goto disconnect_rcu_unlock; + } + + rcu_read_unlock(); + } + + if (integrity_alg[0]) { + int hash_size; + + /* + * We can only change the peer data integrity algorithm + * here. Changing our own data integrity algorithm + * requires that we send a P_PROTOCOL_UPDATE packet at + * the same time; otherwise, the peer has no way to + * tell between which packets the algorithm should + * change. + */ + + peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC); + if (!peer_integrity_tfm) { + drbd_err(connection, "peer data-integrity-alg %s not supported\n", + integrity_alg); + goto disconnect; + } + + hash_size = crypto_hash_digestsize(peer_integrity_tfm); + int_dig_in = kmalloc(hash_size, GFP_KERNEL); + int_dig_vv = kmalloc(hash_size, GFP_KERNEL); + if (!(int_dig_in && int_dig_vv)) { + drbd_err(connection, "Allocation of buffers for data integrity checking failed\n"); + goto disconnect; + } + } + + new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_net_conf) { + drbd_err(connection, "Allocation of new net_conf failed\n"); + goto disconnect; + } + + mutex_lock(&connection->data.mutex); + mutex_lock(&connection->resource->conf_update); + old_net_conf = connection->net_conf; + *new_net_conf = *old_net_conf; + + new_net_conf->wire_protocol = p_proto; + new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p); + new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p); + new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p); + new_net_conf->two_primaries = p_two_primaries; + + rcu_assign_pointer(connection->net_conf, new_net_conf); + mutex_unlock(&connection->resource->conf_update); + mutex_unlock(&connection->data.mutex); + + crypto_free_hash(connection->peer_integrity_tfm); + kfree(connection->int_dig_in); + kfree(connection->int_dig_vv); + connection->peer_integrity_tfm = peer_integrity_tfm; + connection->int_dig_in = int_dig_in; + connection->int_dig_vv = int_dig_vv; + + if (strcmp(old_net_conf->integrity_alg, integrity_alg)) + drbd_info(connection, "peer data-integrity-alg: %s\n", + integrity_alg[0] ? integrity_alg : "(none)"); + + synchronize_rcu(); + kfree(old_net_conf); + return 0; + +disconnect_rcu_unlock: + rcu_read_unlock(); +disconnect: + crypto_free_hash(peer_integrity_tfm); + kfree(int_dig_in); + kfree(int_dig_vv); + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; +} + +/* helper function + * input: alg name, feature name + * return: NULL (alg name was "") + * ERR_PTR(error) if something goes wrong + * or the crypto hash ptr, if it worked out ok. */ +static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device, + const char *alg, const char *name) +{ + struct crypto_hash *tfm; + + if (!alg[0]) + return NULL; + + tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n", + alg, name, PTR_ERR(tfm)); + return tfm; + } + return tfm; +} + +static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi) +{ + void *buffer = connection->data.rbuf; + int size = pi->size; + + while (size) { + int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE); + s = drbd_recv(connection, buffer, s); + if (s <= 0) { + if (s < 0) + return s; + break; + } + size -= s; + } + if (size) + return -EIO; + return 0; +} + +/* + * config_unknown_volume - device configuration command for unknown volume + * + * When a device is added to an existing connection, the node on which the + * device is added first will send configuration commands to its peer but the + * peer will not know about the device yet. It will warn and ignore these + * commands. Once the device is added on the second node, the second node will + * send the same device configuration commands, but in the other direction. + * + * (We can also end up here if drbd is misconfigured.) + */ +static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi) +{ + drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n", + cmdname(pi->cmd), pi->vnr); + return ignore_remaining_packet(connection, pi); +} + +static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_rs_param_95 *p; + unsigned int header_size, data_size, exp_max_sz; + struct crypto_hash *verify_tfm = NULL; + struct crypto_hash *csums_tfm = NULL; + struct net_conf *old_net_conf, *new_net_conf = NULL; + struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL; + const int apv = connection->agreed_pro_version; + struct fifo_buffer *old_plan = NULL, *new_plan = NULL; + int fifo_size = 0; + int err; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return config_unknown_volume(connection, pi); + device = peer_device->device; + + exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) + : apv == 88 ? sizeof(struct p_rs_param) + + SHARED_SECRET_MAX + : apv <= 94 ? sizeof(struct p_rs_param_89) + : /* apv >= 95 */ sizeof(struct p_rs_param_95); + + if (pi->size > exp_max_sz) { + drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n", + pi->size, exp_max_sz); + return -EIO; + } + + if (apv <= 88) { + header_size = sizeof(struct p_rs_param); + data_size = pi->size - header_size; + } else if (apv <= 94) { + header_size = sizeof(struct p_rs_param_89); + data_size = pi->size - header_size; + D_ASSERT(device, data_size == 0); + } else { + header_size = sizeof(struct p_rs_param_95); + data_size = pi->size - header_size; + D_ASSERT(device, data_size == 0); + } + + /* initialize verify_alg and csums_alg */ + p = pi->data; + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + + err = drbd_recv_all(peer_device->connection, p, header_size); + if (err) + return err; + + mutex_lock(&connection->resource->conf_update); + old_net_conf = peer_device->connection->net_conf; + if (get_ldev(device)) { + new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + put_ldev(device); + mutex_unlock(&connection->resource->conf_update); + drbd_err(device, "Allocation of new disk_conf failed\n"); + return -ENOMEM; + } + + old_disk_conf = device->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; + + new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate); + } + + if (apv >= 88) { + if (apv == 88) { + if (data_size > SHARED_SECRET_MAX || data_size == 0) { + drbd_err(device, "verify-alg of wrong size, " + "peer wants %u, accepting only up to %u byte\n", + data_size, SHARED_SECRET_MAX); + err = -EIO; + goto reconnect; + } + + err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size); + if (err) + goto reconnect; + /* we expect NUL terminated string */ + /* but just in case someone tries to be evil */ + D_ASSERT(device, p->verify_alg[data_size-1] == 0); + p->verify_alg[data_size-1] = 0; + + } else /* apv >= 89 */ { + /* we still expect NUL terminated strings */ + /* but just in case someone tries to be evil */ + D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0); + D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0); + p->verify_alg[SHARED_SECRET_MAX-1] = 0; + p->csums_alg[SHARED_SECRET_MAX-1] = 0; + } + + if (strcmp(old_net_conf->verify_alg, p->verify_alg)) { + if (device->state.conn == C_WF_REPORT_PARAMS) { + drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", + old_net_conf->verify_alg, p->verify_alg); + goto disconnect; + } + verify_tfm = drbd_crypto_alloc_digest_safe(device, + p->verify_alg, "verify-alg"); + if (IS_ERR(verify_tfm)) { + verify_tfm = NULL; + goto disconnect; + } + } + + if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) { + if (device->state.conn == C_WF_REPORT_PARAMS) { + drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", + old_net_conf->csums_alg, p->csums_alg); + goto disconnect; + } + csums_tfm = drbd_crypto_alloc_digest_safe(device, + p->csums_alg, "csums-alg"); + if (IS_ERR(csums_tfm)) { + csums_tfm = NULL; + goto disconnect; + } + } + + if (apv > 94 && new_disk_conf) { + new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead); + new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target); + new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target); + new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate); + + fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ; + if (fifo_size != device->rs_plan_s->size) { + new_plan = fifo_alloc(fifo_size); + if (!new_plan) { + drbd_err(device, "kmalloc of fifo_buffer failed"); + put_ldev(device); + goto disconnect; + } + } + } + + if (verify_tfm || csums_tfm) { + new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_net_conf) { + drbd_err(device, "Allocation of new net_conf failed\n"); + goto disconnect; + } + + *new_net_conf = *old_net_conf; + + if (verify_tfm) { + strcpy(new_net_conf->verify_alg, p->verify_alg); + new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1; + crypto_free_hash(peer_device->connection->verify_tfm); + peer_device->connection->verify_tfm = verify_tfm; + drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg); + } + if (csums_tfm) { + strcpy(new_net_conf->csums_alg, p->csums_alg); + new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1; + crypto_free_hash(peer_device->connection->csums_tfm); + peer_device->connection->csums_tfm = csums_tfm; + drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg); + } + rcu_assign_pointer(connection->net_conf, new_net_conf); + } + } + + if (new_disk_conf) { + rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); + put_ldev(device); + } + + if (new_plan) { + old_plan = device->rs_plan_s; + rcu_assign_pointer(device->rs_plan_s, new_plan); + } + + mutex_unlock(&connection->resource->conf_update); + synchronize_rcu(); + if (new_net_conf) + kfree(old_net_conf); + kfree(old_disk_conf); + kfree(old_plan); + + return 0; + +reconnect: + if (new_disk_conf) { + put_ldev(device); + kfree(new_disk_conf); + } + mutex_unlock(&connection->resource->conf_update); + return -EIO; + +disconnect: + kfree(new_plan); + if (new_disk_conf) { + put_ldev(device); + kfree(new_disk_conf); + } + mutex_unlock(&connection->resource->conf_update); + /* just for completeness: actually not needed, + * as this is not reached if csums_tfm was ok. */ + crypto_free_hash(csums_tfm); + /* but free the verify_tfm again, if csums_tfm did not work out */ + crypto_free_hash(verify_tfm); + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; +} + +/* warn if the arguments differ by more than 12.5% */ +static void warn_if_differ_considerably(struct drbd_device *device, + const char *s, sector_t a, sector_t b) +{ + sector_t d; + if (a == 0 || b == 0) + return; + d = (a > b) ? (a - b) : (b - a); + if (d > (a>>3) || d > (b>>3)) + drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s, + (unsigned long long)a, (unsigned long long)b); +} + +static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_sizes *p = pi->data; + enum determine_dev_size dd = DS_UNCHANGED; + sector_t p_size, p_usize, p_csize, my_usize; + int ldsc = 0; /* local disk size changed */ + enum dds_flags ddsf; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return config_unknown_volume(connection, pi); + device = peer_device->device; + + p_size = be64_to_cpu(p->d_size); + p_usize = be64_to_cpu(p->u_size); + p_csize = be64_to_cpu(p->c_size); + + /* just store the peer's disk size for now. + * we still need to figure out whether we accept that. */ + device->p_size = p_size; + + if (get_ldev(device)) { + rcu_read_lock(); + my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size; + rcu_read_unlock(); + + warn_if_differ_considerably(device, "lower level device sizes", + p_size, drbd_get_max_capacity(device->ldev)); + warn_if_differ_considerably(device, "user requested size", + p_usize, my_usize); + + /* if this is the first connect, or an otherwise expected + * param exchange, choose the minimum */ + if (device->state.conn == C_WF_REPORT_PARAMS) + p_usize = min_not_zero(my_usize, p_usize); + + /* Never shrink a device with usable data during connect. + But allow online shrinking if we are connected. */ + if (drbd_new_dev_size(device, device->ldev, p_usize, 0) < + drbd_get_capacity(device->this_bdev) && + device->state.disk >= D_OUTDATED && + device->state.conn < C_CONNECTED) { + drbd_err(device, "The peer's disk size is too small!\n"); + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + put_ldev(device); + return -EIO; + } + + if (my_usize != p_usize) { + struct disk_conf *old_disk_conf, *new_disk_conf = NULL; + + new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL); + if (!new_disk_conf) { + drbd_err(device, "Allocation of new disk_conf failed\n"); + put_ldev(device); + return -ENOMEM; + } + + mutex_lock(&connection->resource->conf_update); + old_disk_conf = device->ldev->disk_conf; + *new_disk_conf = *old_disk_conf; + new_disk_conf->disk_size = p_usize; + + rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); + mutex_unlock(&connection->resource->conf_update); + synchronize_rcu(); + kfree(old_disk_conf); + + drbd_info(device, "Peer sets u_size to %lu sectors\n", + (unsigned long)my_usize); + } + + put_ldev(device); + } + + device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); + /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). + In case we cleared the QUEUE_FLAG_DISCARD from our queue in + drbd_reconsider_max_bio_size(), we can be sure that after + drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ + + ddsf = be16_to_cpu(p->dds_flags); + if (get_ldev(device)) { + drbd_reconsider_max_bio_size(device, device->ldev); + dd = drbd_determine_dev_size(device, ddsf, NULL); + put_ldev(device); + if (dd == DS_ERROR) + return -EIO; + drbd_md_sync(device); + } else { + /* + * I am diskless, need to accept the peer's *current* size. + * I must NOT accept the peers backing disk size, + * it may have been larger than mine all along... + * + * At this point, the peer knows more about my disk, or at + * least about what we last agreed upon, than myself. + * So if his c_size is less than his d_size, the most likely + * reason is that *my* d_size was smaller last time we checked. + * + * However, if he sends a zero current size, + * take his (user-capped or) backing disk size anyways. + */ + drbd_reconsider_max_bio_size(device, NULL); + drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); + } + + if (get_ldev(device)) { + if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) { + device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev); + ldsc = 1; + } + + put_ldev(device); + } + + if (device->state.conn > C_WF_REPORT_PARAMS) { + if (be64_to_cpu(p->c_size) != + drbd_get_capacity(device->this_bdev) || ldsc) { + /* we have different sizes, probably peer + * needs to know my new size... */ + drbd_send_sizes(peer_device, 0, ddsf); + } + if (test_and_clear_bit(RESIZE_PENDING, &device->flags) || + (dd == DS_GREW && device->state.conn == C_CONNECTED)) { + if (device->state.pdsk >= D_INCONSISTENT && + device->state.disk >= D_INCONSISTENT) { + if (ddsf & DDSF_NO_RESYNC) + drbd_info(device, "Resync of new storage suppressed with --assume-clean\n"); + else + resync_after_online_grow(device); + } else + set_bit(RESYNC_AFTER_NEG, &device->flags); + } + } + + return 0; +} + +static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_uuids *p = pi->data; + u64 *p_uuid; + int i, updated_uuids = 0; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return config_unknown_volume(connection, pi); + device = peer_device->device; + + p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); + if (!p_uuid) { + drbd_err(device, "kmalloc of p_uuid failed\n"); + return false; + } + + for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) + p_uuid[i] = be64_to_cpu(p->uuid[i]); + + kfree(device->p_uuid); + device->p_uuid = p_uuid; + + if (device->state.conn < C_CONNECTED && + device->state.disk < D_INCONSISTENT && + device->state.role == R_PRIMARY && + (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { + drbd_err(device, "Can only connect to data with current UUID=%016llX\n", + (unsigned long long)device->ed_uuid); + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; + } + + if (get_ldev(device)) { + int skip_initial_sync = + device->state.conn == C_CONNECTED && + peer_device->connection->agreed_pro_version >= 90 && + device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && + (p_uuid[UI_FLAGS] & 8); + if (skip_initial_sync) { + drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n"); + drbd_bitmap_io(device, &drbd_bmio_clear_n_write, + "clear_n_write from receive_uuids", + BM_LOCKED_TEST_ALLOWED); + _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]); + _drbd_uuid_set(device, UI_BITMAP, 0); + _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), + CS_VERBOSE, NULL); + drbd_md_sync(device); + updated_uuids = 1; + } + put_ldev(device); + } else if (device->state.disk < D_INCONSISTENT && + device->state.role == R_PRIMARY) { + /* I am a diskless primary, the peer just created a new current UUID + for me. */ + updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]); + } + + /* Before we test for the disk state, we should wait until an eventually + ongoing cluster wide state change is finished. That is important if + we are primary and are detaching from our disk. We need to see the + new disk state... */ + mutex_lock(device->state_mutex); + mutex_unlock(device->state_mutex); + if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT) + updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]); + + if (updated_uuids) + drbd_print_uuids(device, "receiver updated UUIDs to"); + + return 0; +} + +/** + * convert_state() - Converts the peer's view of the cluster state to our point of view + * @ps: The state as seen by the peer. + */ +static union drbd_state convert_state(union drbd_state ps) +{ + union drbd_state ms; + + static enum drbd_conns c_tab[] = { + [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS, + [C_CONNECTED] = C_CONNECTED, + + [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, + [C_STARTING_SYNC_T] = C_STARTING_SYNC_S, + [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */ + [C_VERIFY_S] = C_VERIFY_T, + [C_MASK] = C_MASK, + }; + + ms.i = ps.i; + + ms.conn = c_tab[ps.conn]; + ms.peer = ps.role; + ms.role = ps.peer; + ms.pdsk = ps.disk; + ms.disk = ps.pdsk; + ms.peer_isp = (ps.aftr_isp | ps.user_isp); + + return ms; +} + +static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_req_state *p = pi->data; + union drbd_state mask, val; + enum drbd_state_rv rv; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + mask.i = be32_to_cpu(p->mask); + val.i = be32_to_cpu(p->val); + + if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) && + mutex_is_locked(device->state_mutex)) { + drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG); + return 0; + } + + mask = convert_state(mask); + val = convert_state(val); + + rv = drbd_change_state(device, CS_VERBOSE, mask, val); + drbd_send_sr_reply(peer_device, rv); + + drbd_md_sync(device); + + return 0; +} + +static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi) +{ + struct p_req_state *p = pi->data; + union drbd_state mask, val; + enum drbd_state_rv rv; + + mask.i = be32_to_cpu(p->mask); + val.i = be32_to_cpu(p->val); + + if (test_bit(RESOLVE_CONFLICTS, &connection->flags) && + mutex_is_locked(&connection->cstate_mutex)) { + conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG); + return 0; + } + + mask = convert_state(mask); + val = convert_state(val); + + rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL); + conn_send_sr_reply(connection, rv); + + return 0; +} + +static int receive_state(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_state *p = pi->data; + union drbd_state os, ns, peer_state; + enum drbd_disk_state real_peer_disk; + enum chg_state_flags cs_flags; + int rv; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return config_unknown_volume(connection, pi); + device = peer_device->device; + + peer_state.i = be32_to_cpu(p->state); + + real_peer_disk = peer_state.disk; + if (peer_state.disk == D_NEGOTIATING) { + real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT; + drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); + } + + spin_lock_irq(&device->resource->req_lock); + retry: + os = ns = drbd_read_state(device); + spin_unlock_irq(&device->resource->req_lock); + + /* If some other part of the code (asender thread, timeout) + * already decided to close the connection again, + * we must not "re-establish" it here. */ + if (os.conn <= C_TEAR_DOWN) + return -ECONNRESET; + + /* If this is the "end of sync" confirmation, usually the peer disk + * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits + * set) resync started in PausedSyncT, or if the timing of pause-/ + * unpause-sync events has been "just right", the peer disk may + * transition from D_CONSISTENT to D_UP_TO_DATE as well. + */ + if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) && + real_peer_disk == D_UP_TO_DATE && + os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { + /* If we are (becoming) SyncSource, but peer is still in sync + * preparation, ignore its uptodate-ness to avoid flapping, it + * will change to inconsistent once the peer reaches active + * syncing states. + * It may have changed syncer-paused flags, however, so we + * cannot ignore this completely. */ + if (peer_state.conn > C_CONNECTED && + peer_state.conn < C_SYNC_SOURCE) + real_peer_disk = D_INCONSISTENT; + + /* if peer_state changes to connected at the same time, + * it explicitly notifies us that it finished resync. + * Maybe we should finish it up, too? */ + else if (os.conn >= C_SYNC_SOURCE && + peer_state.conn == C_CONNECTED) { + if (drbd_bm_total_weight(device) <= device->rs_failed) + drbd_resync_finished(device); + return 0; + } + } + + /* explicit verify finished notification, stop sector reached. */ + if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE && + peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) { + ov_out_of_sync_print(device); + drbd_resync_finished(device); + return 0; + } + + /* peer says his disk is inconsistent, while we think it is uptodate, + * and this happens while the peer still thinks we have a sync going on, + * but we think we are already done with the sync. + * We ignore this to avoid flapping pdsk. + * This should not happen, if the peer is a recent version of drbd. */ + if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT && + os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE) + real_peer_disk = D_UP_TO_DATE; + + if (ns.conn == C_WF_REPORT_PARAMS) + ns.conn = C_CONNECTED; + + if (peer_state.conn == C_AHEAD) + ns.conn = C_BEHIND; + + if (device->p_uuid && peer_state.disk >= D_NEGOTIATING && + get_ldev_if_state(device, D_NEGOTIATING)) { + int cr; /* consider resync */ + + /* if we established a new connection */ + cr = (os.conn < C_CONNECTED); + /* if we had an established connection + * and one of the nodes newly attaches a disk */ + cr |= (os.conn == C_CONNECTED && + (peer_state.disk == D_NEGOTIATING || + os.disk == D_NEGOTIATING)); + /* if we have both been inconsistent, and the peer has been + * forced to be UpToDate with --overwrite-data */ + cr |= test_bit(CONSIDER_RESYNC, &device->flags); + /* if we had been plain connected, and the admin requested to + * start a sync by "invalidate" or "invalidate-remote" */ + cr |= (os.conn == C_CONNECTED && + (peer_state.conn >= C_STARTING_SYNC_S && + peer_state.conn <= C_WF_BITMAP_T)); + + if (cr) + ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk); + + put_ldev(device); + if (ns.conn == C_MASK) { + ns.conn = C_CONNECTED; + if (device->state.disk == D_NEGOTIATING) { + drbd_force_state(device, NS(disk, D_FAILED)); + } else if (peer_state.disk == D_NEGOTIATING) { + drbd_err(device, "Disk attach process on the peer node was aborted.\n"); + peer_state.disk = D_DISKLESS; + real_peer_disk = D_DISKLESS; + } else { + if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags)) + return -EIO; + D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS); + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; + } + } + } + + spin_lock_irq(&device->resource->req_lock); + if (os.i != drbd_read_state(device).i) + goto retry; + clear_bit(CONSIDER_RESYNC, &device->flags); + ns.peer = peer_state.role; + ns.pdsk = real_peer_disk; + ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); + if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) + ns.disk = device->new_state_tmp.disk; + cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD); + if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED && + test_bit(NEW_CUR_UUID, &device->flags)) { + /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this + for temporal network outages! */ + spin_unlock_irq(&device->resource->req_lock); + drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n"); + tl_clear(peer_device->connection); + drbd_uuid_new_current(device); + clear_bit(NEW_CUR_UUID, &device->flags); + conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD); + return -EIO; + } + rv = _drbd_set_state(device, ns, cs_flags, NULL); + ns = drbd_read_state(device); + spin_unlock_irq(&device->resource->req_lock); + + if (rv < SS_SUCCESS) { + conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); + return -EIO; + } + + if (os.conn > C_WF_REPORT_PARAMS) { + if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED && + peer_state.disk != D_NEGOTIATING ) { + /* we want resync, peer has not yet decided to sync... */ + /* Nowadays only used when forcing a node into primary role and + setting its disk to UpToDate with that */ + drbd_send_uuids(peer_device); + drbd_send_current_state(peer_device); + } + } + + clear_bit(DISCARD_MY_DATA, &device->flags); + + drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */ + + return 0; +} + +static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_rs_uuid *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + wait_event(device->misc_wait, + device->state.conn == C_WF_SYNC_UUID || + device->state.conn == C_BEHIND || + device->state.conn < C_CONNECTED || + device->state.disk < D_NEGOTIATING); + + /* D_ASSERT(device, device->state.conn == C_WF_SYNC_UUID ); */ + + /* Here the _drbd_uuid_ functions are right, current should + _not_ be rotated into the history */ + if (get_ldev_if_state(device, D_NEGOTIATING)) { + _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid)); + _drbd_uuid_set(device, UI_BITMAP, 0UL); + + drbd_print_uuids(device, "updated sync uuid"); + drbd_start_resync(device, C_SYNC_TARGET); + + put_ldev(device); + } else + drbd_err(device, "Ignoring SyncUUID packet!\n"); + + return 0; +} + +/** + * receive_bitmap_plain + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +static int +receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size, + unsigned long *p, struct bm_xfer_ctx *c) +{ + unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - + drbd_header_size(peer_device->connection); + unsigned int num_words = min_t(size_t, data_size / sizeof(*p), + c->bm_words - c->word_offset); + unsigned int want = num_words * sizeof(*p); + int err; + + if (want != size) { + drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size); + return -EIO; + } + if (want == 0) + return 0; + err = drbd_recv_all(peer_device->connection, p, want); + if (err) + return err; + + drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p); + + c->word_offset += num_words; + c->bit_offset = c->word_offset * BITS_PER_LONG; + if (c->bit_offset > c->bm_bits) + c->bit_offset = c->bm_bits; + + return 1; +} + +static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p) +{ + return (enum drbd_bitmap_code)(p->encoding & 0x0f); +} + +static int dcbp_get_start(struct p_compressed_bm *p) +{ + return (p->encoding & 0x80) != 0; +} + +static int dcbp_get_pad_bits(struct p_compressed_bm *p) +{ + return (p->encoding >> 4) & 0x7; +} + +/** + * recv_bm_rle_bits + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +static int +recv_bm_rle_bits(struct drbd_peer_device *peer_device, + struct p_compressed_bm *p, + struct bm_xfer_ctx *c, + unsigned int len) +{ + struct bitstream bs; + u64 look_ahead; + u64 rl; + u64 tmp; + unsigned long s = c->bit_offset; + unsigned long e; + int toggle = dcbp_get_start(p); + int have; + int bits; + + bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p)); + + bits = bitstream_get_bits(&bs, &look_ahead, 64); + if (bits < 0) + return -EIO; + + for (have = bits; have > 0; s += rl, toggle = !toggle) { + bits = vli_decode_bits(&rl, look_ahead); + if (bits <= 0) + return -EIO; + + if (toggle) { + e = s + rl -1; + if (e >= c->bm_bits) { + drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); + return -EIO; + } + _drbd_bm_set_bits(peer_device->device, s, e); + } + + if (have < bits) { + drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n", + have, bits, look_ahead, + (unsigned int)(bs.cur.b - p->code), + (unsigned int)bs.buf_len); + return -EIO; + } + /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */ + if (likely(bits < 64)) + look_ahead >>= bits; + else + look_ahead = 0; + have -= bits; + + bits = bitstream_get_bits(&bs, &tmp, 64 - have); + if (bits < 0) + return -EIO; + look_ahead |= tmp << have; + have += bits; + } + + c->bit_offset = s; + bm_xfer_ctx_bit_to_word_offset(c); + + return (s != c->bm_bits); +} + +/** + * decode_bitmap_c + * + * Return 0 when done, 1 when another iteration is needed, and a negative error + * code upon failure. + */ +static int +decode_bitmap_c(struct drbd_peer_device *peer_device, + struct p_compressed_bm *p, + struct bm_xfer_ctx *c, + unsigned int len) +{ + if (dcbp_get_code(p) == RLE_VLI_Bits) + return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p)); + + /* other variants had been implemented for evaluation, + * but have been dropped as this one turned out to be "best" + * during all our tests. */ + + drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding); + conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD); + return -EIO; +} + +void INFO_bm_xfer_stats(struct drbd_device *device, + const char *direction, struct bm_xfer_ctx *c) +{ + /* what would it take to transfer it "plaintext" */ + unsigned int header_size = drbd_header_size(first_peer_device(device)->connection); + unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size; + unsigned int plain = + header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) + + c->bm_words * sizeof(unsigned long); + unsigned int total = c->bytes[0] + c->bytes[1]; + unsigned int r; + + /* total can not be zero. but just in case: */ + if (total == 0) + return; + + /* don't report if not compressed */ + if (total >= plain) + return; + + /* total < plain. check for overflow, still */ + r = (total > UINT_MAX/1000) ? (total / (plain/1000)) + : (1000 * total / plain); + + if (r > 1000) + r = 1000; + + r = 1000 - r; + drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " + "total %u; compression: %u.%u%%\n", + direction, + c->bytes[1], c->packets[1], + c->bytes[0], c->packets[0], + total, r/10, r % 10); +} + +/* Since we are processing the bitfield from lower addresses to higher, + it does not matter if the process it in 32 bit chunks or 64 bit + chunks as long as it is little endian. (Understand it as byte stream, + beginning with the lowest byte...) If we would use big endian + we would need to process it from the highest address to the lowest, + in order to be agnostic to the 32 vs 64 bits issue. + + returns 0 on failure, 1 if we successfully received it. */ +static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct bm_xfer_ctx c; + int err; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED); + /* you are supposed to send additional out-of-sync information + * if you actually set bits during this phase */ + + c = (struct bm_xfer_ctx) { + .bm_bits = drbd_bm_bits(device), + .bm_words = drbd_bm_words(device), + }; + + for(;;) { + if (pi->cmd == P_BITMAP) + err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c); + else if (pi->cmd == P_COMPRESSED_BITMAP) { + /* MAYBE: sanity check that we speak proto >= 90, + * and the feature is enabled! */ + struct p_compressed_bm *p = pi->data; + + if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) { + drbd_err(device, "ReportCBitmap packet too large\n"); + err = -EIO; + goto out; + } + if (pi->size <= sizeof(*p)) { + drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size); + err = -EIO; + goto out; + } + err = drbd_recv_all(peer_device->connection, p, pi->size); + if (err) + goto out; + err = decode_bitmap_c(peer_device, p, &c, pi->size); + } else { + drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd); + err = -EIO; + goto out; + } + + c.packets[pi->cmd == P_BITMAP]++; + c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size; + + if (err <= 0) { + if (err < 0) + goto out; + break; + } + err = drbd_recv_header(peer_device->connection, pi); + if (err) + goto out; + } + + INFO_bm_xfer_stats(device, "receive", &c); + + if (device->state.conn == C_WF_BITMAP_T) { + enum drbd_state_rv rv; + + err = drbd_send_bitmap(device); + if (err) + goto out; + /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ + rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); + D_ASSERT(device, rv == SS_SUCCESS); + } else if (device->state.conn != C_WF_BITMAP_S) { + /* admin may have requested C_DISCONNECTING, + * other threads may have noticed network errors */ + drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n", + drbd_conn_str(device->state.conn)); + } + err = 0; + + out: + drbd_bm_unlock(device); + if (!err && device->state.conn == C_WF_BITMAP_S) + drbd_start_resync(device, C_SYNC_SOURCE); + return err; +} + +static int receive_skip(struct drbd_connection *connection, struct packet_info *pi) +{ + drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n", + pi->cmd, pi->size); + + return ignore_remaining_packet(connection, pi); +} + +static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi) +{ + /* Make sure we've acked all the TCP data associated + * with the data requests being unplugged */ + drbd_tcp_quickack(connection->data.socket); + + return 0; +} + +static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_desc *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + switch (device->state.conn) { + case C_WF_SYNC_UUID: + case C_WF_BITMAP_T: + case C_BEHIND: + break; + default: + drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n", + drbd_conn_str(device->state.conn)); + } + + drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize)); + + return 0; +} + +struct data_cmd { + int expect_payload; + size_t pkt_size; + int (*fn)(struct drbd_connection *, struct packet_info *); +}; + +static struct data_cmd drbd_cmd_handler[] = { + [P_DATA] = { 1, sizeof(struct p_data), receive_Data }, + [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply }, + [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } , + [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } , + [P_BITMAP] = { 1, 0, receive_bitmap } , + [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } , + [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote }, + [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_SYNC_PARAM] = { 1, 0, receive_SyncParam }, + [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam }, + [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol }, + [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids }, + [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes }, + [P_STATE] = { 0, sizeof(struct p_state), receive_state }, + [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state }, + [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid }, + [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, + [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, + [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, + [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, + [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, + [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, +}; + +static void drbdd(struct drbd_connection *connection) +{ + struct packet_info pi; + size_t shs; /* sub header size */ + int err; + + while (get_t_state(&connection->receiver) == RUNNING) { + struct data_cmd *cmd; + + drbd_thread_current_set_cpu(&connection->receiver); + update_receiver_timing_details(connection, drbd_recv_header); + if (drbd_recv_header(connection, &pi)) + goto err_out; + + cmd = &drbd_cmd_handler[pi.cmd]; + if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) { + drbd_err(connection, "Unexpected data packet %s (0x%04x)", + cmdname(pi.cmd), pi.cmd); + goto err_out; + } + + shs = cmd->pkt_size; + if (pi.size > shs && !cmd->expect_payload) { + drbd_err(connection, "No payload expected %s l:%d\n", + cmdname(pi.cmd), pi.size); + goto err_out; + } + + if (shs) { + update_receiver_timing_details(connection, drbd_recv_all_warn); + err = drbd_recv_all_warn(connection, pi.data, shs); + if (err) + goto err_out; + pi.size -= shs; + } + + update_receiver_timing_details(connection, cmd->fn); + err = cmd->fn(connection, &pi); + if (err) { + drbd_err(connection, "error receiving %s, e: %d l: %d!\n", + cmdname(pi.cmd), err, pi.size); + goto err_out; + } + } + return; + + err_out: + conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD); +} + +static void conn_disconnect(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + enum drbd_conns oc; + int vnr; + + if (connection->cstate == C_STANDALONE) + return; + + /* We are about to start the cleanup after connection loss. + * Make sure drbd_make_request knows about that. + * Usually we should be in some network failure state already, + * but just in case we are not, we fix it up here. + */ + conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); + + /* asender does not clean up anything. it must not interfere, either */ + drbd_thread_stop(&connection->asender); + drbd_free_sock(connection); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + kref_get(&device->kref); + rcu_read_unlock(); + drbd_disconnected(peer_device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); + + if (!list_empty(&connection->current_epoch->list)) + drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n"); + /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ + atomic_set(&connection->current_epoch->epoch_size, 0); + connection->send.seen_any_write_yet = false; + + drbd_info(connection, "Connection closed\n"); + + if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN) + conn_try_outdate_peer_async(connection); + + spin_lock_irq(&connection->resource->req_lock); + oc = connection->cstate; + if (oc >= C_UNCONNECTED) + _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); + + spin_unlock_irq(&connection->resource->req_lock); + + if (oc == C_DISCONNECTING) + conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD); +} + +static int drbd_disconnected(struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + unsigned int i; + + /* wait for current activity to cease. */ + spin_lock_irq(&device->resource->req_lock); + _drbd_wait_ee_list_empty(device, &device->active_ee); + _drbd_wait_ee_list_empty(device, &device->sync_ee); + _drbd_wait_ee_list_empty(device, &device->read_ee); + spin_unlock_irq(&device->resource->req_lock); + + /* We do not have data structures that would allow us to + * get the rs_pending_cnt down to 0 again. + * * On C_SYNC_TARGET we do not have any data structures describing + * the pending RSDataRequest's we have sent. + * * On C_SYNC_SOURCE there is no data structure that tracks + * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget. + * And no, it is not the sum of the reference counts in the + * resync_LRU. The resync_LRU tracks the whole operation including + * the disk-IO, while the rs_pending_cnt only tracks the blocks + * on the fly. */ + drbd_rs_cancel_all(device); + device->rs_total = 0; + device->rs_failed = 0; + atomic_set(&device->rs_pending_cnt, 0); + wake_up(&device->misc_wait); + + del_timer_sync(&device->resync_timer); + resync_timer_fn((unsigned long)device); + + /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, + * w_make_resync_request etc. which may still be on the worker queue + * to be "canceled" */ + drbd_flush_workqueue(&peer_device->connection->sender_work); + + drbd_finish_peer_reqs(device); + + /* This second workqueue flush is necessary, since drbd_finish_peer_reqs() + might have issued a work again. The one before drbd_finish_peer_reqs() is + necessary to reclain net_ee in drbd_finish_peer_reqs(). */ + drbd_flush_workqueue(&peer_device->connection->sender_work); + + /* need to do it again, drbd_finish_peer_reqs() may have populated it + * again via drbd_try_clear_on_disk_bm(). */ + drbd_rs_cancel_all(device); + + kfree(device->p_uuid); + device->p_uuid = NULL; + + if (!drbd_suspended(device)) + tl_clear(peer_device->connection); + + drbd_md_sync(device); + + /* serialize with bitmap writeout triggered by the state change, + * if any. */ + wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); + + /* tcp_close and release of sendpage pages can be deferred. I don't + * want to use SO_LINGER, because apparently it can be deferred for + * more than 20 seconds (longest time I checked). + * + * Actually we don't care for exactly when the network stack does its + * put_page(), but release our reference on these pages right here. + */ + i = drbd_free_peer_reqs(device, &device->net_ee); + if (i) + drbd_info(device, "net_ee not empty, killed %u entries\n", i); + i = atomic_read(&device->pp_in_use_by_net); + if (i) + drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i); + i = atomic_read(&device->pp_in_use); + if (i) + drbd_info(device, "pp_in_use = %d, expected 0\n", i); + + D_ASSERT(device, list_empty(&device->read_ee)); + D_ASSERT(device, list_empty(&device->active_ee)); + D_ASSERT(device, list_empty(&device->sync_ee)); + D_ASSERT(device, list_empty(&device->done_ee)); + + return 0; +} + +/* + * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version + * we can agree on is stored in agreed_pro_version. + * + * feature flags and the reserved array should be enough room for future + * enhancements of the handshake protocol, and possible plugins... + * + * for now, they are expected to be zero, but ignored. + */ +static int drbd_send_features(struct drbd_connection *connection) +{ + struct drbd_socket *sock; + struct p_connection_features *p; + + sock = &connection->data; + p = conn_prepare_command(connection, sock); + if (!p) + return -EIO; + memset(p, 0, sizeof(*p)); + p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); + p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); + p->feature_flags = cpu_to_be32(PRO_FEATURES); + return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0); +} + +/* + * return values: + * 1 yes, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + */ +static int drbd_do_features(struct drbd_connection *connection) +{ + /* ASSERT current == connection->receiver ... */ + struct p_connection_features *p; + const int expect = sizeof(struct p_connection_features); + struct packet_info pi; + int err; + + err = drbd_send_features(connection); + if (err) + return 0; + + err = drbd_recv_header(connection, &pi); + if (err) + return 0; + + if (pi.cmd != P_CONNECTION_FEATURES) { + drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); + return -1; + } + + if (pi.size != expect) { + drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n", + expect, pi.size); + return -1; + } + + p = pi.data; + err = drbd_recv_all_warn(connection, p, expect); + if (err) + return 0; + + p->protocol_min = be32_to_cpu(p->protocol_min); + p->protocol_max = be32_to_cpu(p->protocol_max); + if (p->protocol_max == 0) + p->protocol_max = p->protocol_min; + + if (PRO_VERSION_MAX < p->protocol_min || + PRO_VERSION_MIN > p->protocol_max) + goto incompat; + + connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); + connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags); + + drbd_info(connection, "Handshake successful: " + "Agreed network protocol version %d\n", connection->agreed_pro_version); + + drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n", + connection->agreed_features & FF_TRIM ? " " : " not "); + + return 1; + + incompat: + drbd_err(connection, "incompatible DRBD dialects: " + "I support %d-%d, peer supports %d-%d\n", + PRO_VERSION_MIN, PRO_VERSION_MAX, + p->protocol_min, p->protocol_max); + return -1; +} + +#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) +static int drbd_do_auth(struct drbd_connection *connection) +{ + drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); + drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); + return -1; +} +#else +#define CHALLENGE_LEN 64 + +/* Return value: + 1 - auth succeeded, + 0 - failed, try again (network error), + -1 - auth failed, don't try again. +*/ + +static int drbd_do_auth(struct drbd_connection *connection) +{ + struct drbd_socket *sock; + char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ + struct scatterlist sg; + char *response = NULL; + char *right_response = NULL; + char *peers_ch = NULL; + unsigned int key_len; + char secret[SHARED_SECRET_MAX]; /* 64 byte */ + unsigned int resp_size; + struct hash_desc desc; + struct packet_info pi; + struct net_conf *nc; + int err, rv; + + /* FIXME: Put the challenge/response into the preallocated socket buffer. */ + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + key_len = strlen(nc->shared_secret); + memcpy(secret, nc->shared_secret, key_len); + rcu_read_unlock(); + + desc.tfm = connection->cram_hmac_tfm; + desc.flags = 0; + + rv = crypto_hash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len); + if (rv) { + drbd_err(connection, "crypto_hash_setkey() failed with %d\n", rv); + rv = -1; + goto fail; + } + + get_random_bytes(my_challenge, CHALLENGE_LEN); + + sock = &connection->data; + if (!conn_prepare_command(connection, sock)) { + rv = 0; + goto fail; + } + rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0, + my_challenge, CHALLENGE_LEN); + if (!rv) + goto fail; + + err = drbd_recv_header(connection, &pi); + if (err) { + rv = 0; + goto fail; + } + + if (pi.cmd != P_AUTH_CHALLENGE) { + drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); + rv = 0; + goto fail; + } + + if (pi.size > CHALLENGE_LEN * 2) { + drbd_err(connection, "expected AuthChallenge payload too big.\n"); + rv = -1; + goto fail; + } + + if (pi.size < CHALLENGE_LEN) { + drbd_err(connection, "AuthChallenge payload too small.\n"); + rv = -1; + goto fail; + } + + peers_ch = kmalloc(pi.size, GFP_NOIO); + if (peers_ch == NULL) { + drbd_err(connection, "kmalloc of peers_ch failed\n"); + rv = -1; + goto fail; + } + + err = drbd_recv_all_warn(connection, peers_ch, pi.size); + if (err) { + rv = 0; + goto fail; + } + + if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) { + drbd_err(connection, "Peer presented the same challenge!\n"); + rv = -1; + goto fail; + } + + resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm); + response = kmalloc(resp_size, GFP_NOIO); + if (response == NULL) { + drbd_err(connection, "kmalloc of response failed\n"); + rv = -1; + goto fail; + } + + sg_init_table(&sg, 1); + sg_set_buf(&sg, peers_ch, pi.size); + + rv = crypto_hash_digest(&desc, &sg, sg.length, response); + if (rv) { + drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv); + rv = -1; + goto fail; + } + + if (!conn_prepare_command(connection, sock)) { + rv = 0; + goto fail; + } + rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0, + response, resp_size); + if (!rv) + goto fail; + + err = drbd_recv_header(connection, &pi); + if (err) { + rv = 0; + goto fail; + } + + if (pi.cmd != P_AUTH_RESPONSE) { + drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); + rv = 0; + goto fail; + } + + if (pi.size != resp_size) { + drbd_err(connection, "expected AuthResponse payload of wrong size\n"); + rv = 0; + goto fail; + } + + err = drbd_recv_all_warn(connection, response , resp_size); + if (err) { + rv = 0; + goto fail; + } + + right_response = kmalloc(resp_size, GFP_NOIO); + if (right_response == NULL) { + drbd_err(connection, "kmalloc of right_response failed\n"); + rv = -1; + goto fail; + } + + sg_set_buf(&sg, my_challenge, CHALLENGE_LEN); + + rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); + if (rv) { + drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv); + rv = -1; + goto fail; + } + + rv = !memcmp(response, right_response, resp_size); + + if (rv) + drbd_info(connection, "Peer authenticated using %d bytes HMAC\n", + resp_size); + else + rv = -1; + + fail: + kfree(peers_ch); + kfree(response); + kfree(right_response); + + return rv; +} +#endif + +int drbd_receiver(struct drbd_thread *thi) +{ + struct drbd_connection *connection = thi->connection; + int h; + + drbd_info(connection, "receiver (re)started\n"); + + do { + h = conn_connect(connection); + if (h == 0) { + conn_disconnect(connection); + schedule_timeout_interruptible(HZ); + } + if (h == -1) { + drbd_warn(connection, "Discarding network configuration.\n"); + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + } + } while (h == 0); + + if (h > 0) + drbdd(connection); + + conn_disconnect(connection); + + drbd_info(connection, "receiver terminated\n"); + return 0; +} + +/* ********* acknowledge sender ******** */ + +static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct p_req_state_reply *p = pi->data; + int retcode = be32_to_cpu(p->retcode); + + if (retcode >= SS_SUCCESS) { + set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags); + } else { + set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags); + drbd_err(connection, "Requested state change failed by peer: %s (%d)\n", + drbd_set_st_err_str(retcode), retcode); + } + wake_up(&connection->ping_wait); + + return 0; +} + +static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_req_state_reply *p = pi->data; + int retcode = be32_to_cpu(p->retcode); + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) { + D_ASSERT(device, connection->agreed_pro_version < 100); + return got_conn_RqSReply(connection, pi); + } + + if (retcode >= SS_SUCCESS) { + set_bit(CL_ST_CHG_SUCCESS, &device->flags); + } else { + set_bit(CL_ST_CHG_FAIL, &device->flags); + drbd_err(device, "Requested state change failed by peer: %s (%d)\n", + drbd_set_st_err_str(retcode), retcode); + } + wake_up(&device->state_wait); + + return 0; +} + +static int got_Ping(struct drbd_connection *connection, struct packet_info *pi) +{ + return drbd_send_ping_ack(connection); + +} + +static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi) +{ + /* restore idle timeout */ + connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ; + if (!test_and_set_bit(GOT_PING_ACK, &connection->flags)) + wake_up(&connection->ping_wait); + + return 0; +} + +static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_ack *p = pi->data; + sector_t sector = be64_to_cpu(p->sector); + int blksize = be32_to_cpu(p->blksize); + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89); + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + if (get_ldev(device)) { + drbd_rs_complete_io(device, sector); + drbd_set_in_sync(device, sector, blksize); + /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ + device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); + put_ldev(device); + } + dec_rs_pending(device); + atomic_add(blksize >> 9, &device->rs_sect_in); + + return 0; +} + +static int +validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector, + struct rb_root *root, const char *func, + enum drbd_req_event what, bool missing_ok) +{ + struct drbd_request *req; + struct bio_and_error m; + + spin_lock_irq(&device->resource->req_lock); + req = find_request(device, root, id, sector, missing_ok, func); + if (unlikely(!req)) { + spin_unlock_irq(&device->resource->req_lock); + return -EIO; + } + __req_mod(req, what, &m); + spin_unlock_irq(&device->resource->req_lock); + + if (m.bio) + complete_master_bio(device, &m); + return 0; +} + +static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_ack *p = pi->data; + sector_t sector = be64_to_cpu(p->sector); + int blksize = be32_to_cpu(p->blksize); + enum drbd_req_event what; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + if (p->block_id == ID_SYNCER) { + drbd_set_in_sync(device, sector, blksize); + dec_rs_pending(device); + return 0; + } + switch (pi->cmd) { + case P_RS_WRITE_ACK: + what = WRITE_ACKED_BY_PEER_AND_SIS; + break; + case P_WRITE_ACK: + what = WRITE_ACKED_BY_PEER; + break; + case P_RECV_ACK: + what = RECV_ACKED_BY_PEER; + break; + case P_SUPERSEDED: + what = CONFLICT_RESOLVED; + break; + case P_RETRY_WRITE: + what = POSTPONE_WRITE; + break; + default: + BUG(); + } + + return validate_req_change_req_state(device, p->block_id, sector, + &device->write_requests, __func__, + what, false); +} + +static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_ack *p = pi->data; + sector_t sector = be64_to_cpu(p->sector); + int size = be32_to_cpu(p->blksize); + int err; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + if (p->block_id == ID_SYNCER) { + dec_rs_pending(device); + drbd_rs_failed_io(device, sector, size); + return 0; + } + + err = validate_req_change_req_state(device, p->block_id, sector, + &device->write_requests, __func__, + NEG_ACKED, true); + if (err) { + /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs. + The master bio might already be completed, therefore the + request is no longer in the collision hash. */ + /* In Protocol B we might already have got a P_RECV_ACK + but then get a P_NEG_ACK afterwards. */ + drbd_set_out_of_sync(device, sector, size); + } + return 0; +} + +static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_ack *p = pi->data; + sector_t sector = be64_to_cpu(p->sector); + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n", + (unsigned long long)sector, be32_to_cpu(p->blksize)); + + return validate_req_change_req_state(device, p->block_id, sector, + &device->read_requests, __func__, + NEG_ACKED, false); +} + +static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + sector_t sector; + int size; + struct p_block_ack *p = pi->data; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + dec_rs_pending(device); + + if (get_ldev_if_state(device, D_FAILED)) { + drbd_rs_complete_io(device, sector); + switch (pi->cmd) { + case P_NEG_RS_DREPLY: + drbd_rs_failed_io(device, sector, size); + case P_RS_CANCEL: + break; + default: + BUG(); + } + put_ldev(device); + } + + return 0; +} + +static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi) +{ + struct p_barrier_ack *p = pi->data; + struct drbd_peer_device *peer_device; + int vnr; + + tl_release(connection, p->barrier, be32_to_cpu(p->set_size)); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + + if (device->state.conn == C_AHEAD && + atomic_read(&device->ap_in_flight) == 0 && + !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) { + device->start_resync_timer.expires = jiffies + HZ; + add_timer(&device->start_resync_timer); + } + } + rcu_read_unlock(); + + return 0; +} + +static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct drbd_device *device; + struct p_block_ack *p = pi->data; + struct drbd_device_work *dw; + sector_t sector; + int size; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + update_peer_seq(peer_device, be32_to_cpu(p->seq_num)); + + if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) + drbd_ov_out_of_sync_found(device, sector, size); + else + ov_out_of_sync_print(device); + + if (!get_ldev(device)) + return 0; + + drbd_rs_complete_io(device, sector); + dec_rs_pending(device); + + --device->ov_left; + + /* let's advance progress step marks only for every other megabyte */ + if ((device->ov_left & 0x200) == 0x200) + drbd_advance_rs_marks(device, device->ov_left); + + if (device->ov_left == 0) { + dw = kmalloc(sizeof(*dw), GFP_NOIO); + if (dw) { + dw->w.cb = w_ov_finished; + dw->device = device; + drbd_queue_work(&peer_device->connection->sender_work, &dw->w); + } else { + drbd_err(device, "kmalloc(dw) failed."); + ov_out_of_sync_print(device); + drbd_resync_finished(device); + } + } + put_ldev(device); + return 0; +} + +static int got_skip(struct drbd_connection *connection, struct packet_info *pi) +{ + return 0; +} + +static int connection_finish_peer_reqs(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr, not_empty = 0; + + do { + clear_bit(SIGNAL_ASENDER, &connection->flags); + flush_signals(current); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + kref_get(&device->kref); + rcu_read_unlock(); + if (drbd_finish_peer_reqs(device)) { + kref_put(&device->kref, drbd_destroy_device); + return 1; + } + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + set_bit(SIGNAL_ASENDER, &connection->flags); + + spin_lock_irq(&connection->resource->req_lock); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + not_empty = !list_empty(&device->done_ee); + if (not_empty) + break; + } + spin_unlock_irq(&connection->resource->req_lock); + rcu_read_unlock(); + } while (not_empty); + + return 0; +} + +struct asender_cmd { + size_t pkt_size; + int (*fn)(struct drbd_connection *connection, struct packet_info *); +}; + +static struct asender_cmd asender_tbl[] = { + [P_PING] = { 0, got_Ping }, + [P_PING_ACK] = { 0, got_PingAck }, + [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, + [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, + [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply }, + [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, + [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, + [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, + [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, + [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip }, + [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply }, + [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply }, + [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, +}; + +int drbd_asender(struct drbd_thread *thi) +{ + struct drbd_connection *connection = thi->connection; + struct asender_cmd *cmd = NULL; + struct packet_info pi; + int rv; + void *buf = connection->meta.rbuf; + int received = 0; + unsigned int header_size = drbd_header_size(connection); + int expect = header_size; + bool ping_timeout_active = false; + struct net_conf *nc; + int ping_timeo, tcp_cork, ping_int; + struct sched_param param = { .sched_priority = 2 }; + + rv = sched_setscheduler(current, SCHED_RR, ¶m); + if (rv < 0) + drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv); + + while (get_t_state(thi) == RUNNING) { + drbd_thread_current_set_cpu(thi); + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + ping_timeo = nc->ping_timeo; + tcp_cork = nc->tcp_cork; + ping_int = nc->ping_int; + rcu_read_unlock(); + + if (test_and_clear_bit(SEND_PING, &connection->flags)) { + if (drbd_send_ping(connection)) { + drbd_err(connection, "drbd_send_ping has failed\n"); + goto reconnect; + } + connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; + ping_timeout_active = true; + } + + /* TODO: conditionally cork; it may hurt latency if we cork without + much to send */ + if (tcp_cork) + drbd_tcp_cork(connection->meta.socket); + if (connection_finish_peer_reqs(connection)) { + drbd_err(connection, "connection_finish_peer_reqs() failed\n"); + goto reconnect; + } + /* but unconditionally uncork unless disabled */ + if (tcp_cork) + drbd_tcp_uncork(connection->meta.socket); + + /* short circuit, recv_msg would return EINTR anyways. */ + if (signal_pending(current)) + continue; + + rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0); + clear_bit(SIGNAL_ASENDER, &connection->flags); + + flush_signals(current); + + /* Note: + * -EINTR (on meta) we got a signal + * -EAGAIN (on meta) rcvtimeo expired + * -ECONNRESET other side closed the connection + * -ERESTARTSYS (on data) we got a signal + * rv < 0 other than above: unexpected error! + * rv == expected: full header or command + * rv < expected: "woken" by signal during receive + * rv == 0 : "connection shut down by peer" + */ +received_more: + if (likely(rv > 0)) { + received += rv; + buf += rv; + } else if (rv == 0) { + if (test_bit(DISCONNECT_SENT, &connection->flags)) { + long t; + rcu_read_lock(); + t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10; + rcu_read_unlock(); + + t = wait_event_timeout(connection->ping_wait, + connection->cstate < C_WF_REPORT_PARAMS, + t); + if (t) + break; + } + drbd_err(connection, "meta connection shut down by peer.\n"); + goto reconnect; + } else if (rv == -EAGAIN) { + /* If the data socket received something meanwhile, + * that is good enough: peer is still alive. */ + if (time_after(connection->last_received, + jiffies - connection->meta.socket->sk->sk_rcvtimeo)) + continue; + if (ping_timeout_active) { + drbd_err(connection, "PingAck did not arrive in time.\n"); + goto reconnect; + } + set_bit(SEND_PING, &connection->flags); + continue; + } else if (rv == -EINTR) { + continue; + } else { + drbd_err(connection, "sock_recvmsg returned %d\n", rv); + goto reconnect; + } + + if (received == expect && cmd == NULL) { + if (decode_header(connection, connection->meta.rbuf, &pi)) + goto reconnect; + cmd = &asender_tbl[pi.cmd]; + if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { + drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n", + cmdname(pi.cmd), pi.cmd); + goto disconnect; + } + expect = header_size + cmd->pkt_size; + if (pi.size != expect - header_size) { + drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n", + pi.cmd, pi.size); + goto reconnect; + } + } + if (received == expect) { + bool err; + + err = cmd->fn(connection, &pi); + if (err) { + drbd_err(connection, "%pf failed\n", cmd->fn); + goto reconnect; + } + + connection->last_received = jiffies; + + if (cmd == &asender_tbl[P_PING_ACK]) { + /* restore idle timeout */ + connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ; + ping_timeout_active = false; + } + + buf = connection->meta.rbuf; + received = 0; + expect = header_size; + cmd = NULL; + } + if (test_bit(SEND_PING, &connection->flags)) + continue; + rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT); + if (rv > 0) + goto received_more; + } + + if (0) { +reconnect: + conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); + conn_md_sync(connection); + } + if (0) { +disconnect: + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + } + clear_bit(SIGNAL_ASENDER, &connection->flags); + + drbd_info(connection, "asender terminated\n"); + + return 0; +} diff --git a/kernel/drivers/block/drbd/drbd_req.c b/kernel/drivers/block/drbd/drbd_req.c new file mode 100644 index 000000000..3907202fb --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_req.c @@ -0,0 +1,1638 @@ +/* + drbd_req.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include + +#include +#include +#include "drbd_int.h" +#include "drbd_req.h" + + +static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size); + +/* Update disk stats at start of I/O request */ +static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req) +{ + generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9, + &device->vdisk->part0); +} + +/* Update disk stats when completing request upwards */ +static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) +{ + generic_end_io_acct(bio_data_dir(req->master_bio), + &device->vdisk->part0, req->start_jif); +} + +static struct drbd_request *drbd_req_new(struct drbd_device *device, + struct bio *bio_src) +{ + struct drbd_request *req; + + req = mempool_alloc(drbd_request_mempool, GFP_NOIO); + if (!req) + return NULL; + memset(req, 0, sizeof(*req)); + + drbd_req_make_private_bio(req, bio_src); + req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; + req->device = device; + req->master_bio = bio_src; + req->epoch = 0; + + drbd_clear_interval(&req->i); + req->i.sector = bio_src->bi_iter.bi_sector; + req->i.size = bio_src->bi_iter.bi_size; + req->i.local = true; + req->i.waiting = false; + + INIT_LIST_HEAD(&req->tl_requests); + INIT_LIST_HEAD(&req->w.list); + INIT_LIST_HEAD(&req->req_pending_master_completion); + INIT_LIST_HEAD(&req->req_pending_local); + + /* one reference to be put by __drbd_make_request */ + atomic_set(&req->completion_ref, 1); + /* one kref as long as completion_ref > 0 */ + kref_init(&req->kref); + return req; +} + +static void drbd_remove_request_interval(struct rb_root *root, + struct drbd_request *req) +{ + struct drbd_device *device = req->device; + struct drbd_interval *i = &req->i; + + drbd_remove_interval(root, i); + + /* Wake up any processes waiting for this request to complete. */ + if (i->waiting) + wake_up(&device->misc_wait); +} + +void drbd_req_destroy(struct kref *kref) +{ + struct drbd_request *req = container_of(kref, struct drbd_request, kref); + struct drbd_device *device = req->device; + const unsigned s = req->rq_state; + + if ((req->master_bio && !(s & RQ_POSTPONED)) || + atomic_read(&req->completion_ref) || + (s & RQ_LOCAL_PENDING) || + ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) { + drbd_err(device, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n", + s, atomic_read(&req->completion_ref)); + return; + } + + /* If called from mod_rq_state (expected normal case) or + * drbd_send_and_submit (the less likely normal path), this holds the + * req_lock, and req->tl_requests will typicaly be on ->transfer_log, + * though it may be still empty (never added to the transfer log). + * + * If called from do_retry(), we do NOT hold the req_lock, but we are + * still allowed to unconditionally list_del(&req->tl_requests), + * because it will be on a local on-stack list only. */ + list_del_init(&req->tl_requests); + + /* finally remove the request from the conflict detection + * respective block_id verification interval tree. */ + if (!drbd_interval_empty(&req->i)) { + struct rb_root *root; + + if (s & RQ_WRITE) + root = &device->write_requests; + else + root = &device->read_requests; + drbd_remove_request_interval(root, req); + } else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0) + drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n", + s, (unsigned long long)req->i.sector, req->i.size); + + /* if it was a write, we may have to set the corresponding + * bit(s) out-of-sync first. If it had a local part, we need to + * release the reference to the activity log. */ + if (s & RQ_WRITE) { + /* Set out-of-sync unless both OK flags are set + * (local only or remote failed). + * Other places where we set out-of-sync: + * READ with local io-error */ + + /* There is a special case: + * we may notice late that IO was suspended, + * and postpone, or schedule for retry, a write, + * before it even was submitted or sent. + * In that case we do not want to touch the bitmap at all. + */ + if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) { + if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) + drbd_set_out_of_sync(device, req->i.sector, req->i.size); + + if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) + drbd_set_in_sync(device, req->i.sector, req->i.size); + } + + /* one might be tempted to move the drbd_al_complete_io + * to the local io completion callback drbd_request_endio. + * but, if this was a mirror write, we may only + * drbd_al_complete_io after this is RQ_NET_DONE, + * otherwise the extent could be dropped from the al + * before it has actually been written on the peer. + * if we crash before our peer knows about the request, + * but after the extent has been dropped from the al, + * we would forget to resync the corresponding extent. + */ + if (s & RQ_IN_ACT_LOG) { + if (get_ldev_if_state(device, D_FAILED)) { + drbd_al_complete_io(device, &req->i); + put_ldev(device); + } else if (__ratelimit(&drbd_ratelimit_state)) { + drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), " + "but my Disk seems to have failed :(\n", + (unsigned long long) req->i.sector, req->i.size); + } + } + } + + mempool_free(req, drbd_request_mempool); +} + +static void wake_all_senders(struct drbd_connection *connection) +{ + wake_up(&connection->sender_work.q_wait); +} + +/* must hold resource->req_lock */ +void start_new_tl_epoch(struct drbd_connection *connection) +{ + /* no point closing an epoch, if it is empty, anyways. */ + if (connection->current_tle_writes == 0) + return; + + connection->current_tle_writes = 0; + atomic_inc(&connection->current_tle_nr); + wake_all_senders(connection); +} + +void complete_master_bio(struct drbd_device *device, + struct bio_and_error *m) +{ + bio_endio(m->bio, m->error); + dec_ap_bio(device); +} + + +/* Helper for __req_mod(). + * Set m->bio to the master bio, if it is fit to be completed, + * or leave it alone (it is initialized to NULL in __req_mod), + * if it has already been completed, or cannot be completed yet. + * If m->bio is set, the error status to be returned is placed in m->error. + */ +static +void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) +{ + const unsigned s = req->rq_state; + struct drbd_device *device = req->device; + int rw; + int error, ok; + + /* we must not complete the master bio, while it is + * still being processed by _drbd_send_zc_bio (drbd_send_dblock) + * not yet acknowledged by the peer + * not yet completed by the local io subsystem + * these flags may get cleared in any order by + * the worker, + * the receiver, + * the bio_endio completion callbacks. + */ + if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) || + (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) || + (s & RQ_COMPLETION_SUSP)) { + drbd_err(device, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s); + return; + } + + if (!req->master_bio) { + drbd_err(device, "drbd_req_complete: Logic BUG, master_bio == NULL!\n"); + return; + } + + rw = bio_rw(req->master_bio); + + /* + * figure out whether to report success or failure. + * + * report success when at least one of the operations succeeded. + * or, to put the other way, + * only report failure, when both operations failed. + * + * what to do about the failures is handled elsewhere. + * what we need to do here is just: complete the master_bio. + * + * local completion error, if any, has been stored as ERR_PTR + * in private_bio within drbd_request_endio. + */ + ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); + error = PTR_ERR(req->private_bio); + + /* Before we can signal completion to the upper layers, + * we may need to close the current transfer log epoch. + * We are within the request lock, so we can simply compare + * the request epoch number with the current transfer log + * epoch number. If they match, increase the current_tle_nr, + * and reset the transfer log epoch write_cnt. + */ + if (rw == WRITE && + req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr)) + start_new_tl_epoch(first_peer_device(device)->connection); + + /* Update disk stats */ + _drbd_end_io_acct(device, req); + + /* If READ failed, + * have it be pushed back to the retry work queue, + * so it will re-enter __drbd_make_request(), + * and be re-assigned to a suitable local or remote path, + * or failed if we do not have access to good data anymore. + * + * Unless it was failed early by __drbd_make_request(), + * because no path was available, in which case + * it was not even added to the transfer_log. + * + * READA may fail, and will not be retried. + * + * WRITE should have used all available paths already. + */ + if (!ok && rw == READ && !list_empty(&req->tl_requests)) + req->rq_state |= RQ_POSTPONED; + + if (!(req->rq_state & RQ_POSTPONED)) { + m->error = ok ? 0 : (error ?: -EIO); + m->bio = req->master_bio; + req->master_bio = NULL; + /* We leave it in the tree, to be able to verify later + * write-acks in protocol != C during resync. + * But we mark it as "complete", so it won't be counted as + * conflict in a multi-primary setup. */ + req->i.completed = true; + } + + if (req->i.waiting) + wake_up(&device->misc_wait); + + /* Either we are about to complete to upper layers, + * or we will restart this request. + * In either case, the request object will be destroyed soon, + * so better remove it from all lists. */ + list_del_init(&req->req_pending_master_completion); +} + +/* still holds resource->req_lock */ +static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put) +{ + struct drbd_device *device = req->device; + D_ASSERT(device, m || (req->rq_state & RQ_POSTPONED)); + + if (!atomic_sub_and_test(put, &req->completion_ref)) + return 0; + + drbd_req_complete(req, m); + + if (req->rq_state & RQ_POSTPONED) { + /* don't destroy the req object just yet, + * but queue it for retry */ + drbd_restart_request(req); + return 0; + } + + return 1; +} + +static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_next == NULL) + connection->req_next = req; +} + +static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_next != req) + return; + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { + const unsigned s = req->rq_state; + if (s & RQ_NET_QUEUED) + break; + } + if (&req->tl_requests == &connection->transfer_log) + req = NULL; + connection->req_next = req; +} + +static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_ack_pending == NULL) + connection->req_ack_pending = req; +} + +static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_ack_pending != req) + return; + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { + const unsigned s = req->rq_state; + if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING)) + break; + } + if (&req->tl_requests == &connection->transfer_log) + req = NULL; + connection->req_ack_pending = req; +} + +static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_not_net_done == NULL) + connection->req_not_net_done = req; +} + +static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req) +{ + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + if (!connection) + return; + if (connection->req_not_net_done != req) + return; + list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) { + const unsigned s = req->rq_state; + if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE)) + break; + } + if (&req->tl_requests == &connection->transfer_log) + req = NULL; + connection->req_not_net_done = req; +} + +/* I'd like this to be the only place that manipulates + * req->completion_ref and req->kref. */ +static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, + int clear, int set) +{ + struct drbd_device *device = req->device; + struct drbd_peer_device *peer_device = first_peer_device(device); + unsigned s = req->rq_state; + int c_put = 0; + int k_put = 0; + + if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP)) + set |= RQ_COMPLETION_SUSP; + + /* apply */ + + req->rq_state &= ~clear; + req->rq_state |= set; + + /* no change? */ + if (req->rq_state == s) + return; + + /* intent: get references */ + + if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING)) + atomic_inc(&req->completion_ref); + + if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) { + inc_ap_pending(device); + atomic_inc(&req->completion_ref); + } + + if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) { + atomic_inc(&req->completion_ref); + set_if_null_req_next(peer_device, req); + } + + if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK)) + kref_get(&req->kref); /* wait for the DONE */ + + if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) { + /* potentially already completed in the asender thread */ + if (!(s & RQ_NET_DONE)) { + atomic_add(req->i.size >> 9, &device->ap_in_flight); + set_if_null_req_not_net_done(peer_device, req); + } + if (s & RQ_NET_PENDING) + set_if_null_req_ack_pending(peer_device, req); + } + + if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP)) + atomic_inc(&req->completion_ref); + + /* progress: put references */ + + if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP)) + ++c_put; + + if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) { + D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING); + /* local completion may still come in later, + * we need to keep the req object around. */ + kref_get(&req->kref); + ++c_put; + } + + if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) { + if (req->rq_state & RQ_LOCAL_ABORTED) + ++k_put; + else + ++c_put; + list_del_init(&req->req_pending_local); + } + + if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) { + dec_ap_pending(device); + ++c_put; + req->acked_jif = jiffies; + advance_conn_req_ack_pending(peer_device, req); + } + + if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) { + ++c_put; + advance_conn_req_next(peer_device, req); + } + + if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) { + if (s & RQ_NET_SENT) + atomic_sub(req->i.size >> 9, &device->ap_in_flight); + if (s & RQ_EXP_BARR_ACK) + ++k_put; + req->net_done_jif = jiffies; + + /* in ahead/behind mode, or just in case, + * before we finally destroy this request, + * the caching pointers must not reference it anymore */ + advance_conn_req_next(peer_device, req); + advance_conn_req_ack_pending(peer_device, req); + advance_conn_req_not_net_done(peer_device, req); + } + + /* potentially complete and destroy */ + + if (k_put || c_put) { + /* Completion does it's own kref_put. If we are going to + * kref_sub below, we need req to be still around then. */ + int at_least = k_put + !!c_put; + int refcount = atomic_read(&req->kref.refcount); + if (refcount < at_least) + drbd_err(device, + "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n", + s, req->rq_state, refcount, at_least); + } + + /* If we made progress, retry conflicting peer requests, if any. */ + if (req->i.waiting) + wake_up(&device->misc_wait); + + if (c_put) + k_put += drbd_req_put_completion_ref(req, m, c_put); + if (k_put) + kref_sub(&req->kref, k_put, drbd_req_destroy); +} + +static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req) +{ + char b[BDEVNAME_SIZE]; + + if (!__ratelimit(&drbd_ratelimit_state)) + return; + + drbd_warn(device, "local %s IO error sector %llu+%u on %s\n", + (req->rq_state & RQ_WRITE) ? "WRITE" : "READ", + (unsigned long long)req->i.sector, + req->i.size >> 9, + bdevname(device->ldev->backing_bdev, b)); +} + +/* Helper for HANDED_OVER_TO_NETWORK. + * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)? + * Is it also still "PENDING"? + * --> If so, clear PENDING and set NET_OK below. + * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster + * (and we must not set RQ_NET_OK) */ +static inline bool is_pending_write_protocol_A(struct drbd_request *req) +{ + return (req->rq_state & + (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK)) + == (RQ_WRITE|RQ_NET_PENDING); +} + +/* obviously this could be coded as many single functions + * instead of one huge switch, + * or by putting the code directly in the respective locations + * (as it has been before). + * + * but having it this way + * enforces that it is all in this one place, where it is easier to audit, + * it makes it obvious that whatever "event" "happens" to a request should + * happen "atomically" within the req_lock, + * and it enforces that we have to think in a very structured manner + * about the "events" that may happen to a request during its life time ... + */ +int __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct bio_and_error *m) +{ + struct drbd_device *const device = req->device; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; + struct net_conf *nc; + int p, rv = 0; + + if (m) + m->bio = NULL; + + switch (what) { + default: + drbd_err(device, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__); + break; + + /* does not happen... + * initialization done in drbd_req_new + case CREATED: + break; + */ + + case TO_BE_SENT: /* via network */ + /* reached via __drbd_make_request + * and from w_read_retry_remote */ + D_ASSERT(device, !(req->rq_state & RQ_NET_MASK)); + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + p = nc->wire_protocol; + rcu_read_unlock(); + req->rq_state |= + p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK : + p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0; + mod_rq_state(req, m, 0, RQ_NET_PENDING); + break; + + case TO_BE_SUBMITTED: /* locally */ + /* reached via __drbd_make_request */ + D_ASSERT(device, !(req->rq_state & RQ_LOCAL_MASK)); + mod_rq_state(req, m, 0, RQ_LOCAL_PENDING); + break; + + case COMPLETED_OK: + if (req->rq_state & RQ_WRITE) + device->writ_cnt += req->i.size >> 9; + else + device->read_cnt += req->i.size >> 9; + + mod_rq_state(req, m, RQ_LOCAL_PENDING, + RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); + break; + + case ABORT_DISK_IO: + mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED); + break; + + case WRITE_COMPLETED_WITH_ERROR: + drbd_report_io_error(device, req); + __drbd_chk_io_error(device, DRBD_WRITE_ERROR); + mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + break; + + case READ_COMPLETED_WITH_ERROR: + drbd_set_out_of_sync(device, req->i.sector, req->i.size); + drbd_report_io_error(device, req); + __drbd_chk_io_error(device, DRBD_READ_ERROR); + /* fall through. */ + case READ_AHEAD_COMPLETED_WITH_ERROR: + /* it is legal to fail READA, no __drbd_chk_io_error in that case. */ + mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + break; + + case DISCARD_COMPLETED_NOTSUPP: + case DISCARD_COMPLETED_WITH_ERROR: + /* I'd rather not detach from local disk just because it + * failed a REQ_DISCARD. */ + mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); + break; + + case QUEUE_FOR_NET_READ: + /* READ or READA, and + * no local disk, + * or target area marked as invalid, + * or just got an io-error. */ + /* from __drbd_make_request + * or from bio_endio during read io-error recovery */ + + /* So we can verify the handle in the answer packet. + * Corresponding drbd_remove_request_interval is in + * drbd_req_complete() */ + D_ASSERT(device, drbd_interval_empty(&req->i)); + drbd_insert_interval(&device->read_requests, &req->i); + + set_bit(UNPLUG_REMOTE, &device->flags); + + D_ASSERT(device, req->rq_state & RQ_NET_PENDING); + D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0); + mod_rq_state(req, m, 0, RQ_NET_QUEUED); + req->w.cb = w_send_read_req; + drbd_queue_work(&connection->sender_work, + &req->w); + break; + + case QUEUE_FOR_NET_WRITE: + /* assert something? */ + /* from __drbd_make_request only */ + + /* Corresponding drbd_remove_request_interval is in + * drbd_req_complete() */ + D_ASSERT(device, drbd_interval_empty(&req->i)); + drbd_insert_interval(&device->write_requests, &req->i); + + /* NOTE + * In case the req ended up on the transfer log before being + * queued on the worker, it could lead to this request being + * missed during cleanup after connection loss. + * So we have to do both operations here, + * within the same lock that protects the transfer log. + * + * _req_add_to_epoch(req); this has to be after the + * _maybe_start_new_epoch(req); which happened in + * __drbd_make_request, because we now may set the bit + * again ourselves to close the current epoch. + * + * Add req to the (now) current epoch (barrier). */ + + /* otherwise we may lose an unplug, which may cause some remote + * io-scheduler timeout to expire, increasing maximum latency, + * hurting performance. */ + set_bit(UNPLUG_REMOTE, &device->flags); + + /* queue work item to send data */ + D_ASSERT(device, req->rq_state & RQ_NET_PENDING); + mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK); + req->w.cb = w_send_dblock; + drbd_queue_work(&connection->sender_work, + &req->w); + + /* close the epoch, in case it outgrew the limit */ + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + p = nc->max_epoch_size; + rcu_read_unlock(); + if (connection->current_tle_writes >= p) + start_new_tl_epoch(connection); + + break; + + case QUEUE_FOR_SEND_OOS: + mod_rq_state(req, m, 0, RQ_NET_QUEUED); + req->w.cb = w_send_out_of_sync; + drbd_queue_work(&connection->sender_work, + &req->w); + break; + + case READ_RETRY_REMOTE_CANCELED: + case SEND_CANCELED: + case SEND_FAILED: + /* real cleanup will be done from tl_clear. just update flags + * so it is no longer marked as on the worker queue */ + mod_rq_state(req, m, RQ_NET_QUEUED, 0); + break; + + case HANDED_OVER_TO_NETWORK: + /* assert something? */ + if (is_pending_write_protocol_A(req)) + /* this is what is dangerous about protocol A: + * pretend it was successfully written on the peer. */ + mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING, + RQ_NET_SENT|RQ_NET_OK); + else + mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT); + /* It is still not yet RQ_NET_DONE until the + * corresponding epoch barrier got acked as well, + * so we know what to dirty on connection loss. */ + break; + + case OOS_HANDED_TO_NETWORK: + /* Was not set PENDING, no longer QUEUED, so is now DONE + * as far as this connection is concerned. */ + mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE); + break; + + case CONNECTION_LOST_WHILE_PENDING: + /* transfer log cleanup after connection loss */ + mod_rq_state(req, m, + RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP, + RQ_NET_DONE); + break; + + case CONFLICT_RESOLVED: + /* for superseded conflicting writes of multiple primaries, + * there is no need to keep anything in the tl, potential + * node crashes are covered by the activity log. + * + * If this request had been marked as RQ_POSTPONED before, + * it will actually not be completed, but "restarted", + * resubmitted from the retry worker context. */ + D_ASSERT(device, req->rq_state & RQ_NET_PENDING); + D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK); + mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK); + break; + + case WRITE_ACKED_BY_PEER_AND_SIS: + req->rq_state |= RQ_NET_SIS; + case WRITE_ACKED_BY_PEER: + /* Normal operation protocol C: successfully written on peer. + * During resync, even in protocol != C, + * we requested an explicit write ack anyways. + * Which means we cannot even assert anything here. + * Nothing more to do here. + * We want to keep the tl in place for all protocols, to cater + * for volatile write-back caches on lower level devices. */ + goto ack_common; + case RECV_ACKED_BY_PEER: + D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK); + /* protocol B; pretends to be successfully written on peer. + * see also notes above in HANDED_OVER_TO_NETWORK about + * protocol != C */ + ack_common: + mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK); + break; + + case POSTPONE_WRITE: + D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK); + /* If this node has already detected the write conflict, the + * worker will be waiting on misc_wait. Wake it up once this + * request has completed locally. + */ + D_ASSERT(device, req->rq_state & RQ_NET_PENDING); + req->rq_state |= RQ_POSTPONED; + if (req->i.waiting) + wake_up(&device->misc_wait); + /* Do not clear RQ_NET_PENDING. This request will make further + * progress via restart_conflicting_writes() or + * fail_postponed_requests(). Hopefully. */ + break; + + case NEG_ACKED: + mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0); + break; + + case FAIL_FROZEN_DISK_IO: + if (!(req->rq_state & RQ_LOCAL_COMPLETED)) + break; + mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); + break; + + case RESTART_FROZEN_DISK_IO: + if (!(req->rq_state & RQ_LOCAL_COMPLETED)) + break; + + mod_rq_state(req, m, + RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED, + RQ_LOCAL_PENDING); + + rv = MR_READ; + if (bio_data_dir(req->master_bio) == WRITE) + rv = MR_WRITE; + + get_ldev(device); /* always succeeds in this call path */ + req->w.cb = w_restart_disk_io; + drbd_queue_work(&connection->sender_work, + &req->w); + break; + + case RESEND: + /* Simply complete (local only) READs. */ + if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { + mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0); + break; + } + + /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK + before the connection loss (B&C only); only P_BARRIER_ACK + (or the local completion?) was missing when we suspended. + Throwing them out of the TL here by pretending we got a BARRIER_ACK. + During connection handshake, we ensure that the peer was not rebooted. */ + if (!(req->rq_state & RQ_NET_OK)) { + /* FIXME could this possibly be a req->dw.cb == w_send_out_of_sync? + * in that case we must not set RQ_NET_PENDING. */ + + mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING); + if (req->w.cb) { + /* w.cb expected to be w_send_dblock, or w_send_read_req */ + drbd_queue_work(&connection->sender_work, + &req->w); + rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ; + } /* else: FIXME can this happen? */ + break; + } + /* else, fall through to BARRIER_ACKED */ + + case BARRIER_ACKED: + /* barrier ack for READ requests does not make sense */ + if (!(req->rq_state & RQ_WRITE)) + break; + + if (req->rq_state & RQ_NET_PENDING) { + /* barrier came in before all requests were acked. + * this is bad, because if the connection is lost now, + * we won't be able to clean them up... */ + drbd_err(device, "FIXME (BARRIER_ACKED but pending)\n"); + } + /* Allowed to complete requests, even while suspended. + * As this is called for all requests within a matching epoch, + * we need to filter, and only set RQ_NET_DONE for those that + * have actually been on the wire. */ + mod_rq_state(req, m, RQ_COMPLETION_SUSP, + (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0); + break; + + case DATA_RECEIVED: + D_ASSERT(device, req->rq_state & RQ_NET_PENDING); + mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE); + break; + + case QUEUE_AS_DRBD_BARRIER: + start_new_tl_epoch(connection); + mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE); + break; + }; + + return rv; +} + +/* we may do a local read if: + * - we are consistent (of course), + * - or we are generally inconsistent, + * BUT we are still/already IN SYNC for this area. + * since size may be bigger than BM_BLOCK_SIZE, + * we may need to check several bits. + */ +static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size) +{ + unsigned long sbnr, ebnr; + sector_t esector, nr_sectors; + + if (device->state.disk == D_UP_TO_DATE) + return true; + if (device->state.disk != D_INCONSISTENT) + return false; + esector = sector + (size >> 9) - 1; + nr_sectors = drbd_get_capacity(device->this_bdev); + D_ASSERT(device, sector < nr_sectors); + D_ASSERT(device, esector < nr_sectors); + + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + return drbd_bm_count_bits(device, sbnr, ebnr) == 0; +} + +static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector, + enum drbd_read_balancing rbm) +{ + struct backing_dev_info *bdi; + int stripe_shift; + + switch (rbm) { + case RB_CONGESTED_REMOTE: + bdi = &device->ldev->backing_bdev->bd_disk->queue->backing_dev_info; + return bdi_read_congested(bdi); + case RB_LEAST_PENDING: + return atomic_read(&device->local_cnt) > + atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt); + case RB_32K_STRIPING: /* stripe_shift = 15 */ + case RB_64K_STRIPING: + case RB_128K_STRIPING: + case RB_256K_STRIPING: + case RB_512K_STRIPING: + case RB_1M_STRIPING: /* stripe_shift = 20 */ + stripe_shift = (rbm - RB_32K_STRIPING + 15); + return (sector >> (stripe_shift - 9)) & 1; + case RB_ROUND_ROBIN: + return test_and_change_bit(READ_BALANCE_RR, &device->flags); + case RB_PREFER_REMOTE: + return true; + case RB_PREFER_LOCAL: + default: + return false; + } +} + +/* + * complete_conflicting_writes - wait for any conflicting write requests + * + * The write_requests tree contains all active write requests which we + * currently know about. Wait for any requests to complete which conflict with + * the new one. + * + * Only way out: remove the conflicting intervals from the tree. + */ +static void complete_conflicting_writes(struct drbd_request *req) +{ + DEFINE_WAIT(wait); + struct drbd_device *device = req->device; + struct drbd_interval *i; + sector_t sector = req->i.sector; + int size = req->i.size; + + i = drbd_find_overlap(&device->write_requests, sector, size); + if (!i) + return; + + for (;;) { + prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE); + i = drbd_find_overlap(&device->write_requests, sector, size); + if (!i) + break; + /* Indicate to wake up device->misc_wait on progress. */ + i->waiting = true; + spin_unlock_irq(&device->resource->req_lock); + schedule(); + spin_lock_irq(&device->resource->req_lock); + } + finish_wait(&device->misc_wait, &wait); +} + +/* called within req_lock and rcu_read_lock() */ +static void maybe_pull_ahead(struct drbd_device *device) +{ + struct drbd_connection *connection = first_peer_device(device)->connection; + struct net_conf *nc; + bool congested = false; + enum drbd_on_congestion on_congestion; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + on_congestion = nc ? nc->on_congestion : OC_BLOCK; + rcu_read_unlock(); + if (on_congestion == OC_BLOCK || + connection->agreed_pro_version < 96) + return; + + if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD) + return; /* nothing to do ... */ + + /* If I don't even have good local storage, we can not reasonably try + * to pull ahead of the peer. We also need the local reference to make + * sure device->act_log is there. + */ + if (!get_ldev_if_state(device, D_UP_TO_DATE)) + return; + + if (nc->cong_fill && + atomic_read(&device->ap_in_flight) >= nc->cong_fill) { + drbd_info(device, "Congestion-fill threshold reached\n"); + congested = true; + } + + if (device->act_log->used >= nc->cong_extents) { + drbd_info(device, "Congestion-extents threshold reached\n"); + congested = true; + } + + if (congested) { + /* start a new epoch for non-mirrored writes */ + start_new_tl_epoch(first_peer_device(device)->connection); + + if (on_congestion == OC_PULL_AHEAD) + _drbd_set_state(_NS(device, conn, C_AHEAD), 0, NULL); + else /*nc->on_congestion == OC_DISCONNECT */ + _drbd_set_state(_NS(device, conn, C_DISCONNECTING), 0, NULL); + } + put_ldev(device); +} + +/* If this returns false, and req->private_bio is still set, + * this should be submitted locally. + * + * If it returns false, but req->private_bio is not set, + * we do not have access to good data :( + * + * Otherwise, this destroys req->private_bio, if any, + * and returns true. + */ +static bool do_remote_read(struct drbd_request *req) +{ + struct drbd_device *device = req->device; + enum drbd_read_balancing rbm; + + if (req->private_bio) { + if (!drbd_may_do_local_read(device, + req->i.sector, req->i.size)) { + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(device); + } + } + + if (device->state.pdsk != D_UP_TO_DATE) + return false; + + if (req->private_bio == NULL) + return true; + + /* TODO: improve read balancing decisions, take into account drbd + * protocol, pending requests etc. */ + + rcu_read_lock(); + rbm = rcu_dereference(device->ldev->disk_conf)->read_balancing; + rcu_read_unlock(); + + if (rbm == RB_PREFER_LOCAL && req->private_bio) + return false; /* submit locally */ + + if (remote_due_to_read_balancing(device, req->i.sector, rbm)) { + if (req->private_bio) { + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(device); + } + return true; + } + + return false; +} + +/* returns number of connections (== 1, for drbd 8.4) + * expected to actually write this data, + * which does NOT include those that we are L_AHEAD for. */ +static int drbd_process_write_request(struct drbd_request *req) +{ + struct drbd_device *device = req->device; + int remote, send_oos; + + remote = drbd_should_do_remote(device->state); + send_oos = drbd_should_send_out_of_sync(device->state); + + /* Need to replicate writes. Unless it is an empty flush, + * which is better mapped to a DRBD P_BARRIER packet, + * also for drbd wire protocol compatibility reasons. + * If this was a flush, just start a new epoch. + * Unless the current epoch was empty anyways, or we are not currently + * replicating, in which case there is no point. */ + if (unlikely(req->i.size == 0)) { + /* The only size==0 bios we expect are empty flushes. */ + D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH); + if (remote) + _req_mod(req, QUEUE_AS_DRBD_BARRIER); + return remote; + } + + if (!remote && !send_oos) + return 0; + + D_ASSERT(device, !(remote && send_oos)); + + if (remote) { + _req_mod(req, TO_BE_SENT); + _req_mod(req, QUEUE_FOR_NET_WRITE); + } else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size)) + _req_mod(req, QUEUE_FOR_SEND_OOS); + + return remote; +} + +static void +drbd_submit_req_private_bio(struct drbd_request *req) +{ + struct drbd_device *device = req->device; + struct bio *bio = req->private_bio; + const int rw = bio_rw(bio); + + bio->bi_bdev = device->ldev->backing_bdev; + + /* State may have changed since we grabbed our reference on the + * ->ldev member. Double check, and short-circuit to endio. + * In case the last activity log transaction failed to get on + * stable storage, and this is a WRITE, we may not even submit + * this bio. */ + if (get_ldev(device)) { + req->pre_submit_jif = jiffies; + if (drbd_insert_fault(device, + rw == WRITE ? DRBD_FAULT_DT_WR + : rw == READ ? DRBD_FAULT_DT_RD + : DRBD_FAULT_DT_RA)) + bio_endio(bio, -EIO); + else + generic_make_request(bio); + put_ldev(device); + } else + bio_endio(bio, -EIO); +} + +static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req) +{ + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&req->tl_requests, &device->submit.writes); + list_add_tail(&req->req_pending_master_completion, + &device->pending_master_completion[1 /* WRITE */]); + spin_unlock_irq(&device->resource->req_lock); + queue_work(device->submit.wq, &device->submit.worker); + /* do_submit() may sleep internally on al_wait, too */ + wake_up(&device->al_wait); +} + +/* returns the new drbd_request pointer, if the caller is expected to + * drbd_send_and_submit() it (to save latency), or NULL if we queued the + * request on the submitter thread. + * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request. + */ +static struct drbd_request * +drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif) +{ + const int rw = bio_data_dir(bio); + struct drbd_request *req; + + /* allocate outside of all locks; */ + req = drbd_req_new(device, bio); + if (!req) { + dec_ap_bio(device); + /* only pass the error to the upper layers. + * if user cannot handle io errors, that's not our business. */ + drbd_err(device, "could not kmalloc() req\n"); + bio_endio(bio, -ENOMEM); + return ERR_PTR(-ENOMEM); + } + req->start_jif = start_jif; + + if (!get_ldev(device)) { + bio_put(req->private_bio); + req->private_bio = NULL; + } + + /* Update disk stats */ + _drbd_start_io_acct(device, req); + + if (rw == WRITE && req->private_bio && req->i.size + && !test_bit(AL_SUSPENDED, &device->flags)) { + if (!drbd_al_begin_io_fastpath(device, &req->i)) { + atomic_inc(&device->ap_actlog_cnt); + drbd_queue_write(device, req); + return NULL; + } + req->rq_state |= RQ_IN_ACT_LOG; + req->in_actlog_jif = jiffies; + } + + return req; +} + +static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) +{ + struct drbd_resource *resource = device->resource; + const int rw = bio_rw(req->master_bio); + struct bio_and_error m = { NULL, }; + bool no_remote = false; + bool submit_private_bio = false; + + spin_lock_irq(&resource->req_lock); + if (rw == WRITE) { + /* This may temporarily give up the req_lock, + * but will re-aquire it before it returns here. + * Needs to be before the check on drbd_suspended() */ + complete_conflicting_writes(req); + /* no more giving up req_lock from now on! */ + + /* check for congestion, and potentially stop sending + * full data updates, but start sending "dirty bits" only. */ + maybe_pull_ahead(device); + } + + + if (drbd_suspended(device)) { + /* push back and retry: */ + req->rq_state |= RQ_POSTPONED; + if (req->private_bio) { + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(device); + } + goto out; + } + + /* We fail READ/READA early, if we can not serve it. + * We must do this before req is registered on any lists. + * Otherwise, drbd_req_complete() will queue failed READ for retry. */ + if (rw != WRITE) { + if (!do_remote_read(req) && !req->private_bio) + goto nodata; + } + + /* which transfer log epoch does this belong to? */ + req->epoch = atomic_read(&first_peer_device(device)->connection->current_tle_nr); + + /* no point in adding empty flushes to the transfer log, + * they are mapped to drbd barriers already. */ + if (likely(req->i.size!=0)) { + if (rw == WRITE) + first_peer_device(device)->connection->current_tle_writes++; + + list_add_tail(&req->tl_requests, &first_peer_device(device)->connection->transfer_log); + } + + if (rw == WRITE) { + if (!drbd_process_write_request(req)) + no_remote = true; + } else { + /* We either have a private_bio, or we can read from remote. + * Otherwise we had done the goto nodata above. */ + if (req->private_bio == NULL) { + _req_mod(req, TO_BE_SENT); + _req_mod(req, QUEUE_FOR_NET_READ); + } else + no_remote = true; + } + + /* If it took the fast path in drbd_request_prepare, add it here. + * The slow path has added it already. */ + if (list_empty(&req->req_pending_master_completion)) + list_add_tail(&req->req_pending_master_completion, + &device->pending_master_completion[rw == WRITE]); + if (req->private_bio) { + /* needs to be marked within the same spinlock */ + list_add_tail(&req->req_pending_local, + &device->pending_completion[rw == WRITE]); + _req_mod(req, TO_BE_SUBMITTED); + /* but we need to give up the spinlock to submit */ + submit_private_bio = true; + } else if (no_remote) { +nodata: + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n", + (unsigned long long)req->i.sector, req->i.size >> 9); + /* A write may have been queued for send_oos, however. + * So we can not simply free it, we must go through drbd_req_put_completion_ref() */ + } + +out: + if (drbd_req_put_completion_ref(req, &m, 1)) + kref_put(&req->kref, drbd_req_destroy); + spin_unlock_irq(&resource->req_lock); + + /* Even though above is a kref_put(), this is safe. + * As long as we still need to submit our private bio, + * we hold a completion ref, and the request cannot disappear. + * If however this request did not even have a private bio to submit + * (e.g. remote read), req may already be invalid now. + * That's why we cannot check on req->private_bio. */ + if (submit_private_bio) + drbd_submit_req_private_bio(req); + if (m.bio) + complete_master_bio(device, &m); +} + +void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif) +{ + struct drbd_request *req = drbd_request_prepare(device, bio, start_jif); + if (IS_ERR_OR_NULL(req)) + return; + drbd_send_and_submit(device, req); +} + +static void submit_fast_path(struct drbd_device *device, struct list_head *incoming) +{ + struct drbd_request *req, *tmp; + list_for_each_entry_safe(req, tmp, incoming, tl_requests) { + const int rw = bio_data_dir(req->master_bio); + + if (rw == WRITE /* rw != WRITE should not even end up here! */ + && req->private_bio && req->i.size + && !test_bit(AL_SUSPENDED, &device->flags)) { + if (!drbd_al_begin_io_fastpath(device, &req->i)) + continue; + + req->rq_state |= RQ_IN_ACT_LOG; + req->in_actlog_jif = jiffies; + atomic_dec(&device->ap_actlog_cnt); + } + + list_del_init(&req->tl_requests); + drbd_send_and_submit(device, req); + } +} + +static bool prepare_al_transaction_nonblock(struct drbd_device *device, + struct list_head *incoming, + struct list_head *pending, + struct list_head *later) +{ + struct drbd_request *req, *tmp; + int wake = 0; + int err; + + spin_lock_irq(&device->al_lock); + list_for_each_entry_safe(req, tmp, incoming, tl_requests) { + err = drbd_al_begin_io_nonblock(device, &req->i); + if (err == -ENOBUFS) + break; + if (err == -EBUSY) + wake = 1; + if (err) + list_move_tail(&req->tl_requests, later); + else + list_move_tail(&req->tl_requests, pending); + } + spin_unlock_irq(&device->al_lock); + if (wake) + wake_up(&device->al_wait); + return !list_empty(pending); +} + +void send_and_submit_pending(struct drbd_device *device, struct list_head *pending) +{ + struct drbd_request *req, *tmp; + + list_for_each_entry_safe(req, tmp, pending, tl_requests) { + req->rq_state |= RQ_IN_ACT_LOG; + req->in_actlog_jif = jiffies; + atomic_dec(&device->ap_actlog_cnt); + list_del_init(&req->tl_requests); + drbd_send_and_submit(device, req); + } +} + +void do_submit(struct work_struct *ws) +{ + struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker); + LIST_HEAD(incoming); /* from drbd_make_request() */ + LIST_HEAD(pending); /* to be submitted after next AL-transaction commit */ + LIST_HEAD(busy); /* blocked by resync requests */ + + /* grab new incoming requests */ + spin_lock_irq(&device->resource->req_lock); + list_splice_tail_init(&device->submit.writes, &incoming); + spin_unlock_irq(&device->resource->req_lock); + + for (;;) { + DEFINE_WAIT(wait); + + /* move used-to-be-busy back to front of incoming */ + list_splice_init(&busy, &incoming); + submit_fast_path(device, &incoming); + if (list_empty(&incoming)) + break; + + for (;;) { + prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE); + + list_splice_init(&busy, &incoming); + prepare_al_transaction_nonblock(device, &incoming, &pending, &busy); + if (!list_empty(&pending)) + break; + + schedule(); + + /* If all currently "hot" activity log extents are kept busy by + * incoming requests, we still must not totally starve new + * requests to "cold" extents. + * Something left on &incoming means there had not been + * enough update slots available, and the activity log + * has been marked as "starving". + * + * Try again now, without looking for new requests, + * effectively blocking all new requests until we made + * at least _some_ progress with what we currently have. + */ + if (!list_empty(&incoming)) + continue; + + /* Nothing moved to pending, but nothing left + * on incoming: all moved to busy! + * Grab new and iterate. */ + spin_lock_irq(&device->resource->req_lock); + list_splice_tail_init(&device->submit.writes, &incoming); + spin_unlock_irq(&device->resource->req_lock); + } + finish_wait(&device->al_wait, &wait); + + /* If the transaction was full, before all incoming requests + * had been processed, skip ahead to commit, and iterate + * without splicing in more incoming requests from upper layers. + * + * Else, if all incoming have been processed, + * they have become either "pending" (to be submitted after + * next transaction commit) or "busy" (blocked by resync). + * + * Maybe more was queued, while we prepared the transaction? + * Try to stuff those into this transaction as well. + * Be strictly non-blocking here, + * we already have something to commit. + * + * Commit if we don't make any more progres. + */ + + while (list_empty(&incoming)) { + LIST_HEAD(more_pending); + LIST_HEAD(more_incoming); + bool made_progress; + + /* It is ok to look outside the lock, + * it's only an optimization anyways */ + if (list_empty(&device->submit.writes)) + break; + + spin_lock_irq(&device->resource->req_lock); + list_splice_tail_init(&device->submit.writes, &more_incoming); + spin_unlock_irq(&device->resource->req_lock); + + if (list_empty(&more_incoming)) + break; + + made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy); + + list_splice_tail_init(&more_pending, &pending); + list_splice_tail_init(&more_incoming, &incoming); + if (!made_progress) + break; + } + + drbd_al_begin_io_commit(device); + send_and_submit_pending(device, &pending); + } +} + +void drbd_make_request(struct request_queue *q, struct bio *bio) +{ + struct drbd_device *device = (struct drbd_device *) q->queuedata; + unsigned long start_jif; + + start_jif = jiffies; + + /* + * what we "blindly" assume: + */ + D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512)); + + inc_ap_bio(device); + __drbd_make_request(device, bio, start_jif); +} + +/* This is called by bio_add_page(). + * + * q->max_hw_sectors and other global limits are already enforced there. + * + * We need to call down to our lower level device, + * in case it has special restrictions. + * + * We also may need to enforce configured max-bio-bvecs limits. + * + * As long as the BIO is empty we have to allow at least one bvec, + * regardless of size and offset, so no need to ask lower levels. + */ +int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) +{ + struct drbd_device *device = (struct drbd_device *) q->queuedata; + unsigned int bio_size = bvm->bi_size; + int limit = DRBD_MAX_BIO_SIZE; + int backing_limit; + + if (bio_size && get_ldev(device)) { + unsigned int max_hw_sectors = queue_max_hw_sectors(q); + struct request_queue * const b = + device->ldev->backing_bdev->bd_disk->queue; + if (b->merge_bvec_fn) { + bvm->bi_bdev = device->ldev->backing_bdev; + backing_limit = b->merge_bvec_fn(b, bvm, bvec); + limit = min(limit, backing_limit); + } + put_ldev(device); + if ((limit >> 9) > max_hw_sectors) + limit = max_hw_sectors << 9; + } + return limit; +} + +void request_timer_fn(unsigned long data) +{ + struct drbd_device *device = (struct drbd_device *) data; + struct drbd_connection *connection = first_peer_device(device)->connection; + struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */ + struct net_conf *nc; + unsigned long oldest_submit_jif; + unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ + unsigned long now; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + if (nc && device->state.conn >= C_WF_REPORT_PARAMS) + ent = nc->timeout * HZ/10 * nc->ko_count; + + if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */ + dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10; + put_ldev(device); + } + rcu_read_unlock(); + + et = min_not_zero(dt, ent); + + if (!et) + return; /* Recurring timer stopped */ + + now = jiffies; + nt = now + et; + + spin_lock_irq(&device->resource->req_lock); + req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local); + req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local); + req_peer = connection->req_not_net_done; + /* maybe the oldest request waiting for the peer is in fact still + * blocking in tcp sendmsg */ + if (!req_peer && connection->req_next && connection->req_next->pre_send_jif) + req_peer = connection->req_next; + + /* evaluate the oldest peer request only in one timer! */ + if (req_peer && req_peer->device != device) + req_peer = NULL; + + /* do we have something to evaluate? */ + if (req_peer == NULL && req_write == NULL && req_read == NULL) + goto out; + + oldest_submit_jif = + (req_write && req_read) + ? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif) + ? req_write->pre_submit_jif : req_read->pre_submit_jif ) + : req_write ? req_write->pre_submit_jif + : req_read ? req_read->pre_submit_jif : now; + + /* The request is considered timed out, if + * - we have some effective timeout from the configuration, + * with above state restrictions applied, + * - the oldest request is waiting for a response from the network + * resp. the local disk, + * - the oldest request is in fact older than the effective timeout, + * - the connection was established (resp. disk was attached) + * for longer than the timeout already. + * Note that for 32bit jiffies and very stable connections/disks, + * we may have a wrap around, which is catched by + * !time_in_range(now, last_..._jif, last_..._jif + timeout). + * + * Side effect: once per 32bit wrap-around interval, which means every + * ~198 days with 250 HZ, we have a window where the timeout would need + * to expire twice (worst case) to become effective. Good enough. + */ + if (ent && req_peer && + time_after(now, req_peer->pre_send_jif + ent) && + !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { + drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); + _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD); + } + if (dt && oldest_submit_jif != now && + time_after(now, oldest_submit_jif + dt) && + !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { + drbd_warn(device, "Local backing device failed to meet the disk-timeout\n"); + __drbd_chk_io_error(device, DRBD_FORCE_DETACH); + } + + /* Reschedule timer for the nearest not already expired timeout. + * Fallback to now + min(effective network timeout, disk timeout). */ + ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent)) + ? req_peer->pre_send_jif + ent : now + et; + dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt)) + ? oldest_submit_jif + dt : now + et; + nt = time_before(ent, dt) ? ent : dt; +out: + spin_unlock_irq(&device->resource->req_lock); + mod_timer(&device->request_timer, nt); +} diff --git a/kernel/drivers/block/drbd/drbd_req.h b/kernel/drivers/block/drbd/drbd_req.h new file mode 100644 index 000000000..9f6a04080 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_req.h @@ -0,0 +1,351 @@ +/* + drbd_req.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2006-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2006-2008, Lars Ellenberg . + Copyright (C) 2006-2008, Philipp Reisner . + + DRBD is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + DRBD is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DRBD_REQ_H +#define _DRBD_REQ_H + +#include + +#include +#include +#include "drbd_int.h" + +/* The request callbacks will be called in irq context by the IDE drivers, + and in Softirqs/Tasklets/BH context by the SCSI drivers, + and by the receiver and worker in kernel-thread context. + Try to get the locking right :) */ + +/* + * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are + * associated with IO requests originating from the block layer above us. + * + * There are quite a few things that may happen to a drbd request + * during its lifetime. + * + * It will be created. + * It will be marked with the intention to be + * submitted to local disk and/or + * send via the network. + * + * It has to be placed on the transfer log and other housekeeping lists, + * In case we have a network connection. + * + * It may be identified as a concurrent (write) request + * and be handled accordingly. + * + * It may me handed over to the local disk subsystem. + * It may be completed by the local disk subsystem, + * either successfully or with io-error. + * In case it is a READ request, and it failed locally, + * it may be retried remotely. + * + * It may be queued for sending. + * It may be handed over to the network stack, + * which may fail. + * It may be acknowledged by the "peer" according to the wire_protocol in use. + * this may be a negative ack. + * It may receive a faked ack when the network connection is lost and the + * transfer log is cleaned up. + * Sending may be canceled due to network connection loss. + * When it finally has outlived its time, + * corresponding dirty bits in the resync-bitmap may be cleared or set, + * it will be destroyed, + * and completion will be signalled to the originator, + * with or without "success". + */ + +enum drbd_req_event { + CREATED, + TO_BE_SENT, + TO_BE_SUBMITTED, + + /* XXX yes, now I am inconsistent... + * these are not "events" but "actions" + * oh, well... */ + QUEUE_FOR_NET_WRITE, + QUEUE_FOR_NET_READ, + QUEUE_FOR_SEND_OOS, + + /* An empty flush is queued as P_BARRIER, + * which will cause it to complete "successfully", + * even if the local disk flush failed. + * + * Just like "real" requests, empty flushes (blkdev_issue_flush()) will + * only see an error if neither local nor remote data is reachable. */ + QUEUE_AS_DRBD_BARRIER, + + SEND_CANCELED, + SEND_FAILED, + HANDED_OVER_TO_NETWORK, + OOS_HANDED_TO_NETWORK, + CONNECTION_LOST_WHILE_PENDING, + READ_RETRY_REMOTE_CANCELED, + RECV_ACKED_BY_PEER, + WRITE_ACKED_BY_PEER, + WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */ + CONFLICT_RESOLVED, + POSTPONE_WRITE, + NEG_ACKED, + BARRIER_ACKED, /* in protocol A and B */ + DATA_RECEIVED, /* (remote read) */ + + COMPLETED_OK, + READ_COMPLETED_WITH_ERROR, + READ_AHEAD_COMPLETED_WITH_ERROR, + WRITE_COMPLETED_WITH_ERROR, + DISCARD_COMPLETED_NOTSUPP, + DISCARD_COMPLETED_WITH_ERROR, + + ABORT_DISK_IO, + RESEND, + FAIL_FROZEN_DISK_IO, + RESTART_FROZEN_DISK_IO, + NOTHING, +}; + +/* encoding of request states for now. we don't actually need that many bits. + * we don't need to do atomic bit operations either, since most of the time we + * need to look at the connection state and/or manipulate some lists at the + * same time, so we should hold the request lock anyways. + */ +enum drbd_req_state_bits { + /* 3210 + * 0000: no local possible + * 0001: to be submitted + * UNUSED, we could map: 011: submitted, completion still pending + * 0110: completed ok + * 0010: completed with error + * 1001: Aborted (before completion) + * 1x10: Aborted and completed -> free + */ + __RQ_LOCAL_PENDING, + __RQ_LOCAL_COMPLETED, + __RQ_LOCAL_OK, + __RQ_LOCAL_ABORTED, + + /* 87654 + * 00000: no network possible + * 00001: to be send + * 00011: to be send, on worker queue + * 00101: sent, expecting recv_ack (B) or write_ack (C) + * 11101: sent, + * recv_ack (B) or implicit "ack" (A), + * still waiting for the barrier ack. + * master_bio may already be completed and invalidated. + * 11100: write acked (C), + * data received (for remote read, any protocol) + * or finally the barrier ack has arrived (B,A)... + * request can be freed + * 01100: neg-acked (write, protocol C) + * or neg-d-acked (read, any protocol) + * or killed from the transfer log + * during cleanup after connection loss + * request can be freed + * 01000: canceled or send failed... + * request can be freed + */ + + /* if "SENT" is not set, yet, this can still fail or be canceled. + * if "SENT" is set already, we still wait for an Ack packet. + * when cleared, the master_bio may be completed. + * in (B,A) the request object may still linger on the transaction log + * until the corresponding barrier ack comes in */ + __RQ_NET_PENDING, + + /* If it is QUEUED, and it is a WRITE, it is also registered in the + * transfer log. Currently we need this flag to avoid conflicts between + * worker canceling the request and tl_clear_barrier killing it from + * transfer log. We should restructure the code so this conflict does + * no longer occur. */ + __RQ_NET_QUEUED, + + /* well, actually only "handed over to the network stack". + * + * TODO can potentially be dropped because of the similar meaning + * of RQ_NET_SENT and ~RQ_NET_QUEUED. + * however it is not exactly the same. before we drop it + * we must ensure that we can tell a request with network part + * from a request without, regardless of what happens to it. */ + __RQ_NET_SENT, + + /* when set, the request may be freed (if RQ_NET_QUEUED is clear). + * basically this means the corresponding P_BARRIER_ACK was received */ + __RQ_NET_DONE, + + /* whether or not we know (C) or pretend (B,A) that the write + * was successfully written on the peer. + */ + __RQ_NET_OK, + + /* peer called drbd_set_in_sync() for this write */ + __RQ_NET_SIS, + + /* keep this last, its for the RQ_NET_MASK */ + __RQ_NET_MAX, + + /* Set when this is a write, clear for a read */ + __RQ_WRITE, + + /* Should call drbd_al_complete_io() for this request... */ + __RQ_IN_ACT_LOG, + + /* The peer has sent a retry ACK */ + __RQ_POSTPONED, + + /* would have been completed, + * but was not, because of drbd_suspended() */ + __RQ_COMPLETION_SUSP, + + /* We expect a receive ACK (wire proto B) */ + __RQ_EXP_RECEIVE_ACK, + + /* We expect a write ACK (wite proto C) */ + __RQ_EXP_WRITE_ACK, + + /* waiting for a barrier ack, did an extra kref_get */ + __RQ_EXP_BARR_ACK, +}; + +#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) +#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) +#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) +#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED) + +#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1) + +#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) +#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) +#define RQ_NET_SENT (1UL << __RQ_NET_SENT) +#define RQ_NET_DONE (1UL << __RQ_NET_DONE) +#define RQ_NET_OK (1UL << __RQ_NET_OK) +#define RQ_NET_SIS (1UL << __RQ_NET_SIS) + +/* 0x1f8 */ +#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) + +#define RQ_WRITE (1UL << __RQ_WRITE) +#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) +#define RQ_POSTPONED (1UL << __RQ_POSTPONED) +#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) +#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK) +#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK) +#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK) + +/* For waking up the frozen transfer log mod_req() has to return if the request + should be counted in the epoch object*/ +#define MR_WRITE 1 +#define MR_READ 2 + +static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src) +{ + struct bio *bio; + bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + + req->private_bio = bio; + + bio->bi_private = req; + bio->bi_end_io = drbd_request_endio; + bio->bi_next = NULL; +} + +/* Short lived temporary struct on the stack. + * We could squirrel the error to be returned into + * bio->bi_iter.bi_size, or similar. But that would be too ugly. */ +struct bio_and_error { + struct bio *bio; + int error; +}; + +extern void start_new_tl_epoch(struct drbd_connection *connection); +extern void drbd_req_destroy(struct kref *kref); +extern void _req_may_be_done(struct drbd_request *req, + struct bio_and_error *m); +extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct bio_and_error *m); +extern void complete_master_bio(struct drbd_device *device, + struct bio_and_error *m); +extern void request_timer_fn(unsigned long data); +extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what); +extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what); +extern void tl_abort_disk_io(struct drbd_device *device); + +/* this is in drbd_main.c */ +extern void drbd_restart_request(struct drbd_request *req); + +/* use this if you don't want to deal with calling complete_master_bio() + * outside the spinlock, e.g. when walking some list on cleanup. */ +static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what) +{ + struct drbd_device *device = req->device; + struct bio_and_error m; + int rv; + + /* __req_mod possibly frees req, do not touch req after that! */ + rv = __req_mod(req, what, &m); + if (m.bio) + complete_master_bio(device, &m); + + return rv; +} + +/* completion of master bio is outside of our spinlock. + * We still may or may not be inside some irqs disabled section + * of the lower level driver completion callback, so we need to + * spin_lock_irqsave here. */ +static inline int req_mod(struct drbd_request *req, + enum drbd_req_event what) +{ + unsigned long flags; + struct drbd_device *device = req->device; + struct bio_and_error m; + int rv; + + spin_lock_irqsave(&device->resource->req_lock, flags); + rv = __req_mod(req, what, &m); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + if (m.bio) + complete_master_bio(device, &m); + + return rv; +} + +static inline bool drbd_should_do_remote(union drbd_dev_state s) +{ + return s.pdsk == D_UP_TO_DATE || + (s.pdsk >= D_INCONSISTENT && + s.conn >= C_WF_BITMAP_T && + s.conn < C_AHEAD); + /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T. + That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* + states. */ +} +static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s) +{ + return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; + /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary + since we enter state C_AHEAD only if proto >= 96 */ +} + +#endif diff --git a/kernel/drivers/block/drbd/drbd_state.c b/kernel/drivers/block/drbd/drbd_state.c new file mode 100644 index 000000000..2d7dd269b --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_state.c @@ -0,0 +1,1918 @@ +/* + drbd_state.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev + from Logicworks, Inc. for making SDP replication support possible. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include "drbd_int.h" +#include "drbd_protocol.h" +#include "drbd_req.h" + +struct after_state_chg_work { + struct drbd_work w; + struct drbd_device *device; + union drbd_state os; + union drbd_state ns; + enum chg_state_flags flags; + struct completion *done; +}; + +enum sanitize_state_warnings { + NO_WARNING, + ABORTED_ONLINE_VERIFY, + ABORTED_RESYNC, + CONNECTION_LOST_NEGOTIATING, + IMPLICITLY_UPGRADED_DISK, + IMPLICITLY_UPGRADED_PDSK, +}; + +static int w_after_state_ch(struct drbd_work *w, int unused); +static void after_state_ch(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags); +static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); +static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); +static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn); + +static inline bool is_susp(union drbd_state s) +{ + return s.susp || s.susp_nod || s.susp_fen; +} + +bool conn_all_vols_unconf(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + bool rv = true; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (device->state.disk != D_DISKLESS || + device->state.conn != C_STANDALONE || + device->state.role != R_SECONDARY) { + rv = false; + break; + } + } + rcu_read_unlock(); + + return rv; +} + +/* Unfortunately the states where not correctly ordered, when + they where defined. therefore can not use max_t() here. */ +static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) +{ + if (role1 == R_PRIMARY || role2 == R_PRIMARY) + return R_PRIMARY; + if (role1 == R_SECONDARY || role2 == R_SECONDARY) + return R_SECONDARY; + return R_UNKNOWN; +} +static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) +{ + if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) + return R_UNKNOWN; + if (role1 == R_SECONDARY || role2 == R_SECONDARY) + return R_SECONDARY; + return R_PRIMARY; +} + +enum drbd_role conn_highest_role(struct drbd_connection *connection) +{ + enum drbd_role role = R_UNKNOWN; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + role = max_role(role, device->state.role); + } + rcu_read_unlock(); + + return role; +} + +enum drbd_role conn_highest_peer(struct drbd_connection *connection) +{ + enum drbd_role peer = R_UNKNOWN; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + peer = max_role(peer, device->state.peer); + } + rcu_read_unlock(); + + return peer; +} + +enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection) +{ + enum drbd_disk_state disk_state = D_DISKLESS; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + disk_state = max_t(enum drbd_disk_state, disk_state, device->state.disk); + } + rcu_read_unlock(); + + return disk_state; +} + +enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection) +{ + enum drbd_disk_state disk_state = D_MASK; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk); + } + rcu_read_unlock(); + + return disk_state; +} + +enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection) +{ + enum drbd_disk_state disk_state = D_DISKLESS; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + disk_state = max_t(enum drbd_disk_state, disk_state, device->state.pdsk); + } + rcu_read_unlock(); + + return disk_state; +} + +enum drbd_conns conn_lowest_conn(struct drbd_connection *connection) +{ + enum drbd_conns conn = C_MASK; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + conn = min_t(enum drbd_conns, conn, device->state.conn); + } + rcu_read_unlock(); + + return conn; +} + +static bool no_peer_wf_report_params(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + bool rv = true; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + if (peer_device->device->state.conn == C_WF_REPORT_PARAMS) { + rv = false; + break; + } + rcu_read_unlock(); + + return rv; +} + +static void wake_up_all_devices(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + wake_up(&peer_device->device->state_wait); + rcu_read_unlock(); + +} + + +/** + * cl_wide_st_chg() - true if the state change is a cluster wide one + * @device: DRBD device. + * @os: old (current) state. + * @ns: new (wanted) state. + */ +static int cl_wide_st_chg(struct drbd_device *device, + union drbd_state os, union drbd_state ns) +{ + return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && + ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || + (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || + (os.disk != D_FAILED && ns.disk == D_FAILED))) || + (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || + (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) || + (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS); +} + +static union drbd_state +apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val) +{ + union drbd_state ns; + ns.i = (os.i & ~mask.i) | val.i; + return ns; +} + +enum drbd_state_rv +drbd_change_state(struct drbd_device *device, enum chg_state_flags f, + union drbd_state mask, union drbd_state val) +{ + unsigned long flags; + union drbd_state ns; + enum drbd_state_rv rv; + + spin_lock_irqsave(&device->resource->req_lock, flags); + ns = apply_mask_val(drbd_read_state(device), mask, val); + rv = _drbd_set_state(device, ns, f, NULL); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + return rv; +} + +/** + * drbd_force_state() - Impose a change which happens outside our control on our state + * @device: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + */ +void drbd_force_state(struct drbd_device *device, + union drbd_state mask, union drbd_state val) +{ + drbd_change_state(device, CS_HARD, mask, val); +} + +static enum drbd_state_rv +_req_st_cond(struct drbd_device *device, union drbd_state mask, + union drbd_state val) +{ + union drbd_state os, ns; + unsigned long flags; + enum drbd_state_rv rv; + + if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &device->flags)) + return SS_CW_SUCCESS; + + if (test_and_clear_bit(CL_ST_CHG_FAIL, &device->flags)) + return SS_CW_FAILED_BY_PEER; + + spin_lock_irqsave(&device->resource->req_lock, flags); + os = drbd_read_state(device); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); + rv = is_valid_transition(os, ns); + if (rv >= SS_SUCCESS) + rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ + + if (!cl_wide_st_chg(device, os, ns)) + rv = SS_CW_NO_NEED; + if (rv == SS_UNKNOWN_ERROR) { + rv = is_valid_state(device, ns); + if (rv >= SS_SUCCESS) { + rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); + if (rv >= SS_SUCCESS) + rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */ + } + } + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + return rv; +} + +/** + * drbd_req_state() - Perform an eventually cluster wide state change + * @device: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Should not be called directly, use drbd_request_state() or + * _drbd_request_state(). + */ +static enum drbd_state_rv +drbd_req_state(struct drbd_device *device, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + struct completion done; + unsigned long flags; + union drbd_state os, ns; + enum drbd_state_rv rv; + + init_completion(&done); + + if (f & CS_SERIALIZE) + mutex_lock(device->state_mutex); + + spin_lock_irqsave(&device->resource->req_lock, flags); + os = drbd_read_state(device); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); + rv = is_valid_transition(os, ns); + if (rv < SS_SUCCESS) { + spin_unlock_irqrestore(&device->resource->req_lock, flags); + goto abort; + } + + if (cl_wide_st_chg(device, os, ns)) { + rv = is_valid_state(device, ns); + if (rv == SS_SUCCESS) + rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + if (rv < SS_SUCCESS) { + if (f & CS_VERBOSE) + print_st_err(device, os, ns, rv); + goto abort; + } + + if (drbd_send_state_req(first_peer_device(device), mask, val)) { + rv = SS_CW_FAILED_BY_PEER; + if (f & CS_VERBOSE) + print_st_err(device, os, ns, rv); + goto abort; + } + + wait_event(device->state_wait, + (rv = _req_st_cond(device, mask, val))); + + if (rv < SS_SUCCESS) { + if (f & CS_VERBOSE) + print_st_err(device, os, ns, rv); + goto abort; + } + spin_lock_irqsave(&device->resource->req_lock, flags); + ns = apply_mask_val(drbd_read_state(device), mask, val); + rv = _drbd_set_state(device, ns, f, &done); + } else { + rv = _drbd_set_state(device, ns, f, &done); + } + + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { + D_ASSERT(device, current != first_peer_device(device)->connection->worker.task); + wait_for_completion(&done); + } + +abort: + if (f & CS_SERIALIZE) + mutex_unlock(device->state_mutex); + + return rv; +} + +/** + * _drbd_request_state() - Request a state change (with flags) + * @device: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE + * flag, or when logging of failed state change requests is not desired. + */ +enum drbd_state_rv +_drbd_request_state(struct drbd_device *device, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + enum drbd_state_rv rv; + + wait_event(device->state_wait, + (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE); + + return rv; +} + +enum drbd_state_rv +_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + enum drbd_state_rv rv; + + BUG_ON(f & CS_SERIALIZE); + + wait_event_cmd(device->state_wait, + (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE, + mutex_unlock(device->state_mutex), + mutex_lock(device->state_mutex)); + + return rv; +} + +static void print_st(struct drbd_device *device, const char *name, union drbd_state ns) +{ + drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", + name, + drbd_conn_str(ns.conn), + drbd_role_str(ns.role), + drbd_role_str(ns.peer), + drbd_disk_str(ns.disk), + drbd_disk_str(ns.pdsk), + is_susp(ns) ? 's' : 'r', + ns.aftr_isp ? 'a' : '-', + ns.peer_isp ? 'p' : '-', + ns.user_isp ? 'u' : '-', + ns.susp_fen ? 'F' : '-', + ns.susp_nod ? 'N' : '-' + ); +} + +void print_st_err(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum drbd_state_rv err) +{ + if (err == SS_IN_TRANSIENT_STATE) + return; + drbd_err(device, "State change failed: %s\n", drbd_set_st_err_str(err)); + print_st(device, " state", os); + print_st(device, "wanted", ns); +} + +static long print_state_change(char *pb, union drbd_state os, union drbd_state ns, + enum chg_state_flags flags) +{ + char *pbp; + pbp = pb; + *pbp = 0; + + if (ns.role != os.role && flags & CS_DC_ROLE) + pbp += sprintf(pbp, "role( %s -> %s ) ", + drbd_role_str(os.role), + drbd_role_str(ns.role)); + if (ns.peer != os.peer && flags & CS_DC_PEER) + pbp += sprintf(pbp, "peer( %s -> %s ) ", + drbd_role_str(os.peer), + drbd_role_str(ns.peer)); + if (ns.conn != os.conn && flags & CS_DC_CONN) + pbp += sprintf(pbp, "conn( %s -> %s ) ", + drbd_conn_str(os.conn), + drbd_conn_str(ns.conn)); + if (ns.disk != os.disk && flags & CS_DC_DISK) + pbp += sprintf(pbp, "disk( %s -> %s ) ", + drbd_disk_str(os.disk), + drbd_disk_str(ns.disk)); + if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK) + pbp += sprintf(pbp, "pdsk( %s -> %s ) ", + drbd_disk_str(os.pdsk), + drbd_disk_str(ns.pdsk)); + + return pbp - pb; +} + +static void drbd_pr_state_change(struct drbd_device *device, union drbd_state os, union drbd_state ns, + enum chg_state_flags flags) +{ + char pb[300]; + char *pbp = pb; + + pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK); + + if (ns.aftr_isp != os.aftr_isp) + pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ", + os.aftr_isp, + ns.aftr_isp); + if (ns.peer_isp != os.peer_isp) + pbp += sprintf(pbp, "peer_isp( %d -> %d ) ", + os.peer_isp, + ns.peer_isp); + if (ns.user_isp != os.user_isp) + pbp += sprintf(pbp, "user_isp( %d -> %d ) ", + os.user_isp, + ns.user_isp); + + if (pbp != pb) + drbd_info(device, "%s\n", pb); +} + +static void conn_pr_state_change(struct drbd_connection *connection, union drbd_state os, union drbd_state ns, + enum chg_state_flags flags) +{ + char pb[300]; + char *pbp = pb; + + pbp += print_state_change(pbp, os, ns, flags); + + if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP) + pbp += sprintf(pbp, "susp( %d -> %d ) ", + is_susp(os), + is_susp(ns)); + + if (pbp != pb) + drbd_info(connection, "%s\n", pb); +} + + +/** + * is_valid_state() - Returns an SS_ error code if ns is not valid + * @device: DRBD device. + * @ns: State to consider. + */ +static enum drbd_state_rv +is_valid_state(struct drbd_device *device, union drbd_state ns) +{ + /* See drbd_state_sw_errors in drbd_strings.c */ + + enum drbd_fencing_p fp; + enum drbd_state_rv rv = SS_SUCCESS; + struct net_conf *nc; + + rcu_read_lock(); + fp = FP_DONT_CARE; + if (get_ldev(device)) { + fp = rcu_dereference(device->ldev->disk_conf)->fencing; + put_ldev(device); + } + + nc = rcu_dereference(first_peer_device(device)->connection->net_conf); + if (nc) { + if (!nc->two_primaries && ns.role == R_PRIMARY) { + if (ns.peer == R_PRIMARY) + rv = SS_TWO_PRIMARIES; + else if (conn_highest_peer(first_peer_device(device)->connection) == R_PRIMARY) + rv = SS_O_VOL_PEER_PRI; + } + } + + if (rv <= 0) + /* already found a reason to abort */; + else if (ns.role == R_SECONDARY && device->open_cnt) + rv = SS_DEVICE_IN_USE; + + else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (fp >= FP_RESOURCE && + ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) + rv = SS_PRIMARY_NOP; + + else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) + rv = SS_NO_LOCAL_DISK; + + else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) + rv = SS_NO_REMOTE_DISK; + + else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if ((ns.conn == C_CONNECTED || + ns.conn == C_WF_BITMAP_S || + ns.conn == C_SYNC_SOURCE || + ns.conn == C_PAUSED_SYNC_S) && + ns.disk == D_OUTDATED) + rv = SS_CONNECTED_OUTDATES; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + (nc->verify_alg[0] == 0)) + rv = SS_NO_VERIFY_ALG; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + first_peer_device(device)->connection->agreed_pro_version < 88) + rv = SS_NOT_SUPPORTED; + + else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && + ns.pdsk == D_UNKNOWN) + rv = SS_NEED_CONNECTION; + + else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) + rv = SS_CONNECTED_OUTDATES; + + rcu_read_unlock(); + + return rv; +} + +/** + * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible + * This function limits state transitions that may be declined by DRBD. I.e. + * user requests (aka soft transitions). + * @device: DRBD device. + * @ns: new state. + * @os: old state. + */ +static enum drbd_state_rv +is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_connection *connection) +{ + enum drbd_state_rv rv = SS_SUCCESS; + + if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && + os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) + rv = SS_ALREADY_STANDALONE; + + if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) + rv = SS_IS_DISKLESS; + + if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) + rv = SS_NO_NET_CONFIG; + + if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) + rv = SS_LOWER_THAN_OUTDATED; + + if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) + rv = SS_IN_TRANSIENT_STATE; + + /* While establishing a connection only allow cstate to change. + Delay/refuse role changes, detach attach etc... (they do not touch cstate) */ + if (test_bit(STATE_SENT, &connection->flags) && + !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) || + (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS))) + rv = SS_IN_TRANSIENT_STATE; + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + ns.conn != os.conn && os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && + os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE) + && os.conn < C_WF_REPORT_PARAMS) + rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */ + + if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED && + os.conn < C_CONNECTED && os.pdsk > D_OUTDATED) + rv = SS_OUTDATE_WO_CONN; + + return rv; +} + +static enum drbd_state_rv +is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc) +{ + /* no change -> nothing to do, at least for the connection part */ + if (oc == nc) + return SS_NOTHING_TO_DO; + + /* disconnect of an unconfigured connection does not make sense */ + if (oc == C_STANDALONE && nc == C_DISCONNECTING) + return SS_ALREADY_STANDALONE; + + /* from C_STANDALONE, we start with C_UNCONNECTED */ + if (oc == C_STANDALONE && nc != C_UNCONNECTED) + return SS_NEED_CONNECTION; + + /* When establishing a connection we need to go through WF_REPORT_PARAMS! + Necessary to do the right thing upon invalidate-remote on a disconnected resource */ + if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED) + return SS_NEED_CONNECTION; + + /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */ + if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING) + return SS_IN_TRANSIENT_STATE; + + /* After C_DISCONNECTING only C_STANDALONE may follow */ + if (oc == C_DISCONNECTING && nc != C_STANDALONE) + return SS_IN_TRANSIENT_STATE; + + return SS_SUCCESS; +} + + +/** + * is_valid_transition() - Returns an SS_ error code if the state transition is not possible + * This limits hard state transitions. Hard state transitions are facts there are + * imposed on DRBD by the environment. E.g. disk broke or network broke down. + * But those hard state transitions are still not allowed to do everything. + * @ns: new state. + * @os: old state. + */ +static enum drbd_state_rv +is_valid_transition(union drbd_state os, union drbd_state ns) +{ + enum drbd_state_rv rv; + + rv = is_valid_conn_transition(os.conn, ns.conn); + + /* we cannot fail (again) if we already detached */ + if (ns.disk == D_FAILED && os.disk == D_DISKLESS) + rv = SS_IS_DISKLESS; + + return rv; +} + +static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_state_warnings warn) +{ + static const char *msg_table[] = { + [NO_WARNING] = "", + [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", + [ABORTED_RESYNC] = "Resync aborted.", + [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", + [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", + [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", + }; + + if (warn != NO_WARNING) + drbd_warn(device, "%s\n", msg_table[warn]); +} + +/** + * sanitize_state() - Resolves implicitly necessary additional changes to a state transition + * @device: DRBD device. + * @os: old state. + * @ns: new state. + * @warn_sync_abort: + * + * When we loose connection, we have to set the state of the peers disk (pdsk) + * to D_UNKNOWN. This rule and many more along those lines are in this function. + */ +static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum sanitize_state_warnings *warn) +{ + enum drbd_fencing_p fp; + enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; + + if (warn) + *warn = NO_WARNING; + + fp = FP_DONT_CARE; + if (get_ldev(device)) { + rcu_read_lock(); + fp = rcu_dereference(device->ldev->disk_conf)->fencing; + rcu_read_unlock(); + put_ldev(device); + } + + /* Implications from connection to peer and peer_isp */ + if (ns.conn < C_CONNECTED) { + ns.peer_isp = 0; + ns.peer = R_UNKNOWN; + if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) + ns.pdsk = D_UNKNOWN; + } + + /* Clear the aftr_isp when becoming unconfigured */ + if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) + ns.aftr_isp = 0; + + /* An implication of the disk states onto the connection state */ + /* Abort resync if a disk fails/detaches */ + if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { + if (warn) + *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ? + ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; + ns.conn = C_CONNECTED; + } + + /* Connection breaks down before we finished "Negotiating" */ + if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && + get_ldev_if_state(device, D_NEGOTIATING)) { + if (device->ed_uuid == device->ldev->md.uuid[UI_CURRENT]) { + ns.disk = device->new_state_tmp.disk; + ns.pdsk = device->new_state_tmp.pdsk; + } else { + if (warn) + *warn = CONNECTION_LOST_NEGOTIATING; + ns.disk = D_DISKLESS; + ns.pdsk = D_UNKNOWN; + } + put_ldev(device); + } + + /* D_CONSISTENT and D_OUTDATED vanish when we get connected */ + if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) { + if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) + ns.disk = D_UP_TO_DATE; + if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED) + ns.pdsk = D_UP_TO_DATE; + } + + /* Implications of the connection stat on the disk states */ + disk_min = D_DISKLESS; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_UNKNOWN; + switch ((enum drbd_conns)ns.conn) { + case C_WF_BITMAP_T: + case C_PAUSED_SYNC_T: + case C_STARTING_SYNC_T: + case C_WF_SYNC_UUID: + case C_BEHIND: + disk_min = D_INCONSISTENT; + disk_max = D_OUTDATED; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_VERIFY_S: + case C_VERIFY_T: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_CONNECTED: + disk_min = D_DISKLESS; + disk_max = D_UP_TO_DATE; + pdsk_min = D_DISKLESS; + pdsk_max = D_UP_TO_DATE; + break; + case C_WF_BITMAP_S: + case C_PAUSED_SYNC_S: + case C_STARTING_SYNC_S: + case C_AHEAD: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/ + break; + case C_SYNC_TARGET: + disk_min = D_INCONSISTENT; + disk_max = D_INCONSISTENT; + pdsk_min = D_UP_TO_DATE; + pdsk_max = D_UP_TO_DATE; + break; + case C_SYNC_SOURCE: + disk_min = D_UP_TO_DATE; + disk_max = D_UP_TO_DATE; + pdsk_min = D_INCONSISTENT; + pdsk_max = D_INCONSISTENT; + break; + case C_STANDALONE: + case C_DISCONNECTING: + case C_UNCONNECTED: + case C_TIMEOUT: + case C_BROKEN_PIPE: + case C_NETWORK_FAILURE: + case C_PROTOCOL_ERROR: + case C_TEAR_DOWN: + case C_WF_CONNECTION: + case C_WF_REPORT_PARAMS: + case C_MASK: + break; + } + if (ns.disk > disk_max) + ns.disk = disk_max; + + if (ns.disk < disk_min) { + if (warn) + *warn = IMPLICITLY_UPGRADED_DISK; + ns.disk = disk_min; + } + if (ns.pdsk > pdsk_max) + ns.pdsk = pdsk_max; + + if (ns.pdsk < pdsk_min) { + if (warn) + *warn = IMPLICITLY_UPGRADED_PDSK; + ns.pdsk = pdsk_min; + } + + if (fp == FP_STONITH && + (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && + !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) + ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */ + + if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) && + !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE)) + ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */ + + if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { + if (ns.conn == C_SYNC_SOURCE) + ns.conn = C_PAUSED_SYNC_S; + if (ns.conn == C_SYNC_TARGET) + ns.conn = C_PAUSED_SYNC_T; + } else { + if (ns.conn == C_PAUSED_SYNC_S) + ns.conn = C_SYNC_SOURCE; + if (ns.conn == C_PAUSED_SYNC_T) + ns.conn = C_SYNC_TARGET; + } + + return ns; +} + +void drbd_resume_al(struct drbd_device *device) +{ + if (test_and_clear_bit(AL_SUSPENDED, &device->flags)) + drbd_info(device, "Resumed AL updates\n"); +} + +/* helper for __drbd_set_state */ +static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) +{ + if (first_peer_device(device)->connection->agreed_pro_version < 90) + device->ov_start_sector = 0; + device->rs_total = drbd_bm_bits(device); + device->ov_position = 0; + if (cs == C_VERIFY_T) { + /* starting online verify from an arbitrary position + * does not fit well into the existing protocol. + * on C_VERIFY_T, we initialize ov_left and friends + * implicitly in receive_DataRequest once the + * first P_OV_REQUEST is received */ + device->ov_start_sector = ~(sector_t)0; + } else { + unsigned long bit = BM_SECT_TO_BIT(device->ov_start_sector); + if (bit >= device->rs_total) { + device->ov_start_sector = + BM_BIT_TO_SECT(device->rs_total - 1); + device->rs_total = 1; + } else + device->rs_total -= bit; + device->ov_position = device->ov_start_sector; + } + device->ov_left = device->rs_total; +} + +/** + * __drbd_set_state() - Set a new DRBD state + * @device: DRBD device. + * @ns: new state. + * @flags: Flags + * @done: Optional completion, that will get completed after the after_state_ch() finished + * + * Caller needs to hold req_lock, and global_state_lock. Do not call directly. + */ +enum drbd_state_rv +__drbd_set_state(struct drbd_device *device, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) +{ + struct drbd_peer_device *peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + union drbd_state os; + enum drbd_state_rv rv = SS_SUCCESS; + enum sanitize_state_warnings ssw; + struct after_state_chg_work *ascw; + + os = drbd_read_state(device); + + ns = sanitize_state(device, os, ns, &ssw); + if (ns.i == os.i) + return SS_NOTHING_TO_DO; + + rv = is_valid_transition(os, ns); + if (rv < SS_SUCCESS) + return rv; + + if (!(flags & CS_HARD)) { + /* pre-state-change checks ; only look at ns */ + /* See drbd_state_sw_errors in drbd_strings.c */ + + rv = is_valid_state(device, ns); + if (rv < SS_SUCCESS) { + /* If the old state was illegal as well, then let + this happen...*/ + + if (is_valid_state(device, os) == rv) + rv = is_valid_soft_transition(os, ns, connection); + } else + rv = is_valid_soft_transition(os, ns, connection); + } + + if (rv < SS_SUCCESS) { + if (flags & CS_VERBOSE) + print_st_err(device, os, ns, rv); + return rv; + } + + print_sanitize_warnings(device, ssw); + + drbd_pr_state_change(device, os, ns, flags); + + /* Display changes to the susp* flags that where caused by the call to + sanitize_state(). Only display it here if we where not called from + _conn_request_state() */ + if (!(flags & CS_DC_SUSP)) + conn_pr_state_change(connection, os, ns, + (flags & ~CS_DC_MASK) | CS_DC_SUSP); + + /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference + * on the ldev here, to be sure the transition -> D_DISKLESS resp. + * drbd_ldev_destroy() won't happen before our corresponding + * after_state_ch works run, where we put_ldev again. */ + if ((os.disk != D_FAILED && ns.disk == D_FAILED) || + (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) + atomic_inc(&device->local_cnt); + + if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) + clear_bit(RS_DONE, &device->flags); + + /* changes to local_cnt and device flags should be visible before + * changes to state, which again should be visible before anything else + * depending on that change happens. */ + smp_wmb(); + device->state.i = ns.i; + device->resource->susp = ns.susp; + device->resource->susp_nod = ns.susp_nod; + device->resource->susp_fen = ns.susp_fen; + smp_wmb(); + + /* put replicated vs not-replicated requests in seperate epochs */ + if (drbd_should_do_remote((union drbd_dev_state)os.i) != + drbd_should_do_remote((union drbd_dev_state)ns.i)) + start_new_tl_epoch(connection); + + if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) + drbd_print_uuids(device, "attached to UUIDs"); + + /* Wake up role changes, that were delayed because of connection establishing */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && + no_peer_wf_report_params(connection)) { + clear_bit(STATE_SENT, &connection->flags); + wake_up_all_devices(connection); + } + + wake_up(&device->misc_wait); + wake_up(&device->state_wait); + wake_up(&connection->ping_wait); + + /* Aborted verify run, or we reached the stop sector. + * Log the last position, unless end-of-device. */ + if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && + ns.conn <= C_CONNECTED) { + device->ov_start_sector = + BM_BIT_TO_SECT(drbd_bm_bits(device) - device->ov_left); + if (device->ov_left) + drbd_info(device, "Online Verify reached sector %llu\n", + (unsigned long long)device->ov_start_sector); + } + + if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && + (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { + drbd_info(device, "Syncer continues.\n"); + device->rs_paused += (long)jiffies + -(long)device->rs_mark_time[device->rs_last_mark]; + if (ns.conn == C_SYNC_TARGET) + mod_timer(&device->resync_timer, jiffies); + } + + if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && + (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { + drbd_info(device, "Resync suspended\n"); + device->rs_mark_time[device->rs_last_mark] = jiffies; + } + + if (os.conn == C_CONNECTED && + (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { + unsigned long now = jiffies; + int i; + + set_ov_position(device, ns.conn); + device->rs_start = now; + device->rs_last_sect_ev = 0; + device->ov_last_oos_size = 0; + device->ov_last_oos_start = 0; + + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + device->rs_mark_left[i] = device->ov_left; + device->rs_mark_time[i] = now; + } + + drbd_rs_controller_reset(device); + + if (ns.conn == C_VERIFY_S) { + drbd_info(device, "Starting Online Verify from sector %llu\n", + (unsigned long long)device->ov_position); + mod_timer(&device->resync_timer, jiffies); + } + } + + if (get_ldev(device)) { + u32 mdf = device->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| + MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| + MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); + + mdf &= ~MDF_AL_CLEAN; + if (test_bit(CRASHED_PRIMARY, &device->flags)) + mdf |= MDF_CRASHED_PRIMARY; + if (device->state.role == R_PRIMARY || + (device->state.pdsk < D_INCONSISTENT && device->state.peer == R_PRIMARY)) + mdf |= MDF_PRIMARY_IND; + if (device->state.conn > C_WF_REPORT_PARAMS) + mdf |= MDF_CONNECTED_IND; + if (device->state.disk > D_INCONSISTENT) + mdf |= MDF_CONSISTENT; + if (device->state.disk > D_OUTDATED) + mdf |= MDF_WAS_UP_TO_DATE; + if (device->state.pdsk <= D_OUTDATED && device->state.pdsk >= D_INCONSISTENT) + mdf |= MDF_PEER_OUT_DATED; + if (mdf != device->ldev->md.flags) { + device->ldev->md.flags = mdf; + drbd_md_mark_dirty(device); + } + if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) + drbd_set_ed_uuid(device, device->ldev->md.uuid[UI_CURRENT]); + put_ldev(device); + } + + /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ + if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && + os.peer == R_SECONDARY && ns.peer == R_PRIMARY) + set_bit(CONSIDER_RESYNC, &device->flags); + + /* Receiver should clean up itself */ + if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) + drbd_thread_stop_nowait(&connection->receiver); + + /* Now the receiver finished cleaning up itself, it should die */ + if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) + drbd_thread_stop_nowait(&connection->receiver); + + /* Upon network failure, we need to restart the receiver. */ + if (os.conn > C_WF_CONNECTION && + ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) + drbd_thread_restart_nowait(&connection->receiver); + + /* Resume AL writing if we get a connection */ + if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) { + drbd_resume_al(device); + connection->connect_cnt++; + } + + /* remember last attach time so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) + device->last_reattach_jif = jiffies; + + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); + if (ascw) { + ascw->os = os; + ascw->ns = ns; + ascw->flags = flags; + ascw->w.cb = w_after_state_ch; + ascw->device = device; + ascw->done = done; + drbd_queue_work(&connection->sender_work, + &ascw->w); + } else { + drbd_err(device, "Could not kmalloc an ascw\n"); + } + + return rv; +} + +static int w_after_state_ch(struct drbd_work *w, int unused) +{ + struct after_state_chg_work *ascw = + container_of(w, struct after_state_chg_work, w); + struct drbd_device *device = ascw->device; + + after_state_ch(device, ascw->os, ascw->ns, ascw->flags); + if (ascw->flags & CS_WAIT_COMPLETE) + complete(ascw->done); + kfree(ascw); + + return 0; +} + +static void abw_start_sync(struct drbd_device *device, int rv) +{ + if (rv) { + drbd_err(device, "Writing the bitmap failed not starting resync.\n"); + _drbd_request_state(device, NS(conn, C_CONNECTED), CS_VERBOSE); + return; + } + + switch (device->state.conn) { + case C_STARTING_SYNC_T: + _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); + break; + case C_STARTING_SYNC_S: + drbd_start_resync(device, C_SYNC_SOURCE); + break; + } +} + +int drbd_bitmap_io_from_worker(struct drbd_device *device, + int (*io_fn)(struct drbd_device *), + char *why, enum bm_flag flags) +{ + int rv; + + D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); + + /* open coded non-blocking drbd_suspend_io(device); */ + set_bit(SUSPEND_IO, &device->flags); + + drbd_bm_lock(device, why, flags); + rv = io_fn(device); + drbd_bm_unlock(device); + + drbd_resume_io(device); + + return rv; +} + +/** + * after_state_ch() - Perform after state change actions that may sleep + * @device: DRBD device. + * @os: old state. + * @ns: new state. + * @flags: Flags + */ +static void after_state_ch(struct drbd_device *device, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags) +{ + struct drbd_resource *resource = device->resource; + struct drbd_peer_device *peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + struct sib_info sib; + + sib.sib_reason = SIB_STATE_CHANGE; + sib.os = os; + sib.ns = ns; + + if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE) + && (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) { + clear_bit(CRASHED_PRIMARY, &device->flags); + if (device->p_uuid) + device->p_uuid[UI_FLAGS] &= ~((u64)2); + } + + /* Inform userspace about the change... */ + drbd_bcast_event(device, &sib); + + if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) + drbd_khelper(device, "pri-on-incon-degr"); + + /* Here we have the actions that are performed after a + state change. This function might sleep */ + + if (ns.susp_nod) { + enum drbd_req_event what = NOTHING; + + spin_lock_irq(&device->resource->req_lock); + if (os.conn < C_CONNECTED && conn_lowest_conn(connection) >= C_CONNECTED) + what = RESEND; + + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + conn_lowest_disk(connection) > D_NEGOTIATING) + what = RESTART_FROZEN_DISK_IO; + + if (resource->susp_nod && what != NOTHING) { + _tl_restart(connection, what); + _conn_request_state(connection, + (union drbd_state) { { .susp_nod = 1 } }, + (union drbd_state) { { .susp_nod = 0 } }, + CS_VERBOSE); + } + spin_unlock_irq(&device->resource->req_lock); + } + + if (ns.susp_fen) { + spin_lock_irq(&device->resource->req_lock); + if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) { + /* case2: The connection was established again: */ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + clear_bit(NEW_CUR_UUID, &peer_device->device->flags); + rcu_read_unlock(); + _tl_restart(connection, RESEND); + _conn_request_state(connection, + (union drbd_state) { { .susp_fen = 1 } }, + (union drbd_state) { { .susp_fen = 0 } }, + CS_VERBOSE); + } + spin_unlock_irq(&device->resource->req_lock); + } + + /* Became sync source. With protocol >= 96, we still need to send out + * the sync uuid now. Need to do that before any drbd_send_state, or + * the other side may go "paused sync" before receiving the sync uuids, + * which is unexpected. */ + if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) && + (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) && + connection->agreed_pro_version >= 96 && get_ldev(device)) { + drbd_gen_and_send_sync_uuid(peer_device); + put_ldev(device); + } + + /* Do not change the order of the if above and the two below... */ + if (os.pdsk == D_DISKLESS && + ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */ + /* we probably will start a resync soon. + * make sure those things are properly reset. */ + device->rs_total = 0; + device->rs_failed = 0; + atomic_set(&device->rs_pending_cnt, 0); + drbd_rs_cancel_all(device); + + drbd_send_uuids(peer_device); + drbd_send_state(peer_device, ns); + } + /* No point in queuing send_bitmap if we don't have a connection + * anymore, so check also the _current_ state, not only the new state + * at the time this work was queued. */ + if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S && + device->state.conn == C_WF_BITMAP_S) + drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL, + "send_bitmap (WFBitMapS)", + BM_LOCKED_TEST_ALLOWED); + + /* Lost contact to peer's copy of the data */ + if ((os.pdsk >= D_INCONSISTENT && + os.pdsk != D_UNKNOWN && + os.pdsk != D_OUTDATED) + && (ns.pdsk < D_INCONSISTENT || + ns.pdsk == D_UNKNOWN || + ns.pdsk == D_OUTDATED)) { + if (get_ldev(device)) { + if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && + device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + if (drbd_suspended(device)) { + set_bit(NEW_CUR_UUID, &device->flags); + } else { + drbd_uuid_new_current(device); + drbd_send_uuids(peer_device); + } + } + put_ldev(device); + } + } + + if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) { + if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + drbd_uuid_new_current(device); + drbd_send_uuids(peer_device); + } + /* D_DISKLESS Peer becomes secondary */ + if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) + /* We may still be Primary ourselves. + * No harm done if the bitmap still changes, + * redirtied pages will follow later. */ + drbd_bitmap_io_from_worker(device, &drbd_bm_write, + "demote diskless peer", BM_LOCKED_SET_ALLOWED); + put_ldev(device); + } + + /* Write out all changed bits on demote. + * Though, no need to da that just yet + * if there is a resync going on still */ + if (os.role == R_PRIMARY && ns.role == R_SECONDARY && + device->state.conn <= C_CONNECTED && get_ldev(device)) { + /* No changes to the bitmap expected this time, so assert that, + * even though no harm was done if it did change. */ + drbd_bitmap_io_from_worker(device, &drbd_bm_write, + "demote", BM_LOCKED_TEST_ALLOWED); + put_ldev(device); + } + + /* Last part of the attaching process ... */ + if (ns.conn >= C_CONNECTED && + os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { + drbd_send_sizes(peer_device, 0, 0); /* to start sync... */ + drbd_send_uuids(peer_device); + drbd_send_state(peer_device, ns); + } + + /* We want to pause/continue resync, tell peer. */ + if (ns.conn >= C_CONNECTED && + ((os.aftr_isp != ns.aftr_isp) || + (os.user_isp != ns.user_isp))) + drbd_send_state(peer_device, ns); + + /* In case one of the isp bits got set, suspend other devices. */ + if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && + (ns.aftr_isp || ns.peer_isp || ns.user_isp)) + suspend_other_sg(device); + + /* Make sure the peer gets informed about eventual state + changes (ISP bits) while we were in WFReportParams. */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) + drbd_send_state(peer_device, ns); + + if (os.conn != C_AHEAD && ns.conn == C_AHEAD) + drbd_send_state(peer_device, ns); + + /* We are in the progress to start a full sync... */ + if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) + /* no other bitmap changes expected during this phase */ + drbd_queue_bitmap_io(device, + &drbd_bmio_set_n_write, &abw_start_sync, + "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED); + + /* first half of local IO error, failure to attach, + * or administrative detach */ + if (os.disk != D_FAILED && ns.disk == D_FAILED) { + enum drbd_io_error_p eh = EP_PASS_ON; + int was_io_error = 0; + /* corresponding get_ldev was in __drbd_set_state, to serialize + * our cleanup here with the transition to D_DISKLESS. + * But is is still not save to dreference ldev here, since + * we might come from an failed Attach before ldev was set. */ + if (device->ldev) { + rcu_read_lock(); + eh = rcu_dereference(device->ldev->disk_conf)->on_io_error; + rcu_read_unlock(); + + was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags); + + if (was_io_error && eh == EP_CALL_HELPER) + drbd_khelper(device, "local-io-error"); + + /* Immediately allow completion of all application IO, + * that waits for completion from the local disk, + * if this was a force-detach due to disk_timeout + * or administrator request (drbdsetup detach --force). + * Do NOT abort otherwise. + * Aborting local requests may cause serious problems, + * if requests are completed to upper layers already, + * and then later the already submitted local bio completes. + * This can cause DMA into former bio pages that meanwhile + * have been re-used for other things. + * So aborting local requests may cause crashes, + * or even worse, silent data corruption. + */ + if (test_and_clear_bit(FORCE_DETACH, &device->flags)) + tl_abort_disk_io(device); + + /* current state still has to be D_FAILED, + * there is only one way out: to D_DISKLESS, + * and that may only happen after our put_ldev below. */ + if (device->state.disk != D_FAILED) + drbd_err(device, + "ASSERT FAILED: disk is %s during detach\n", + drbd_disk_str(device->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(peer_device, ns); + + drbd_rs_cancel_all(device); + + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_md_sync(device); + } + put_ldev(device); + } + + /* second half of local IO error, failure to attach, + * or administrative detach, + * after local_cnt references have reached zero again */ + if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) { + /* We must still be diskless, + * re-attach has to be serialized with this! */ + if (device->state.disk != D_DISKLESS) + drbd_err(device, + "ASSERT FAILED: disk is %s while going diskless\n", + drbd_disk_str(device->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(peer_device, ns); + /* corresponding get_ldev in __drbd_set_state + * this may finally trigger drbd_ldev_destroy. */ + put_ldev(device); + } + + /* Notify peer that I had a local IO error, and did not detached.. */ + if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) + drbd_send_state(peer_device, ns); + + /* Disks got bigger while they were detached */ + if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && + test_and_clear_bit(RESYNC_AFTER_NEG, &device->flags)) { + if (ns.conn == C_CONNECTED) + resync_after_online_grow(device); + } + + /* A resync finished or aborted, wake paused devices... */ + if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || + (os.peer_isp && !ns.peer_isp) || + (os.user_isp && !ns.user_isp)) + resume_next_sg(device); + + /* sync target done with resync. Explicitly notify peer, even though + * it should (at least for non-empty resyncs) already know itself. */ + if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) + drbd_send_state(peer_device, ns); + + /* Verify finished, or reached stop sector. Peer did not know about + * the stop sector, and we may even have changed the stop sector during + * verify to interrupt/stop early. Send the new state. */ + if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED + && verify_can_do_stop_sector(device)) + drbd_send_state(peer_device, ns); + + /* This triggers bitmap writeout of potentially still unwritten pages + * if the resync finished cleanly, or aborted because of peer disk + * failure, or because of connection loss. + * For resync aborted because of local disk failure, we cannot do + * any bitmap writeout anymore. + * No harm done if some bits change during this phase. + */ + if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) { + drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL, + "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); + put_ldev(device); + } + + if (ns.disk == D_DISKLESS && + ns.conn == C_STANDALONE && + ns.role == R_SECONDARY) { + if (os.aftr_isp != ns.aftr_isp) + resume_next_sg(device); + } + + drbd_md_sync(device); +} + +struct after_conn_state_chg_work { + struct drbd_work w; + enum drbd_conns oc; + union drbd_state ns_min; + union drbd_state ns_max; /* new, max state, over all devices */ + enum chg_state_flags flags; + struct drbd_connection *connection; +}; + +static int w_after_conn_state_ch(struct drbd_work *w, int unused) +{ + struct after_conn_state_chg_work *acscw = + container_of(w, struct after_conn_state_chg_work, w); + struct drbd_connection *connection = acscw->connection; + enum drbd_conns oc = acscw->oc; + union drbd_state ns_max = acscw->ns_max; + struct drbd_peer_device *peer_device; + int vnr; + + kfree(acscw); + + /* Upon network configuration, we need to start the receiver */ + if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED) + drbd_thread_start(&connection->receiver); + + if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { + struct net_conf *old_conf; + + mutex_lock(&connection->resource->conf_update); + old_conf = connection->net_conf; + connection->my_addr_len = 0; + connection->peer_addr_len = 0; + RCU_INIT_POINTER(connection->net_conf, NULL); + conn_free_crypto(connection); + mutex_unlock(&connection->resource->conf_update); + + synchronize_rcu(); + kfree(old_conf); + } + + if (ns_max.susp_fen) { + /* case1: The outdate peer handler is successful: */ + if (ns_max.pdsk <= D_OUTDATED) { + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (test_bit(NEW_CUR_UUID, &device->flags)) { + drbd_uuid_new_current(device); + clear_bit(NEW_CUR_UUID, &device->flags); + } + } + rcu_read_unlock(); + spin_lock_irq(&connection->resource->req_lock); + _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING); + _conn_request_state(connection, + (union drbd_state) { { .susp_fen = 1 } }, + (union drbd_state) { { .susp_fen = 0 } }, + CS_VERBOSE); + spin_unlock_irq(&connection->resource->req_lock); + } + } + kref_put(&connection->kref, drbd_destroy_connection); + + conn_md_sync(connection); + + return 0; +} + +static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf) +{ + enum chg_state_flags flags = ~0; + struct drbd_peer_device *peer_device; + int vnr, first_vol = 1; + union drbd_dev_state os, cs = { + { .role = R_SECONDARY, + .peer = R_UNKNOWN, + .conn = connection->cstate, + .disk = D_DISKLESS, + .pdsk = D_UNKNOWN, + } }; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + os = device->state; + + if (first_vol) { + cs = os; + first_vol = 0; + continue; + } + + if (cs.role != os.role) + flags &= ~CS_DC_ROLE; + + if (cs.peer != os.peer) + flags &= ~CS_DC_PEER; + + if (cs.conn != os.conn) + flags &= ~CS_DC_CONN; + + if (cs.disk != os.disk) + flags &= ~CS_DC_DISK; + + if (cs.pdsk != os.pdsk) + flags &= ~CS_DC_PDSK; + } + rcu_read_unlock(); + + *pf |= CS_DC_MASK; + *pf &= flags; + (*pcs).i = cs.i; +} + +static enum drbd_state_rv +conn_is_valid_transition(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags) +{ + enum drbd_state_rv rv = SS_SUCCESS; + union drbd_state ns, os; + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + os = drbd_read_state(device); + ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL); + + if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) + ns.disk = os.disk; + + if (ns.i == os.i) + continue; + + rv = is_valid_transition(os, ns); + + if (rv >= SS_SUCCESS && !(flags & CS_HARD)) { + rv = is_valid_state(device, ns); + if (rv < SS_SUCCESS) { + if (is_valid_state(device, os) == rv) + rv = is_valid_soft_transition(os, ns, connection); + } else + rv = is_valid_soft_transition(os, ns, connection); + } + + if (rv < SS_SUCCESS) { + if (flags & CS_VERBOSE) + print_st_err(device, os, ns, rv); + break; + } + } + rcu_read_unlock(); + + return rv; +} + +static void +conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags) +{ + union drbd_state ns, os, ns_max = { }; + union drbd_state ns_min = { + { .role = R_MASK, + .peer = R_MASK, + .conn = val.conn, + .disk = D_MASK, + .pdsk = D_MASK + } }; + struct drbd_peer_device *peer_device; + enum drbd_state_rv rv; + int vnr, number_of_volumes = 0; + + if (mask.conn == C_MASK) { + /* remember last connect time so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if (connection->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS) + connection->last_reconnect_jif = jiffies; + + connection->cstate = val.conn; + } + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + number_of_volumes++; + os = drbd_read_state(device); + ns = apply_mask_val(os, mask, val); + ns = sanitize_state(device, os, ns, NULL); + + if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) + ns.disk = os.disk; + + rv = __drbd_set_state(device, ns, flags, NULL); + if (rv < SS_SUCCESS) + BUG(); + + ns.i = device->state.i; + ns_max.role = max_role(ns.role, ns_max.role); + ns_max.peer = max_role(ns.peer, ns_max.peer); + ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn); + ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk); + ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk); + + ns_min.role = min_role(ns.role, ns_min.role); + ns_min.peer = min_role(ns.peer, ns_min.peer); + ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn); + ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk); + ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk); + } + rcu_read_unlock(); + + if (number_of_volumes == 0) { + ns_min = ns_max = (union drbd_state) { { + .role = R_SECONDARY, + .peer = R_UNKNOWN, + .conn = val.conn, + .disk = D_DISKLESS, + .pdsk = D_UNKNOWN + } }; + } + + ns_min.susp = ns_max.susp = connection->resource->susp; + ns_min.susp_nod = ns_max.susp_nod = connection->resource->susp_nod; + ns_min.susp_fen = ns_max.susp_fen = connection->resource->susp_fen; + + *pns_min = ns_min; + *pns_max = ns_max; +} + +static enum drbd_state_rv +_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val) +{ + enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */; + + if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags)) + rv = SS_CW_SUCCESS; + + if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags)) + rv = SS_CW_FAILED_BY_PEER; + + err = conn_is_valid_transition(connection, mask, val, 0); + if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS) + return rv; + + return err; +} + +enum drbd_state_rv +_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags) +{ + enum drbd_state_rv rv = SS_SUCCESS; + struct after_conn_state_chg_work *acscw; + enum drbd_conns oc = connection->cstate; + union drbd_state ns_max, ns_min, os; + bool have_mutex = false; + + if (mask.conn) { + rv = is_valid_conn_transition(oc, val.conn); + if (rv < SS_SUCCESS) + goto abort; + } + + rv = conn_is_valid_transition(connection, mask, val, flags); + if (rv < SS_SUCCESS) + goto abort; + + if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING && + !(flags & (CS_LOCAL_ONLY | CS_HARD))) { + + /* This will be a cluster-wide state change. + * Need to give up the spinlock, grab the mutex, + * then send the state change request, ... */ + spin_unlock_irq(&connection->resource->req_lock); + mutex_lock(&connection->cstate_mutex); + have_mutex = true; + + set_bit(CONN_WD_ST_CHG_REQ, &connection->flags); + if (conn_send_state_req(connection, mask, val)) { + /* sending failed. */ + clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags); + rv = SS_CW_FAILED_BY_PEER; + /* need to re-aquire the spin lock, though */ + goto abort_unlocked; + } + + if (val.conn == C_DISCONNECTING) + set_bit(DISCONNECT_SENT, &connection->flags); + + /* ... and re-aquire the spinlock. + * If _conn_rq_cond() returned >= SS_SUCCESS, we must call + * conn_set_state() within the same spinlock. */ + spin_lock_irq(&connection->resource->req_lock); + wait_event_lock_irq(connection->ping_wait, + (rv = _conn_rq_cond(connection, mask, val)), + connection->resource->req_lock); + clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags); + if (rv < SS_SUCCESS) + goto abort; + } + + conn_old_common_state(connection, &os, &flags); + flags |= CS_DC_SUSP; + conn_set_state(connection, mask, val, &ns_min, &ns_max, flags); + conn_pr_state_change(connection, os, ns_max, flags); + + acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); + if (acscw) { + acscw->oc = os.conn; + acscw->ns_min = ns_min; + acscw->ns_max = ns_max; + acscw->flags = flags; + acscw->w.cb = w_after_conn_state_ch; + kref_get(&connection->kref); + acscw->connection = connection; + drbd_queue_work(&connection->sender_work, &acscw->w); + } else { + drbd_err(connection, "Could not kmalloc an acscw\n"); + } + + abort: + if (have_mutex) { + /* mutex_unlock() "... must not be used in interrupt context.", + * so give up the spinlock, then re-aquire it */ + spin_unlock_irq(&connection->resource->req_lock); + abort_unlocked: + mutex_unlock(&connection->cstate_mutex); + spin_lock_irq(&connection->resource->req_lock); + } + if (rv < SS_SUCCESS && flags & CS_VERBOSE) { + drbd_err(connection, "State change failed: %s\n", drbd_set_st_err_str(rv)); + drbd_err(connection, " mask = 0x%x val = 0x%x\n", mask.i, val.i); + drbd_err(connection, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn)); + } + return rv; +} + +enum drbd_state_rv +conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags) +{ + enum drbd_state_rv rv; + + spin_lock_irq(&connection->resource->req_lock); + rv = _conn_request_state(connection, mask, val, flags); + spin_unlock_irq(&connection->resource->req_lock); + + return rv; +} diff --git a/kernel/drivers/block/drbd/drbd_state.h b/kernel/drivers/block/drbd/drbd_state.h new file mode 100644 index 000000000..7f53c4082 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_state.h @@ -0,0 +1,166 @@ +#ifndef DRBD_STATE_H +#define DRBD_STATE_H + +struct drbd_device; +struct drbd_connection; + +/** + * DOC: DRBD State macros + * + * These macros are used to express state changes in easily readable form. + * + * The NS macros expand to a mask and a value, that can be bit ored onto the + * current state as soon as the spinlock (req_lock) was taken. + * + * The _NS macros are used for state functions that get called with the + * spinlock. These macros expand directly to the new state value. + * + * Besides the basic forms NS() and _NS() additional _?NS[23] are defined + * to express state changes that affect more than one aspect of the state. + * + * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) + * Means that the network connection was established and that the peer + * is in secondary role. + */ +#define role_MASK R_MASK +#define peer_MASK R_MASK +#define disk_MASK D_MASK +#define pdsk_MASK D_MASK +#define conn_MASK C_MASK +#define susp_MASK 1 +#define user_isp_MASK 1 +#define aftr_isp_MASK 1 +#define susp_nod_MASK 1 +#define susp_fen_MASK 1 + +#define NS(T, S) \ + ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T = (S); val; }) +#define NS2(T1, S1, T2, S2) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val; }) +#define NS3(T1, S1, T2, S2, T3, S3) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val.T3 = (S3); val; }) + +#define _NS(D, T, S) \ + D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; }) +#define _NS2(D, T1, S1, T2, S2) \ + D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns; }) +#define _NS3(D, T1, S1, T2, S2, T3, S3) \ + D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) + +enum chg_state_flags { + CS_HARD = 1 << 0, + CS_VERBOSE = 1 << 1, + CS_WAIT_COMPLETE = 1 << 2, + CS_SERIALIZE = 1 << 3, + CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, + CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */ + CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */ + CS_DC_PEER = 1 << 6, + CS_DC_CONN = 1 << 7, + CS_DC_DISK = 1 << 8, + CS_DC_PDSK = 1 << 9, + CS_DC_SUSP = 1 << 10, + CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK, + CS_IGN_OUTD_FAIL = 1 << 11, +}; + +/* drbd_dev_state and drbd_state are different types. This is to stress the + small difference. There is no suspended flag (.susp), and no suspended + while fence handler runs flas (susp_fen). */ +union drbd_dev_state { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ + unsigned conn:5 ; /* 17/32 cstates */ + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned _unused:1 ; + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ + unsigned peer_isp:1 ; + unsigned user_isp:1 ; + unsigned _pad:11; /* 0 unused */ +#elif defined(__BIG_ENDIAN_BITFIELD) + unsigned _pad:11; + unsigned user_isp:1 ; + unsigned peer_isp:1 ; + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ + unsigned _unused:1 ; + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned conn:5 ; /* 17/32 cstates */ + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ +#else +# error "this endianess is not supported" +#endif + }; + unsigned int i; +}; + +extern enum drbd_state_rv drbd_change_state(struct drbd_device *device, + enum chg_state_flags f, + union drbd_state mask, + union drbd_state val); +extern void drbd_force_state(struct drbd_device *, union drbd_state, + union drbd_state); +extern enum drbd_state_rv _drbd_request_state(struct drbd_device *, + union drbd_state, + union drbd_state, + enum chg_state_flags); + +extern enum drbd_state_rv +_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state, + union drbd_state, enum chg_state_flags); + +extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, + enum chg_state_flags, + struct completion *done); +extern void print_st_err(struct drbd_device *, union drbd_state, + union drbd_state, int); + +enum drbd_state_rv +_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags); + +enum drbd_state_rv +conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val, + enum chg_state_flags flags); + +extern void drbd_resume_al(struct drbd_device *device); +extern bool conn_all_vols_unconf(struct drbd_connection *connection); + +/** + * drbd_request_state() - Reqest a state change + * @device: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * + * This is the most graceful way of requesting a state change. It is verbose + * quite verbose in case the state change is not possible, and all those + * state changes are globally serialized. + */ +static inline int drbd_request_state(struct drbd_device *device, + union drbd_state mask, + union drbd_state val) +{ + return _drbd_request_state(device, mask, val, CS_VERBOSE + CS_ORDERED); +} + +enum drbd_role conn_highest_role(struct drbd_connection *connection); +enum drbd_role conn_highest_peer(struct drbd_connection *connection); +enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection); +enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection); +enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection); +enum drbd_conns conn_lowest_conn(struct drbd_connection *connection); + +#endif diff --git a/kernel/drivers/block/drbd/drbd_strings.c b/kernel/drivers/block/drbd/drbd_strings.c new file mode 100644 index 000000000..80b0f63c7 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_strings.c @@ -0,0 +1,118 @@ +/* + drbd.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner . + Copyright (C) 2003-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include +#include "drbd_strings.h" + +static const char *drbd_conn_s_names[] = { + [C_STANDALONE] = "StandAlone", + [C_DISCONNECTING] = "Disconnecting", + [C_UNCONNECTED] = "Unconnected", + [C_TIMEOUT] = "Timeout", + [C_BROKEN_PIPE] = "BrokenPipe", + [C_NETWORK_FAILURE] = "NetworkFailure", + [C_PROTOCOL_ERROR] = "ProtocolError", + [C_WF_CONNECTION] = "WFConnection", + [C_WF_REPORT_PARAMS] = "WFReportParams", + [C_TEAR_DOWN] = "TearDown", + [C_CONNECTED] = "Connected", + [C_STARTING_SYNC_S] = "StartingSyncS", + [C_STARTING_SYNC_T] = "StartingSyncT", + [C_WF_BITMAP_S] = "WFBitMapS", + [C_WF_BITMAP_T] = "WFBitMapT", + [C_WF_SYNC_UUID] = "WFSyncUUID", + [C_SYNC_SOURCE] = "SyncSource", + [C_SYNC_TARGET] = "SyncTarget", + [C_PAUSED_SYNC_S] = "PausedSyncS", + [C_PAUSED_SYNC_T] = "PausedSyncT", + [C_VERIFY_S] = "VerifyS", + [C_VERIFY_T] = "VerifyT", + [C_AHEAD] = "Ahead", + [C_BEHIND] = "Behind", +}; + +static const char *drbd_role_s_names[] = { + [R_PRIMARY] = "Primary", + [R_SECONDARY] = "Secondary", + [R_UNKNOWN] = "Unknown" +}; + +static const char *drbd_disk_s_names[] = { + [D_DISKLESS] = "Diskless", + [D_ATTACHING] = "Attaching", + [D_FAILED] = "Failed", + [D_NEGOTIATING] = "Negotiating", + [D_INCONSISTENT] = "Inconsistent", + [D_OUTDATED] = "Outdated", + [D_UNKNOWN] = "DUnknown", + [D_CONSISTENT] = "Consistent", + [D_UP_TO_DATE] = "UpToDate", +}; + +static const char *drbd_state_sw_errors[] = { + [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", + [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data", + [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", + [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", + [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", + [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated", + [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active", + [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device", + [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node", + [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk", + [-SS_DEVICE_IN_USE] = "Device is held open by someone", + [-SS_NO_NET_CONFIG] = "Have no net/connection configuration", + [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify", + [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync", + [-SS_NOT_SUPPORTED] = "Peer does not support protocol", + [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", + [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", + [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", + [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer", + [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config", +}; + +const char *drbd_conn_str(enum drbd_conns s) +{ + /* enums are unsigned... */ + return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s]; +} + +const char *drbd_role_str(enum drbd_role s) +{ + return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s]; +} + +const char *drbd_disk_str(enum drbd_disk_state s) +{ + return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; +} + +const char *drbd_set_st_err_str(enum drbd_state_rv err) +{ + return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : + err > SS_TWO_PRIMARIES ? "TOO_LARGE" + : drbd_state_sw_errors[-err]; +} diff --git a/kernel/drivers/block/drbd/drbd_strings.h b/kernel/drivers/block/drbd/drbd_strings.h new file mode 100644 index 000000000..f9923cc88 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_strings.h @@ -0,0 +1,9 @@ +#ifndef __DRBD_STRINGS_H +#define __DRBD_STRINGS_H + +extern const char *drbd_conn_str(enum drbd_conns); +extern const char *drbd_role_str(enum drbd_role); +extern const char *drbd_disk_str(enum drbd_disk_state); +extern const char *drbd_set_st_err_str(enum drbd_state_rv); + +#endif /* __DRBD_STRINGS_H */ diff --git a/kernel/drivers/block/drbd/drbd_vli.h b/kernel/drivers/block/drbd/drbd_vli.h new file mode 100644 index 000000000..8cb1532a3 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_vli.h @@ -0,0 +1,351 @@ +/* +-*- linux-c -*- + drbd_receiver.c + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DRBD_VLI_H +#define _DRBD_VLI_H + +/* + * At a granularity of 4KiB storage represented per bit, + * and stroage sizes of several TiB, + * and possibly small-bandwidth replication, + * the bitmap transfer time can take much too long, + * if transmitted in plain text. + * + * We try to reduce the transferred bitmap information + * by encoding runlengths of bit polarity. + * + * We never actually need to encode a "zero" (runlengths are positive). + * But then we have to store the value of the first bit. + * The first bit of information thus shall encode if the first runlength + * gives the number of set or unset bits. + * + * We assume that large areas are either completely set or unset, + * which gives good compression with any runlength method, + * even when encoding the runlength as fixed size 32bit/64bit integers. + * + * Still, there may be areas where the polarity flips every few bits, + * and encoding the runlength sequence of those areas with fix size + * integers would be much worse than plaintext. + * + * We want to encode small runlength values with minimum code length, + * while still being able to encode a Huge run of all zeros. + * + * Thus we need a Variable Length Integer encoding, VLI. + * + * For some cases, we produce more code bits than plaintext input. + * We need to send incompressible chunks as plaintext, skip over them + * and then see if the next chunk compresses better. + * + * We don't care too much about "excellent" compression ratio for large + * runlengths (all set/all clear): whether we achieve a factor of 100 + * or 1000 is not that much of an issue. + * We do not want to waste too much on short runlengths in the "noisy" + * parts of the bitmap, though. + * + * There are endless variants of VLI, we experimented with: + * * simple byte-based + * * various bit based with different code word length. + * + * To avoid yet an other configuration parameter (choice of bitmap compression + * algorithm) which was difficult to explain and tune, we just chose the one + * variant that turned out best in all test cases. + * Based on real world usage patterns, with device sizes ranging from a few GiB + * to several TiB, file server/mailserver/webserver/mysql/postgress, + * mostly idle to really busy, the all time winner (though sometimes only + * marginally better) is: + */ + +/* + * encoding is "visualised" as + * __little endian__ bitstream, least significant bit first (left most) + * + * this particular encoding is chosen so that the prefix code + * starts as unary encoding the level, then modified so that + * 10 levels can be described in 8bit, with minimal overhead + * for the smaller levels. + * + * Number of data bits follow fibonacci sequence, with the exception of the + * last level (+1 data bit, so it makes 64bit total). The only worse code when + * encoding bit polarity runlength is 1 plain bits => 2 code bits. +prefix data bits max val Nº data bits +0 x 0x2 1 +10 x 0x4 1 +110 xx 0x8 2 +1110 xxx 0x10 3 +11110 xxx xx 0x30 5 +111110 xx xxxxxx 0x130 8 +11111100 xxxxxxxx xxxxx 0x2130 13 +11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 +11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 +11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 + * maximum encodable value: 0x100000400202130 == 2**56 + some */ + +/* compression "table": + transmitted x 0.29 + as plaintext x ........................ + x ........................ + x ........................ + x 0.59 0.21........................ + x ........................................................ + x .. c ................................................... + x 0.44.. o ................................................... + x .......... d ................................................... + x .......... e ................................................... + X............. ................................................... + x.............. b ................................................... +2.0x............... i ................................................... + #X................ t ................................................... + #................. s ........................... plain bits .......... +-+----------------------------------------------------------------------- + 1 16 32 64 +*/ + +/* LEVEL: (total bits, prefix bits, prefix value), + * sorted ascending by number of total bits. + * The rest of the code table is calculated at compiletime from this. */ + +/* fibonacci data 1, 1, ... */ +#define VLI_L_1_1() do { \ + LEVEL( 2, 1, 0x00); \ + LEVEL( 3, 2, 0x01); \ + LEVEL( 5, 3, 0x03); \ + LEVEL( 7, 4, 0x07); \ + LEVEL(10, 5, 0x0f); \ + LEVEL(14, 6, 0x1f); \ + LEVEL(21, 8, 0x3f); \ + LEVEL(29, 8, 0x7f); \ + LEVEL(42, 8, 0xbf); \ + LEVEL(64, 8, 0xff); \ + } while (0) + +/* finds a suitable level to decode the least significant part of in. + * returns number of bits consumed. + * + * BUG() for bad input, as that would mean a buggy code table. */ +static inline int vli_decode_bits(u64 *out, const u64 in) +{ + u64 adj = 1; + +#define LEVEL(t,b,v) \ + do { \ + if ((in & ((1 << b) -1)) == v) { \ + *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ + return t; \ + } \ + adj += 1ULL << (t - b); \ + } while (0) + + VLI_L_1_1(); + + /* NOT REACHED, if VLI_LEVELS code table is defined properly */ + BUG(); +#undef LEVEL +} + +/* return number of code bits needed, + * or negative error number */ +static inline int __vli_encode_bits(u64 *out, const u64 in) +{ + u64 max = 0; + u64 adj = 1; + + if (in == 0) + return -EINVAL; + +#define LEVEL(t,b,v) do { \ + max += 1ULL << (t - b); \ + if (in <= max) { \ + if (out) \ + *out = ((in - adj) << b) | v; \ + return t; \ + } \ + adj = max + 1; \ + } while (0) + + VLI_L_1_1(); + + return -EOVERFLOW; +#undef LEVEL +} + +#undef VLI_L_1_1 + +/* code from here down is independend of actually used bit code */ + +/* + * Code length is determined by some unique (e.g. unary) prefix. + * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, + * not a byte stream. + */ + +/* for the bitstream, we need a cursor */ +struct bitstream_cursor { + /* the current byte */ + u8 *b; + /* the current bit within *b, nomalized: 0..7 */ + unsigned int bit; +}; + +/* initialize cursor to point to first bit of stream */ +static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) +{ + cur->b = s; + cur->bit = 0; +} + +/* advance cursor by that many bits; maximum expected input value: 64, + * but depending on VLI implementation, it may be more. */ +static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) +{ + bits += cur->bit; + cur->b = cur->b + (bits >> 3); + cur->bit = bits & 7; +} + +/* the bitstream itself knows its length */ +struct bitstream { + struct bitstream_cursor cur; + unsigned char *buf; + size_t buf_len; /* in bytes */ + + /* for input stream: + * number of trailing 0 bits for padding + * total number of valid bits in stream: buf_len * 8 - pad_bits */ + unsigned int pad_bits; +}; + +static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) +{ + bs->buf = s; + bs->buf_len = len; + bs->pad_bits = pad_bits; + bitstream_cursor_reset(&bs->cur, bs->buf); +} + +static inline void bitstream_rewind(struct bitstream *bs) +{ + bitstream_cursor_reset(&bs->cur, bs->buf); + memset(bs->buf, 0, bs->buf_len); +} + +/* Put (at most 64) least significant bits of val into bitstream, and advance cursor. + * Ignores "pad_bits". + * Returns zero if bits == 0 (nothing to do). + * Returns number of bits used if successful. + * + * If there is not enough room left in bitstream, + * leaves bitstream unchanged and returns -ENOBUFS. + */ +static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) +{ + unsigned char *b = bs->cur.b; + unsigned int tmp; + + if (bits == 0) + return 0; + + if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) + return -ENOBUFS; + + /* paranoia: strip off hi bits; they should not be set anyways. */ + if (bits < 64) + val &= ~0ULL >> (64 - bits); + + *b++ |= (val & 0xff) << bs->cur.bit; + + for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) + *b++ |= (val >> tmp) & 0xff; + + bitstream_cursor_advance(&bs->cur, bits); + return bits; +} + +/* Fetch (at most 64) bits from bitstream into *out, and advance cursor. + * + * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. + * + * If there are less than the requested number of valid bits left in the + * bitstream, still fetches all available bits. + * + * Returns number of actually fetched bits. + */ +static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) +{ + u64 val; + unsigned int n; + + if (bits > 64) + return -EINVAL; + + if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) + bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) + - bs->cur.bit - bs->pad_bits; + + if (bits == 0) { + *out = 0; + return 0; + } + + /* get the high bits */ + val = 0; + n = (bs->cur.bit + bits + 7) >> 3; + /* n may be at most 9, if cur.bit + bits > 64 */ + /* which means this copies at most 8 byte */ + if (n) { + memcpy(&val, bs->cur.b+1, n - 1); + val = le64_to_cpu(val) << (8 - bs->cur.bit); + } + + /* we still need the low bits */ + val |= bs->cur.b[0] >> bs->cur.bit; + + /* and mask out bits we don't want */ + val &= ~0ULL >> (64 - bits); + + bitstream_cursor_advance(&bs->cur, bits); + *out = val; + + return bits; +} + +/* encodes @in as vli into @bs; + + * return values + * > 0: number of bits successfully stored in bitstream + * -ENOBUFS @bs is full + * -EINVAL input zero (invalid) + * -EOVERFLOW input too large for this vli code (invalid) + */ +static inline int vli_encode_bits(struct bitstream *bs, u64 in) +{ + u64 code = code; + int bits = __vli_encode_bits(&code, in); + + if (bits <= 0) + return bits; + + return bitstream_put_bits(bs, code, bits); +} + +#endif diff --git a/kernel/drivers/block/drbd/drbd_worker.c b/kernel/drivers/block/drbd/drbd_worker.c new file mode 100644 index 000000000..d0fae55d8 --- /dev/null +++ b/kernel/drivers/block/drbd/drbd_worker.c @@ -0,0 +1,2156 @@ +/* + drbd_worker.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "drbd_int.h" +#include "drbd_protocol.h" +#include "drbd_req.h" + +static int make_ov_request(struct drbd_device *, int); +static int make_resync_request(struct drbd_device *, int); + +/* endio handlers: + * drbd_md_endio (defined here) + * drbd_request_endio (defined here) + * drbd_peer_request_endio (defined here) + * drbd_bm_endio (defined in drbd_bitmap.c) + * + * For all these callbacks, note the following: + * The callbacks will be called in irq context by the IDE drivers, + * and in Softirqs/Tasklets/BH context by the SCSI drivers. + * Try to get the locking right :) + * + */ + + +/* About the global_state_lock + Each state transition on an device holds a read lock. In case we have + to evaluate the resync after dependencies, we grab a write lock, because + we need stable states on all devices for that. */ +rwlock_t global_state_lock; + +/* used for synchronous meta data and bitmap IO + * submitted by drbd_md_sync_page_io() + */ +void drbd_md_endio(struct bio *bio, int error) +{ + struct drbd_device *device; + + device = bio->bi_private; + device->md_io.error = error; + + /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able + * to timeout on the lower level device, and eventually detach from it. + * If this io completion runs after that timeout expired, this + * drbd_md_put_buffer() may allow us to finally try and re-attach. + * During normal operation, this only puts that extra reference + * down to 1 again. + * Make sure we first drop the reference, and only then signal + * completion, or we may (in drbd_al_read_log()) cycle so fast into the + * next drbd_md_sync_page_io(), that we trigger the + * ASSERT(atomic_read(&device->md_io_in_use) == 1) there. + */ + drbd_md_put_buffer(device); + device->md_io.done = 1; + wake_up(&device->misc_wait); + bio_put(bio); + if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */ + put_ldev(device); +} + +/* reads on behalf of the partner, + * "submitted" by the receiver + */ +static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local) +{ + unsigned long flags = 0; + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + + spin_lock_irqsave(&device->resource->req_lock, flags); + device->read_cnt += peer_req->i.size >> 9; + list_del(&peer_req->w.list); + if (list_empty(&device->read_ee)) + wake_up(&device->ee_wait); + if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) + __drbd_chk_io_error(device, DRBD_READ_ERROR); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w); + put_ldev(device); +} + +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver, final stage. */ +void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local) +{ + unsigned long flags = 0; + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + struct drbd_interval i; + int do_wake; + u64 block_id; + int do_al_complete_io; + + /* after we moved peer_req to done_ee, + * we may no longer access it, + * it may be freed/reused already! + * (as soon as we release the req_lock) */ + i = peer_req->i; + do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO; + block_id = peer_req->block_id; + peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO; + + spin_lock_irqsave(&device->resource->req_lock, flags); + device->writ_cnt += peer_req->i.size >> 9; + list_move_tail(&peer_req->w.list, &device->done_ee); + + /* + * Do not remove from the write_requests tree here: we did not send the + * Ack yet and did not wake possibly waiting conflicting requests. + * Removed from the tree from "drbd_process_done_ee" within the + * appropriate dw.cb (e_end_block/e_end_resync_block) or from + * _drbd_clear_done_ee. + */ + + do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee); + + /* FIXME do we want to detach for failed REQ_DISCARD? + * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ + if (peer_req->flags & EE_WAS_ERROR) + __drbd_chk_io_error(device, DRBD_WRITE_ERROR); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + + if (block_id == ID_SYNCER) + drbd_rs_complete_io(device, i.sector); + + if (do_wake) + wake_up(&device->ee_wait); + + if (do_al_complete_io) + drbd_al_complete_io(device, &i); + + wake_asender(peer_device->connection); + put_ldev(device); +} + +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver. + */ +void drbd_peer_request_endio(struct bio *bio, int error) +{ + struct drbd_peer_request *peer_req = bio->bi_private; + struct drbd_device *device = peer_req->peer_device->device; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + int is_write = bio_data_dir(bio) == WRITE; + int is_discard = !!(bio->bi_rw & REQ_DISCARD); + + if (error && __ratelimit(&drbd_ratelimit_state)) + drbd_warn(device, "%s: error=%d s=%llus\n", + is_write ? (is_discard ? "discard" : "write") + : "read", error, + (unsigned long long)peer_req->i.sector); + if (!error && !uptodate) { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_warn(device, "%s: setting error to -EIO s=%llus\n", + is_write ? "write" : "read", + (unsigned long long)peer_req->i.sector); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + if (error) + set_bit(__EE_WAS_ERROR, &peer_req->flags); + + bio_put(bio); /* no need for the bio anymore */ + if (atomic_dec_and_test(&peer_req->pending_bios)) { + if (is_write) + drbd_endio_write_sec_final(peer_req); + else + drbd_endio_read_sec_final(peer_req); + } +} + +/* read, readA or write requests on R_PRIMARY coming from drbd_make_request + */ +void drbd_request_endio(struct bio *bio, int error) +{ + unsigned long flags; + struct drbd_request *req = bio->bi_private; + struct drbd_device *device = req->device; + struct bio_and_error m; + enum drbd_req_event what; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + if (!error && !uptodate) { + drbd_warn(device, "p %s: setting error to -EIO\n", + bio_data_dir(bio) == WRITE ? "write" : "read"); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + + /* If this request was aborted locally before, + * but now was completed "successfully", + * chances are that this caused arbitrary data corruption. + * + * "aborting" requests, or force-detaching the disk, is intended for + * completely blocked/hung local backing devices which do no longer + * complete requests at all, not even do error completions. In this + * situation, usually a hard-reset and failover is the only way out. + * + * By "aborting", basically faking a local error-completion, + * we allow for a more graceful swichover by cleanly migrating services. + * Still the affected node has to be rebooted "soon". + * + * By completing these requests, we allow the upper layers to re-use + * the associated data pages. + * + * If later the local backing device "recovers", and now DMAs some data + * from disk into the original request pages, in the best case it will + * just put random data into unused pages; but typically it will corrupt + * meanwhile completely unrelated data, causing all sorts of damage. + * + * Which means delayed successful completion, + * especially for READ requests, + * is a reason to panic(). + * + * We assume that a delayed *error* completion is OK, + * though we still will complain noisily about it. + */ + if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); + + if (!error) + panic("possible random memory corruption caused by delayed completion of aborted local request\n"); + } + + /* to avoid recursion in __req_mod */ + if (unlikely(error)) { + if (bio->bi_rw & REQ_DISCARD) + what = (error == -EOPNOTSUPP) + ? DISCARD_COMPLETED_NOTSUPP + : DISCARD_COMPLETED_WITH_ERROR; + else + what = (bio_data_dir(bio) == WRITE) + ? WRITE_COMPLETED_WITH_ERROR + : (bio_rw(bio) == READ) + ? READ_COMPLETED_WITH_ERROR + : READ_AHEAD_COMPLETED_WITH_ERROR; + } else + what = COMPLETED_OK; + + bio_put(req->private_bio); + req->private_bio = ERR_PTR(error); + + /* not req_mod(), we need irqsave here! */ + spin_lock_irqsave(&device->resource->req_lock, flags); + __req_mod(req, what, &m); + spin_unlock_irqrestore(&device->resource->req_lock, flags); + put_ldev(device); + + if (m.bio) + complete_master_bio(device, &m); +} + +void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest) +{ + struct hash_desc desc; + struct scatterlist sg; + struct page *page = peer_req->pages; + struct page *tmp; + unsigned len; + + desc.tfm = tfm; + desc.flags = 0; + + sg_init_table(&sg, 1); + crypto_hash_init(&desc); + + while ((tmp = page_chain_next(page))) { + /* all but the last page will be fully used */ + sg_set_page(&sg, page, PAGE_SIZE, 0); + crypto_hash_update(&desc, &sg, sg.length); + page = tmp; + } + /* and now the last, possibly only partially used page */ + len = peer_req->i.size & (PAGE_SIZE - 1); + sg_set_page(&sg, page, len ?: PAGE_SIZE, 0); + crypto_hash_update(&desc, &sg, sg.length); + crypto_hash_final(&desc, digest); +} + +void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest) +{ + struct hash_desc desc; + struct scatterlist sg; + struct bio_vec bvec; + struct bvec_iter iter; + + desc.tfm = tfm; + desc.flags = 0; + + sg_init_table(&sg, 1); + crypto_hash_init(&desc); + + bio_for_each_segment(bvec, bio, iter) { + sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); + crypto_hash_update(&desc, &sg, sg.length); + } + crypto_hash_final(&desc, digest); +} + +/* MAYBE merge common code with w_e_end_ov_req */ +static int w_e_send_csum(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + int digest_size; + void *digest; + int err = 0; + + if (unlikely(cancel)) + goto out; + + if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0)) + goto out; + + digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + sector_t sector = peer_req->i.sector; + unsigned int size = peer_req->i.size; + drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest); + /* Free peer_req and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_alloc_pages due to pp_in_use > max_buffers. */ + drbd_free_peer_req(device, peer_req); + peer_req = NULL; + inc_rs_pending(device); + err = drbd_send_drequest_csum(peer_device, sector, size, + digest, digest_size, + P_CSUM_RS_REQUEST); + kfree(digest); + } else { + drbd_err(device, "kmalloc() of digest failed.\n"); + err = -ENOMEM; + } + +out: + if (peer_req) + drbd_free_peer_req(device, peer_req); + + if (unlikely(err)) + drbd_err(device, "drbd_send_drequest(..., csum) failed\n"); + return err; +} + +#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) + +static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size) +{ + struct drbd_device *device = peer_device->device; + struct drbd_peer_request *peer_req; + + if (!get_ldev(device)) + return -EIO; + + /* GFP_TRY, because if there is no memory available right now, this may + * be rescheduled for later. It is "only" background resync, after all. */ + peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, + size, true /* has real payload */, GFP_TRY); + if (!peer_req) + goto defer; + + peer_req->w.cb = w_e_send_csum; + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->read_ee); + spin_unlock_irq(&device->resource->req_lock); + + atomic_add(size >> 9, &device->rs_sect_ev); + if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0) + return 0; + + /* If it failed because of ENOMEM, retry should help. If it failed + * because bio_add_page failed (probably broken lower level driver), + * retry may or may not help. + * If it does not, you may need to force disconnect. */ + spin_lock_irq(&device->resource->req_lock); + list_del(&peer_req->w.list); + spin_unlock_irq(&device->resource->req_lock); + + drbd_free_peer_req(device, peer_req); +defer: + put_ldev(device); + return -EAGAIN; +} + +int w_resync_timer(struct drbd_work *w, int cancel) +{ + struct drbd_device *device = + container_of(w, struct drbd_device, resync_work); + + switch (device->state.conn) { + case C_VERIFY_S: + make_ov_request(device, cancel); + break; + case C_SYNC_TARGET: + make_resync_request(device, cancel); + break; + } + + return 0; +} + +void resync_timer_fn(unsigned long data) +{ + struct drbd_device *device = (struct drbd_device *) data; + + drbd_queue_work_if_unqueued( + &first_peer_device(device)->connection->sender_work, + &device->resync_work); +} + +static void fifo_set(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] = value; +} + +static int fifo_push(struct fifo_buffer *fb, int value) +{ + int ov; + + ov = fb->values[fb->head_index]; + fb->values[fb->head_index++] = value; + + if (fb->head_index >= fb->size) + fb->head_index = 0; + + return ov; +} + +static void fifo_add_val(struct fifo_buffer *fb, int value) +{ + int i; + + for (i = 0; i < fb->size; i++) + fb->values[i] += value; +} + +struct fifo_buffer *fifo_alloc(int fifo_size) +{ + struct fifo_buffer *fb; + + fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO); + if (!fb) + return NULL; + + fb->head_index = 0; + fb->size = fifo_size; + fb->total = 0; + + return fb; +} + +static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in) +{ + struct disk_conf *dc; + unsigned int want; /* The number of sectors we want in-flight */ + int req_sect; /* Number of sectors to request in this turn */ + int correction; /* Number of sectors more we need in-flight */ + int cps; /* correction per invocation of drbd_rs_controller() */ + int steps; /* Number of time steps to plan ahead */ + int curr_corr; + int max_sect; + struct fifo_buffer *plan; + + dc = rcu_dereference(device->ldev->disk_conf); + plan = rcu_dereference(device->rs_plan_s); + + steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */ + + if (device->rs_in_flight + sect_in == 0) { /* At start of resync */ + want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps; + } else { /* normal path */ + want = dc->c_fill_target ? dc->c_fill_target : + sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10); + } + + correction = want - device->rs_in_flight - plan->total; + + /* Plan ahead */ + cps = correction / steps; + fifo_add_val(plan, cps); + plan->total += cps * steps; + + /* What we do in this step */ + curr_corr = fifo_push(plan, 0); + plan->total -= curr_corr; + + req_sect = sect_in + curr_corr; + if (req_sect < 0) + req_sect = 0; + + max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ; + if (req_sect > max_sect) + req_sect = max_sect; + + /* + drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n", + sect_in, device->rs_in_flight, want, correction, + steps, cps, device->rs_planed, curr_corr, req_sect); + */ + + return req_sect; +} + +static int drbd_rs_number_requests(struct drbd_device *device) +{ + unsigned int sect_in; /* Number of sectors that came in since the last turn */ + int number, mxb; + + sect_in = atomic_xchg(&device->rs_sect_in, 0); + device->rs_in_flight -= sect_in; + + rcu_read_lock(); + mxb = drbd_get_max_buffers(device) / 2; + if (rcu_dereference(device->rs_plan_s)->size) { + number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9); + device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME; + } else { + device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate; + number = SLEEP_TIME * device->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ); + } + rcu_read_unlock(); + + /* Don't have more than "max-buffers"/2 in-flight. + * Otherwise we may cause the remote site to stall on drbd_alloc_pages(), + * potentially causing a distributed deadlock on congestion during + * online-verify or (checksum-based) resync, if max-buffers, + * socket buffer sizes and resync rate settings are mis-configured. */ + + /* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k), + * mxb (as used here, and in drbd_alloc_pages on the peer) is + * "number of pages" (typically also 4k), + * but "rs_in_flight" is in "sectors" (512 Byte). */ + if (mxb - device->rs_in_flight/8 < number) + number = mxb - device->rs_in_flight/8; + + return number; +} + +static int make_resync_request(struct drbd_device *const device, int cancel) +{ + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; + unsigned long bit; + sector_t sector; + const sector_t capacity = drbd_get_capacity(device->this_bdev); + int max_bio_size; + int number, rollback_i, size; + int align, requeue = 0; + int i = 0; + + if (unlikely(cancel)) + return 0; + + if (device->rs_total == 0) { + /* empty resync? */ + drbd_resync_finished(device); + return 0; + } + + if (!get_ldev(device)) { + /* Since we only need to access device->rsync a + get_ldev_if_state(device,D_FAILED) would be sufficient, but + to continue resync with a broken disk makes no sense at + all */ + drbd_err(device, "Disk broke down during resync!\n"); + return 0; + } + + max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; + number = drbd_rs_number_requests(device); + if (number <= 0) + goto requeue; + + for (i = 0; i < number; i++) { + /* Stop generating RS requests when half of the send buffer is filled, + * but notify TCP that we'd like to have more space. */ + mutex_lock(&connection->data.mutex); + if (connection->data.socket) { + struct sock *sk = connection->data.socket->sk; + int queued = sk->sk_wmem_queued; + int sndbuf = sk->sk_sndbuf; + if (queued > sndbuf / 2) { + requeue = 1; + if (sk->sk_socket) + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + } else + requeue = 1; + mutex_unlock(&connection->data.mutex); + if (requeue) + goto requeue; + +next_sector: + size = BM_BLOCK_SIZE; + bit = drbd_bm_find_next(device, device->bm_resync_fo); + + if (bit == DRBD_END_OF_BITMAP) { + device->bm_resync_fo = drbd_bm_bits(device); + put_ldev(device); + return 0; + } + + sector = BM_BIT_TO_SECT(bit); + + if (drbd_try_rs_begin_io(device, sector)) { + device->bm_resync_fo = bit; + goto requeue; + } + device->bm_resync_fo = bit + 1; + + if (unlikely(drbd_bm_test_bit(device, bit) == 0)) { + drbd_rs_complete_io(device, sector); + goto next_sector; + } + +#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE + /* try to find some adjacent bits. + * we stop if we have already the maximum req size. + * + * Additionally always align bigger requests, in order to + * be prepared for all stripe sizes of software RAIDs. + */ + align = 1; + rollback_i = i; + while (i < number) { + if (size + BM_BLOCK_SIZE > max_bio_size) + break; + + /* Be always aligned */ + if (sector & ((1<<(align+3))-1)) + break; + + /* do not cross extent boundaries */ + if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) + break; + /* now, is it actually dirty, after all? + * caution, drbd_bm_test_bit is tri-state for some + * obscure reason; ( b == 0 ) would get the out-of-band + * only accidentally right because of the "oddly sized" + * adjustment below */ + if (drbd_bm_test_bit(device, bit+1) != 1) + break; + bit++; + size += BM_BLOCK_SIZE; + if ((BM_BLOCK_SIZE << align) <= size) + align++; + i++; + } + /* if we merged some, + * reset the offset to start the next drbd_bm_find_next from */ + if (size > BM_BLOCK_SIZE) + device->bm_resync_fo = bit + 1; +#endif + + /* adjust very last sectors, in case we are oddly sized */ + if (sector + (size>>9) > capacity) + size = (capacity-sector)<<9; + + if (device->use_csums) { + switch (read_for_csum(peer_device, sector, size)) { + case -EIO: /* Disk failure */ + put_ldev(device); + return -EIO; + case -EAGAIN: /* allocation failed, or ldev busy */ + drbd_rs_complete_io(device, sector); + device->bm_resync_fo = BM_SECT_TO_BIT(sector); + i = rollback_i; + goto requeue; + case 0: + /* everything ok */ + break; + default: + BUG(); + } + } else { + int err; + + inc_rs_pending(device); + err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST, + sector, size, ID_SYNCER); + if (err) { + drbd_err(device, "drbd_send_drequest() failed, aborting...\n"); + dec_rs_pending(device); + put_ldev(device); + return err; + } + } + } + + if (device->bm_resync_fo >= drbd_bm_bits(device)) { + /* last syncer _request_ was sent, + * but the P_RS_DATA_REPLY not yet received. sync will end (and + * next sync group will resume), as soon as we receive the last + * resync data block, and the last bit is cleared. + * until then resync "work" is "inactive" ... + */ + put_ldev(device); + return 0; + } + + requeue: + device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); + mod_timer(&device->resync_timer, jiffies + SLEEP_TIME); + put_ldev(device); + return 0; +} + +static int make_ov_request(struct drbd_device *device, int cancel) +{ + int number, i, size; + sector_t sector; + const sector_t capacity = drbd_get_capacity(device->this_bdev); + bool stop_sector_reached = false; + + if (unlikely(cancel)) + return 1; + + number = drbd_rs_number_requests(device); + + sector = device->ov_position; + for (i = 0; i < number; i++) { + if (sector >= capacity) + return 1; + + /* We check for "finished" only in the reply path: + * w_e_end_ov_reply(). + * We need to send at least one request out. */ + stop_sector_reached = i > 0 + && verify_can_do_stop_sector(device) + && sector >= device->ov_stop_sector; + if (stop_sector_reached) + break; + + size = BM_BLOCK_SIZE; + + if (drbd_try_rs_begin_io(device, sector)) { + device->ov_position = sector; + goto requeue; + } + + if (sector + (size>>9) > capacity) + size = (capacity-sector)<<9; + + inc_rs_pending(device); + if (drbd_send_ov_request(first_peer_device(device), sector, size)) { + dec_rs_pending(device); + return 0; + } + sector += BM_SECT_PER_BIT; + } + device->ov_position = sector; + + requeue: + device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9)); + if (i == 0 || !stop_sector_reached) + mod_timer(&device->resync_timer, jiffies + SLEEP_TIME); + return 1; +} + +int w_ov_finished(struct drbd_work *w, int cancel) +{ + struct drbd_device_work *dw = + container_of(w, struct drbd_device_work, w); + struct drbd_device *device = dw->device; + kfree(dw); + ov_out_of_sync_print(device); + drbd_resync_finished(device); + + return 0; +} + +static int w_resync_finished(struct drbd_work *w, int cancel) +{ + struct drbd_device_work *dw = + container_of(w, struct drbd_device_work, w); + struct drbd_device *device = dw->device; + kfree(dw); + + drbd_resync_finished(device); + + return 0; +} + +static void ping_peer(struct drbd_device *device) +{ + struct drbd_connection *connection = first_peer_device(device)->connection; + + clear_bit(GOT_PING_ACK, &connection->flags); + request_ping(connection); + wait_event(connection->ping_wait, + test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED); +} + +int drbd_resync_finished(struct drbd_device *device) +{ + unsigned long db, dt, dbdt; + unsigned long n_oos; + union drbd_state os, ns; + struct drbd_device_work *dw; + char *khelper_cmd = NULL; + int verify_done = 0; + + /* Remove all elements from the resync LRU. Since future actions + * might set bits in the (main) bitmap, then the entries in the + * resync LRU would be wrong. */ + if (drbd_rs_del_all(device)) { + /* In case this is not possible now, most probably because + * there are P_RS_DATA_REPLY Packets lingering on the worker's + * queue (or even the read operations for those packets + * is not finished by now). Retry in 100ms. */ + + schedule_timeout_interruptible(HZ / 10); + dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC); + if (dw) { + dw->w.cb = w_resync_finished; + dw->device = device; + drbd_queue_work(&first_peer_device(device)->connection->sender_work, + &dw->w); + return 1; + } + drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n"); + } + + dt = (jiffies - device->rs_start - device->rs_paused) / HZ; + if (dt <= 0) + dt = 1; + + db = device->rs_total; + /* adjust for verify start and stop sectors, respective reached position */ + if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T) + db -= device->ov_left; + + dbdt = Bit2KB(db/dt); + device->rs_paused /= HZ; + + if (!get_ldev(device)) + goto out; + + ping_peer(device); + + spin_lock_irq(&device->resource->req_lock); + os = drbd_read_state(device); + + verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T); + + /* This protects us against multiple calls (that can happen in the presence + of application IO), and against connectivity loss just before we arrive here. */ + if (os.conn <= C_CONNECTED) + goto out_unlock; + + ns = os; + ns.conn = C_CONNECTED; + + drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", + verify_done ? "Online verify" : "Resync", + dt + device->rs_paused, device->rs_paused, dbdt); + + n_oos = drbd_bm_total_weight(device); + + if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) { + if (n_oos) { + drbd_alert(device, "Online verify found %lu %dk block out of sync!\n", + n_oos, Bit2KB(1)); + khelper_cmd = "out-of-sync"; + } + } else { + D_ASSERT(device, (n_oos - device->rs_failed) == 0); + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) + khelper_cmd = "after-resync-target"; + + if (device->use_csums && device->rs_total) { + const unsigned long s = device->rs_same_csum; + const unsigned long t = device->rs_total; + const int ratio = + (t == 0) ? 0 : + (t < 100000) ? ((s*100)/t) : (s/(t/100)); + drbd_info(device, "%u %% had equal checksums, eliminated: %luK; " + "transferred %luK total %luK\n", + ratio, + Bit2KB(device->rs_same_csum), + Bit2KB(device->rs_total - device->rs_same_csum), + Bit2KB(device->rs_total)); + } + } + + if (device->rs_failed) { + drbd_info(device, " %lu failed blocks\n", device->rs_failed); + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { + ns.disk = D_INCONSISTENT; + ns.pdsk = D_UP_TO_DATE; + } else { + ns.disk = D_UP_TO_DATE; + ns.pdsk = D_INCONSISTENT; + } + } else { + ns.disk = D_UP_TO_DATE; + ns.pdsk = D_UP_TO_DATE; + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { + if (device->p_uuid) { + int i; + for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++) + _drbd_uuid_set(device, i, device->p_uuid[i]); + drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]); + _drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]); + } else { + drbd_err(device, "device->p_uuid is NULL! BUG\n"); + } + } + + if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) { + /* for verify runs, we don't update uuids here, + * so there would be nothing to report. */ + drbd_uuid_set_bm(device, 0UL); + drbd_print_uuids(device, "updated UUIDs"); + if (device->p_uuid) { + /* Now the two UUID sets are equal, update what we + * know of the peer. */ + int i; + for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) + device->p_uuid[i] = device->ldev->md.uuid[i]; + } + } + } + + _drbd_set_state(device, ns, CS_VERBOSE, NULL); +out_unlock: + spin_unlock_irq(&device->resource->req_lock); + put_ldev(device); +out: + device->rs_total = 0; + device->rs_failed = 0; + device->rs_paused = 0; + + /* reset start sector, if we reached end of device */ + if (verify_done && device->ov_left == 0) + device->ov_start_sector = 0; + + drbd_md_sync(device); + + if (khelper_cmd) + drbd_khelper(device, khelper_cmd); + + return 1; +} + +/* helper */ +static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req) +{ + if (drbd_peer_req_has_active_page(peer_req)) { + /* This might happen if sendpage() has not finished */ + int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT; + atomic_add(i, &device->pp_in_use_by_net); + atomic_sub(i, &device->pp_in_use); + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->net_ee); + spin_unlock_irq(&device->resource->req_lock); + wake_up(&drbd_pp_wait); + } else + drbd_free_peer_req(device, peer_req); +} + +/** + * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST + * @device: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_e_end_data_req(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + int err; + + if (unlikely(cancel)) { + drbd_free_peer_req(device, peer_req); + dec_unacked(device); + return 0; + } + + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Sending NegDReply. sector=%llus.\n", + (unsigned long long)peer_req->i.sector); + + err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req); + } + + dec_unacked(device); + + move_to_net_ee_or_free(device, peer_req); + + if (unlikely(err)) + drbd_err(device, "drbd_send_block() failed\n"); + return err; +} + +/** + * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_e_end_rsdata_req(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + int err; + + if (unlikely(cancel)) { + drbd_free_peer_req(device, peer_req); + dec_unacked(device); + return 0; + } + + if (get_ldev_if_state(device, D_FAILED)) { + drbd_rs_complete_io(device, peer_req->i.sector); + put_ldev(device); + } + + if (device->state.conn == C_AHEAD) { + err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req); + } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + if (likely(device->state.pdsk >= D_INCONSISTENT)) { + inc_rs_pending(device); + err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Not sending RSDataReply, " + "partner DISKLESS!\n"); + err = 0; + } + } else { + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Sending NegRSDReply. sector %llus.\n", + (unsigned long long)peer_req->i.sector); + + err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req); + + /* update resync data with failure */ + drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size); + } + + dec_unacked(device); + + move_to_net_ee_or_free(device, peer_req); + + if (unlikely(err)) + drbd_err(device, "drbd_send_block() failed\n"); + return err; +} + +int w_e_end_csum_rs_req(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + struct digest_info *di; + int digest_size; + void *digest = NULL; + int err, eq = 0; + + if (unlikely(cancel)) { + drbd_free_peer_req(device, peer_req); + dec_unacked(device); + return 0; + } + + if (get_ldev(device)) { + drbd_rs_complete_io(device, peer_req->i.sector); + put_ldev(device); + } + + di = peer_req->digest; + + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + /* quick hack to try to avoid a race against reconfiguration. + * a real fix would be much more involved, + * introducing more locking mechanisms */ + if (peer_device->connection->csums_tfm) { + digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm); + D_ASSERT(device, digest_size == di->digest_size); + digest = kmalloc(digest_size, GFP_NOIO); + } + if (digest) { + drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest); + eq = !memcmp(digest, di->digest, digest_size); + kfree(digest); + } + + if (eq) { + drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size); + /* rs_same_csums unit is BM_BLOCK_SIZE */ + device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT; + err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req); + } else { + inc_rs_pending(device); + peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */ + peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */ + kfree(di); + err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); + } + } else { + err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req); + if (__ratelimit(&drbd_ratelimit_state)) + drbd_err(device, "Sending NegDReply. I guess it gets messy.\n"); + } + + dec_unacked(device); + move_to_net_ee_or_free(device, peer_req); + + if (unlikely(err)) + drbd_err(device, "drbd_send_block/ack() failed\n"); + return err; +} + +int w_e_end_ov_req(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + sector_t sector = peer_req->i.sector; + unsigned int size = peer_req->i.size; + int digest_size; + void *digest; + int err = 0; + + if (unlikely(cancel)) + goto out; + + digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (!digest) { + err = 1; /* terminate the connection in case the allocation failed */ + goto out; + } + + if (likely(!(peer_req->flags & EE_WAS_ERROR))) + drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest); + else + memset(digest, 0, digest_size); + + /* Free e and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_alloc_pages due to pp_in_use > max_buffers. */ + drbd_free_peer_req(device, peer_req); + peer_req = NULL; + inc_rs_pending(device); + err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY); + if (err) + dec_rs_pending(device); + kfree(digest); + +out: + if (peer_req) + drbd_free_peer_req(device, peer_req); + dec_unacked(device); + return err; +} + +void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size) +{ + if (device->ov_last_oos_start + device->ov_last_oos_size == sector) { + device->ov_last_oos_size += size>>9; + } else { + device->ov_last_oos_start = sector; + device->ov_last_oos_size = size>>9; + } + drbd_set_out_of_sync(device, sector, size); +} + +int w_e_end_ov_reply(struct drbd_work *w, int cancel) +{ + struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w); + struct drbd_peer_device *peer_device = peer_req->peer_device; + struct drbd_device *device = peer_device->device; + struct digest_info *di; + void *digest; + sector_t sector = peer_req->i.sector; + unsigned int size = peer_req->i.size; + int digest_size; + int err, eq = 0; + bool stop_sector_reached = false; + + if (unlikely(cancel)) { + drbd_free_peer_req(device, peer_req); + dec_unacked(device); + return 0; + } + + /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all + * the resync lru has been cleaned up already */ + if (get_ldev(device)) { + drbd_rs_complete_io(device, peer_req->i.sector); + put_ldev(device); + } + + di = peer_req->digest; + + if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { + digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest); + + D_ASSERT(device, digest_size == di->digest_size); + eq = !memcmp(digest, di->digest, digest_size); + kfree(digest); + } + } + + /* Free peer_req and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_alloc_pages due to pp_in_use > max_buffers. */ + drbd_free_peer_req(device, peer_req); + if (!eq) + drbd_ov_out_of_sync_found(device, sector, size); + else + ov_out_of_sync_print(device); + + err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, + eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); + + dec_unacked(device); + + --device->ov_left; + + /* let's advance progress step marks only for every other megabyte */ + if ((device->ov_left & 0x200) == 0x200) + drbd_advance_rs_marks(device, device->ov_left); + + stop_sector_reached = verify_can_do_stop_sector(device) && + (sector + (size>>9)) >= device->ov_stop_sector; + + if (device->ov_left == 0 || stop_sector_reached) { + ov_out_of_sync_print(device); + drbd_resync_finished(device); + } + + return err; +} + +/* FIXME + * We need to track the number of pending barrier acks, + * and to be able to wait for them. + * See also comment in drbd_adm_attach before drbd_suspend_io. + */ +static int drbd_send_barrier(struct drbd_connection *connection) +{ + struct p_barrier *p; + struct drbd_socket *sock; + + sock = &connection->data; + p = conn_prepare_command(connection, sock); + if (!p) + return -EIO; + p->barrier = connection->send.current_epoch_nr; + p->pad = 0; + connection->send.current_epoch_writes = 0; + + return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0); +} + +int w_send_write_hint(struct drbd_work *w, int cancel) +{ + struct drbd_device *device = + container_of(w, struct drbd_device, unplug_work); + struct drbd_socket *sock; + + if (cancel) + return 0; + sock = &first_peer_device(device)->connection->data; + if (!drbd_prepare_command(first_peer_device(device), sock)) + return -EIO; + return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0); +} + +static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch) +{ + if (!connection->send.seen_any_write_yet) { + connection->send.seen_any_write_yet = true; + connection->send.current_epoch_nr = epoch; + connection->send.current_epoch_writes = 0; + } +} + +static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch) +{ + /* re-init if first write on this connection */ + if (!connection->send.seen_any_write_yet) + return; + if (connection->send.current_epoch_nr != epoch) { + if (connection->send.current_epoch_writes) + drbd_send_barrier(connection); + connection->send.current_epoch_nr = epoch; + } +} + +int w_send_out_of_sync(struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_device *device = req->device; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *const connection = peer_device->connection; + int err; + + if (unlikely(cancel)) { + req_mod(req, SEND_CANCELED); + return 0; + } + req->pre_send_jif = jiffies; + + /* this time, no connection->send.current_epoch_writes++; + * If it was sent, it was the closing barrier for the last + * replicated epoch, before we went into AHEAD mode. + * No more barriers will be sent, until we leave AHEAD mode again. */ + maybe_send_barrier(connection, req->epoch); + + err = drbd_send_out_of_sync(peer_device, req); + req_mod(req, OOS_HANDED_TO_NETWORK); + + return err; +} + +/** + * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_send_dblock(struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_device *device = req->device; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device->connection; + int err; + + if (unlikely(cancel)) { + req_mod(req, SEND_CANCELED); + return 0; + } + req->pre_send_jif = jiffies; + + re_init_if_first_write(connection, req->epoch); + maybe_send_barrier(connection, req->epoch); + connection->send.current_epoch_writes++; + + err = drbd_send_dblock(peer_device, req); + req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + + return err; +} + +/** + * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_send_read_req(struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_device *device = req->device; + struct drbd_peer_device *const peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device->connection; + int err; + + if (unlikely(cancel)) { + req_mod(req, SEND_CANCELED); + return 0; + } + req->pre_send_jif = jiffies; + + /* Even read requests may close a write epoch, + * if there was any yet. */ + maybe_send_barrier(connection, req->epoch); + + err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size, + (unsigned long)req); + + req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK); + + return err; +} + +int w_restart_disk_io(struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + struct drbd_device *device = req->device; + + if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG) + drbd_al_begin_io(device, &req->i); + + drbd_req_make_private_bio(req, req->master_bio); + req->private_bio->bi_bdev = device->ldev->backing_bdev; + generic_make_request(req->private_bio); + + return 0; +} + +static int _drbd_may_sync_now(struct drbd_device *device) +{ + struct drbd_device *odev = device; + int resync_after; + + while (1) { + if (!odev->ldev || odev->state.disk == D_DISKLESS) + return 1; + rcu_read_lock(); + resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; + rcu_read_unlock(); + if (resync_after == -1) + return 1; + odev = minor_to_device(resync_after); + if (!odev) + return 1; + if ((odev->state.conn >= C_SYNC_SOURCE && + odev->state.conn <= C_PAUSED_SYNC_T) || + odev->state.aftr_isp || odev->state.peer_isp || + odev->state.user_isp) + return 0; + } +} + +/** + * _drbd_pause_after() - Pause resync on all devices that may not resync now + * @device: DRBD device. + * + * Called from process context only (admin command and after_state_ch). + */ +static int _drbd_pause_after(struct drbd_device *device) +{ + struct drbd_device *odev; + int i, rv = 0; + + rcu_read_lock(); + idr_for_each_entry(&drbd_devices, odev, i) { + if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) + continue; + if (!_drbd_may_sync_now(odev)) + rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) + != SS_NOTHING_TO_DO); + } + rcu_read_unlock(); + + return rv; +} + +/** + * _drbd_resume_next() - Resume resync on all devices that may resync now + * @device: DRBD device. + * + * Called from process context only (admin command and worker). + */ +static int _drbd_resume_next(struct drbd_device *device) +{ + struct drbd_device *odev; + int i, rv = 0; + + rcu_read_lock(); + idr_for_each_entry(&drbd_devices, odev, i) { + if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) + continue; + if (odev->state.aftr_isp) { + if (_drbd_may_sync_now(odev)) + rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), + CS_HARD, NULL) + != SS_NOTHING_TO_DO) ; + } + } + rcu_read_unlock(); + return rv; +} + +void resume_next_sg(struct drbd_device *device) +{ + write_lock_irq(&global_state_lock); + _drbd_resume_next(device); + write_unlock_irq(&global_state_lock); +} + +void suspend_other_sg(struct drbd_device *device) +{ + write_lock_irq(&global_state_lock); + _drbd_pause_after(device); + write_unlock_irq(&global_state_lock); +} + +/* caller must hold global_state_lock */ +enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor) +{ + struct drbd_device *odev; + int resync_after; + + if (o_minor == -1) + return NO_ERROR; + if (o_minor < -1 || o_minor > MINORMASK) + return ERR_RESYNC_AFTER; + + /* check for loops */ + odev = minor_to_device(o_minor); + while (1) { + if (odev == device) + return ERR_RESYNC_AFTER_CYCLE; + + /* You are free to depend on diskless, non-existing, + * or not yet/no longer existing minors. + * We only reject dependency loops. + * We cannot follow the dependency chain beyond a detached or + * missing minor. + */ + if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS) + return NO_ERROR; + + rcu_read_lock(); + resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after; + rcu_read_unlock(); + /* dependency chain ends here, no cycles. */ + if (resync_after == -1) + return NO_ERROR; + + /* follow the dependency chain */ + odev = minor_to_device(resync_after); + } +} + +/* caller must hold global_state_lock */ +void drbd_resync_after_changed(struct drbd_device *device) +{ + int changes; + + do { + changes = _drbd_pause_after(device); + changes |= _drbd_resume_next(device); + } while (changes); +} + +void drbd_rs_controller_reset(struct drbd_device *device) +{ + struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; + struct fifo_buffer *plan; + + atomic_set(&device->rs_sect_in, 0); + atomic_set(&device->rs_sect_ev, 0); + device->rs_in_flight = 0; + device->rs_last_events = + (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]); + + /* Updating the RCU protected object in place is necessary since + this function gets called from atomic context. + It is valid since all other updates also lead to an completely + empty fifo */ + rcu_read_lock(); + plan = rcu_dereference(device->rs_plan_s); + plan->total = 0; + fifo_set(plan, 0); + rcu_read_unlock(); +} + +void start_resync_timer_fn(unsigned long data) +{ + struct drbd_device *device = (struct drbd_device *) data; + drbd_device_post_work(device, RS_START); +} + +static void do_start_resync(struct drbd_device *device) +{ + if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) { + drbd_warn(device, "postponing start_resync ...\n"); + device->start_resync_timer.expires = jiffies + HZ/10; + add_timer(&device->start_resync_timer); + return; + } + + drbd_start_resync(device, C_SYNC_SOURCE); + clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags); +} + +static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device) +{ + bool csums_after_crash_only; + rcu_read_lock(); + csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only; + rcu_read_unlock(); + return connection->agreed_pro_version >= 89 && /* supported? */ + connection->csums_tfm && /* configured? */ + (csums_after_crash_only == 0 /* use for each resync? */ + || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */ +} + +/** + * drbd_start_resync() - Start the resync process + * @device: DRBD device. + * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET + * + * This function might bring you directly into one of the + * C_PAUSED_SYNC_* states. + */ +void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) +{ + struct drbd_peer_device *peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; + union drbd_state ns; + int r; + + if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) { + drbd_err(device, "Resync already running!\n"); + return; + } + + if (!test_bit(B_RS_H_DONE, &device->flags)) { + if (side == C_SYNC_TARGET) { + /* Since application IO was locked out during C_WF_BITMAP_T and + C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET + we check that we might make the data inconsistent. */ + r = drbd_khelper(device, "before-resync-target"); + r = (r >> 8) & 0xff; + if (r > 0) { + drbd_info(device, "before-resync-target handler returned %d, " + "dropping connection.\n", r); + conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); + return; + } + } else /* C_SYNC_SOURCE */ { + r = drbd_khelper(device, "before-resync-source"); + r = (r >> 8) & 0xff; + if (r > 0) { + if (r == 3) { + drbd_info(device, "before-resync-source handler returned %d, " + "ignoring. Old userland tools?", r); + } else { + drbd_info(device, "before-resync-source handler returned %d, " + "dropping connection.\n", r); + conn_request_state(connection, + NS(conn, C_DISCONNECTING), CS_HARD); + return; + } + } + } + } + + if (current == connection->worker.task) { + /* The worker should not sleep waiting for state_mutex, + that can take long */ + if (!mutex_trylock(device->state_mutex)) { + set_bit(B_RS_H_DONE, &device->flags); + device->start_resync_timer.expires = jiffies + HZ/5; + add_timer(&device->start_resync_timer); + return; + } + } else { + mutex_lock(device->state_mutex); + } + clear_bit(B_RS_H_DONE, &device->flags); + + /* req_lock: serialize with drbd_send_and_submit() and others + * global_state_lock: for stable sync-after dependencies */ + spin_lock_irq(&device->resource->req_lock); + write_lock(&global_state_lock); + /* Did some connection breakage or IO error race with us? */ + if (device->state.conn < C_CONNECTED + || !get_ldev_if_state(device, D_NEGOTIATING)) { + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); + mutex_unlock(device->state_mutex); + return; + } + + ns = drbd_read_state(device); + + ns.aftr_isp = !_drbd_may_sync_now(device); + + ns.conn = side; + + if (side == C_SYNC_TARGET) + ns.disk = D_INCONSISTENT; + else /* side == C_SYNC_SOURCE */ + ns.pdsk = D_INCONSISTENT; + + r = __drbd_set_state(device, ns, CS_VERBOSE, NULL); + ns = drbd_read_state(device); + + if (ns.conn < C_CONNECTED) + r = SS_UNKNOWN_ERROR; + + if (r == SS_SUCCESS) { + unsigned long tw = drbd_bm_total_weight(device); + unsigned long now = jiffies; + int i; + + device->rs_failed = 0; + device->rs_paused = 0; + device->rs_same_csum = 0; + device->rs_last_sect_ev = 0; + device->rs_total = tw; + device->rs_start = now; + for (i = 0; i < DRBD_SYNC_MARKS; i++) { + device->rs_mark_left[i] = tw; + device->rs_mark_time[i] = now; + } + _drbd_pause_after(device); + /* Forget potentially stale cached per resync extent bit-counts. + * Open coded drbd_rs_cancel_all(device), we already have IRQs + * disabled, and know the disk state is ok. */ + spin_lock(&device->al_lock); + lc_reset(device->resync); + device->resync_locked = 0; + device->resync_wenr = LC_FREE; + spin_unlock(&device->al_lock); + } + write_unlock(&global_state_lock); + spin_unlock_irq(&device->resource->req_lock); + + if (r == SS_SUCCESS) { + wake_up(&device->al_wait); /* for lc_reset() above */ + /* reset rs_last_bcast when a resync or verify is started, + * to deal with potential jiffies wrap. */ + device->rs_last_bcast = jiffies - HZ; + + drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", + drbd_conn_str(ns.conn), + (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10), + (unsigned long) device->rs_total); + if (side == C_SYNC_TARGET) { + device->bm_resync_fo = 0; + device->use_csums = use_checksum_based_resync(connection, device); + } else { + device->use_csums = 0; + } + + /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid + * with w_send_oos, or the sync target will get confused as to + * how much bits to resync. We cannot do that always, because for an + * empty resync and protocol < 95, we need to do it here, as we call + * drbd_resync_finished from here in that case. + * We drbd_gen_and_send_sync_uuid here for protocol < 96, + * and from after_state_ch otherwise. */ + if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96) + drbd_gen_and_send_sync_uuid(peer_device); + + if (connection->agreed_pro_version < 95 && device->rs_total == 0) { + /* This still has a race (about when exactly the peers + * detect connection loss) that can lead to a full sync + * on next handshake. In 8.3.9 we fixed this with explicit + * resync-finished notifications, but the fix + * introduces a protocol change. Sleeping for some + * time longer than the ping interval + timeout on the + * SyncSource, to give the SyncTarget the chance to + * detect connection loss, then waiting for a ping + * response (implicit in drbd_resync_finished) reduces + * the race considerably, but does not solve it. */ + if (side == C_SYNC_SOURCE) { + struct net_conf *nc; + int timeo; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9; + rcu_read_unlock(); + schedule_timeout_interruptible(timeo); + } + drbd_resync_finished(device); + } + + drbd_rs_controller_reset(device); + /* ns.conn may already be != device->state.conn, + * we may have been paused in between, or become paused until + * the timer triggers. + * No matter, that is handled in resync_timer_fn() */ + if (ns.conn == C_SYNC_TARGET) + mod_timer(&device->resync_timer, jiffies); + + drbd_md_sync(device); + } + put_ldev(device); + mutex_unlock(device->state_mutex); +} + +static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done) +{ + struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; + device->rs_last_bcast = jiffies; + + if (!get_ldev(device)) + return; + + drbd_bm_write_lazy(device, 0); + if (resync_done && is_sync_state(device->state.conn)) + drbd_resync_finished(device); + + drbd_bcast_event(device, &sib); + /* update timestamp, in case it took a while to write out stuff */ + device->rs_last_bcast = jiffies; + put_ldev(device); +} + +static void drbd_ldev_destroy(struct drbd_device *device) +{ + lc_destroy(device->resync); + device->resync = NULL; + lc_destroy(device->act_log); + device->act_log = NULL; + + __acquire(local); + drbd_free_ldev(device->ldev); + device->ldev = NULL; + __release(local); + + clear_bit(GOING_DISKLESS, &device->flags); + wake_up(&device->misc_wait); +} + +static void go_diskless(struct drbd_device *device) +{ + D_ASSERT(device, device->state.disk == D_FAILED); + /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will + * inc/dec it frequently. Once we are D_DISKLESS, no one will touch + * the protected members anymore, though, so once put_ldev reaches zero + * again, it will be safe to free them. */ + + /* Try to write changed bitmap pages, read errors may have just + * set some bits outside the area covered by the activity log. + * + * If we have an IO error during the bitmap writeout, + * we will want a full sync next time, just in case. + * (Do we want a specific meta data flag for this?) + * + * If that does not make it to stable storage either, + * we cannot do anything about that anymore. + * + * We still need to check if both bitmap and ldev are present, we may + * end up here after a failed attach, before ldev was even assigned. + */ + if (device->bitmap && device->ldev) { + /* An interrupted resync or similar is allowed to recounts bits + * while we detach. + * Any modifications would not be expected anymore, though. + */ + if (drbd_bitmap_io_from_worker(device, drbd_bm_write, + "detach", BM_LOCKED_TEST_ALLOWED)) { + if (test_bit(WAS_READ_ERROR, &device->flags)) { + drbd_md_set_flag(device, MDF_FULL_SYNC); + drbd_md_sync(device); + } + } + } + + drbd_force_state(device, NS(disk, D_DISKLESS)); +} + +static int do_md_sync(struct drbd_device *device) +{ + drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); + drbd_md_sync(device); + return 0; +} + +/* only called from drbd_worker thread, no locking */ +void __update_timing_details( + struct drbd_thread_timing_details *tdp, + unsigned int *cb_nr, + void *cb, + const char *fn, const unsigned int line) +{ + unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST; + struct drbd_thread_timing_details *td = tdp + i; + + td->start_jif = jiffies; + td->cb_addr = cb; + td->caller_fn = fn; + td->line = line; + td->cb_nr = *cb_nr; + + i = (i+1) % DRBD_THREAD_DETAILS_HIST; + td = tdp + i; + memset(td, 0, sizeof(*td)); + + ++(*cb_nr); +} + +static void do_device_work(struct drbd_device *device, const unsigned long todo) +{ + if (test_bit(MD_SYNC, &todo)) + do_md_sync(device); + if (test_bit(RS_DONE, &todo) || + test_bit(RS_PROGRESS, &todo)) + update_on_disk_bitmap(device, test_bit(RS_DONE, &todo)); + if (test_bit(GO_DISKLESS, &todo)) + go_diskless(device); + if (test_bit(DESTROY_DISK, &todo)) + drbd_ldev_destroy(device); + if (test_bit(RS_START, &todo)) + do_start_resync(device); +} + +#define DRBD_DEVICE_WORK_MASK \ + ((1UL << GO_DISKLESS) \ + |(1UL << DESTROY_DISK) \ + |(1UL << MD_SYNC) \ + |(1UL << RS_START) \ + |(1UL << RS_PROGRESS) \ + |(1UL << RS_DONE) \ + ) + +static unsigned long get_work_bits(unsigned long *flags) +{ + unsigned long old, new; + do { + old = *flags; + new = old & ~DRBD_DEVICE_WORK_MASK; + } while (cmpxchg(flags, old, new) != old); + return old & DRBD_DEVICE_WORK_MASK; +} + +static void do_unqueued_work(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + unsigned long todo = get_work_bits(&device->flags); + if (!todo) + continue; + + kref_get(&device->kref); + rcu_read_unlock(); + do_device_work(device, todo); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + +static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list) +{ + spin_lock_irq(&queue->q_lock); + list_splice_tail_init(&queue->q, work_list); + spin_unlock_irq(&queue->q_lock); + return !list_empty(work_list); +} + +static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list) +{ + DEFINE_WAIT(wait); + struct net_conf *nc; + int uncork, cork; + + dequeue_work_batch(&connection->sender_work, work_list); + if (!list_empty(work_list)) + return; + + /* Still nothing to do? + * Maybe we still need to close the current epoch, + * even if no new requests are queued yet. + * + * Also, poke TCP, just in case. + * Then wait for new work (or signal). */ + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + uncork = nc ? nc->tcp_cork : 0; + rcu_read_unlock(); + if (uncork) { + mutex_lock(&connection->data.mutex); + if (connection->data.socket) + drbd_tcp_uncork(connection->data.socket); + mutex_unlock(&connection->data.mutex); + } + + for (;;) { + int send_barrier; + prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE); + spin_lock_irq(&connection->resource->req_lock); + spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ + if (!list_empty(&connection->sender_work.q)) + list_splice_tail_init(&connection->sender_work.q, work_list); + spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */ + if (!list_empty(work_list) || signal_pending(current)) { + spin_unlock_irq(&connection->resource->req_lock); + break; + } + + /* We found nothing new to do, no to-be-communicated request, + * no other work item. We may still need to close the last + * epoch. Next incoming request epoch will be connection -> + * current transfer log epoch number. If that is different + * from the epoch of the last request we communicated, it is + * safe to send the epoch separating barrier now. + */ + send_barrier = + atomic_read(&connection->current_tle_nr) != + connection->send.current_epoch_nr; + spin_unlock_irq(&connection->resource->req_lock); + + if (send_barrier) + maybe_send_barrier(connection, + connection->send.current_epoch_nr + 1); + + if (test_bit(DEVICE_WORK_PENDING, &connection->flags)) + break; + + /* drbd_send() may have called flush_signals() */ + if (get_t_state(&connection->worker) != RUNNING) + break; + + schedule(); + /* may be woken up for other things but new work, too, + * e.g. if the current epoch got closed. + * In which case we send the barrier above. */ + } + finish_wait(&connection->sender_work.q_wait, &wait); + + /* someone may have changed the config while we have been waiting above. */ + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + cork = nc ? nc->tcp_cork : 0; + rcu_read_unlock(); + mutex_lock(&connection->data.mutex); + if (connection->data.socket) { + if (cork) + drbd_tcp_cork(connection->data.socket); + else if (!uncork) + drbd_tcp_uncork(connection->data.socket); + } + mutex_unlock(&connection->data.mutex); +} + +int drbd_worker(struct drbd_thread *thi) +{ + struct drbd_connection *connection = thi->connection; + struct drbd_work *w = NULL; + struct drbd_peer_device *peer_device; + LIST_HEAD(work_list); + int vnr; + + while (get_t_state(thi) == RUNNING) { + drbd_thread_current_set_cpu(thi); + + if (list_empty(&work_list)) { + update_worker_timing_details(connection, wait_for_work); + wait_for_work(connection, &work_list); + } + + if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) { + update_worker_timing_details(connection, do_unqueued_work); + do_unqueued_work(connection); + } + + if (signal_pending(current)) { + flush_signals(current); + if (get_t_state(thi) == RUNNING) { + drbd_warn(connection, "Worker got an unexpected signal\n"); + continue; + } + break; + } + + if (get_t_state(thi) != RUNNING) + break; + + if (!list_empty(&work_list)) { + w = list_first_entry(&work_list, struct drbd_work, list); + list_del_init(&w->list); + update_worker_timing_details(connection, w->cb); + if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0) + continue; + if (connection->cstate >= C_WF_REPORT_PARAMS) + conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); + } + } + + do { + if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) { + update_worker_timing_details(connection, do_unqueued_work); + do_unqueued_work(connection); + } + if (!list_empty(&work_list)) { + w = list_first_entry(&work_list, struct drbd_work, list); + list_del_init(&w->list); + update_worker_timing_details(connection, w->cb); + w->cb(w, 1); + } else + dequeue_work_batch(&connection->sender_work, &work_list); + } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags)); + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE); + kref_get(&device->kref); + rcu_read_unlock(); + drbd_device_cleanup(device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); + + return 0; +} diff --git a/kernel/drivers/block/floppy.c b/kernel/drivers/block/floppy.c new file mode 100644 index 000000000..a08cda955 --- /dev/null +++ b/kernel/drivers/block/floppy.c @@ -0,0 +1,4645 @@ +/* + * linux/drivers/block/floppy.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1993, 1994 Alain Knaff + * Copyright (C) 1998 Alan Cox + */ + +/* + * 02.12.91 - Changed to static variables to indicate need for reset + * and recalibrate. This makes some things easier (output_byte reset + * checking etc), and means less interrupt jumping in case of errors, + * so the code is hopefully easier to understand. + */ + +/* + * This file is certainly a mess. I've tried my best to get it working, + * but I don't like programming floppies, and I have only one anyway. + * Urgel. I should check for more errors, and do more graceful error + * recovery. Seems there are problems with several drives. I've tried to + * correct them. No promises. + */ + +/* + * As with hd.c, all routines within this file can (and will) be called + * by interrupts, so extreme caution is needed. A hardware interrupt + * handler may not sleep, or a kernel panic will happen. Thus I cannot + * call "floppy-on" directly, but have to set a special timer interrupt + * etc. + */ + +/* + * 28.02.92 - made track-buffering routines, based on the routines written + * by entropy@wintermute.wpi.edu (Lawrence Foard). Linus. + */ + +/* + * Automatic floppy-detection and formatting written by Werner Almesberger + * (almesber@nessie.cs.id.ethz.ch), who also corrected some problems with + * the floppy-change signal detection. + */ + +/* + * 1992/7/22 -- Hennus Bergman: Added better error reporting, fixed + * FDC data overrun bug, added some preliminary stuff for vertical + * recording support. + * + * 1992/9/17: Added DMA allocation & DMA functions. -- hhb. + * + * TODO: Errors are still not counted properly. + */ + +/* 1992/9/20 + * Modifications for ``Sector Shifting'' by Rob Hooft (hooft@chem.ruu.nl) + * modeled after the freeware MS-DOS program fdformat/88 V1.8 by + * Christoph H. Hochst\"atter. + * I have fixed the shift values to the ones I always use. Maybe a new + * ioctl() should be created to be able to modify them. + * There is a bug in the driver that makes it impossible to format a + * floppy as the first thing after bootup. + */ + +/* + * 1993/4/29 -- Linus -- cleaned up the timer handling in the kernel, and + * this helped the floppy driver as well. Much cleaner, and still seems to + * work. + */ + +/* 1994/6/24 --bbroad-- added the floppy table entries and made + * minor modifications to allow 2.88 floppies to be run. + */ + +/* 1994/7/13 -- Paul Vojta -- modified the probing code to allow three or more + * disk types. + */ + +/* + * 1994/8/8 -- Alain Knaff -- Switched to fdpatch driver: Support for bigger + * format bug fixes, but unfortunately some new bugs too... + */ + +/* 1994/9/17 -- Koen Holtman -- added logging of physical floppy write + * errors to allow safe writing by specialized programs. + */ + +/* 1995/4/24 -- Dan Fandrich -- added support for Commodore 1581 3.5" disks + * by defining bit 1 of the "stretch" parameter to mean put sectors on the + * opposite side of the disk, leaving the sector IDs alone (i.e. Commodore's + * drives are "upside-down"). + */ + +/* + * 1995/8/26 -- Andreas Busse -- added Mips support. + */ + +/* + * 1995/10/18 -- Ralf Baechle -- Portability cleanup; move machine dependent + * features to asm/floppy.h. + */ + +/* + * 1998/1/21 -- Richard Gooch -- devfs support + */ + +/* + * 1998/05/07 -- Russell King -- More portability cleanups; moved definition of + * interrupt and dma channel to asm/floppy.h. Cleaned up some formatting & + * use of '0' for NULL. + */ + +/* + * 1998/06/07 -- Alan Cox -- Merged the 2.0.34 fixes for resource allocation + * failures. + */ + +/* + * 1998/09/20 -- David Weinehall -- Added slow-down code for buggy PS/2-drives. + */ + +/* + * 1999/08/13 -- Paul Slootman -- floppy stopped working on Alpha after 24 + * days, 6 hours, 32 minutes and 32 seconds (i.e. MAXINT jiffies; ints were + * being used to store jiffies, which are unsigned longs). + */ + +/* + * 2000/08/28 -- Arnaldo Carvalho de Melo + * - get rid of check_region + * - s/suser/capable/ + */ + +/* + * 2001/08/26 -- Paul Gortmaker - fix insmod oops on machines with no + * floppy controller (lingering task on list after module is gone... boom.) + */ + +/* + * 2002/02/07 -- Anton Altaparmakov - Fix io ports reservation to correct range + * (0x3f2-0x3f5, 0x3f7). This fix is a bit of a hack but the proper fix + * requires many non-obvious changes in arch dependent code. + */ + +/* 2003/07/28 -- Daniele Bellucci . + * Better audit of register_blkdev. + */ + +#undef FLOPPY_SILENT_DCL_CLEAR + +#define REALLY_SLOW_IO + +#define DEBUGT 2 + +#define DPRINT(format, args...) \ + pr_info("floppy%d: " format, current_drive, ##args) + +#define DCL_DEBUG /* debug disk change line */ +#ifdef DCL_DEBUG +#define debug_dcl(test, fmt, args...) \ + do { if ((test) & FD_DEBUG) DPRINT(fmt, ##args); } while (0) +#else +#define debug_dcl(test, fmt, args...) \ + do { if (0) DPRINT(fmt, ##args); } while (0) +#endif + +/* do print messages for unexpected interrupts */ +static int print_unex = 1; +#include +#include +#include +#include +#include +#include +#define FDPATCHES +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* CMOS defines */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * PS/2 floppies have much slower step rates than regular floppies. + * It's been recommended that take about 1/4 of the default speed + * in some more extreme cases. + */ +static DEFINE_MUTEX(floppy_mutex); +static int slow_floppy; + +#include +#include + +static int FLOPPY_IRQ = 6; +static int FLOPPY_DMA = 2; +static int can_use_virtual_dma = 2; +/* ======= + * can use virtual DMA: + * 0 = use of virtual DMA disallowed by config + * 1 = use of virtual DMA prescribed by config + * 2 = no virtual DMA preference configured. By default try hard DMA, + * but fall back on virtual DMA when not enough memory available + */ + +static int use_virtual_dma; +/* ======= + * use virtual DMA + * 0 using hard DMA + * 1 using virtual DMA + * This variable is set to virtual when a DMA mem problem arises, and + * reset back in floppy_grab_irq_and_dma. + * It is not safe to reset it in other circumstances, because the floppy + * driver may have several buffers in use at once, and we do currently not + * record each buffers capabilities + */ + +static DEFINE_SPINLOCK(floppy_lock); + +static unsigned short virtual_dma_port = 0x3f0; +irqreturn_t floppy_interrupt(int irq, void *dev_id); +static int set_dor(int fdc, char mask, char data); + +#define K_64 0x10000 /* 64KB */ + +/* the following is the mask of allowed drives. By default units 2 and + * 3 of both floppy controllers are disabled, because switching on the + * motor of these drives causes system hangs on some PCI computers. drive + * 0 is the low bit (0x1), and drive 7 is the high bit (0x80). Bits are on if + * a drive is allowed. + * + * NOTE: This must come before we include the arch floppy header because + * some ports reference this variable from there. -DaveM + */ + +static int allowed_drive_mask = 0x33; + +#include + +static int irqdma_allocated; + +#include +#include +#include /* for the compatibility eject ioctl */ +#include + +static struct request *current_req; +static void do_fd_request(struct request_queue *q); +static int set_next_request(void); + +#ifndef fd_get_dma_residue +#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA) +#endif + +/* Dma Memory related stuff */ + +#ifndef fd_dma_mem_free +#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size)) +#endif + +#ifndef fd_dma_mem_alloc +#define fd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL, get_order(size)) +#endif + +static inline void fallback_on_nodma_alloc(char **addr, size_t l) +{ +#ifdef FLOPPY_CAN_FALLBACK_ON_NODMA + if (*addr) + return; /* we have the memory */ + if (can_use_virtual_dma != 2) + return; /* no fallback allowed */ + pr_info("DMA memory shortage. Temporarily falling back on virtual DMA\n"); + *addr = (char *)nodma_mem_alloc(l); +#else + return; +#endif +} + +/* End dma memory related stuff */ + +static unsigned long fake_change; +static bool initialized; + +#define ITYPE(x) (((x) >> 2) & 0x1f) +#define TOMINOR(x) ((x & 3) | ((x & 4) << 5)) +#define UNIT(x) ((x) & 0x03) /* drive on fdc */ +#define FDC(x) (((x) & 0x04) >> 2) /* fdc of drive */ + /* reverse mapping from unit and fdc to drive */ +#define REVDRIVE(fdc, unit) ((unit) + ((fdc) << 2)) + +#define DP (&drive_params[current_drive]) +#define DRS (&drive_state[current_drive]) +#define DRWE (&write_errors[current_drive]) +#define FDCS (&fdc_state[fdc]) + +#define UDP (&drive_params[drive]) +#define UDRS (&drive_state[drive]) +#define UDRWE (&write_errors[drive]) +#define UFDCS (&fdc_state[FDC(drive)]) + +#define PH_HEAD(floppy, head) (((((floppy)->stretch & 2) >> 1) ^ head) << 2) +#define STRETCH(floppy) ((floppy)->stretch & FD_STRETCH) + +/* read/write */ +#define COMMAND (raw_cmd->cmd[0]) +#define DR_SELECT (raw_cmd->cmd[1]) +#define TRACK (raw_cmd->cmd[2]) +#define HEAD (raw_cmd->cmd[3]) +#define SECTOR (raw_cmd->cmd[4]) +#define SIZECODE (raw_cmd->cmd[5]) +#define SECT_PER_TRACK (raw_cmd->cmd[6]) +#define GAP (raw_cmd->cmd[7]) +#define SIZECODE2 (raw_cmd->cmd[8]) +#define NR_RW 9 + +/* format */ +#define F_SIZECODE (raw_cmd->cmd[2]) +#define F_SECT_PER_TRACK (raw_cmd->cmd[3]) +#define F_GAP (raw_cmd->cmd[4]) +#define F_FILL (raw_cmd->cmd[5]) +#define NR_F 6 + +/* + * Maximum disk size (in kilobytes). + * This default is used whenever the current disk size is unknown. + * [Now it is rather a minimum] + */ +#define MAX_DISK_SIZE 4 /* 3984 */ + +/* + * globals used by 'result()' + */ +#define MAX_REPLIES 16 +static unsigned char reply_buffer[MAX_REPLIES]; +static int inr; /* size of reply buffer, when called from interrupt */ +#define ST0 (reply_buffer[0]) +#define ST1 (reply_buffer[1]) +#define ST2 (reply_buffer[2]) +#define ST3 (reply_buffer[0]) /* result of GETSTATUS */ +#define R_TRACK (reply_buffer[3]) +#define R_HEAD (reply_buffer[4]) +#define R_SECTOR (reply_buffer[5]) +#define R_SIZECODE (reply_buffer[6]) + +#define SEL_DLY (2 * HZ / 100) + +/* + * this struct defines the different floppy drive types. + */ +static struct { + struct floppy_drive_params params; + const char *name; /* name printed while booting */ +} default_drive_params[] = { +/* NOTE: the time values in jiffies should be in msec! + CMOS drive type + | Maximum data rate supported by drive type + | | Head load time, msec + | | | Head unload time, msec (not used) + | | | | Step rate interval, usec + | | | | | Time needed for spinup time (jiffies) + | | | | | | Timeout for spinning down (jiffies) + | | | | | | | Spindown offset (where disk stops) + | | | | | | | | Select delay + | | | | | | | | | RPS + | | | | | | | | | | Max number of tracks + | | | | | | | | | | | Interrupt timeout + | | | | | | | | | | | | Max nonintlv. sectors + | | | | | | | | | | | | | -Max Errors- flags */ +{{0, 500, 16, 16, 8000, 1*HZ, 3*HZ, 0, SEL_DLY, 5, 80, 3*HZ, 20, {3,1,2,0,2}, 0, + 0, { 7, 4, 8, 2, 1, 5, 3,10}, 3*HZ/2, 0 }, "unknown" }, + +{{1, 300, 16, 16, 8000, 1*HZ, 3*HZ, 0, SEL_DLY, 5, 40, 3*HZ, 17, {3,1,2,0,2}, 0, + 0, { 1, 0, 0, 0, 0, 0, 0, 0}, 3*HZ/2, 1 }, "360K PC" }, /*5 1/4 360 KB PC*/ + +{{2, 500, 16, 16, 6000, 4*HZ/10, 3*HZ, 14, SEL_DLY, 6, 83, 3*HZ, 17, {3,1,2,0,2}, 0, + 0, { 2, 5, 6,23,10,20,12, 0}, 3*HZ/2, 2 }, "1.2M" }, /*5 1/4 HD AT*/ + +{{3, 250, 16, 16, 3000, 1*HZ, 3*HZ, 0, SEL_DLY, 5, 83, 3*HZ, 20, {3,1,2,0,2}, 0, + 0, { 4,22,21,30, 3, 0, 0, 0}, 3*HZ/2, 4 }, "720k" }, /*3 1/2 DD*/ + +{{4, 500, 16, 16, 4000, 4*HZ/10, 3*HZ, 10, SEL_DLY, 5, 83, 3*HZ, 20, {3,1,2,0,2}, 0, + 0, { 7, 4,25,22,31,21,29,11}, 3*HZ/2, 7 }, "1.44M" }, /*3 1/2 HD*/ + +{{5, 1000, 15, 8, 3000, 4*HZ/10, 3*HZ, 10, SEL_DLY, 5, 83, 3*HZ, 40, {3,1,2,0,2}, 0, + 0, { 7, 8, 4,25,28,22,31,21}, 3*HZ/2, 8 }, "2.88M AMI BIOS" }, /*3 1/2 ED*/ + +{{6, 1000, 15, 8, 3000, 4*HZ/10, 3*HZ, 10, SEL_DLY, 5, 83, 3*HZ, 40, {3,1,2,0,2}, 0, + 0, { 7, 8, 4,25,28,22,31,21}, 3*HZ/2, 8 }, "2.88M" } /*3 1/2 ED*/ +/* | --autodetected formats--- | | | + * read_track | | Name printed when booting + * | Native format + * Frequency of disk change checks */ +}; + +static struct floppy_drive_params drive_params[N_DRIVE]; +static struct floppy_drive_struct drive_state[N_DRIVE]; +static struct floppy_write_errors write_errors[N_DRIVE]; +static struct timer_list motor_off_timer[N_DRIVE]; +static struct gendisk *disks[N_DRIVE]; +static struct block_device *opened_bdev[N_DRIVE]; +static DEFINE_MUTEX(open_lock); +static struct floppy_raw_cmd *raw_cmd, default_raw_cmd; +static int fdc_queue; + +/* + * This struct defines the different floppy types. + * + * Bit 0 of 'stretch' tells if the tracks need to be doubled for some + * types (e.g. 360kB diskette in 1.2MB drive, etc.). Bit 1 of 'stretch' + * tells if the disk is in Commodore 1581 format, which means side 0 sectors + * are located on side 1 of the disk but with a side 0 ID, and vice-versa. + * This is the same as the Sharp MZ-80 5.25" CP/M disk format, except that the + * 1581's logical side 0 is on physical side 1, whereas the Sharp's logical + * side 0 is on physical side 0 (but with the misnamed sector IDs). + * 'stretch' should probably be renamed to something more general, like + * 'options'. + * + * Bits 2 through 9 of 'stretch' tell the number of the first sector. + * The LSB (bit 2) is flipped. For most disks, the first sector + * is 1 (represented by 0x00<<2). For some CP/M and music sampler + * disks (such as Ensoniq EPS 16plus) it is 0 (represented as 0x01<<2). + * For Amstrad CPC disks it is 0xC1 (represented as 0xC0<<2). + * + * Other parameters should be self-explanatory (see also setfdprm(8)). + */ +/* + Size + | Sectors per track + | | Head + | | | Tracks + | | | | Stretch + | | | | | Gap 1 size + | | | | | | Data rate, | 0x40 for perp + | | | | | | | Spec1 (stepping rate, head unload + | | | | | | | | /fmt gap (gap2) */ +static struct floppy_struct floppy_type[32] = { + { 0, 0,0, 0,0,0x00,0x00,0x00,0x00,NULL }, /* 0 no testing */ + { 720, 9,2,40,0,0x2A,0x02,0xDF,0x50,"d360" }, /* 1 360KB PC */ + { 2400,15,2,80,0,0x1B,0x00,0xDF,0x54,"h1200" }, /* 2 1.2MB AT */ + { 720, 9,1,80,0,0x2A,0x02,0xDF,0x50,"D360" }, /* 3 360KB SS 3.5" */ + { 1440, 9,2,80,0,0x2A,0x02,0xDF,0x50,"D720" }, /* 4 720KB 3.5" */ + { 720, 9,2,40,1,0x23,0x01,0xDF,0x50,"h360" }, /* 5 360KB AT */ + { 1440, 9,2,80,0,0x23,0x01,0xDF,0x50,"h720" }, /* 6 720KB AT */ + { 2880,18,2,80,0,0x1B,0x00,0xCF,0x6C,"H1440" }, /* 7 1.44MB 3.5" */ + { 5760,36,2,80,0,0x1B,0x43,0xAF,0x54,"E2880" }, /* 8 2.88MB 3.5" */ + { 6240,39,2,80,0,0x1B,0x43,0xAF,0x28,"E3120" }, /* 9 3.12MB 3.5" */ + + { 2880,18,2,80,0,0x25,0x00,0xDF,0x02,"h1440" }, /* 10 1.44MB 5.25" */ + { 3360,21,2,80,0,0x1C,0x00,0xCF,0x0C,"H1680" }, /* 11 1.68MB 3.5" */ + { 820,10,2,41,1,0x25,0x01,0xDF,0x2E,"h410" }, /* 12 410KB 5.25" */ + { 1640,10,2,82,0,0x25,0x02,0xDF,0x2E,"H820" }, /* 13 820KB 3.5" */ + { 2952,18,2,82,0,0x25,0x00,0xDF,0x02,"h1476" }, /* 14 1.48MB 5.25" */ + { 3444,21,2,82,0,0x25,0x00,0xDF,0x0C,"H1722" }, /* 15 1.72MB 3.5" */ + { 840,10,2,42,1,0x25,0x01,0xDF,0x2E,"h420" }, /* 16 420KB 5.25" */ + { 1660,10,2,83,0,0x25,0x02,0xDF,0x2E,"H830" }, /* 17 830KB 3.5" */ + { 2988,18,2,83,0,0x25,0x00,0xDF,0x02,"h1494" }, /* 18 1.49MB 5.25" */ + { 3486,21,2,83,0,0x25,0x00,0xDF,0x0C,"H1743" }, /* 19 1.74 MB 3.5" */ + + { 1760,11,2,80,0,0x1C,0x09,0xCF,0x00,"h880" }, /* 20 880KB 5.25" */ + { 2080,13,2,80,0,0x1C,0x01,0xCF,0x00,"D1040" }, /* 21 1.04MB 3.5" */ + { 2240,14,2,80,0,0x1C,0x19,0xCF,0x00,"D1120" }, /* 22 1.12MB 3.5" */ + { 3200,20,2,80,0,0x1C,0x20,0xCF,0x2C,"h1600" }, /* 23 1.6MB 5.25" */ + { 3520,22,2,80,0,0x1C,0x08,0xCF,0x2e,"H1760" }, /* 24 1.76MB 3.5" */ + { 3840,24,2,80,0,0x1C,0x20,0xCF,0x00,"H1920" }, /* 25 1.92MB 3.5" */ + { 6400,40,2,80,0,0x25,0x5B,0xCF,0x00,"E3200" }, /* 26 3.20MB 3.5" */ + { 7040,44,2,80,0,0x25,0x5B,0xCF,0x00,"E3520" }, /* 27 3.52MB 3.5" */ + { 7680,48,2,80,0,0x25,0x63,0xCF,0x00,"E3840" }, /* 28 3.84MB 3.5" */ + { 3680,23,2,80,0,0x1C,0x10,0xCF,0x00,"H1840" }, /* 29 1.84MB 3.5" */ + + { 1600,10,2,80,0,0x25,0x02,0xDF,0x2E,"D800" }, /* 30 800KB 3.5" */ + { 3200,20,2,80,0,0x1C,0x00,0xCF,0x2C,"H1600" }, /* 31 1.6MB 3.5" */ +}; + +#define SECTSIZE (_FD_SECTSIZE(*floppy)) + +/* Auto-detection: Disk type used until the next media change occurs. */ +static struct floppy_struct *current_type[N_DRIVE]; + +/* + * User-provided type information. current_type points to + * the respective entry of this array. + */ +static struct floppy_struct user_params[N_DRIVE]; + +static sector_t floppy_sizes[256]; + +static char floppy_device_name[] = "floppy"; + +/* + * The driver is trying to determine the correct media format + * while probing is set. rw_interrupt() clears it after a + * successful access. + */ +static int probing; + +/* Synchronization of FDC access. */ +#define FD_COMMAND_NONE -1 +#define FD_COMMAND_ERROR 2 +#define FD_COMMAND_OKAY 3 + +static volatile int command_status = FD_COMMAND_NONE; +static unsigned long fdc_busy; +static DECLARE_WAIT_QUEUE_HEAD(fdc_wait); +static DECLARE_WAIT_QUEUE_HEAD(command_done); + +/* Errors during formatting are counted here. */ +static int format_errors; + +/* Format request descriptor. */ +static struct format_descr format_req; + +/* + * Rate is 0 for 500kb/s, 1 for 300kbps, 2 for 250kbps + * Spec1 is 0xSH, where S is stepping rate (F=1ms, E=2ms, D=3ms etc), + * H is head unload time (1=16ms, 2=32ms, etc) + */ + +/* + * Track buffer + * Because these are written to by the DMA controller, they must + * not contain a 64k byte boundary crossing, or data will be + * corrupted/lost. + */ +static char *floppy_track_buffer; +static int max_buffer_sectors; + +static int *errors; +typedef void (*done_f)(int); +static const struct cont_t { + void (*interrupt)(void); + /* this is called after the interrupt of the + * main command */ + void (*redo)(void); /* this is called to retry the operation */ + void (*error)(void); /* this is called to tally an error */ + done_f done; /* this is called to say if the operation has + * succeeded/failed */ +} *cont; + +static void floppy_ready(void); +static void floppy_start(void); +static void process_fd_request(void); +static void recalibrate_floppy(void); +static void floppy_shutdown(struct work_struct *); + +static int floppy_request_regions(int); +static void floppy_release_regions(int); +static int floppy_grab_irq_and_dma(void); +static void floppy_release_irq_and_dma(void); + +/* + * The "reset" variable should be tested whenever an interrupt is scheduled, + * after the commands have been sent. This is to ensure that the driver doesn't + * get wedged when the interrupt doesn't come because of a failed command. + * reset doesn't need to be tested before sending commands, because + * output_byte is automatically disabled when reset is set. + */ +static void reset_fdc(void); + +/* + * These are global variables, as that's the easiest way to give + * information to interrupts. They are the data used for the current + * request. + */ +#define NO_TRACK -1 +#define NEED_1_RECAL -2 +#define NEED_2_RECAL -3 + +static atomic_t usage_count = ATOMIC_INIT(0); + +/* buffer related variables */ +static int buffer_track = -1; +static int buffer_drive = -1; +static int buffer_min = -1; +static int buffer_max = -1; + +/* fdc related variables, should end up in a struct */ +static struct floppy_fdc_state fdc_state[N_FDC]; +static int fdc; /* current fdc */ + +static struct workqueue_struct *floppy_wq; + +static struct floppy_struct *_floppy = floppy_type; +static unsigned char current_drive; +static long current_count_sectors; +static unsigned char fsector_t; /* sector in track */ +static unsigned char in_sector_offset; /* offset within physical sector, + * expressed in units of 512 bytes */ + +static inline bool drive_no_geom(int drive) +{ + return !current_type[drive] && !ITYPE(UDRS->fd_device); +} + +#ifndef fd_eject +static inline int fd_eject(int drive) +{ + return -EINVAL; +} +#endif + +/* + * Debugging + * ========= + */ +#ifdef DEBUGT +static long unsigned debugtimer; + +static inline void set_debugt(void) +{ + debugtimer = jiffies; +} + +static inline void debugt(const char *func, const char *msg) +{ + if (DP->flags & DEBUGT) + pr_info("%s:%s dtime=%lu\n", func, msg, jiffies - debugtimer); +} +#else +static inline void set_debugt(void) { } +static inline void debugt(const char *func, const char *msg) { } +#endif /* DEBUGT */ + + +static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown); +static const char *timeout_message; + +static void is_alive(const char *func, const char *message) +{ + /* this routine checks whether the floppy driver is "alive" */ + if (test_bit(0, &fdc_busy) && command_status < 2 && + !delayed_work_pending(&fd_timeout)) { + DPRINT("%s: timeout handler died. %s\n", func, message); + } +} + +static void (*do_floppy)(void) = NULL; + +#define OLOGSIZE 20 + +static void (*lasthandler)(void); +static unsigned long interruptjiffies; +static unsigned long resultjiffies; +static int resultsize; +static unsigned long lastredo; + +static struct output_log { + unsigned char data; + unsigned char status; + unsigned long jiffies; +} output_log[OLOGSIZE]; + +static int output_log_pos; + +#define current_reqD -1 +#define MAXTIMEOUT -2 + +static void __reschedule_timeout(int drive, const char *message) +{ + unsigned long delay; + + if (drive == current_reqD) + drive = current_drive; + + if (drive < 0 || drive >= N_DRIVE) { + delay = 20UL * HZ; + drive = 0; + } else + delay = UDP->timeout; + + mod_delayed_work(floppy_wq, &fd_timeout, delay); + if (UDP->flags & FD_DEBUG) + DPRINT("reschedule timeout %s\n", message); + timeout_message = message; +} + +static void reschedule_timeout(int drive, const char *message) +{ + unsigned long flags; + + spin_lock_irqsave(&floppy_lock, flags); + __reschedule_timeout(drive, message); + spin_unlock_irqrestore(&floppy_lock, flags); +} + +#define INFBOUND(a, b) (a) = max_t(int, a, b) +#define SUPBOUND(a, b) (a) = min_t(int, a, b) + +/* + * Bottom half floppy driver. + * ========================== + * + * This part of the file contains the code talking directly to the hardware, + * and also the main service loop (seek-configure-spinup-command) + */ + +/* + * disk change. + * This routine is responsible for maintaining the FD_DISK_CHANGE flag, + * and the last_checked date. + * + * last_checked is the date of the last check which showed 'no disk change' + * FD_DISK_CHANGE is set under two conditions: + * 1. The floppy has been changed after some i/o to that floppy already + * took place. + * 2. No floppy disk is in the drive. This is done in order to ensure that + * requests are quickly flushed in case there is no disk in the drive. It + * follows that FD_DISK_CHANGE can only be cleared if there is a disk in + * the drive. + * + * For 1., maxblock is observed. Maxblock is 0 if no i/o has taken place yet. + * For 2., FD_DISK_NEWCHANGE is watched. FD_DISK_NEWCHANGE is cleared on + * each seek. If a disk is present, the disk change line should also be + * cleared on each seek. Thus, if FD_DISK_NEWCHANGE is clear, but the disk + * change line is set, this means either that no disk is in the drive, or + * that it has been removed since the last seek. + * + * This means that we really have a third possibility too: + * The floppy has been changed after the last seek. + */ + +static int disk_change(int drive) +{ + int fdc = FDC(drive); + + if (time_before(jiffies, UDRS->select_date + UDP->select_delay)) + DPRINT("WARNING disk change called early\n"); + if (!(FDCS->dor & (0x10 << UNIT(drive))) || + (FDCS->dor & 3) != UNIT(drive) || fdc != FDC(drive)) { + DPRINT("probing disk change on unselected drive\n"); + DPRINT("drive=%d fdc=%d dor=%x\n", drive, FDC(drive), + (unsigned int)FDCS->dor); + } + + debug_dcl(UDP->flags, + "checking disk change line for drive %d\n", drive); + debug_dcl(UDP->flags, "jiffies=%lu\n", jiffies); + debug_dcl(UDP->flags, "disk change line=%x\n", fd_inb(FD_DIR) & 0x80); + debug_dcl(UDP->flags, "flags=%lx\n", UDRS->flags); + + if (UDP->flags & FD_BROKEN_DCL) + return test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); + if ((fd_inb(FD_DIR) ^ UDP->flags) & 0x80) { + set_bit(FD_VERIFY_BIT, &UDRS->flags); + /* verify write protection */ + + if (UDRS->maxblock) /* mark it changed */ + set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); + + /* invalidate its geometry */ + if (UDRS->keep_data >= 0) { + if ((UDP->flags & FTD_MSG) && + current_type[drive] != NULL) + DPRINT("Disk type is undefined after disk change\n"); + current_type[drive] = NULL; + floppy_sizes[TOMINOR(drive)] = MAX_DISK_SIZE << 1; + } + + return 1; + } else { + UDRS->last_checked = jiffies; + clear_bit(FD_DISK_NEWCHANGE_BIT, &UDRS->flags); + } + return 0; +} + +static inline int is_selected(int dor, int unit) +{ + return ((dor & (0x10 << unit)) && (dor & 3) == unit); +} + +static bool is_ready_state(int status) +{ + int state = status & (STATUS_READY | STATUS_DIR | STATUS_DMA); + return state == STATUS_READY; +} + +static int set_dor(int fdc, char mask, char data) +{ + unsigned char unit; + unsigned char drive; + unsigned char newdor; + unsigned char olddor; + + if (FDCS->address == -1) + return -1; + + olddor = FDCS->dor; + newdor = (olddor & mask) | data; + if (newdor != olddor) { + unit = olddor & 0x3; + if (is_selected(olddor, unit) && !is_selected(newdor, unit)) { + drive = REVDRIVE(fdc, unit); + debug_dcl(UDP->flags, + "calling disk change from set_dor\n"); + disk_change(drive); + } + FDCS->dor = newdor; + fd_outb(newdor, FD_DOR); + + unit = newdor & 0x3; + if (!is_selected(olddor, unit) && is_selected(newdor, unit)) { + drive = REVDRIVE(fdc, unit); + UDRS->select_date = jiffies; + } + } + return olddor; +} + +static void twaddle(void) +{ + if (DP->select_delay) + return; + fd_outb(FDCS->dor & ~(0x10 << UNIT(current_drive)), FD_DOR); + fd_outb(FDCS->dor, FD_DOR); + DRS->select_date = jiffies; +} + +/* + * Reset all driver information about the current fdc. + * This is needed after a reset, and after a raw command. + */ +static void reset_fdc_info(int mode) +{ + int drive; + + FDCS->spec1 = FDCS->spec2 = -1; + FDCS->need_configure = 1; + FDCS->perp_mode = 1; + FDCS->rawcmd = 0; + for (drive = 0; drive < N_DRIVE; drive++) + if (FDC(drive) == fdc && (mode || UDRS->track != NEED_1_RECAL)) + UDRS->track = NEED_2_RECAL; +} + +/* selects the fdc and drive, and enables the fdc's input/dma. */ +static void set_fdc(int drive) +{ + if (drive >= 0 && drive < N_DRIVE) { + fdc = FDC(drive); + current_drive = drive; + } + if (fdc != 1 && fdc != 0) { + pr_info("bad fdc value\n"); + return; + } + set_dor(fdc, ~0, 8); +#if N_FDC > 1 + set_dor(1 - fdc, ~8, 0); +#endif + if (FDCS->rawcmd == 2) + reset_fdc_info(1); + if (fd_inb(FD_STATUS) != STATUS_READY) + FDCS->reset = 1; +} + +/* locks the driver */ +static int lock_fdc(int drive, bool interruptible) +{ + if (WARN(atomic_read(&usage_count) == 0, + "Trying to lock fdc while usage count=0\n")) + return -1; + + if (wait_event_interruptible(fdc_wait, !test_and_set_bit(0, &fdc_busy))) + return -EINTR; + + command_status = FD_COMMAND_NONE; + + reschedule_timeout(drive, "lock fdc"); + set_fdc(drive); + return 0; +} + +/* unlocks the driver */ +static void unlock_fdc(void) +{ + if (!test_bit(0, &fdc_busy)) + DPRINT("FDC access conflict!\n"); + + raw_cmd = NULL; + command_status = FD_COMMAND_NONE; + cancel_delayed_work(&fd_timeout); + do_floppy = NULL; + cont = NULL; + clear_bit(0, &fdc_busy); + wake_up(&fdc_wait); +} + +/* switches the motor off after a given timeout */ +static void motor_off_callback(unsigned long nr) +{ + unsigned char mask = ~(0x10 << UNIT(nr)); + + set_dor(FDC(nr), mask, 0); +} + +/* schedules motor off */ +static void floppy_off(unsigned int drive) +{ + unsigned long volatile delta; + int fdc = FDC(drive); + + if (!(FDCS->dor & (0x10 << UNIT(drive)))) + return; + + del_timer(motor_off_timer + drive); + + /* make spindle stop in a position which minimizes spinup time + * next time */ + if (UDP->rps) { + delta = jiffies - UDRS->first_read_date + HZ - + UDP->spindown_offset; + delta = ((delta * UDP->rps) % HZ) / UDP->rps; + motor_off_timer[drive].expires = + jiffies + UDP->spindown - delta; + } + add_timer(motor_off_timer + drive); +} + +/* + * cycle through all N_DRIVE floppy drives, for disk change testing. + * stopping at current drive. This is done before any long operation, to + * be sure to have up to date disk change information. + */ +static void scandrives(void) +{ + int i; + int drive; + int saved_drive; + + if (DP->select_delay) + return; + + saved_drive = current_drive; + for (i = 0; i < N_DRIVE; i++) { + drive = (saved_drive + i + 1) % N_DRIVE; + if (UDRS->fd_ref == 0 || UDP->select_delay != 0) + continue; /* skip closed drives */ + set_fdc(drive); + if (!(set_dor(fdc, ~3, UNIT(drive) | (0x10 << UNIT(drive))) & + (0x10 << UNIT(drive)))) + /* switch the motor off again, if it was off to + * begin with */ + set_dor(fdc, ~(0x10 << UNIT(drive)), 0); + } + set_fdc(saved_drive); +} + +static void empty(void) +{ +} + +static void (*floppy_work_fn)(void); + +static void floppy_work_workfn(struct work_struct *work) +{ + floppy_work_fn(); +} + +static DECLARE_WORK(floppy_work, floppy_work_workfn); + +static void schedule_bh(void (*handler)(void)) +{ + WARN_ON(work_pending(&floppy_work)); + + floppy_work_fn = handler; + queue_work(floppy_wq, &floppy_work); +} + +static void (*fd_timer_fn)(void) = NULL; + +static void fd_timer_workfn(struct work_struct *work) +{ + fd_timer_fn(); +} + +static DECLARE_DELAYED_WORK(fd_timer, fd_timer_workfn); + +static void cancel_activity(void) +{ + do_floppy = NULL; + cancel_delayed_work_sync(&fd_timer); + cancel_work_sync(&floppy_work); +} + +/* this function makes sure that the disk stays in the drive during the + * transfer */ +static void fd_watchdog(void) +{ + debug_dcl(DP->flags, "calling disk change from watchdog\n"); + + if (disk_change(current_drive)) { + DPRINT("disk removed during i/o\n"); + cancel_activity(); + cont->done(0); + reset_fdc(); + } else { + cancel_delayed_work(&fd_timer); + fd_timer_fn = fd_watchdog; + queue_delayed_work(floppy_wq, &fd_timer, HZ / 10); + } +} + +static void main_command_interrupt(void) +{ + cancel_delayed_work(&fd_timer); + cont->interrupt(); +} + +/* waits for a delay (spinup or select) to pass */ +static int fd_wait_for_completion(unsigned long expires, + void (*function)(void)) +{ + if (FDCS->reset) { + reset_fdc(); /* do the reset during sleep to win time + * if we don't need to sleep, it's a good + * occasion anyways */ + return 1; + } + + if (time_before(jiffies, expires)) { + cancel_delayed_work(&fd_timer); + fd_timer_fn = function; + queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies); + return 1; + } + return 0; +} + +static void setup_DMA(void) +{ + unsigned long f; + + if (raw_cmd->length == 0) { + int i; + + pr_info("zero dma transfer size:"); + for (i = 0; i < raw_cmd->cmd_count; i++) + pr_cont("%x,", raw_cmd->cmd[i]); + pr_cont("\n"); + cont->done(0); + FDCS->reset = 1; + return; + } + if (((unsigned long)raw_cmd->kernel_data) % 512) { + pr_info("non aligned address: %p\n", raw_cmd->kernel_data); + cont->done(0); + FDCS->reset = 1; + return; + } + f = claim_dma_lock(); + fd_disable_dma(); +#ifdef fd_dma_setup + if (fd_dma_setup(raw_cmd->kernel_data, raw_cmd->length, + (raw_cmd->flags & FD_RAW_READ) ? + DMA_MODE_READ : DMA_MODE_WRITE, FDCS->address) < 0) { + release_dma_lock(f); + cont->done(0); + FDCS->reset = 1; + return; + } + release_dma_lock(f); +#else + fd_clear_dma_ff(); + fd_cacheflush(raw_cmd->kernel_data, raw_cmd->length); + fd_set_dma_mode((raw_cmd->flags & FD_RAW_READ) ? + DMA_MODE_READ : DMA_MODE_WRITE); + fd_set_dma_addr(raw_cmd->kernel_data); + fd_set_dma_count(raw_cmd->length); + virtual_dma_port = FDCS->address; + fd_enable_dma(); + release_dma_lock(f); +#endif +} + +static void show_floppy(void); + +/* waits until the fdc becomes ready */ +static int wait_til_ready(void) +{ + int status; + int counter; + + if (FDCS->reset) + return -1; + for (counter = 0; counter < 10000; counter++) { + status = fd_inb(FD_STATUS); + if (status & STATUS_READY) + return status; + } + if (initialized) { + DPRINT("Getstatus times out (%x) on fdc %d\n", status, fdc); + show_floppy(); + } + FDCS->reset = 1; + return -1; +} + +/* sends a command byte to the fdc */ +static int output_byte(char byte) +{ + int status = wait_til_ready(); + + if (status < 0) + return -1; + + if (is_ready_state(status)) { + fd_outb(byte, FD_DATA); + output_log[output_log_pos].data = byte; + output_log[output_log_pos].status = status; + output_log[output_log_pos].jiffies = jiffies; + output_log_pos = (output_log_pos + 1) % OLOGSIZE; + return 0; + } + FDCS->reset = 1; + if (initialized) { + DPRINT("Unable to send byte %x to FDC. Fdc=%x Status=%x\n", + byte, fdc, status); + show_floppy(); + } + return -1; +} + +/* gets the response from the fdc */ +static int result(void) +{ + int i; + int status = 0; + + for (i = 0; i < MAX_REPLIES; i++) { + status = wait_til_ready(); + if (status < 0) + break; + status &= STATUS_DIR | STATUS_READY | STATUS_BUSY | STATUS_DMA; + if ((status & ~STATUS_BUSY) == STATUS_READY) { + resultjiffies = jiffies; + resultsize = i; + return i; + } + if (status == (STATUS_DIR | STATUS_READY | STATUS_BUSY)) + reply_buffer[i] = fd_inb(FD_DATA); + else + break; + } + if (initialized) { + DPRINT("get result error. Fdc=%d Last status=%x Read bytes=%d\n", + fdc, status, i); + show_floppy(); + } + FDCS->reset = 1; + return -1; +} + +#define MORE_OUTPUT -2 +/* does the fdc need more output? */ +static int need_more_output(void) +{ + int status = wait_til_ready(); + + if (status < 0) + return -1; + + if (is_ready_state(status)) + return MORE_OUTPUT; + + return result(); +} + +/* Set perpendicular mode as required, based on data rate, if supported. + * 82077 Now tested. 1Mbps data rate only possible with 82077-1. + */ +static void perpendicular_mode(void) +{ + unsigned char perp_mode; + + if (raw_cmd->rate & 0x40) { + switch (raw_cmd->rate & 3) { + case 0: + perp_mode = 2; + break; + case 3: + perp_mode = 3; + break; + default: + DPRINT("Invalid data rate for perpendicular mode!\n"); + cont->done(0); + FDCS->reset = 1; + /* + * convenient way to return to + * redo without too much hassle + * (deep stack et al.) + */ + return; + } + } else + perp_mode = 0; + + if (FDCS->perp_mode == perp_mode) + return; + if (FDCS->version >= FDC_82077_ORIG) { + output_byte(FD_PERPENDICULAR); + output_byte(perp_mode); + FDCS->perp_mode = perp_mode; + } else if (perp_mode) { + DPRINT("perpendicular mode not supported by this FDC.\n"); + } +} /* perpendicular_mode */ + +static int fifo_depth = 0xa; +static int no_fifo; + +static int fdc_configure(void) +{ + /* Turn on FIFO */ + output_byte(FD_CONFIGURE); + if (need_more_output() != MORE_OUTPUT) + return 0; + output_byte(0); + output_byte(0x10 | (no_fifo & 0x20) | (fifo_depth & 0xf)); + output_byte(0); /* pre-compensation from track + 0 upwards */ + return 1; +} + +#define NOMINAL_DTR 500 + +/* Issue a "SPECIFY" command to set the step rate time, head unload time, + * head load time, and DMA disable flag to values needed by floppy. + * + * The value "dtr" is the data transfer rate in Kbps. It is needed + * to account for the data rate-based scaling done by the 82072 and 82077 + * FDC types. This parameter is ignored for other types of FDCs (i.e. + * 8272a). + * + * Note that changing the data transfer rate has a (probably deleterious) + * effect on the parameters subject to scaling for 82072/82077 FDCs, so + * fdc_specify is called again after each data transfer rate + * change. + * + * srt: 1000 to 16000 in microseconds + * hut: 16 to 240 milliseconds + * hlt: 2 to 254 milliseconds + * + * These values are rounded up to the next highest available delay time. + */ +static void fdc_specify(void) +{ + unsigned char spec1; + unsigned char spec2; + unsigned long srt; + unsigned long hlt; + unsigned long hut; + unsigned long dtr = NOMINAL_DTR; + unsigned long scale_dtr = NOMINAL_DTR; + int hlt_max_code = 0x7f; + int hut_max_code = 0xf; + + if (FDCS->need_configure && FDCS->version >= FDC_82072A) { + fdc_configure(); + FDCS->need_configure = 0; + } + + switch (raw_cmd->rate & 0x03) { + case 3: + dtr = 1000; + break; + case 1: + dtr = 300; + if (FDCS->version >= FDC_82078) { + /* chose the default rate table, not the one + * where 1 = 2 Mbps */ + output_byte(FD_DRIVESPEC); + if (need_more_output() == MORE_OUTPUT) { + output_byte(UNIT(current_drive)); + output_byte(0xc0); + } + } + break; + case 2: + dtr = 250; + break; + } + + if (FDCS->version >= FDC_82072) { + scale_dtr = dtr; + hlt_max_code = 0x00; /* 0==256msec*dtr0/dtr (not linear!) */ + hut_max_code = 0x0; /* 0==256msec*dtr0/dtr (not linear!) */ + } + + /* Convert step rate from microseconds to milliseconds and 4 bits */ + srt = 16 - DIV_ROUND_UP(DP->srt * scale_dtr / 1000, NOMINAL_DTR); + if (slow_floppy) + srt = srt / 4; + + SUPBOUND(srt, 0xf); + INFBOUND(srt, 0); + + hlt = DIV_ROUND_UP(DP->hlt * scale_dtr / 2, NOMINAL_DTR); + if (hlt < 0x01) + hlt = 0x01; + else if (hlt > 0x7f) + hlt = hlt_max_code; + + hut = DIV_ROUND_UP(DP->hut * scale_dtr / 16, NOMINAL_DTR); + if (hut < 0x1) + hut = 0x1; + else if (hut > 0xf) + hut = hut_max_code; + + spec1 = (srt << 4) | hut; + spec2 = (hlt << 1) | (use_virtual_dma & 1); + + /* If these parameters did not change, just return with success */ + if (FDCS->spec1 != spec1 || FDCS->spec2 != spec2) { + /* Go ahead and set spec1 and spec2 */ + output_byte(FD_SPECIFY); + output_byte(FDCS->spec1 = spec1); + output_byte(FDCS->spec2 = spec2); + } +} /* fdc_specify */ + +/* Set the FDC's data transfer rate on behalf of the specified drive. + * NOTE: with 82072/82077 FDCs, changing the data rate requires a reissue + * of the specify command (i.e. using the fdc_specify function). + */ +static int fdc_dtr(void) +{ + /* If data rate not already set to desired value, set it. */ + if ((raw_cmd->rate & 3) == FDCS->dtr) + return 0; + + /* Set dtr */ + fd_outb(raw_cmd->rate & 3, FD_DCR); + + /* TODO: some FDC/drive combinations (C&T 82C711 with TEAC 1.2MB) + * need a stabilization period of several milliseconds to be + * enforced after data rate changes before R/W operations. + * Pause 5 msec to avoid trouble. (Needs to be 2 jiffies) + */ + FDCS->dtr = raw_cmd->rate & 3; + return fd_wait_for_completion(jiffies + 2UL * HZ / 100, floppy_ready); +} /* fdc_dtr */ + +static void tell_sector(void) +{ + pr_cont(": track %d, head %d, sector %d, size %d", + R_TRACK, R_HEAD, R_SECTOR, R_SIZECODE); +} /* tell_sector */ + +static void print_errors(void) +{ + DPRINT(""); + if (ST0 & ST0_ECE) { + pr_cont("Recalibrate failed!"); + } else if (ST2 & ST2_CRC) { + pr_cont("data CRC error"); + tell_sector(); + } else if (ST1 & ST1_CRC) { + pr_cont("CRC error"); + tell_sector(); + } else if ((ST1 & (ST1_MAM | ST1_ND)) || + (ST2 & ST2_MAM)) { + if (!probing) { + pr_cont("sector not found"); + tell_sector(); + } else + pr_cont("probe failed..."); + } else if (ST2 & ST2_WC) { /* seek error */ + pr_cont("wrong cylinder"); + } else if (ST2 & ST2_BC) { /* cylinder marked as bad */ + pr_cont("bad cylinder"); + } else { + pr_cont("unknown error. ST[0..2] are: 0x%x 0x%x 0x%x", + ST0, ST1, ST2); + tell_sector(); + } + pr_cont("\n"); +} + +/* + * OK, this error interpreting routine is called after a + * DMA read/write has succeeded + * or failed, so we check the results, and copy any buffers. + * hhb: Added better error reporting. + * ak: Made this into a separate routine. + */ +static int interpret_errors(void) +{ + char bad; + + if (inr != 7) { + DPRINT("-- FDC reply error\n"); + FDCS->reset = 1; + return 1; + } + + /* check IC to find cause of interrupt */ + switch (ST0 & ST0_INTR) { + case 0x40: /* error occurred during command execution */ + if (ST1 & ST1_EOC) + return 0; /* occurs with pseudo-DMA */ + bad = 1; + if (ST1 & ST1_WP) { + DPRINT("Drive is write protected\n"); + clear_bit(FD_DISK_WRITABLE_BIT, &DRS->flags); + cont->done(0); + bad = 2; + } else if (ST1 & ST1_ND) { + set_bit(FD_NEED_TWADDLE_BIT, &DRS->flags); + } else if (ST1 & ST1_OR) { + if (DP->flags & FTD_MSG) + DPRINT("Over/Underrun - retrying\n"); + bad = 0; + } else if (*errors >= DP->max_errors.reporting) { + print_errors(); + } + if (ST2 & ST2_WC || ST2 & ST2_BC) + /* wrong cylinder => recal */ + DRS->track = NEED_2_RECAL; + return bad; + case 0x80: /* invalid command given */ + DPRINT("Invalid FDC command given!\n"); + cont->done(0); + return 2; + case 0xc0: + DPRINT("Abnormal termination caused by polling\n"); + cont->error(); + return 2; + default: /* (0) Normal command termination */ + return 0; + } +} + +/* + * This routine is called when everything should be correctly set up + * for the transfer (i.e. floppy motor is on, the correct floppy is + * selected, and the head is sitting on the right track). + */ +static void setup_rw_floppy(void) +{ + int i; + int r; + int flags; + int dflags; + unsigned long ready_date; + void (*function)(void); + + flags = raw_cmd->flags; + if (flags & (FD_RAW_READ | FD_RAW_WRITE)) + flags |= FD_RAW_INTR; + + if ((flags & FD_RAW_SPIN) && !(flags & FD_RAW_NO_MOTOR)) { + ready_date = DRS->spinup_date + DP->spinup; + /* If spinup will take a long time, rerun scandrives + * again just before spinup completion. Beware that + * after scandrives, we must again wait for selection. + */ + if (time_after(ready_date, jiffies + DP->select_delay)) { + ready_date -= DP->select_delay; + function = floppy_start; + } else + function = setup_rw_floppy; + + /* wait until the floppy is spinning fast enough */ + if (fd_wait_for_completion(ready_date, function)) + return; + } + dflags = DRS->flags; + + if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE)) + setup_DMA(); + + if (flags & FD_RAW_INTR) + do_floppy = main_command_interrupt; + + r = 0; + for (i = 0; i < raw_cmd->cmd_count; i++) + r |= output_byte(raw_cmd->cmd[i]); + + debugt(__func__, "rw_command"); + + if (r) { + cont->error(); + reset_fdc(); + return; + } + + if (!(flags & FD_RAW_INTR)) { + inr = result(); + cont->interrupt(); + } else if (flags & FD_RAW_NEED_DISK) + fd_watchdog(); +} + +static int blind_seek; + +/* + * This is the routine called after every seek (or recalibrate) interrupt + * from the floppy controller. + */ +static void seek_interrupt(void) +{ + debugt(__func__, ""); + if (inr != 2 || (ST0 & 0xF8) != 0x20) { + DPRINT("seek failed\n"); + DRS->track = NEED_2_RECAL; + cont->error(); + cont->redo(); + return; + } + if (DRS->track >= 0 && DRS->track != ST1 && !blind_seek) { + debug_dcl(DP->flags, + "clearing NEWCHANGE flag because of effective seek\n"); + debug_dcl(DP->flags, "jiffies=%lu\n", jiffies); + clear_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags); + /* effective seek */ + DRS->select_date = jiffies; + } + DRS->track = ST1; + floppy_ready(); +} + +static void check_wp(void) +{ + if (test_bit(FD_VERIFY_BIT, &DRS->flags)) { + /* check write protection */ + output_byte(FD_GETSTATUS); + output_byte(UNIT(current_drive)); + if (result() != 1) { + FDCS->reset = 1; + return; + } + clear_bit(FD_VERIFY_BIT, &DRS->flags); + clear_bit(FD_NEED_TWADDLE_BIT, &DRS->flags); + debug_dcl(DP->flags, + "checking whether disk is write protected\n"); + debug_dcl(DP->flags, "wp=%x\n", ST3 & 0x40); + if (!(ST3 & 0x40)) + set_bit(FD_DISK_WRITABLE_BIT, &DRS->flags); + else + clear_bit(FD_DISK_WRITABLE_BIT, &DRS->flags); + } +} + +static void seek_floppy(void) +{ + int track; + + blind_seek = 0; + + debug_dcl(DP->flags, "calling disk change from %s\n", __func__); + + if (!test_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags) && + disk_change(current_drive) && (raw_cmd->flags & FD_RAW_NEED_DISK)) { + /* the media changed flag should be cleared after the seek. + * If it isn't, this means that there is really no disk in + * the drive. + */ + set_bit(FD_DISK_CHANGED_BIT, &DRS->flags); + cont->done(0); + cont->redo(); + return; + } + if (DRS->track <= NEED_1_RECAL) { + recalibrate_floppy(); + return; + } else if (test_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags) && + (raw_cmd->flags & FD_RAW_NEED_DISK) && + (DRS->track <= NO_TRACK || DRS->track == raw_cmd->track)) { + /* we seek to clear the media-changed condition. Does anybody + * know a more elegant way, which works on all drives? */ + if (raw_cmd->track) + track = raw_cmd->track - 1; + else { + if (DP->flags & FD_SILENT_DCL_CLEAR) { + set_dor(fdc, ~(0x10 << UNIT(current_drive)), 0); + blind_seek = 1; + raw_cmd->flags |= FD_RAW_NEED_SEEK; + } + track = 1; + } + } else { + check_wp(); + if (raw_cmd->track != DRS->track && + (raw_cmd->flags & FD_RAW_NEED_SEEK)) + track = raw_cmd->track; + else { + setup_rw_floppy(); + return; + } + } + + do_floppy = seek_interrupt; + output_byte(FD_SEEK); + output_byte(UNIT(current_drive)); + if (output_byte(track) < 0) { + reset_fdc(); + return; + } + debugt(__func__, ""); +} + +static void recal_interrupt(void) +{ + debugt(__func__, ""); + if (inr != 2) + FDCS->reset = 1; + else if (ST0 & ST0_ECE) { + switch (DRS->track) { + case NEED_1_RECAL: + debugt(__func__, "need 1 recal"); + /* after a second recalibrate, we still haven't + * reached track 0. Probably no drive. Raise an + * error, as failing immediately might upset + * computers possessed by the Devil :-) */ + cont->error(); + cont->redo(); + return; + case NEED_2_RECAL: + debugt(__func__, "need 2 recal"); + /* If we already did a recalibrate, + * and we are not at track 0, this + * means we have moved. (The only way + * not to move at recalibration is to + * be already at track 0.) Clear the + * new change flag */ + debug_dcl(DP->flags, + "clearing NEWCHANGE flag because of second recalibrate\n"); + + clear_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags); + DRS->select_date = jiffies; + /* fall through */ + default: + debugt(__func__, "default"); + /* Recalibrate moves the head by at + * most 80 steps. If after one + * recalibrate we don't have reached + * track 0, this might mean that we + * started beyond track 80. Try + * again. */ + DRS->track = NEED_1_RECAL; + break; + } + } else + DRS->track = ST1; + floppy_ready(); +} + +static void print_result(char *message, int inr) +{ + int i; + + DPRINT("%s ", message); + if (inr >= 0) + for (i = 0; i < inr; i++) + pr_cont("repl[%d]=%x ", i, reply_buffer[i]); + pr_cont("\n"); +} + +/* interrupt handler. Note that this can be called externally on the Sparc */ +irqreturn_t floppy_interrupt(int irq, void *dev_id) +{ + int do_print; + unsigned long f; + void (*handler)(void) = do_floppy; + + lasthandler = handler; + interruptjiffies = jiffies; + + f = claim_dma_lock(); + fd_disable_dma(); + release_dma_lock(f); + + do_floppy = NULL; + if (fdc >= N_FDC || FDCS->address == -1) { + /* we don't even know which FDC is the culprit */ + pr_info("DOR0=%x\n", fdc_state[0].dor); + pr_info("floppy interrupt on bizarre fdc %d\n", fdc); + pr_info("handler=%pf\n", handler); + is_alive(__func__, "bizarre fdc"); + return IRQ_NONE; + } + + FDCS->reset = 0; + /* We have to clear the reset flag here, because apparently on boxes + * with level triggered interrupts (PS/2, Sparc, ...), it is needed to + * emit SENSEI's to clear the interrupt line. And FDCS->reset blocks the + * emission of the SENSEI's. + * It is OK to emit floppy commands because we are in an interrupt + * handler here, and thus we have to fear no interference of other + * activity. + */ + + do_print = !handler && print_unex && initialized; + + inr = result(); + if (do_print) + print_result("unexpected interrupt", inr); + if (inr == 0) { + int max_sensei = 4; + do { + output_byte(FD_SENSEI); + inr = result(); + if (do_print) + print_result("sensei", inr); + max_sensei--; + } while ((ST0 & 0x83) != UNIT(current_drive) && + inr == 2 && max_sensei); + } + if (!handler) { + FDCS->reset = 1; + return IRQ_NONE; + } + schedule_bh(handler); + is_alive(__func__, "normal interrupt end"); + + /* FIXME! Was it really for us? */ + return IRQ_HANDLED; +} + +static void recalibrate_floppy(void) +{ + debugt(__func__, ""); + do_floppy = recal_interrupt; + output_byte(FD_RECALIBRATE); + if (output_byte(UNIT(current_drive)) < 0) + reset_fdc(); +} + +/* + * Must do 4 FD_SENSEIs after reset because of ``drive polling''. + */ +static void reset_interrupt(void) +{ + debugt(__func__, ""); + result(); /* get the status ready for set_fdc */ + if (FDCS->reset) { + pr_info("reset set in interrupt, calling %pf\n", cont->error); + cont->error(); /* a reset just after a reset. BAD! */ + } + cont->redo(); +} + +/* + * reset is done by pulling bit 2 of DOR low for a while (old FDCs), + * or by setting the self clearing bit 7 of STATUS (newer FDCs) + */ +static void reset_fdc(void) +{ + unsigned long flags; + + do_floppy = reset_interrupt; + FDCS->reset = 0; + reset_fdc_info(0); + + /* Pseudo-DMA may intercept 'reset finished' interrupt. */ + /* Irrelevant for systems with true DMA (i386). */ + + flags = claim_dma_lock(); + fd_disable_dma(); + release_dma_lock(flags); + + if (FDCS->version >= FDC_82072A) + fd_outb(0x80 | (FDCS->dtr & 3), FD_STATUS); + else { + fd_outb(FDCS->dor & ~0x04, FD_DOR); + udelay(FD_RESET_DELAY); + fd_outb(FDCS->dor, FD_DOR); + } +} + +static void show_floppy(void) +{ + int i; + + pr_info("\n"); + pr_info("floppy driver state\n"); + pr_info("-------------------\n"); + pr_info("now=%lu last interrupt=%lu diff=%lu last called handler=%pf\n", + jiffies, interruptjiffies, jiffies - interruptjiffies, + lasthandler); + + pr_info("timeout_message=%s\n", timeout_message); + pr_info("last output bytes:\n"); + for (i = 0; i < OLOGSIZE; i++) + pr_info("%2x %2x %lu\n", + output_log[(i + output_log_pos) % OLOGSIZE].data, + output_log[(i + output_log_pos) % OLOGSIZE].status, + output_log[(i + output_log_pos) % OLOGSIZE].jiffies); + pr_info("last result at %lu\n", resultjiffies); + pr_info("last redo_fd_request at %lu\n", lastredo); + print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1, + reply_buffer, resultsize, true); + + pr_info("status=%x\n", fd_inb(FD_STATUS)); + pr_info("fdc_busy=%lu\n", fdc_busy); + if (do_floppy) + pr_info("do_floppy=%pf\n", do_floppy); + if (work_pending(&floppy_work)) + pr_info("floppy_work.func=%pf\n", floppy_work.func); + if (delayed_work_pending(&fd_timer)) + pr_info("delayed work.function=%p expires=%ld\n", + fd_timer.work.func, + fd_timer.timer.expires - jiffies); + if (delayed_work_pending(&fd_timeout)) + pr_info("timer_function=%p expires=%ld\n", + fd_timeout.work.func, + fd_timeout.timer.expires - jiffies); + + pr_info("cont=%p\n", cont); + pr_info("current_req=%p\n", current_req); + pr_info("command_status=%d\n", command_status); + pr_info("\n"); +} + +static void floppy_shutdown(struct work_struct *arg) +{ + unsigned long flags; + + if (initialized) + show_floppy(); + cancel_activity(); + + flags = claim_dma_lock(); + fd_disable_dma(); + release_dma_lock(flags); + + /* avoid dma going to a random drive after shutdown */ + + if (initialized) + DPRINT("floppy timeout called\n"); + FDCS->reset = 1; + if (cont) { + cont->done(0); + cont->redo(); /* this will recall reset when needed */ + } else { + pr_info("no cont in shutdown!\n"); + process_fd_request(); + } + is_alive(__func__, ""); +} + +/* start motor, check media-changed condition and write protection */ +static int start_motor(void (*function)(void)) +{ + int mask; + int data; + + mask = 0xfc; + data = UNIT(current_drive); + if (!(raw_cmd->flags & FD_RAW_NO_MOTOR)) { + if (!(FDCS->dor & (0x10 << UNIT(current_drive)))) { + set_debugt(); + /* no read since this drive is running */ + DRS->first_read_date = 0; + /* note motor start time if motor is not yet running */ + DRS->spinup_date = jiffies; + data |= (0x10 << UNIT(current_drive)); + } + } else if (FDCS->dor & (0x10 << UNIT(current_drive))) + mask &= ~(0x10 << UNIT(current_drive)); + + /* starts motor and selects floppy */ + del_timer(motor_off_timer + current_drive); + set_dor(fdc, mask, data); + + /* wait_for_completion also schedules reset if needed. */ + return fd_wait_for_completion(DRS->select_date + DP->select_delay, + function); +} + +static void floppy_ready(void) +{ + if (FDCS->reset) { + reset_fdc(); + return; + } + if (start_motor(floppy_ready)) + return; + if (fdc_dtr()) + return; + + debug_dcl(DP->flags, "calling disk change from floppy_ready\n"); + if (!(raw_cmd->flags & FD_RAW_NO_MOTOR) && + disk_change(current_drive) && !DP->select_delay) + twaddle(); /* this clears the dcl on certain + * drive/controller combinations */ + +#ifdef fd_chose_dma_mode + if ((raw_cmd->flags & FD_RAW_READ) || (raw_cmd->flags & FD_RAW_WRITE)) { + unsigned long flags = claim_dma_lock(); + fd_chose_dma_mode(raw_cmd->kernel_data, raw_cmd->length); + release_dma_lock(flags); + } +#endif + + if (raw_cmd->flags & (FD_RAW_NEED_SEEK | FD_RAW_NEED_DISK)) { + perpendicular_mode(); + fdc_specify(); /* must be done here because of hut, hlt ... */ + seek_floppy(); + } else { + if ((raw_cmd->flags & FD_RAW_READ) || + (raw_cmd->flags & FD_RAW_WRITE)) + fdc_specify(); + setup_rw_floppy(); + } +} + +static void floppy_start(void) +{ + reschedule_timeout(current_reqD, "floppy start"); + + scandrives(); + debug_dcl(DP->flags, "setting NEWCHANGE in floppy_start\n"); + set_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags); + floppy_ready(); +} + +/* + * ======================================================================== + * here ends the bottom half. Exported routines are: + * floppy_start, floppy_off, floppy_ready, lock_fdc, unlock_fdc, set_fdc, + * start_motor, reset_fdc, reset_fdc_info, interpret_errors. + * Initialization also uses output_byte, result, set_dor, floppy_interrupt + * and set_dor. + * ======================================================================== + */ +/* + * General purpose continuations. + * ============================== + */ + +static void do_wakeup(void) +{ + reschedule_timeout(MAXTIMEOUT, "do wakeup"); + cont = NULL; + command_status += 2; + wake_up(&command_done); +} + +static const struct cont_t wakeup_cont = { + .interrupt = empty, + .redo = do_wakeup, + .error = empty, + .done = (done_f)empty +}; + +static const struct cont_t intr_cont = { + .interrupt = empty, + .redo = process_fd_request, + .error = empty, + .done = (done_f)empty +}; + +static int wait_til_done(void (*handler)(void), bool interruptible) +{ + int ret; + + schedule_bh(handler); + + if (interruptible) + wait_event_interruptible(command_done, command_status >= 2); + else + wait_event(command_done, command_status >= 2); + + if (command_status < 2) { + cancel_activity(); + cont = &intr_cont; + reset_fdc(); + return -EINTR; + } + + if (FDCS->reset) + command_status = FD_COMMAND_ERROR; + if (command_status == FD_COMMAND_OKAY) + ret = 0; + else + ret = -EIO; + command_status = FD_COMMAND_NONE; + return ret; +} + +static void generic_done(int result) +{ + command_status = result; + cont = &wakeup_cont; +} + +static void generic_success(void) +{ + cont->done(1); +} + +static void generic_failure(void) +{ + cont->done(0); +} + +static void success_and_wakeup(void) +{ + generic_success(); + cont->redo(); +} + +/* + * formatting and rw support. + * ========================== + */ + +static int next_valid_format(void) +{ + int probed_format; + + probed_format = DRS->probed_format; + while (1) { + if (probed_format >= 8 || !DP->autodetect[probed_format]) { + DRS->probed_format = 0; + return 1; + } + if (floppy_type[DP->autodetect[probed_format]].sect) { + DRS->probed_format = probed_format; + return 0; + } + probed_format++; + } +} + +static void bad_flp_intr(void) +{ + int err_count; + + if (probing) { + DRS->probed_format++; + if (!next_valid_format()) + return; + } + err_count = ++(*errors); + INFBOUND(DRWE->badness, err_count); + if (err_count > DP->max_errors.abort) + cont->done(0); + if (err_count > DP->max_errors.reset) + FDCS->reset = 1; + else if (err_count > DP->max_errors.recal) + DRS->track = NEED_2_RECAL; +} + +static void set_floppy(int drive) +{ + int type = ITYPE(UDRS->fd_device); + + if (type) + _floppy = floppy_type + type; + else + _floppy = current_type[drive]; +} + +/* + * formatting support. + * =================== + */ +static void format_interrupt(void) +{ + switch (interpret_errors()) { + case 1: + cont->error(); + case 2: + break; + case 0: + cont->done(1); + } + cont->redo(); +} + +#define FM_MODE(x, y) ((y) & ~(((x)->rate & 0x80) >> 1)) +#define CT(x) ((x) | 0xc0) + +static void setup_format_params(int track) +{ + int n; + int il; + int count; + int head_shift; + int track_shift; + struct fparm { + unsigned char track, head, sect, size; + } *here = (struct fparm *)floppy_track_buffer; + + raw_cmd = &default_raw_cmd; + raw_cmd->track = track; + + raw_cmd->flags = (FD_RAW_WRITE | FD_RAW_INTR | FD_RAW_SPIN | + FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK); + raw_cmd->rate = _floppy->rate & 0x43; + raw_cmd->cmd_count = NR_F; + COMMAND = FM_MODE(_floppy, FD_FORMAT); + DR_SELECT = UNIT(current_drive) + PH_HEAD(_floppy, format_req.head); + F_SIZECODE = FD_SIZECODE(_floppy); + F_SECT_PER_TRACK = _floppy->sect << 2 >> F_SIZECODE; + F_GAP = _floppy->fmt_gap; + F_FILL = FD_FILL_BYTE; + + raw_cmd->kernel_data = floppy_track_buffer; + raw_cmd->length = 4 * F_SECT_PER_TRACK; + + /* allow for about 30ms for data transport per track */ + head_shift = (F_SECT_PER_TRACK + 5) / 6; + + /* a ``cylinder'' is two tracks plus a little stepping time */ + track_shift = 2 * head_shift + 3; + + /* position of logical sector 1 on this track */ + n = (track_shift * format_req.track + head_shift * format_req.head) + % F_SECT_PER_TRACK; + + /* determine interleave */ + il = 1; + if (_floppy->fmt_gap < 0x22) + il++; + + /* initialize field */ + for (count = 0; count < F_SECT_PER_TRACK; ++count) { + here[count].track = format_req.track; + here[count].head = format_req.head; + here[count].sect = 0; + here[count].size = F_SIZECODE; + } + /* place logical sectors */ + for (count = 1; count <= F_SECT_PER_TRACK; ++count) { + here[n].sect = count; + n = (n + il) % F_SECT_PER_TRACK; + if (here[n].sect) { /* sector busy, find next free sector */ + ++n; + if (n >= F_SECT_PER_TRACK) { + n -= F_SECT_PER_TRACK; + while (here[n].sect) + ++n; + } + } + } + if (_floppy->stretch & FD_SECTBASEMASK) { + for (count = 0; count < F_SECT_PER_TRACK; count++) + here[count].sect += FD_SECTBASE(_floppy) - 1; + } +} + +static void redo_format(void) +{ + buffer_track = -1; + setup_format_params(format_req.track << STRETCH(_floppy)); + floppy_start(); + debugt(__func__, "queue format request"); +} + +static const struct cont_t format_cont = { + .interrupt = format_interrupt, + .redo = redo_format, + .error = bad_flp_intr, + .done = generic_done +}; + +static int do_format(int drive, struct format_descr *tmp_format_req) +{ + int ret; + + if (lock_fdc(drive, true)) + return -EINTR; + + set_floppy(drive); + if (!_floppy || + _floppy->track > DP->tracks || + tmp_format_req->track >= _floppy->track || + tmp_format_req->head >= _floppy->head || + (_floppy->sect << 2) % (1 << FD_SIZECODE(_floppy)) || + !_floppy->fmt_gap) { + process_fd_request(); + return -EINVAL; + } + format_req = *tmp_format_req; + format_errors = 0; + cont = &format_cont; + errors = &format_errors; + ret = wait_til_done(redo_format, true); + if (ret == -EINTR) + return -EINTR; + process_fd_request(); + return ret; +} + +/* + * Buffer read/write and support + * ============================= + */ + +static void floppy_end_request(struct request *req, int error) +{ + unsigned int nr_sectors = current_count_sectors; + unsigned int drive = (unsigned long)req->rq_disk->private_data; + + /* current_count_sectors can be zero if transfer failed */ + if (error) + nr_sectors = blk_rq_cur_sectors(req); + if (__blk_end_request(req, error, nr_sectors << 9)) + return; + + /* We're done with the request */ + floppy_off(drive); + current_req = NULL; +} + +/* new request_done. Can handle physical sectors which are smaller than a + * logical buffer */ +static void request_done(int uptodate) +{ + struct request *req = current_req; + struct request_queue *q; + unsigned long flags; + int block; + char msg[sizeof("request done ") + sizeof(int) * 3]; + + probing = 0; + snprintf(msg, sizeof(msg), "request done %d", uptodate); + reschedule_timeout(MAXTIMEOUT, msg); + + if (!req) { + pr_info("floppy.c: no request in request_done\n"); + return; + } + + q = req->q; + + if (uptodate) { + /* maintain values for invalidation on geometry + * change */ + block = current_count_sectors + blk_rq_pos(req); + INFBOUND(DRS->maxblock, block); + if (block > _floppy->sect) + DRS->maxtrack = 1; + + /* unlock chained buffers */ + spin_lock_irqsave(q->queue_lock, flags); + floppy_end_request(req, 0); + spin_unlock_irqrestore(q->queue_lock, flags); + } else { + if (rq_data_dir(req) == WRITE) { + /* record write error information */ + DRWE->write_errors++; + if (DRWE->write_errors == 1) { + DRWE->first_error_sector = blk_rq_pos(req); + DRWE->first_error_generation = DRS->generation; + } + DRWE->last_error_sector = blk_rq_pos(req); + DRWE->last_error_generation = DRS->generation; + } + spin_lock_irqsave(q->queue_lock, flags); + floppy_end_request(req, -EIO); + spin_unlock_irqrestore(q->queue_lock, flags); + } +} + +/* Interrupt handler evaluating the result of the r/w operation */ +static void rw_interrupt(void) +{ + int eoc; + int ssize; + int heads; + int nr_sectors; + + if (R_HEAD >= 2) { + /* some Toshiba floppy controllers occasionnally seem to + * return bogus interrupts after read/write operations, which + * can be recognized by a bad head number (>= 2) */ + return; + } + + if (!DRS->first_read_date) + DRS->first_read_date = jiffies; + + nr_sectors = 0; + ssize = DIV_ROUND_UP(1 << SIZECODE, 4); + + if (ST1 & ST1_EOC) + eoc = 1; + else + eoc = 0; + + if (COMMAND & 0x80) + heads = 2; + else + heads = 1; + + nr_sectors = (((R_TRACK - TRACK) * heads + + R_HEAD - HEAD) * SECT_PER_TRACK + + R_SECTOR - SECTOR + eoc) << SIZECODE >> 2; + + if (nr_sectors / ssize > + DIV_ROUND_UP(in_sector_offset + current_count_sectors, ssize)) { + DPRINT("long rw: %x instead of %lx\n", + nr_sectors, current_count_sectors); + pr_info("rs=%d s=%d\n", R_SECTOR, SECTOR); + pr_info("rh=%d h=%d\n", R_HEAD, HEAD); + pr_info("rt=%d t=%d\n", R_TRACK, TRACK); + pr_info("heads=%d eoc=%d\n", heads, eoc); + pr_info("spt=%d st=%d ss=%d\n", + SECT_PER_TRACK, fsector_t, ssize); + pr_info("in_sector_offset=%d\n", in_sector_offset); + } + + nr_sectors -= in_sector_offset; + INFBOUND(nr_sectors, 0); + SUPBOUND(current_count_sectors, nr_sectors); + + switch (interpret_errors()) { + case 2: + cont->redo(); + return; + case 1: + if (!current_count_sectors) { + cont->error(); + cont->redo(); + return; + } + break; + case 0: + if (!current_count_sectors) { + cont->redo(); + return; + } + current_type[current_drive] = _floppy; + floppy_sizes[TOMINOR(current_drive)] = _floppy->size; + break; + } + + if (probing) { + if (DP->flags & FTD_MSG) + DPRINT("Auto-detected floppy type %s in fd%d\n", + _floppy->name, current_drive); + current_type[current_drive] = _floppy; + floppy_sizes[TOMINOR(current_drive)] = _floppy->size; + probing = 0; + } + + if (CT(COMMAND) != FD_READ || + raw_cmd->kernel_data == bio_data(current_req->bio)) { + /* transfer directly from buffer */ + cont->done(1); + } else if (CT(COMMAND) == FD_READ) { + buffer_track = raw_cmd->track; + buffer_drive = current_drive; + INFBOUND(buffer_max, nr_sectors + fsector_t); + } + cont->redo(); +} + +/* Compute maximal contiguous buffer size. */ +static int buffer_chain_size(void) +{ + struct bio_vec bv; + int size; + struct req_iterator iter; + char *base; + + base = bio_data(current_req->bio); + size = 0; + + rq_for_each_segment(bv, current_req, iter) { + if (page_address(bv.bv_page) + bv.bv_offset != base + size) + break; + + size += bv.bv_len; + } + + return size >> 9; +} + +/* Compute the maximal transfer size */ +static int transfer_size(int ssize, int max_sector, int max_size) +{ + SUPBOUND(max_sector, fsector_t + max_size); + + /* alignment */ + max_sector -= (max_sector % _floppy->sect) % ssize; + + /* transfer size, beginning not aligned */ + current_count_sectors = max_sector - fsector_t; + + return max_sector; +} + +/* + * Move data from/to the track buffer to/from the buffer cache. + */ +static void copy_buffer(int ssize, int max_sector, int max_sector_2) +{ + int remaining; /* number of transferred 512-byte sectors */ + struct bio_vec bv; + char *buffer; + char *dma_buffer; + int size; + struct req_iterator iter; + + max_sector = transfer_size(ssize, + min(max_sector, max_sector_2), + blk_rq_sectors(current_req)); + + if (current_count_sectors <= 0 && CT(COMMAND) == FD_WRITE && + buffer_max > fsector_t + blk_rq_sectors(current_req)) + current_count_sectors = min_t(int, buffer_max - fsector_t, + blk_rq_sectors(current_req)); + + remaining = current_count_sectors << 9; + if (remaining > blk_rq_bytes(current_req) && CT(COMMAND) == FD_WRITE) { + DPRINT("in copy buffer\n"); + pr_info("current_count_sectors=%ld\n", current_count_sectors); + pr_info("remaining=%d\n", remaining >> 9); + pr_info("current_req->nr_sectors=%u\n", + blk_rq_sectors(current_req)); + pr_info("current_req->current_nr_sectors=%u\n", + blk_rq_cur_sectors(current_req)); + pr_info("max_sector=%d\n", max_sector); + pr_info("ssize=%d\n", ssize); + } + + buffer_max = max(max_sector, buffer_max); + + dma_buffer = floppy_track_buffer + ((fsector_t - buffer_min) << 9); + + size = blk_rq_cur_bytes(current_req); + + rq_for_each_segment(bv, current_req, iter) { + if (!remaining) + break; + + size = bv.bv_len; + SUPBOUND(size, remaining); + + buffer = page_address(bv.bv_page) + bv.bv_offset; + if (dma_buffer + size > + floppy_track_buffer + (max_buffer_sectors << 10) || + dma_buffer < floppy_track_buffer) { + DPRINT("buffer overrun in copy buffer %d\n", + (int)((floppy_track_buffer - dma_buffer) >> 9)); + pr_info("fsector_t=%d buffer_min=%d\n", + fsector_t, buffer_min); + pr_info("current_count_sectors=%ld\n", + current_count_sectors); + if (CT(COMMAND) == FD_READ) + pr_info("read\n"); + if (CT(COMMAND) == FD_WRITE) + pr_info("write\n"); + break; + } + if (((unsigned long)buffer) % 512) + DPRINT("%p buffer not aligned\n", buffer); + + if (CT(COMMAND) == FD_READ) + memcpy(buffer, dma_buffer, size); + else + memcpy(dma_buffer, buffer, size); + + remaining -= size; + dma_buffer += size; + } + if (remaining) { + if (remaining > 0) + max_sector -= remaining >> 9; + DPRINT("weirdness: remaining %d\n", remaining >> 9); + } +} + +/* work around a bug in pseudo DMA + * (on some FDCs) pseudo DMA does not stop when the CPU stops + * sending data. Hence we need a different way to signal the + * transfer length: We use SECT_PER_TRACK. Unfortunately, this + * does not work with MT, hence we can only transfer one head at + * a time + */ +static void virtualdmabug_workaround(void) +{ + int hard_sectors; + int end_sector; + + if (CT(COMMAND) == FD_WRITE) { + COMMAND &= ~0x80; /* switch off multiple track mode */ + + hard_sectors = raw_cmd->length >> (7 + SIZECODE); + end_sector = SECTOR + hard_sectors - 1; + if (end_sector > SECT_PER_TRACK) { + pr_info("too many sectors %d > %d\n", + end_sector, SECT_PER_TRACK); + return; + } + SECT_PER_TRACK = end_sector; + /* make sure SECT_PER_TRACK + * points to end of transfer */ + } +} + +/* + * Formulate a read/write request. + * this routine decides where to load the data (directly to buffer, or to + * tmp floppy area), how much data to load (the size of the buffer, the whole + * track, or a single sector) + * All floppy_track_buffer handling goes in here. If we ever add track buffer + * allocation on the fly, it should be done here. No other part should need + * modification. + */ + +static int make_raw_rw_request(void) +{ + int aligned_sector_t; + int max_sector; + int max_size; + int tracksize; + int ssize; + + if (WARN(max_buffer_sectors == 0, "VFS: Block I/O scheduled on unopened device\n")) + return 0; + + set_fdc((long)current_req->rq_disk->private_data); + + raw_cmd = &default_raw_cmd; + raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK; + raw_cmd->cmd_count = NR_RW; + if (rq_data_dir(current_req) == READ) { + raw_cmd->flags |= FD_RAW_READ; + COMMAND = FM_MODE(_floppy, FD_READ); + } else if (rq_data_dir(current_req) == WRITE) { + raw_cmd->flags |= FD_RAW_WRITE; + COMMAND = FM_MODE(_floppy, FD_WRITE); + } else { + DPRINT("%s: unknown command\n", __func__); + return 0; + } + + max_sector = _floppy->sect * _floppy->head; + + TRACK = (int)blk_rq_pos(current_req) / max_sector; + fsector_t = (int)blk_rq_pos(current_req) % max_sector; + if (_floppy->track && TRACK >= _floppy->track) { + if (blk_rq_cur_sectors(current_req) & 1) { + current_count_sectors = 1; + return 1; + } else + return 0; + } + HEAD = fsector_t / _floppy->sect; + + if (((_floppy->stretch & (FD_SWAPSIDES | FD_SECTBASEMASK)) || + test_bit(FD_NEED_TWADDLE_BIT, &DRS->flags)) && + fsector_t < _floppy->sect) + max_sector = _floppy->sect; + + /* 2M disks have phantom sectors on the first track */ + if ((_floppy->rate & FD_2M) && (!TRACK) && (!HEAD)) { + max_sector = 2 * _floppy->sect / 3; + if (fsector_t >= max_sector) { + current_count_sectors = + min_t(int, _floppy->sect - fsector_t, + blk_rq_sectors(current_req)); + return 1; + } + SIZECODE = 2; + } else + SIZECODE = FD_SIZECODE(_floppy); + raw_cmd->rate = _floppy->rate & 0x43; + if ((_floppy->rate & FD_2M) && (TRACK || HEAD) && raw_cmd->rate == 2) + raw_cmd->rate = 1; + + if (SIZECODE) + SIZECODE2 = 0xff; + else + SIZECODE2 = 0x80; + raw_cmd->track = TRACK << STRETCH(_floppy); + DR_SELECT = UNIT(current_drive) + PH_HEAD(_floppy, HEAD); + GAP = _floppy->gap; + ssize = DIV_ROUND_UP(1 << SIZECODE, 4); + SECT_PER_TRACK = _floppy->sect << 2 >> SIZECODE; + SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) + + FD_SECTBASE(_floppy); + + /* tracksize describes the size which can be filled up with sectors + * of size ssize. + */ + tracksize = _floppy->sect - _floppy->sect % ssize; + if (tracksize < _floppy->sect) { + SECT_PER_TRACK++; + if (tracksize <= fsector_t % _floppy->sect) + SECTOR--; + + /* if we are beyond tracksize, fill up using smaller sectors */ + while (tracksize <= fsector_t % _floppy->sect) { + while (tracksize + ssize > _floppy->sect) { + SIZECODE--; + ssize >>= 1; + } + SECTOR++; + SECT_PER_TRACK++; + tracksize += ssize; + } + max_sector = HEAD * _floppy->sect + tracksize; + } else if (!TRACK && !HEAD && !(_floppy->rate & FD_2M) && probing) { + max_sector = _floppy->sect; + } else if (!HEAD && CT(COMMAND) == FD_WRITE) { + /* for virtual DMA bug workaround */ + max_sector = _floppy->sect; + } + + in_sector_offset = (fsector_t % _floppy->sect) % ssize; + aligned_sector_t = fsector_t - in_sector_offset; + max_size = blk_rq_sectors(current_req); + if ((raw_cmd->track == buffer_track) && + (current_drive == buffer_drive) && + (fsector_t >= buffer_min) && (fsector_t < buffer_max)) { + /* data already in track buffer */ + if (CT(COMMAND) == FD_READ) { + copy_buffer(1, max_sector, buffer_max); + return 1; + } + } else if (in_sector_offset || blk_rq_sectors(current_req) < ssize) { + if (CT(COMMAND) == FD_WRITE) { + unsigned int sectors; + + sectors = fsector_t + blk_rq_sectors(current_req); + if (sectors > ssize && sectors < ssize + ssize) + max_size = ssize + ssize; + else + max_size = ssize; + } + raw_cmd->flags &= ~FD_RAW_WRITE; + raw_cmd->flags |= FD_RAW_READ; + COMMAND = FM_MODE(_floppy, FD_READ); + } else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) { + unsigned long dma_limit; + int direct, indirect; + + indirect = + transfer_size(ssize, max_sector, + max_buffer_sectors * 2) - fsector_t; + + /* + * Do NOT use minimum() here---MAX_DMA_ADDRESS is 64 bits wide + * on a 64 bit machine! + */ + max_size = buffer_chain_size(); + dma_limit = (MAX_DMA_ADDRESS - + ((unsigned long)bio_data(current_req->bio))) >> 9; + if ((unsigned long)max_size > dma_limit) + max_size = dma_limit; + /* 64 kb boundaries */ + if (CROSS_64KB(bio_data(current_req->bio), max_size << 9)) + max_size = (K_64 - + ((unsigned long)bio_data(current_req->bio)) % + K_64) >> 9; + direct = transfer_size(ssize, max_sector, max_size) - fsector_t; + /* + * We try to read tracks, but if we get too many errors, we + * go back to reading just one sector at a time. + * + * This means we should be able to read a sector even if there + * are other bad sectors on this track. + */ + if (!direct || + (indirect * 2 > direct * 3 && + *errors < DP->max_errors.read_track && + ((!probing || + (DP->read_track & (1 << DRS->probed_format)))))) { + max_size = blk_rq_sectors(current_req); + } else { + raw_cmd->kernel_data = bio_data(current_req->bio); + raw_cmd->length = current_count_sectors << 9; + if (raw_cmd->length == 0) { + DPRINT("%s: zero dma transfer attempted\n", __func__); + DPRINT("indirect=%d direct=%d fsector_t=%d\n", + indirect, direct, fsector_t); + return 0; + } + virtualdmabug_workaround(); + return 2; + } + } + + if (CT(COMMAND) == FD_READ) + max_size = max_sector; /* unbounded */ + + /* claim buffer track if needed */ + if (buffer_track != raw_cmd->track || /* bad track */ + buffer_drive != current_drive || /* bad drive */ + fsector_t > buffer_max || + fsector_t < buffer_min || + ((CT(COMMAND) == FD_READ || + (!in_sector_offset && blk_rq_sectors(current_req) >= ssize)) && + max_sector > 2 * max_buffer_sectors + buffer_min && + max_size + fsector_t > 2 * max_buffer_sectors + buffer_min)) { + /* not enough space */ + buffer_track = -1; + buffer_drive = current_drive; + buffer_max = buffer_min = aligned_sector_t; + } + raw_cmd->kernel_data = floppy_track_buffer + + ((aligned_sector_t - buffer_min) << 9); + + if (CT(COMMAND) == FD_WRITE) { + /* copy write buffer to track buffer. + * if we get here, we know that the write + * is either aligned or the data already in the buffer + * (buffer will be overwritten) */ + if (in_sector_offset && buffer_track == -1) + DPRINT("internal error offset !=0 on write\n"); + buffer_track = raw_cmd->track; + buffer_drive = current_drive; + copy_buffer(ssize, max_sector, + 2 * max_buffer_sectors + buffer_min); + } else + transfer_size(ssize, max_sector, + 2 * max_buffer_sectors + buffer_min - + aligned_sector_t); + + /* round up current_count_sectors to get dma xfer size */ + raw_cmd->length = in_sector_offset + current_count_sectors; + raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1; + raw_cmd->length <<= 9; + if ((raw_cmd->length < current_count_sectors << 9) || + (raw_cmd->kernel_data != bio_data(current_req->bio) && + CT(COMMAND) == FD_WRITE && + (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max || + aligned_sector_t < buffer_min)) || + raw_cmd->length % (128 << SIZECODE) || + raw_cmd->length <= 0 || current_count_sectors <= 0) { + DPRINT("fractionary current count b=%lx s=%lx\n", + raw_cmd->length, current_count_sectors); + if (raw_cmd->kernel_data != bio_data(current_req->bio)) + pr_info("addr=%d, length=%ld\n", + (int)((raw_cmd->kernel_data - + floppy_track_buffer) >> 9), + current_count_sectors); + pr_info("st=%d ast=%d mse=%d msi=%d\n", + fsector_t, aligned_sector_t, max_sector, max_size); + pr_info("ssize=%x SIZECODE=%d\n", ssize, SIZECODE); + pr_info("command=%x SECTOR=%d HEAD=%d, TRACK=%d\n", + COMMAND, SECTOR, HEAD, TRACK); + pr_info("buffer drive=%d\n", buffer_drive); + pr_info("buffer track=%d\n", buffer_track); + pr_info("buffer_min=%d\n", buffer_min); + pr_info("buffer_max=%d\n", buffer_max); + return 0; + } + + if (raw_cmd->kernel_data != bio_data(current_req->bio)) { + if (raw_cmd->kernel_data < floppy_track_buffer || + current_count_sectors < 0 || + raw_cmd->length < 0 || + raw_cmd->kernel_data + raw_cmd->length > + floppy_track_buffer + (max_buffer_sectors << 10)) { + DPRINT("buffer overrun in schedule dma\n"); + pr_info("fsector_t=%d buffer_min=%d current_count=%ld\n", + fsector_t, buffer_min, raw_cmd->length >> 9); + pr_info("current_count_sectors=%ld\n", + current_count_sectors); + if (CT(COMMAND) == FD_READ) + pr_info("read\n"); + if (CT(COMMAND) == FD_WRITE) + pr_info("write\n"); + return 0; + } + } else if (raw_cmd->length > blk_rq_bytes(current_req) || + current_count_sectors > blk_rq_sectors(current_req)) { + DPRINT("buffer overrun in direct transfer\n"); + return 0; + } else if (raw_cmd->length < current_count_sectors << 9) { + DPRINT("more sectors than bytes\n"); + pr_info("bytes=%ld\n", raw_cmd->length >> 9); + pr_info("sectors=%ld\n", current_count_sectors); + } + if (raw_cmd->length == 0) { + DPRINT("zero dma transfer attempted from make_raw_request\n"); + return 0; + } + + virtualdmabug_workaround(); + return 2; +} + +/* + * Round-robin between our available drives, doing one request from each + */ +static int set_next_request(void) +{ + struct request_queue *q; + int old_pos = fdc_queue; + + do { + q = disks[fdc_queue]->queue; + if (++fdc_queue == N_DRIVE) + fdc_queue = 0; + if (q) { + current_req = blk_fetch_request(q); + if (current_req) + break; + } + } while (fdc_queue != old_pos); + + return current_req != NULL; +} + +static void redo_fd_request(void) +{ + int drive; + int tmp; + + lastredo = jiffies; + if (current_drive < N_DRIVE) + floppy_off(current_drive); + +do_request: + if (!current_req) { + int pending; + + spin_lock_irq(&floppy_lock); + pending = set_next_request(); + spin_unlock_irq(&floppy_lock); + if (!pending) { + do_floppy = NULL; + unlock_fdc(); + return; + } + } + drive = (long)current_req->rq_disk->private_data; + set_fdc(drive); + reschedule_timeout(current_reqD, "redo fd request"); + + set_floppy(drive); + raw_cmd = &default_raw_cmd; + raw_cmd->flags = 0; + if (start_motor(redo_fd_request)) + return; + + disk_change(current_drive); + if (test_bit(current_drive, &fake_change) || + test_bit(FD_DISK_CHANGED_BIT, &DRS->flags)) { + DPRINT("disk absent or changed during operation\n"); + request_done(0); + goto do_request; + } + if (!_floppy) { /* Autodetection */ + if (!probing) { + DRS->probed_format = 0; + if (next_valid_format()) { + DPRINT("no autodetectable formats\n"); + _floppy = NULL; + request_done(0); + goto do_request; + } + } + probing = 1; + _floppy = floppy_type + DP->autodetect[DRS->probed_format]; + } else + probing = 0; + errors = &(current_req->errors); + tmp = make_raw_rw_request(); + if (tmp < 2) { + request_done(tmp); + goto do_request; + } + + if (test_bit(FD_NEED_TWADDLE_BIT, &DRS->flags)) + twaddle(); + schedule_bh(floppy_start); + debugt(__func__, "queue fd request"); + return; +} + +static const struct cont_t rw_cont = { + .interrupt = rw_interrupt, + .redo = redo_fd_request, + .error = bad_flp_intr, + .done = request_done +}; + +static void process_fd_request(void) +{ + cont = &rw_cont; + schedule_bh(redo_fd_request); +} + +static void do_fd_request(struct request_queue *q) +{ + if (WARN(max_buffer_sectors == 0, + "VFS: %s called on non-open device\n", __func__)) + return; + + if (WARN(atomic_read(&usage_count) == 0, + "warning: usage count=0, current_req=%p sect=%ld type=%x flags=%llx\n", + current_req, (long)blk_rq_pos(current_req), current_req->cmd_type, + (unsigned long long) current_req->cmd_flags)) + return; + + if (test_and_set_bit(0, &fdc_busy)) { + /* fdc busy, this new request will be treated when the + current one is done */ + is_alive(__func__, "old request running"); + return; + } + command_status = FD_COMMAND_NONE; + __reschedule_timeout(MAXTIMEOUT, "fd_request"); + set_fdc(0); + process_fd_request(); + is_alive(__func__, ""); +} + +static const struct cont_t poll_cont = { + .interrupt = success_and_wakeup, + .redo = floppy_ready, + .error = generic_failure, + .done = generic_done +}; + +static int poll_drive(bool interruptible, int flag) +{ + /* no auto-sense, just clear dcl */ + raw_cmd = &default_raw_cmd; + raw_cmd->flags = flag; + raw_cmd->track = 0; + raw_cmd->cmd_count = 0; + cont = &poll_cont; + debug_dcl(DP->flags, "setting NEWCHANGE in poll_drive\n"); + set_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags); + + return wait_til_done(floppy_ready, interruptible); +} + +/* + * User triggered reset + * ==================== + */ + +static void reset_intr(void) +{ + pr_info("weird, reset interrupt called\n"); +} + +static const struct cont_t reset_cont = { + .interrupt = reset_intr, + .redo = success_and_wakeup, + .error = generic_failure, + .done = generic_done +}; + +static int user_reset_fdc(int drive, int arg, bool interruptible) +{ + int ret; + + if (lock_fdc(drive, interruptible)) + return -EINTR; + + if (arg == FD_RESET_ALWAYS) + FDCS->reset = 1; + if (FDCS->reset) { + cont = &reset_cont; + ret = wait_til_done(reset_fdc, interruptible); + if (ret == -EINTR) + return -EINTR; + } + process_fd_request(); + return 0; +} + +/* + * Misc Ioctl's and support + * ======================== + */ +static inline int fd_copyout(void __user *param, const void *address, + unsigned long size) +{ + return copy_to_user(param, address, size) ? -EFAULT : 0; +} + +static inline int fd_copyin(void __user *param, void *address, + unsigned long size) +{ + return copy_from_user(address, param, size) ? -EFAULT : 0; +} + +static const char *drive_name(int type, int drive) +{ + struct floppy_struct *floppy; + + if (type) + floppy = floppy_type + type; + else { + if (UDP->native_format) + floppy = floppy_type + UDP->native_format; + else + return "(null)"; + } + if (floppy->name) + return floppy->name; + else + return "(null)"; +} + +/* raw commands */ +static void raw_cmd_done(int flag) +{ + int i; + + if (!flag) { + raw_cmd->flags |= FD_RAW_FAILURE; + raw_cmd->flags |= FD_RAW_HARDFAILURE; + } else { + raw_cmd->reply_count = inr; + if (raw_cmd->reply_count > MAX_REPLIES) + raw_cmd->reply_count = 0; + for (i = 0; i < raw_cmd->reply_count; i++) + raw_cmd->reply[i] = reply_buffer[i]; + + if (raw_cmd->flags & (FD_RAW_READ | FD_RAW_WRITE)) { + unsigned long flags; + flags = claim_dma_lock(); + raw_cmd->length = fd_get_dma_residue(); + release_dma_lock(flags); + } + + if ((raw_cmd->flags & FD_RAW_SOFTFAILURE) && + (!raw_cmd->reply_count || (raw_cmd->reply[0] & 0xc0))) + raw_cmd->flags |= FD_RAW_FAILURE; + + if (disk_change(current_drive)) + raw_cmd->flags |= FD_RAW_DISK_CHANGE; + else + raw_cmd->flags &= ~FD_RAW_DISK_CHANGE; + if (raw_cmd->flags & FD_RAW_NO_MOTOR_AFTER) + motor_off_callback(current_drive); + + if (raw_cmd->next && + (!(raw_cmd->flags & FD_RAW_FAILURE) || + !(raw_cmd->flags & FD_RAW_STOP_IF_FAILURE)) && + ((raw_cmd->flags & FD_RAW_FAILURE) || + !(raw_cmd->flags & FD_RAW_STOP_IF_SUCCESS))) { + raw_cmd = raw_cmd->next; + return; + } + } + generic_done(flag); +} + +static const struct cont_t raw_cmd_cont = { + .interrupt = success_and_wakeup, + .redo = floppy_start, + .error = generic_failure, + .done = raw_cmd_done +}; + +static int raw_cmd_copyout(int cmd, void __user *param, + struct floppy_raw_cmd *ptr) +{ + int ret; + + while (ptr) { + struct floppy_raw_cmd cmd = *ptr; + cmd.next = NULL; + cmd.kernel_data = NULL; + ret = copy_to_user(param, &cmd, sizeof(cmd)); + if (ret) + return -EFAULT; + param += sizeof(struct floppy_raw_cmd); + if ((ptr->flags & FD_RAW_READ) && ptr->buffer_length) { + if (ptr->length >= 0 && + ptr->length <= ptr->buffer_length) { + long length = ptr->buffer_length - ptr->length; + ret = fd_copyout(ptr->data, ptr->kernel_data, + length); + if (ret) + return ret; + } + } + ptr = ptr->next; + } + + return 0; +} + +static void raw_cmd_free(struct floppy_raw_cmd **ptr) +{ + struct floppy_raw_cmd *next; + struct floppy_raw_cmd *this; + + this = *ptr; + *ptr = NULL; + while (this) { + if (this->buffer_length) { + fd_dma_mem_free((unsigned long)this->kernel_data, + this->buffer_length); + this->buffer_length = 0; + } + next = this->next; + kfree(this); + this = next; + } +} + +static int raw_cmd_copyin(int cmd, void __user *param, + struct floppy_raw_cmd **rcmd) +{ + struct floppy_raw_cmd *ptr; + int ret; + int i; + + *rcmd = NULL; + +loop: + ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_USER); + if (!ptr) + return -ENOMEM; + *rcmd = ptr; + ret = copy_from_user(ptr, param, sizeof(*ptr)); + ptr->next = NULL; + ptr->buffer_length = 0; + ptr->kernel_data = NULL; + if (ret) + return -EFAULT; + param += sizeof(struct floppy_raw_cmd); + if (ptr->cmd_count > 33) + /* the command may now also take up the space + * initially intended for the reply & the + * reply count. Needed for long 82078 commands + * such as RESTORE, which takes ... 17 command + * bytes. Murphy's law #137: When you reserve + * 16 bytes for a structure, you'll one day + * discover that you really need 17... + */ + return -EINVAL; + + for (i = 0; i < 16; i++) + ptr->reply[i] = 0; + ptr->resultcode = 0; + + if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) { + if (ptr->length <= 0) + return -EINVAL; + ptr->kernel_data = (char *)fd_dma_mem_alloc(ptr->length); + fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length); + if (!ptr->kernel_data) + return -ENOMEM; + ptr->buffer_length = ptr->length; + } + if (ptr->flags & FD_RAW_WRITE) { + ret = fd_copyin(ptr->data, ptr->kernel_data, ptr->length); + if (ret) + return ret; + } + + if (ptr->flags & FD_RAW_MORE) { + rcmd = &(ptr->next); + ptr->rate &= 0x43; + goto loop; + } + + return 0; +} + +static int raw_cmd_ioctl(int cmd, void __user *param) +{ + struct floppy_raw_cmd *my_raw_cmd; + int drive; + int ret2; + int ret; + + if (FDCS->rawcmd <= 1) + FDCS->rawcmd = 1; + for (drive = 0; drive < N_DRIVE; drive++) { + if (FDC(drive) != fdc) + continue; + if (drive == current_drive) { + if (UDRS->fd_ref > 1) { + FDCS->rawcmd = 2; + break; + } + } else if (UDRS->fd_ref) { + FDCS->rawcmd = 2; + break; + } + } + + if (FDCS->reset) + return -EIO; + + ret = raw_cmd_copyin(cmd, param, &my_raw_cmd); + if (ret) { + raw_cmd_free(&my_raw_cmd); + return ret; + } + + raw_cmd = my_raw_cmd; + cont = &raw_cmd_cont; + ret = wait_til_done(floppy_start, true); + debug_dcl(DP->flags, "calling disk change from raw_cmd ioctl\n"); + + if (ret != -EINTR && FDCS->reset) + ret = -EIO; + + DRS->track = NO_TRACK; + + ret2 = raw_cmd_copyout(cmd, param, my_raw_cmd); + if (!ret) + ret = ret2; + raw_cmd_free(&my_raw_cmd); + return ret; +} + +static int invalidate_drive(struct block_device *bdev) +{ + /* invalidate the buffer track to force a reread */ + set_bit((long)bdev->bd_disk->private_data, &fake_change); + process_fd_request(); + check_disk_change(bdev); + return 0; +} + +static int set_geometry(unsigned int cmd, struct floppy_struct *g, + int drive, int type, struct block_device *bdev) +{ + int cnt; + + /* sanity checking for parameters. */ + if (g->sect <= 0 || + g->head <= 0 || + g->track <= 0 || g->track > UDP->tracks >> STRETCH(g) || + /* check if reserved bits are set */ + (g->stretch & ~(FD_STRETCH | FD_SWAPSIDES | FD_SECTBASEMASK)) != 0) + return -EINVAL; + if (type) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + mutex_lock(&open_lock); + if (lock_fdc(drive, true)) { + mutex_unlock(&open_lock); + return -EINTR; + } + floppy_type[type] = *g; + floppy_type[type].name = "user format"; + for (cnt = type << 2; cnt < (type << 2) + 4; cnt++) + floppy_sizes[cnt] = floppy_sizes[cnt + 0x80] = + floppy_type[type].size + 1; + process_fd_request(); + for (cnt = 0; cnt < N_DRIVE; cnt++) { + struct block_device *bdev = opened_bdev[cnt]; + if (!bdev || ITYPE(drive_state[cnt].fd_device) != type) + continue; + __invalidate_device(bdev, true); + } + mutex_unlock(&open_lock); + } else { + int oldStretch; + + if (lock_fdc(drive, true)) + return -EINTR; + if (cmd != FDDEFPRM) { + /* notice a disk change immediately, else + * we lose our settings immediately*/ + if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR) + return -EINTR; + } + oldStretch = g->stretch; + user_params[drive] = *g; + if (buffer_drive == drive) + SUPBOUND(buffer_max, user_params[drive].sect); + current_type[drive] = &user_params[drive]; + floppy_sizes[drive] = user_params[drive].size; + if (cmd == FDDEFPRM) + DRS->keep_data = -1; + else + DRS->keep_data = 1; + /* invalidation. Invalidate only when needed, i.e. + * when there are already sectors in the buffer cache + * whose number will change. This is useful, because + * mtools often changes the geometry of the disk after + * looking at the boot block */ + if (DRS->maxblock > user_params[drive].sect || + DRS->maxtrack || + ((user_params[drive].sect ^ oldStretch) & + (FD_SWAPSIDES | FD_SECTBASEMASK))) + invalidate_drive(bdev); + else + process_fd_request(); + } + return 0; +} + +/* handle obsolete ioctl's */ +static unsigned int ioctl_table[] = { + FDCLRPRM, + FDSETPRM, + FDDEFPRM, + FDGETPRM, + FDMSGON, + FDMSGOFF, + FDFMTBEG, + FDFMTTRK, + FDFMTEND, + FDSETEMSGTRESH, + FDFLUSH, + FDSETMAXERRS, + FDGETMAXERRS, + FDGETDRVTYP, + FDSETDRVPRM, + FDGETDRVPRM, + FDGETDRVSTAT, + FDPOLLDRVSTAT, + FDRESET, + FDGETFDCSTAT, + FDWERRORCLR, + FDWERRORGET, + FDRAWCMD, + FDEJECT, + FDTWADDLE +}; + +static int normalize_ioctl(unsigned int *cmd, int *size) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ioctl_table); i++) { + if ((*cmd & 0xffff) == (ioctl_table[i] & 0xffff)) { + *size = _IOC_SIZE(*cmd); + *cmd = ioctl_table[i]; + if (*size > _IOC_SIZE(*cmd)) { + pr_info("ioctl not yet supported\n"); + return -EFAULT; + } + return 0; + } + } + return -EINVAL; +} + +static int get_floppy_geometry(int drive, int type, struct floppy_struct **g) +{ + if (type) + *g = &floppy_type[type]; + else { + if (lock_fdc(drive, false)) + return -EINTR; + if (poll_drive(false, 0) == -EINTR) + return -EINTR; + process_fd_request(); + *g = current_type[drive]; + } + if (!*g) + return -ENODEV; + return 0; +} + +static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + int drive = (long)bdev->bd_disk->private_data; + int type = ITYPE(drive_state[drive].fd_device); + struct floppy_struct *g; + int ret; + + ret = get_floppy_geometry(drive, type, &g); + if (ret) + return ret; + + geo->heads = g->head; + geo->sectors = g->sect; + geo->cylinders = g->track; + return 0; +} + +static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, + unsigned long param) +{ + int drive = (long)bdev->bd_disk->private_data; + int type = ITYPE(UDRS->fd_device); + int i; + int ret; + int size; + union inparam { + struct floppy_struct g; /* geometry */ + struct format_descr f; + struct floppy_max_errors max_errors; + struct floppy_drive_params dp; + } inparam; /* parameters coming from user space */ + const void *outparam; /* parameters passed back to user space */ + + /* convert compatibility eject ioctls into floppy eject ioctl. + * We do this in order to provide a means to eject floppy disks before + * installing the new fdutils package */ + if (cmd == CDROMEJECT || /* CD-ROM eject */ + cmd == 0x6470) { /* SunOS floppy eject */ + DPRINT("obsolete eject ioctl\n"); + DPRINT("please use floppycontrol --eject\n"); + cmd = FDEJECT; + } + + if (!((cmd & 0xff00) == 0x0200)) + return -EINVAL; + + /* convert the old style command into a new style command */ + ret = normalize_ioctl(&cmd, &size); + if (ret) + return ret; + + /* permission checks */ + if (((cmd & 0x40) && !(mode & (FMODE_WRITE | FMODE_WRITE_IOCTL))) || + ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))) + return -EPERM; + + if (WARN_ON(size < 0 || size > sizeof(inparam))) + return -EINVAL; + + /* copyin */ + memset(&inparam, 0, sizeof(inparam)); + if (_IOC_DIR(cmd) & _IOC_WRITE) { + ret = fd_copyin((void __user *)param, &inparam, size); + if (ret) + return ret; + } + + switch (cmd) { + case FDEJECT: + if (UDRS->fd_ref != 1) + /* somebody else has this drive open */ + return -EBUSY; + if (lock_fdc(drive, true)) + return -EINTR; + + /* do the actual eject. Fails on + * non-Sparc architectures */ + ret = fd_eject(UNIT(drive)); + + set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); + set_bit(FD_VERIFY_BIT, &UDRS->flags); + process_fd_request(); + return ret; + case FDCLRPRM: + if (lock_fdc(drive, true)) + return -EINTR; + current_type[drive] = NULL; + floppy_sizes[drive] = MAX_DISK_SIZE << 1; + UDRS->keep_data = 0; + return invalidate_drive(bdev); + case FDSETPRM: + case FDDEFPRM: + return set_geometry(cmd, &inparam.g, drive, type, bdev); + case FDGETPRM: + ret = get_floppy_geometry(drive, type, + (struct floppy_struct **)&outparam); + if (ret) + return ret; + break; + case FDMSGON: + UDP->flags |= FTD_MSG; + return 0; + case FDMSGOFF: + UDP->flags &= ~FTD_MSG; + return 0; + case FDFMTBEG: + if (lock_fdc(drive, true)) + return -EINTR; + if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR) + return -EINTR; + ret = UDRS->flags; + process_fd_request(); + if (ret & FD_VERIFY) + return -ENODEV; + if (!(ret & FD_DISK_WRITABLE)) + return -EROFS; + return 0; + case FDFMTTRK: + if (UDRS->fd_ref != 1) + return -EBUSY; + return do_format(drive, &inparam.f); + case FDFMTEND: + case FDFLUSH: + if (lock_fdc(drive, true)) + return -EINTR; + return invalidate_drive(bdev); + case FDSETEMSGTRESH: + UDP->max_errors.reporting = (unsigned short)(param & 0x0f); + return 0; + case FDGETMAXERRS: + outparam = &UDP->max_errors; + break; + case FDSETMAXERRS: + UDP->max_errors = inparam.max_errors; + break; + case FDGETDRVTYP: + outparam = drive_name(type, drive); + SUPBOUND(size, strlen((const char *)outparam) + 1); + break; + case FDSETDRVPRM: + *UDP = inparam.dp; + break; + case FDGETDRVPRM: + outparam = UDP; + break; + case FDPOLLDRVSTAT: + if (lock_fdc(drive, true)) + return -EINTR; + if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR) + return -EINTR; + process_fd_request(); + /* fall through */ + case FDGETDRVSTAT: + outparam = UDRS; + break; + case FDRESET: + return user_reset_fdc(drive, (int)param, true); + case FDGETFDCSTAT: + outparam = UFDCS; + break; + case FDWERRORCLR: + memset(UDRWE, 0, sizeof(*UDRWE)); + return 0; + case FDWERRORGET: + outparam = UDRWE; + break; + case FDRAWCMD: + if (type) + return -EINVAL; + if (lock_fdc(drive, true)) + return -EINTR; + set_floppy(drive); + i = raw_cmd_ioctl(cmd, (void __user *)param); + if (i == -EINTR) + return -EINTR; + process_fd_request(); + return i; + case FDTWADDLE: + if (lock_fdc(drive, true)) + return -EINTR; + twaddle(); + process_fd_request(); + return 0; + default: + return -EINVAL; + } + + if (_IOC_DIR(cmd) & _IOC_READ) + return fd_copyout((void __user *)param, outparam, size); + + return 0; +} + +static int fd_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long param) +{ + int ret; + + mutex_lock(&floppy_mutex); + ret = fd_locked_ioctl(bdev, mode, cmd, param); + mutex_unlock(&floppy_mutex); + + return ret; +} + +static void __init config_types(void) +{ + bool has_drive = false; + int drive; + + /* read drive info out of physical CMOS */ + drive = 0; + if (!UDP->cmos) + UDP->cmos = FLOPPY0_TYPE; + drive = 1; + if (!UDP->cmos && FLOPPY1_TYPE) + UDP->cmos = FLOPPY1_TYPE; + + /* FIXME: additional physical CMOS drive detection should go here */ + + for (drive = 0; drive < N_DRIVE; drive++) { + unsigned int type = UDP->cmos; + struct floppy_drive_params *params; + const char *name = NULL; + static char temparea[32]; + + if (type < ARRAY_SIZE(default_drive_params)) { + params = &default_drive_params[type].params; + if (type) { + name = default_drive_params[type].name; + allowed_drive_mask |= 1 << drive; + } else + allowed_drive_mask &= ~(1 << drive); + } else { + params = &default_drive_params[0].params; + sprintf(temparea, "unknown type %d (usb?)", type); + name = temparea; + } + if (name) { + const char *prepend; + if (!has_drive) { + prepend = ""; + has_drive = true; + pr_info("Floppy drive(s):"); + } else { + prepend = ","; + } + + pr_cont("%s fd%d is %s", prepend, drive, name); + } + *UDP = *params; + } + + if (has_drive) + pr_cont("\n"); +} + +static void floppy_release(struct gendisk *disk, fmode_t mode) +{ + int drive = (long)disk->private_data; + + mutex_lock(&floppy_mutex); + mutex_lock(&open_lock); + if (!UDRS->fd_ref--) { + DPRINT("floppy_release with fd_ref == 0"); + UDRS->fd_ref = 0; + } + if (!UDRS->fd_ref) + opened_bdev[drive] = NULL; + mutex_unlock(&open_lock); + mutex_unlock(&floppy_mutex); +} + +/* + * floppy_open check for aliasing (/dev/fd0 can be the same as + * /dev/PS0 etc), and disallows simultaneous access to the same + * drive with different device numbers. + */ +static int floppy_open(struct block_device *bdev, fmode_t mode) +{ + int drive = (long)bdev->bd_disk->private_data; + int old_dev, new_dev; + int try; + int res = -EBUSY; + char *tmp; + + mutex_lock(&floppy_mutex); + mutex_lock(&open_lock); + old_dev = UDRS->fd_device; + if (opened_bdev[drive] && opened_bdev[drive] != bdev) + goto out2; + + if (!UDRS->fd_ref && (UDP->flags & FD_BROKEN_DCL)) { + set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); + set_bit(FD_VERIFY_BIT, &UDRS->flags); + } + + UDRS->fd_ref++; + + opened_bdev[drive] = bdev; + + res = -ENXIO; + + if (!floppy_track_buffer) { + /* if opening an ED drive, reserve a big buffer, + * else reserve a small one */ + if ((UDP->cmos == 6) || (UDP->cmos == 5)) + try = 64; /* Only 48 actually useful */ + else + try = 32; /* Only 24 actually useful */ + + tmp = (char *)fd_dma_mem_alloc(1024 * try); + if (!tmp && !floppy_track_buffer) { + try >>= 1; /* buffer only one side */ + INFBOUND(try, 16); + tmp = (char *)fd_dma_mem_alloc(1024 * try); + } + if (!tmp && !floppy_track_buffer) + fallback_on_nodma_alloc(&tmp, 2048 * try); + if (!tmp && !floppy_track_buffer) { + DPRINT("Unable to allocate DMA memory\n"); + goto out; + } + if (floppy_track_buffer) { + if (tmp) + fd_dma_mem_free((unsigned long)tmp, try * 1024); + } else { + buffer_min = buffer_max = -1; + floppy_track_buffer = tmp; + max_buffer_sectors = try; + } + } + + new_dev = MINOR(bdev->bd_dev); + UDRS->fd_device = new_dev; + set_capacity(disks[drive], floppy_sizes[new_dev]); + if (old_dev != -1 && old_dev != new_dev) { + if (buffer_drive == drive) + buffer_track = -1; + } + + if (UFDCS->rawcmd == 1) + UFDCS->rawcmd = 2; + + if (!(mode & FMODE_NDELAY)) { + if (mode & (FMODE_READ|FMODE_WRITE)) { + UDRS->last_checked = 0; + clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); + check_disk_change(bdev); + if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags)) + goto out; + if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags)) + goto out; + } + res = -EROFS; + if ((mode & FMODE_WRITE) && + !test_bit(FD_DISK_WRITABLE_BIT, &UDRS->flags)) + goto out; + } + mutex_unlock(&open_lock); + mutex_unlock(&floppy_mutex); + return 0; +out: + UDRS->fd_ref--; + + if (!UDRS->fd_ref) + opened_bdev[drive] = NULL; +out2: + mutex_unlock(&open_lock); + mutex_unlock(&floppy_mutex); + return res; +} + +/* + * Check if the disk has been changed or if a change has been faked. + */ +static unsigned int floppy_check_events(struct gendisk *disk, + unsigned int clearing) +{ + int drive = (long)disk->private_data; + + if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || + test_bit(FD_VERIFY_BIT, &UDRS->flags)) + return DISK_EVENT_MEDIA_CHANGE; + + if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) { + lock_fdc(drive, false); + poll_drive(false, 0); + process_fd_request(); + } + + if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || + test_bit(FD_VERIFY_BIT, &UDRS->flags) || + test_bit(drive, &fake_change) || + drive_no_geom(drive)) + return DISK_EVENT_MEDIA_CHANGE; + return 0; +} + +/* + * This implements "read block 0" for floppy_revalidate(). + * Needed for format autodetection, checking whether there is + * a disk in the drive, and whether that disk is writable. + */ + +struct rb0_cbdata { + int drive; + struct completion complete; +}; + +static void floppy_rb0_cb(struct bio *bio, int err) +{ + struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private; + int drive = cbdata->drive; + + if (err) { + pr_info("floppy: error %d while reading block 0\n", err); + set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); + } + complete(&cbdata->complete); +} + +static int __floppy_read_block_0(struct block_device *bdev, int drive) +{ + struct bio bio; + struct bio_vec bio_vec; + struct page *page; + struct rb0_cbdata cbdata; + size_t size; + + page = alloc_page(GFP_NOIO); + if (!page) { + process_fd_request(); + return -ENOMEM; + } + + size = bdev->bd_block_size; + if (!size) + size = 1024; + + cbdata.drive = drive; + + bio_init(&bio); + bio.bi_io_vec = &bio_vec; + bio_vec.bv_page = page; + bio_vec.bv_len = size; + bio_vec.bv_offset = 0; + bio.bi_vcnt = 1; + bio.bi_iter.bi_size = size; + bio.bi_bdev = bdev; + bio.bi_iter.bi_sector = 0; + bio.bi_flags |= (1 << BIO_QUIET); + bio.bi_private = &cbdata; + bio.bi_end_io = floppy_rb0_cb; + + submit_bio(READ, &bio); + process_fd_request(); + + init_completion(&cbdata.complete); + wait_for_completion(&cbdata.complete); + + __free_page(page); + + return 0; +} + +/* revalidate the floppy disk, i.e. trigger format autodetection by reading + * the bootblock (block 0). "Autodetection" is also needed to check whether + * there is a disk in the drive at all... Thus we also do it for fixed + * geometry formats */ +static int floppy_revalidate(struct gendisk *disk) +{ + int drive = (long)disk->private_data; + int cf; + int res = 0; + + if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || + test_bit(FD_VERIFY_BIT, &UDRS->flags) || + test_bit(drive, &fake_change) || + drive_no_geom(drive)) { + if (WARN(atomic_read(&usage_count) == 0, + "VFS: revalidate called on non-open device.\n")) + return -EFAULT; + + lock_fdc(drive, false); + cf = (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) || + test_bit(FD_VERIFY_BIT, &UDRS->flags)); + if (!(cf || test_bit(drive, &fake_change) || drive_no_geom(drive))) { + process_fd_request(); /*already done by another thread */ + return 0; + } + UDRS->maxblock = 0; + UDRS->maxtrack = 0; + if (buffer_drive == drive) + buffer_track = -1; + clear_bit(drive, &fake_change); + clear_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); + if (cf) + UDRS->generation++; + if (drive_no_geom(drive)) { + /* auto-sensing */ + res = __floppy_read_block_0(opened_bdev[drive], drive); + } else { + if (cf) + poll_drive(false, FD_RAW_NEED_DISK); + process_fd_request(); + } + } + set_capacity(disk, floppy_sizes[UDRS->fd_device]); + return res; +} + +static const struct block_device_operations floppy_fops = { + .owner = THIS_MODULE, + .open = floppy_open, + .release = floppy_release, + .ioctl = fd_ioctl, + .getgeo = fd_getgeo, + .check_events = floppy_check_events, + .revalidate_disk = floppy_revalidate, +}; + +/* + * Floppy Driver initialization + * ============================= + */ + +/* Determine the floppy disk controller type */ +/* This routine was written by David C. Niemi */ +static char __init get_fdc_version(void) +{ + int r; + + output_byte(FD_DUMPREGS); /* 82072 and better know DUMPREGS */ + if (FDCS->reset) + return FDC_NONE; + r = result(); + if (r <= 0x00) + return FDC_NONE; /* No FDC present ??? */ + if ((r == 1) && (reply_buffer[0] == 0x80)) { + pr_info("FDC %d is an 8272A\n", fdc); + return FDC_8272A; /* 8272a/765 don't know DUMPREGS */ + } + if (r != 10) { + pr_info("FDC %d init: DUMPREGS: unexpected return of %d bytes.\n", + fdc, r); + return FDC_UNKNOWN; + } + + if (!fdc_configure()) { + pr_info("FDC %d is an 82072\n", fdc); + return FDC_82072; /* 82072 doesn't know CONFIGURE */ + } + + output_byte(FD_PERPENDICULAR); + if (need_more_output() == MORE_OUTPUT) { + output_byte(0); + } else { + pr_info("FDC %d is an 82072A\n", fdc); + return FDC_82072A; /* 82072A as found on Sparcs. */ + } + + output_byte(FD_UNLOCK); + r = result(); + if ((r == 1) && (reply_buffer[0] == 0x80)) { + pr_info("FDC %d is a pre-1991 82077\n", fdc); + return FDC_82077_ORIG; /* Pre-1991 82077, doesn't know + * LOCK/UNLOCK */ + } + if ((r != 1) || (reply_buffer[0] != 0x00)) { + pr_info("FDC %d init: UNLOCK: unexpected return of %d bytes.\n", + fdc, r); + return FDC_UNKNOWN; + } + output_byte(FD_PARTID); + r = result(); + if (r != 1) { + pr_info("FDC %d init: PARTID: unexpected return of %d bytes.\n", + fdc, r); + return FDC_UNKNOWN; + } + if (reply_buffer[0] == 0x80) { + pr_info("FDC %d is a post-1991 82077\n", fdc); + return FDC_82077; /* Revised 82077AA passes all the tests */ + } + switch (reply_buffer[0] >> 5) { + case 0x0: + /* Either a 82078-1 or a 82078SL running at 5Volt */ + pr_info("FDC %d is an 82078.\n", fdc); + return FDC_82078; + case 0x1: + pr_info("FDC %d is a 44pin 82078\n", fdc); + return FDC_82078; + case 0x2: + pr_info("FDC %d is a S82078B\n", fdc); + return FDC_S82078B; + case 0x3: + pr_info("FDC %d is a National Semiconductor PC87306\n", fdc); + return FDC_87306; + default: + pr_info("FDC %d init: 82078 variant with unknown PARTID=%d.\n", + fdc, reply_buffer[0] >> 5); + return FDC_82078_UNKN; + } +} /* get_fdc_version */ + +/* lilo configuration */ + +static void __init floppy_set_flags(int *ints, int param, int param2) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(default_drive_params); i++) { + if (param) + default_drive_params[i].params.flags |= param2; + else + default_drive_params[i].params.flags &= ~param2; + } + DPRINT("%s flag 0x%x\n", param2 ? "Setting" : "Clearing", param); +} + +static void __init daring(int *ints, int param, int param2) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(default_drive_params); i++) { + if (param) { + default_drive_params[i].params.select_delay = 0; + default_drive_params[i].params.flags |= + FD_SILENT_DCL_CLEAR; + } else { + default_drive_params[i].params.select_delay = + 2 * HZ / 100; + default_drive_params[i].params.flags &= + ~FD_SILENT_DCL_CLEAR; + } + } + DPRINT("Assuming %s floppy hardware\n", param ? "standard" : "broken"); +} + +static void __init set_cmos(int *ints, int dummy, int dummy2) +{ + int current_drive = 0; + + if (ints[0] != 2) { + DPRINT("wrong number of parameters for CMOS\n"); + return; + } + current_drive = ints[1]; + if (current_drive < 0 || current_drive >= 8) { + DPRINT("bad drive for set_cmos\n"); + return; + } +#if N_FDC > 1 + if (current_drive >= 4 && !FDC2) + FDC2 = 0x370; +#endif + DP->cmos = ints[2]; + DPRINT("setting CMOS code to %d\n", ints[2]); +} + +static struct param_table { + const char *name; + void (*fn) (int *ints, int param, int param2); + int *var; + int def_param; + int param2; +} config_params[] __initdata = { + {"allowed_drive_mask", NULL, &allowed_drive_mask, 0xff, 0}, /* obsolete */ + {"all_drives", NULL, &allowed_drive_mask, 0xff, 0}, /* obsolete */ + {"asus_pci", NULL, &allowed_drive_mask, 0x33, 0}, + {"irq", NULL, &FLOPPY_IRQ, 6, 0}, + {"dma", NULL, &FLOPPY_DMA, 2, 0}, + {"daring", daring, NULL, 1, 0}, +#if N_FDC > 1 + {"two_fdc", NULL, &FDC2, 0x370, 0}, + {"one_fdc", NULL, &FDC2, 0, 0}, +#endif + {"thinkpad", floppy_set_flags, NULL, 1, FD_INVERTED_DCL}, + {"broken_dcl", floppy_set_flags, NULL, 1, FD_BROKEN_DCL}, + {"messages", floppy_set_flags, NULL, 1, FTD_MSG}, + {"silent_dcl_clear", floppy_set_flags, NULL, 1, FD_SILENT_DCL_CLEAR}, + {"debug", floppy_set_flags, NULL, 1, FD_DEBUG}, + {"nodma", NULL, &can_use_virtual_dma, 1, 0}, + {"omnibook", NULL, &can_use_virtual_dma, 1, 0}, + {"yesdma", NULL, &can_use_virtual_dma, 0, 0}, + {"fifo_depth", NULL, &fifo_depth, 0xa, 0}, + {"nofifo", NULL, &no_fifo, 0x20, 0}, + {"usefifo", NULL, &no_fifo, 0, 0}, + {"cmos", set_cmos, NULL, 0, 0}, + {"slow", NULL, &slow_floppy, 1, 0}, + {"unexpected_interrupts", NULL, &print_unex, 1, 0}, + {"no_unexpected_interrupts", NULL, &print_unex, 0, 0}, + {"L40SX", NULL, &print_unex, 0, 0} + + EXTRA_FLOPPY_PARAMS +}; + +static int __init floppy_setup(char *str) +{ + int i; + int param; + int ints[11]; + + str = get_options(str, ARRAY_SIZE(ints), ints); + if (str) { + for (i = 0; i < ARRAY_SIZE(config_params); i++) { + if (strcmp(str, config_params[i].name) == 0) { + if (ints[0]) + param = ints[1]; + else + param = config_params[i].def_param; + if (config_params[i].fn) + config_params[i].fn(ints, param, + config_params[i]. + param2); + if (config_params[i].var) { + DPRINT("%s=%d\n", str, param); + *config_params[i].var = param; + } + return 1; + } + } + } + if (str) { + DPRINT("unknown floppy option [%s]\n", str); + + DPRINT("allowed options are:"); + for (i = 0; i < ARRAY_SIZE(config_params); i++) + pr_cont(" %s", config_params[i].name); + pr_cont("\n"); + } else + DPRINT("botched floppy option\n"); + DPRINT("Read Documentation/blockdev/floppy.txt\n"); + return 0; +} + +static int have_no_fdc = -ENODEV; + +static ssize_t floppy_cmos_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct platform_device *p = to_platform_device(dev); + int drive; + + drive = p->id; + return sprintf(buf, "%X\n", UDP->cmos); +} + +static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL); + +static struct attribute *floppy_dev_attrs[] = { + &dev_attr_cmos.attr, + NULL +}; + +ATTRIBUTE_GROUPS(floppy_dev); + +static void floppy_device_release(struct device *dev) +{ +} + +static int floppy_resume(struct device *dev) +{ + int fdc; + + for (fdc = 0; fdc < N_FDC; fdc++) + if (FDCS->address != -1) + user_reset_fdc(-1, FD_RESET_ALWAYS, false); + + return 0; +} + +static const struct dev_pm_ops floppy_pm_ops = { + .resume = floppy_resume, + .restore = floppy_resume, +}; + +static struct platform_driver floppy_driver = { + .driver = { + .name = "floppy", + .pm = &floppy_pm_ops, + }, +}; + +static struct platform_device floppy_device[N_DRIVE]; + +static bool floppy_available(int drive) +{ + if (!(allowed_drive_mask & (1 << drive))) + return false; + if (fdc_state[FDC(drive)].version == FDC_NONE) + return false; + return true; +} + +static struct kobject *floppy_find(dev_t dev, int *part, void *data) +{ + int drive = (*part & 3) | ((*part & 0x80) >> 5); + if (drive >= N_DRIVE || !floppy_available(drive)) + return NULL; + if (((*part >> 2) & 0x1f) >= ARRAY_SIZE(floppy_type)) + return NULL; + *part = 0; + return get_disk(disks[drive]); +} + +static int __init do_floppy_init(void) +{ + int i, unit, drive, err; + + set_debugt(); + interruptjiffies = resultjiffies = jiffies; + +#if defined(CONFIG_PPC) + if (check_legacy_ioport(FDC1)) + return -ENODEV; +#endif + + raw_cmd = NULL; + + floppy_wq = alloc_ordered_workqueue("floppy", 0); + if (!floppy_wq) + return -ENOMEM; + + for (drive = 0; drive < N_DRIVE; drive++) { + disks[drive] = alloc_disk(1); + if (!disks[drive]) { + err = -ENOMEM; + goto out_put_disk; + } + + disks[drive]->queue = blk_init_queue(do_fd_request, &floppy_lock); + if (!disks[drive]->queue) { + err = -ENOMEM; + goto out_put_disk; + } + + blk_queue_max_hw_sectors(disks[drive]->queue, 64); + disks[drive]->major = FLOPPY_MAJOR; + disks[drive]->first_minor = TOMINOR(drive); + disks[drive]->fops = &floppy_fops; + sprintf(disks[drive]->disk_name, "fd%d", drive); + + init_timer(&motor_off_timer[drive]); + motor_off_timer[drive].data = drive; + motor_off_timer[drive].function = motor_off_callback; + } + + err = register_blkdev(FLOPPY_MAJOR, "fd"); + if (err) + goto out_put_disk; + + err = platform_driver_register(&floppy_driver); + if (err) + goto out_unreg_blkdev; + + blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, + floppy_find, NULL, NULL); + + for (i = 0; i < 256; i++) + if (ITYPE(i)) + floppy_sizes[i] = floppy_type[ITYPE(i)].size; + else + floppy_sizes[i] = MAX_DISK_SIZE << 1; + + reschedule_timeout(MAXTIMEOUT, "floppy init"); + config_types(); + + for (i = 0; i < N_FDC; i++) { + fdc = i; + memset(FDCS, 0, sizeof(*FDCS)); + FDCS->dtr = -1; + FDCS->dor = 0x4; +#if defined(__sparc__) || defined(__mc68000__) + /*sparcs/sun3x don't have a DOR reset which we can fall back on to */ +#ifdef __mc68000__ + if (MACH_IS_SUN3X) +#endif + FDCS->version = FDC_82072A; +#endif + } + + use_virtual_dma = can_use_virtual_dma & 1; + fdc_state[0].address = FDC1; + if (fdc_state[0].address == -1) { + cancel_delayed_work(&fd_timeout); + err = -ENODEV; + goto out_unreg_region; + } +#if N_FDC > 1 + fdc_state[1].address = FDC2; +#endif + + fdc = 0; /* reset fdc in case of unexpected interrupt */ + err = floppy_grab_irq_and_dma(); + if (err) { + cancel_delayed_work(&fd_timeout); + err = -EBUSY; + goto out_unreg_region; + } + + /* initialise drive state */ + for (drive = 0; drive < N_DRIVE; drive++) { + memset(UDRS, 0, sizeof(*UDRS)); + memset(UDRWE, 0, sizeof(*UDRWE)); + set_bit(FD_DISK_NEWCHANGE_BIT, &UDRS->flags); + set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags); + set_bit(FD_VERIFY_BIT, &UDRS->flags); + UDRS->fd_device = -1; + floppy_track_buffer = NULL; + max_buffer_sectors = 0; + } + /* + * Small 10 msec delay to let through any interrupt that + * initialization might have triggered, to not + * confuse detection: + */ + msleep(10); + + for (i = 0; i < N_FDC; i++) { + fdc = i; + FDCS->driver_version = FD_DRIVER_VERSION; + for (unit = 0; unit < 4; unit++) + FDCS->track[unit] = 0; + if (FDCS->address == -1) + continue; + FDCS->rawcmd = 2; + if (user_reset_fdc(-1, FD_RESET_ALWAYS, false)) { + /* free ioports reserved by floppy_grab_irq_and_dma() */ + floppy_release_regions(fdc); + FDCS->address = -1; + FDCS->version = FDC_NONE; + continue; + } + /* Try to determine the floppy controller type */ + FDCS->version = get_fdc_version(); + if (FDCS->version == FDC_NONE) { + /* free ioports reserved by floppy_grab_irq_and_dma() */ + floppy_release_regions(fdc); + FDCS->address = -1; + continue; + } + if (can_use_virtual_dma == 2 && FDCS->version < FDC_82072A) + can_use_virtual_dma = 0; + + have_no_fdc = 0; + /* Not all FDCs seem to be able to handle the version command + * properly, so force a reset for the standard FDC clones, + * to avoid interrupt garbage. + */ + user_reset_fdc(-1, FD_RESET_ALWAYS, false); + } + fdc = 0; + cancel_delayed_work(&fd_timeout); + current_drive = 0; + initialized = true; + if (have_no_fdc) { + DPRINT("no floppy controllers found\n"); + err = have_no_fdc; + goto out_release_dma; + } + + for (drive = 0; drive < N_DRIVE; drive++) { + if (!floppy_available(drive)) + continue; + + floppy_device[drive].name = floppy_device_name; + floppy_device[drive].id = drive; + floppy_device[drive].dev.release = floppy_device_release; + floppy_device[drive].dev.groups = floppy_dev_groups; + + err = platform_device_register(&floppy_device[drive]); + if (err) + goto out_remove_drives; + + /* to be cleaned up... */ + disks[drive]->private_data = (void *)(long)drive; + disks[drive]->flags |= GENHD_FL_REMOVABLE; + disks[drive]->driverfs_dev = &floppy_device[drive].dev; + add_disk(disks[drive]); + } + + return 0; + +out_remove_drives: + while (drive--) { + if (floppy_available(drive)) { + del_gendisk(disks[drive]); + platform_device_unregister(&floppy_device[drive]); + } + } +out_release_dma: + if (atomic_read(&usage_count)) + floppy_release_irq_and_dma(); +out_unreg_region: + blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); + platform_driver_unregister(&floppy_driver); +out_unreg_blkdev: + unregister_blkdev(FLOPPY_MAJOR, "fd"); +out_put_disk: + destroy_workqueue(floppy_wq); + for (drive = 0; drive < N_DRIVE; drive++) { + if (!disks[drive]) + break; + if (disks[drive]->queue) { + del_timer_sync(&motor_off_timer[drive]); + blk_cleanup_queue(disks[drive]->queue); + disks[drive]->queue = NULL; + } + put_disk(disks[drive]); + } + return err; +} + +#ifndef MODULE +static __init void floppy_async_init(void *data, async_cookie_t cookie) +{ + do_floppy_init(); +} +#endif + +static int __init floppy_init(void) +{ +#ifdef MODULE + return do_floppy_init(); +#else + /* Don't hold up the bootup by the floppy initialization */ + async_schedule(floppy_async_init, NULL); + return 0; +#endif +} + +static const struct io_region { + int offset; + int size; +} io_regions[] = { + { 2, 1 }, + /* address + 3 is sometimes reserved by pnp bios for motherboard */ + { 4, 2 }, + /* address + 6 is reserved, and may be taken by IDE. + * Unfortunately, Adaptec doesn't know this :-(, */ + { 7, 1 }, +}; + +static void floppy_release_allocated_regions(int fdc, const struct io_region *p) +{ + while (p != io_regions) { + p--; + release_region(FDCS->address + p->offset, p->size); + } +} + +#define ARRAY_END(X) (&((X)[ARRAY_SIZE(X)])) + +static int floppy_request_regions(int fdc) +{ + const struct io_region *p; + + for (p = io_regions; p < ARRAY_END(io_regions); p++) { + if (!request_region(FDCS->address + p->offset, + p->size, "floppy")) { + DPRINT("Floppy io-port 0x%04lx in use\n", + FDCS->address + p->offset); + floppy_release_allocated_regions(fdc, p); + return -EBUSY; + } + } + return 0; +} + +static void floppy_release_regions(int fdc) +{ + floppy_release_allocated_regions(fdc, ARRAY_END(io_regions)); +} + +static int floppy_grab_irq_and_dma(void) +{ + if (atomic_inc_return(&usage_count) > 1) + return 0; + + /* + * We might have scheduled a free_irq(), wait it to + * drain first: + */ + flush_workqueue(floppy_wq); + + if (fd_request_irq()) { + DPRINT("Unable to grab IRQ%d for the floppy driver\n", + FLOPPY_IRQ); + atomic_dec(&usage_count); + return -1; + } + if (fd_request_dma()) { + DPRINT("Unable to grab DMA%d for the floppy driver\n", + FLOPPY_DMA); + if (can_use_virtual_dma & 2) + use_virtual_dma = can_use_virtual_dma = 1; + if (!(can_use_virtual_dma & 1)) { + fd_free_irq(); + atomic_dec(&usage_count); + return -1; + } + } + + for (fdc = 0; fdc < N_FDC; fdc++) { + if (FDCS->address != -1) { + if (floppy_request_regions(fdc)) + goto cleanup; + } + } + for (fdc = 0; fdc < N_FDC; fdc++) { + if (FDCS->address != -1) { + reset_fdc_info(1); + fd_outb(FDCS->dor, FD_DOR); + } + } + fdc = 0; + set_dor(0, ~0, 8); /* avoid immediate interrupt */ + + for (fdc = 0; fdc < N_FDC; fdc++) + if (FDCS->address != -1) + fd_outb(FDCS->dor, FD_DOR); + /* + * The driver will try and free resources and relies on us + * to know if they were allocated or not. + */ + fdc = 0; + irqdma_allocated = 1; + return 0; +cleanup: + fd_free_irq(); + fd_free_dma(); + while (--fdc >= 0) + floppy_release_regions(fdc); + atomic_dec(&usage_count); + return -1; +} + +static void floppy_release_irq_and_dma(void) +{ + int old_fdc; +#ifndef __sparc__ + int drive; +#endif + long tmpsize; + unsigned long tmpaddr; + + if (!atomic_dec_and_test(&usage_count)) + return; + + if (irqdma_allocated) { + fd_disable_dma(); + fd_free_dma(); + fd_free_irq(); + irqdma_allocated = 0; + } + set_dor(0, ~0, 8); +#if N_FDC > 1 + set_dor(1, ~8, 0); +#endif + + if (floppy_track_buffer && max_buffer_sectors) { + tmpsize = max_buffer_sectors * 1024; + tmpaddr = (unsigned long)floppy_track_buffer; + floppy_track_buffer = NULL; + max_buffer_sectors = 0; + buffer_min = buffer_max = -1; + fd_dma_mem_free(tmpaddr, tmpsize); + } +#ifndef __sparc__ + for (drive = 0; drive < N_FDC * 4; drive++) + if (timer_pending(motor_off_timer + drive)) + pr_info("motor off timer %d still active\n", drive); +#endif + + if (delayed_work_pending(&fd_timeout)) + pr_info("floppy timer still active:%s\n", timeout_message); + if (delayed_work_pending(&fd_timer)) + pr_info("auxiliary floppy timer still active\n"); + if (work_pending(&floppy_work)) + pr_info("work still pending\n"); + old_fdc = fdc; + for (fdc = 0; fdc < N_FDC; fdc++) + if (FDCS->address != -1) + floppy_release_regions(fdc); + fdc = old_fdc; +} + +#ifdef MODULE + +static char *floppy; + +static void __init parse_floppy_cfg_string(char *cfg) +{ + char *ptr; + + while (*cfg) { + ptr = cfg; + while (*cfg && *cfg != ' ' && *cfg != '\t') + cfg++; + if (*cfg) { + *cfg = '\0'; + cfg++; + } + if (*ptr) + floppy_setup(ptr); + } +} + +static int __init floppy_module_init(void) +{ + if (floppy) + parse_floppy_cfg_string(floppy); + return floppy_init(); +} +module_init(floppy_module_init); + +static void __exit floppy_module_exit(void) +{ + int drive; + + blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); + unregister_blkdev(FLOPPY_MAJOR, "fd"); + platform_driver_unregister(&floppy_driver); + + destroy_workqueue(floppy_wq); + + for (drive = 0; drive < N_DRIVE; drive++) { + del_timer_sync(&motor_off_timer[drive]); + + if (floppy_available(drive)) { + del_gendisk(disks[drive]); + platform_device_unregister(&floppy_device[drive]); + } + blk_cleanup_queue(disks[drive]->queue); + + /* + * These disks have not called add_disk(). Don't put down + * queue reference in put_disk(). + */ + if (!(allowed_drive_mask & (1 << drive)) || + fdc_state[FDC(drive)].version == FDC_NONE) + disks[drive]->queue = NULL; + + put_disk(disks[drive]); + } + + cancel_delayed_work_sync(&fd_timeout); + cancel_delayed_work_sync(&fd_timer); + + if (atomic_read(&usage_count)) + floppy_release_irq_and_dma(); + + /* eject disk, if any */ + fd_eject(0); +} + +module_exit(floppy_module_exit); + +module_param(floppy, charp, 0); +module_param(FLOPPY_IRQ, int, 0); +module_param(FLOPPY_DMA, int, 0); +MODULE_AUTHOR("Alain L. Knaff"); +MODULE_SUPPORTED_DEVICE("fd"); +MODULE_LICENSE("GPL"); + +/* This doesn't actually get used other than for module information */ +static const struct pnp_device_id floppy_pnpids[] = { + {"PNP0700", 0}, + {} +}; + +MODULE_DEVICE_TABLE(pnp, floppy_pnpids); + +#else + +__setup("floppy=", floppy_setup); +module_init(floppy_init) +#endif + +MODULE_ALIAS_BLOCKDEV_MAJOR(FLOPPY_MAJOR); diff --git a/kernel/drivers/block/hd.c b/kernel/drivers/block/hd.c new file mode 100644 index 000000000..3abb12182 --- /dev/null +++ b/kernel/drivers/block/hd.c @@ -0,0 +1,804 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This is the low-level hd interrupt support. It traverses the + * request-list, using interrupts to jump between functions. As + * all the functions are called within interrupts, we may not + * sleep. Special care is recommended. + * + * modified by Drew Eckhardt to check nr of hd's from the CMOS. + * + * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug + * in the early extended-partition checks and added DM partitions + * + * IRQ-unmask, drive-id, multiple-mode, support for ">16 heads", + * and general streamlining by Mark Lord. + * + * Removed 99% of above. Use Mark's ide driver for those options. + * This is now a lightweight ST-506 driver. (Paul Gortmaker) + * + * Modified 1995 Russell King for ARM processor. + * + * Bugfix: max_sectors must be <= 255 or the wheels tend to come + * off in a hurry once you queue things up - Paul G. 02/2001 + */ + +/* Uncomment the following if you want verbose error reports. */ +/* #define VERBOSE_ERRORS */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HD_IRQ 14 + +#define REALLY_SLOW_IO +#include +#include + +#ifdef __arm__ +#undef HD_IRQ +#endif +#include +#ifdef __arm__ +#define HD_IRQ IRQ_HARDDISK +#endif + +/* Hd controller regster ports */ + +#define HD_DATA 0x1f0 /* _CTL when writing */ +#define HD_ERROR 0x1f1 /* see err-bits */ +#define HD_NSECTOR 0x1f2 /* nr of sectors to read/write */ +#define HD_SECTOR 0x1f3 /* starting sector */ +#define HD_LCYL 0x1f4 /* starting cylinder */ +#define HD_HCYL 0x1f5 /* high byte of starting cyl */ +#define HD_CURRENT 0x1f6 /* 101dhhhh , d=drive, hhhh=head */ +#define HD_STATUS 0x1f7 /* see status-bits */ +#define HD_FEATURE HD_ERROR /* same io address, read=error, write=feature */ +#define HD_PRECOMP HD_FEATURE /* obsolete use of this port - predates IDE */ +#define HD_COMMAND HD_STATUS /* same io address, read=status, write=cmd */ + +#define HD_CMD 0x3f6 /* used for resets */ +#define HD_ALTSTATUS 0x3f6 /* same as HD_STATUS but doesn't clear irq */ + +/* Bits of HD_STATUS */ +#define ERR_STAT 0x01 +#define INDEX_STAT 0x02 +#define ECC_STAT 0x04 /* Corrected error */ +#define DRQ_STAT 0x08 +#define SEEK_STAT 0x10 +#define SERVICE_STAT SEEK_STAT +#define WRERR_STAT 0x20 +#define READY_STAT 0x40 +#define BUSY_STAT 0x80 + +/* Bits for HD_ERROR */ +#define MARK_ERR 0x01 /* Bad address mark */ +#define TRK0_ERR 0x02 /* couldn't find track 0 */ +#define ABRT_ERR 0x04 /* Command aborted */ +#define MCR_ERR 0x08 /* media change request */ +#define ID_ERR 0x10 /* ID field not found */ +#define MC_ERR 0x20 /* media changed */ +#define ECC_ERR 0x40 /* Uncorrectable ECC error */ +#define BBD_ERR 0x80 /* pre-EIDE meaning: block marked bad */ +#define ICRC_ERR 0x80 /* new meaning: CRC error during transfer */ + +static DEFINE_SPINLOCK(hd_lock); +static struct request_queue *hd_queue; +static struct request *hd_req; + +#define TIMEOUT_VALUE (6*HZ) +#define HD_DELAY 0 + +#define MAX_ERRORS 16 /* Max read/write errors/sector */ +#define RESET_FREQ 8 /* Reset controller every 8th retry */ +#define RECAL_FREQ 4 /* Recalibrate every 4th retry */ +#define MAX_HD 2 + +#define STAT_OK (READY_STAT|SEEK_STAT) +#define OK_STATUS(s) (((s)&(STAT_OK|(BUSY_STAT|WRERR_STAT|ERR_STAT)))==STAT_OK) + +static void recal_intr(void); +static void bad_rw_intr(void); + +static int reset; +static int hd_error; + +/* + * This struct defines the HD's and their types. + */ +struct hd_i_struct { + unsigned int head, sect, cyl, wpcom, lzone, ctl; + int unit; + int recalibrate; + int special_op; +}; + +#ifdef HD_TYPE +static struct hd_i_struct hd_info[] = { HD_TYPE }; +static int NR_HD = ARRAY_SIZE(hd_info); +#else +static struct hd_i_struct hd_info[MAX_HD]; +static int NR_HD; +#endif + +static struct gendisk *hd_gendisk[MAX_HD]; + +static struct timer_list device_timer; + +#define TIMEOUT_VALUE (6*HZ) + +#define SET_TIMER \ + do { \ + mod_timer(&device_timer, jiffies + TIMEOUT_VALUE); \ + } while (0) + +static void (*do_hd)(void) = NULL; +#define SET_HANDLER(x) \ +if ((do_hd = (x)) != NULL) \ + SET_TIMER; \ +else \ + del_timer(&device_timer); + + +#if (HD_DELAY > 0) + +#include + +unsigned long last_req; + +unsigned long read_timer(void) +{ + unsigned long t, flags; + int i; + + raw_spin_lock_irqsave(&i8253_lock, flags); + t = jiffies * 11932; + outb_p(0, 0x43); + i = inb_p(0x40); + i |= inb(0x40) << 8; + raw_spin_unlock_irqrestore(&i8253_lock, flags); + return(t - i); +} +#endif + +static void __init hd_setup(char *str, int *ints) +{ + int hdind = 0; + + if (ints[0] != 3) + return; + if (hd_info[0].head != 0) + hdind = 1; + hd_info[hdind].head = ints[2]; + hd_info[hdind].sect = ints[3]; + hd_info[hdind].cyl = ints[1]; + hd_info[hdind].wpcom = 0; + hd_info[hdind].lzone = ints[1]; + hd_info[hdind].ctl = (ints[2] > 8 ? 8 : 0); + NR_HD = hdind+1; +} + +static bool hd_end_request(int err, unsigned int bytes) +{ + if (__blk_end_request(hd_req, err, bytes)) + return true; + hd_req = NULL; + return false; +} + +static bool hd_end_request_cur(int err) +{ + return hd_end_request(err, blk_rq_cur_bytes(hd_req)); +} + +static void dump_status(const char *msg, unsigned int stat) +{ + char *name = "hd?"; + if (hd_req) + name = hd_req->rq_disk->disk_name; + +#ifdef VERBOSE_ERRORS + printk("%s: %s: status=0x%02x { ", name, msg, stat & 0xff); + if (stat & BUSY_STAT) printk("Busy "); + if (stat & READY_STAT) printk("DriveReady "); + if (stat & WRERR_STAT) printk("WriteFault "); + if (stat & SEEK_STAT) printk("SeekComplete "); + if (stat & DRQ_STAT) printk("DataRequest "); + if (stat & ECC_STAT) printk("CorrectedError "); + if (stat & INDEX_STAT) printk("Index "); + if (stat & ERR_STAT) printk("Error "); + printk("}\n"); + if ((stat & ERR_STAT) == 0) { + hd_error = 0; + } else { + hd_error = inb(HD_ERROR); + printk("%s: %s: error=0x%02x { ", name, msg, hd_error & 0xff); + if (hd_error & BBD_ERR) printk("BadSector "); + if (hd_error & ECC_ERR) printk("UncorrectableError "); + if (hd_error & ID_ERR) printk("SectorIdNotFound "); + if (hd_error & ABRT_ERR) printk("DriveStatusError "); + if (hd_error & TRK0_ERR) printk("TrackZeroNotFound "); + if (hd_error & MARK_ERR) printk("AddrMarkNotFound "); + printk("}"); + if (hd_error & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) { + printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL), + inb(HD_CURRENT) & 0xf, inb(HD_SECTOR)); + if (hd_req) + printk(", sector=%ld", blk_rq_pos(hd_req)); + } + printk("\n"); + } +#else + printk("%s: %s: status=0x%02x.\n", name, msg, stat & 0xff); + if ((stat & ERR_STAT) == 0) { + hd_error = 0; + } else { + hd_error = inb(HD_ERROR); + printk("%s: %s: error=0x%02x.\n", name, msg, hd_error & 0xff); + } +#endif +} + +static void check_status(void) +{ + int i = inb_p(HD_STATUS); + + if (!OK_STATUS(i)) { + dump_status("check_status", i); + bad_rw_intr(); + } +} + +static int controller_busy(void) +{ + int retries = 100000; + unsigned char status; + + do { + status = inb_p(HD_STATUS); + } while ((status & BUSY_STAT) && --retries); + return status; +} + +static int status_ok(void) +{ + unsigned char status = inb_p(HD_STATUS); + + if (status & BUSY_STAT) + return 1; /* Ancient, but does it make sense??? */ + if (status & WRERR_STAT) + return 0; + if (!(status & READY_STAT)) + return 0; + if (!(status & SEEK_STAT)) + return 0; + return 1; +} + +static int controller_ready(unsigned int drive, unsigned int head) +{ + int retry = 100; + + do { + if (controller_busy() & BUSY_STAT) + return 0; + outb_p(0xA0 | (drive<<4) | head, HD_CURRENT); + if (status_ok()) + return 1; + } while (--retry); + return 0; +} + +static void hd_out(struct hd_i_struct *disk, + unsigned int nsect, + unsigned int sect, + unsigned int head, + unsigned int cyl, + unsigned int cmd, + void (*intr_addr)(void)) +{ + unsigned short port; + +#if (HD_DELAY > 0) + while (read_timer() - last_req < HD_DELAY) + /* nothing */; +#endif + if (reset) + return; + if (!controller_ready(disk->unit, head)) { + reset = 1; + return; + } + SET_HANDLER(intr_addr); + outb_p(disk->ctl, HD_CMD); + port = HD_DATA; + outb_p(disk->wpcom >> 2, ++port); + outb_p(nsect, ++port); + outb_p(sect, ++port); + outb_p(cyl, ++port); + outb_p(cyl >> 8, ++port); + outb_p(0xA0 | (disk->unit << 4) | head, ++port); + outb_p(cmd, ++port); +} + +static void hd_request (void); + +static int drive_busy(void) +{ + unsigned int i; + unsigned char c; + + for (i = 0; i < 500000 ; i++) { + c = inb_p(HD_STATUS); + if ((c & (BUSY_STAT | READY_STAT | SEEK_STAT)) == STAT_OK) + return 0; + } + dump_status("reset timed out", c); + return 1; +} + +static void reset_controller(void) +{ + int i; + + outb_p(4, HD_CMD); + for (i = 0; i < 1000; i++) barrier(); + outb_p(hd_info[0].ctl & 0x0f, HD_CMD); + for (i = 0; i < 1000; i++) barrier(); + if (drive_busy()) + printk("hd: controller still busy\n"); + else if ((hd_error = inb(HD_ERROR)) != 1) + printk("hd: controller reset failed: %02x\n", hd_error); +} + +static void reset_hd(void) +{ + static int i; + +repeat: + if (reset) { + reset = 0; + i = -1; + reset_controller(); + } else { + check_status(); + if (reset) + goto repeat; + } + if (++i < NR_HD) { + struct hd_i_struct *disk = &hd_info[i]; + disk->special_op = disk->recalibrate = 1; + hd_out(disk, disk->sect, disk->sect, disk->head-1, + disk->cyl, ATA_CMD_INIT_DEV_PARAMS, &reset_hd); + if (reset) + goto repeat; + } else + hd_request(); +} + +/* + * Ok, don't know what to do with the unexpected interrupts: on some machines + * doing a reset and a retry seems to result in an eternal loop. Right now I + * ignore it, and just set the timeout. + * + * On laptops (and "green" PCs), an unexpected interrupt occurs whenever the + * drive enters "idle", "standby", or "sleep" mode, so if the status looks + * "good", we just ignore the interrupt completely. + */ +static void unexpected_hd_interrupt(void) +{ + unsigned int stat = inb_p(HD_STATUS); + + if (stat & (BUSY_STAT|DRQ_STAT|ECC_STAT|ERR_STAT)) { + dump_status("unexpected interrupt", stat); + SET_TIMER; + } +} + +/* + * bad_rw_intr() now tries to be a bit smarter and does things + * according to the error returned by the controller. + * -Mika Liljeberg (liljeber@cs.Helsinki.FI) + */ +static void bad_rw_intr(void) +{ + struct request *req = hd_req; + + if (req != NULL) { + struct hd_i_struct *disk = req->rq_disk->private_data; + if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) { + hd_end_request_cur(-EIO); + disk->special_op = disk->recalibrate = 1; + } else if (req->errors % RESET_FREQ == 0) + reset = 1; + else if ((hd_error & TRK0_ERR) || req->errors % RECAL_FREQ == 0) + disk->special_op = disk->recalibrate = 1; + /* Otherwise just retry */ + } +} + +static inline int wait_DRQ(void) +{ + int retries; + int stat; + + for (retries = 0; retries < 100000; retries++) { + stat = inb_p(HD_STATUS); + if (stat & DRQ_STAT) + return 0; + } + dump_status("wait_DRQ", stat); + return -1; +} + +static void read_intr(void) +{ + struct request *req; + int i, retries = 100000; + + do { + i = (unsigned) inb_p(HD_STATUS); + if (i & BUSY_STAT) + continue; + if (!OK_STATUS(i)) + break; + if (i & DRQ_STAT) + goto ok_to_read; + } while (--retries > 0); + dump_status("read_intr", i); + bad_rw_intr(); + hd_request(); + return; + +ok_to_read: + req = hd_req; + insw(HD_DATA, bio_data(req->bio), 256); +#ifdef DEBUG + printk("%s: read: sector %ld, remaining = %u, buffer=%p\n", + req->rq_disk->disk_name, blk_rq_pos(req) + 1, + blk_rq_sectors(req) - 1, bio_data(req->bio)+512); +#endif + if (hd_end_request(0, 512)) { + SET_HANDLER(&read_intr); + return; + } + + (void) inb_p(HD_STATUS); +#if (HD_DELAY > 0) + last_req = read_timer(); +#endif + hd_request(); +} + +static void write_intr(void) +{ + struct request *req = hd_req; + int i; + int retries = 100000; + + do { + i = (unsigned) inb_p(HD_STATUS); + if (i & BUSY_STAT) + continue; + if (!OK_STATUS(i)) + break; + if ((blk_rq_sectors(req) <= 1) || (i & DRQ_STAT)) + goto ok_to_write; + } while (--retries > 0); + dump_status("write_intr", i); + bad_rw_intr(); + hd_request(); + return; + +ok_to_write: + if (hd_end_request(0, 512)) { + SET_HANDLER(&write_intr); + outsw(HD_DATA, bio_data(req->bio), 256); + return; + } + +#if (HD_DELAY > 0) + last_req = read_timer(); +#endif + hd_request(); +} + +static void recal_intr(void) +{ + check_status(); +#if (HD_DELAY > 0) + last_req = read_timer(); +#endif + hd_request(); +} + +/* + * This is another of the error-routines I don't know what to do with. The + * best idea seems to just set reset, and start all over again. + */ +static void hd_times_out(unsigned long dummy) +{ + char *name; + + do_hd = NULL; + + if (!hd_req) + return; + + spin_lock_irq(hd_queue->queue_lock); + reset = 1; + name = hd_req->rq_disk->disk_name; + printk("%s: timeout\n", name); + if (++hd_req->errors >= MAX_ERRORS) { +#ifdef DEBUG + printk("%s: too many errors\n", name); +#endif + hd_end_request_cur(-EIO); + } + hd_request(); + spin_unlock_irq(hd_queue->queue_lock); +} + +static int do_special_op(struct hd_i_struct *disk, struct request *req) +{ + if (disk->recalibrate) { + disk->recalibrate = 0; + hd_out(disk, disk->sect, 0, 0, 0, ATA_CMD_RESTORE, &recal_intr); + return reset; + } + if (disk->head > 16) { + printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name); + hd_end_request_cur(-EIO); + } + disk->special_op = 0; + return 1; +} + +/* + * The driver enables interrupts as much as possible. In order to do this, + * (a) the device-interrupt is disabled before entering hd_request(), + * and (b) the timeout-interrupt is disabled before the sti(). + * + * Interrupts are still masked (by default) whenever we are exchanging + * data/cmds with a drive, because some drives seem to have very poor + * tolerance for latency during I/O. The IDE driver has support to unmask + * interrupts for non-broken hardware, so use that driver if required. + */ +static void hd_request(void) +{ + unsigned int block, nsect, sec, track, head, cyl; + struct hd_i_struct *disk; + struct request *req; + + if (do_hd) + return; +repeat: + del_timer(&device_timer); + + if (!hd_req) { + hd_req = blk_fetch_request(hd_queue); + if (!hd_req) { + do_hd = NULL; + return; + } + } + req = hd_req; + + if (reset) { + reset_hd(); + return; + } + disk = req->rq_disk->private_data; + block = blk_rq_pos(req); + nsect = blk_rq_sectors(req); + if (block >= get_capacity(req->rq_disk) || + ((block+nsect) > get_capacity(req->rq_disk))) { + printk("%s: bad access: block=%d, count=%d\n", + req->rq_disk->disk_name, block, nsect); + hd_end_request_cur(-EIO); + goto repeat; + } + + if (disk->special_op) { + if (do_special_op(disk, req)) + goto repeat; + return; + } + sec = block % disk->sect + 1; + track = block / disk->sect; + head = track % disk->head; + cyl = track / disk->head; +#ifdef DEBUG + printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n", + req->rq_disk->disk_name, + req_data_dir(req) == READ ? "read" : "writ", + cyl, head, sec, nsect, bio_data(req->bio)); +#endif + if (req->cmd_type == REQ_TYPE_FS) { + switch (rq_data_dir(req)) { + case READ: + hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ, + &read_intr); + if (reset) + goto repeat; + break; + case WRITE: + hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE, + &write_intr); + if (reset) + goto repeat; + if (wait_DRQ()) { + bad_rw_intr(); + goto repeat; + } + outsw(HD_DATA, bio_data(req->bio), 256); + break; + default: + printk("unknown hd-command\n"); + hd_end_request_cur(-EIO); + break; + } + } +} + +static void do_hd_request(struct request_queue *q) +{ + hd_request(); +} + +static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct hd_i_struct *disk = bdev->bd_disk->private_data; + + geo->heads = disk->head; + geo->sectors = disk->sect; + geo->cylinders = disk->cyl; + return 0; +} + +/* + * Releasing a block device means we sync() it, so that it can safely + * be forgotten about... + */ + +static irqreturn_t hd_interrupt(int irq, void *dev_id) +{ + void (*handler)(void) = do_hd; + + spin_lock(hd_queue->queue_lock); + + do_hd = NULL; + del_timer(&device_timer); + if (!handler) + handler = unexpected_hd_interrupt; + handler(); + + spin_unlock(hd_queue->queue_lock); + + return IRQ_HANDLED; +} + +static const struct block_device_operations hd_fops = { + .getgeo = hd_getgeo, +}; + +static int __init hd_init(void) +{ + int drive; + + if (register_blkdev(HD_MAJOR, "hd")) + return -1; + + hd_queue = blk_init_queue(do_hd_request, &hd_lock); + if (!hd_queue) { + unregister_blkdev(HD_MAJOR, "hd"); + return -ENOMEM; + } + + blk_queue_max_hw_sectors(hd_queue, 255); + init_timer(&device_timer); + device_timer.function = hd_times_out; + blk_queue_logical_block_size(hd_queue, 512); + + if (!NR_HD) { + /* + * We don't know anything about the drive. This means + * that you *MUST* specify the drive parameters to the + * kernel yourself. + * + * If we were on an i386, we used to read this info from + * the BIOS or CMOS. This doesn't work all that well, + * since this assumes that this is a primary or secondary + * drive, and if we're using this legacy driver, it's + * probably an auxiliary controller added to recover + * legacy data off an ST-506 drive. Either way, it's + * definitely safest to have the user explicitly specify + * the information. + */ + printk("hd: no drives specified - use hd=cyl,head,sectors" + " on kernel command line\n"); + goto out; + } + + for (drive = 0 ; drive < NR_HD ; drive++) { + struct gendisk *disk = alloc_disk(64); + struct hd_i_struct *p = &hd_info[drive]; + if (!disk) + goto Enomem; + disk->major = HD_MAJOR; + disk->first_minor = drive << 6; + disk->fops = &hd_fops; + sprintf(disk->disk_name, "hd%c", 'a'+drive); + disk->private_data = p; + set_capacity(disk, p->head * p->sect * p->cyl); + disk->queue = hd_queue; + p->unit = drive; + hd_gendisk[drive] = disk; + printk("%s: %luMB, CHS=%d/%d/%d\n", + disk->disk_name, (unsigned long)get_capacity(disk)/2048, + p->cyl, p->head, p->sect); + } + + if (request_irq(HD_IRQ, hd_interrupt, 0, "hd", NULL)) { + printk("hd: unable to get IRQ%d for the hard disk driver\n", + HD_IRQ); + goto out1; + } + if (!request_region(HD_DATA, 8, "hd")) { + printk(KERN_WARNING "hd: port 0x%x busy\n", HD_DATA); + goto out2; + } + if (!request_region(HD_CMD, 1, "hd(cmd)")) { + printk(KERN_WARNING "hd: port 0x%x busy\n", HD_CMD); + goto out3; + } + + /* Let them fly */ + for (drive = 0; drive < NR_HD; drive++) + add_disk(hd_gendisk[drive]); + + return 0; + +out3: + release_region(HD_DATA, 8); +out2: + free_irq(HD_IRQ, NULL); +out1: + for (drive = 0; drive < NR_HD; drive++) + put_disk(hd_gendisk[drive]); + NR_HD = 0; +out: + del_timer(&device_timer); + unregister_blkdev(HD_MAJOR, "hd"); + blk_cleanup_queue(hd_queue); + return -1; +Enomem: + while (drive--) + put_disk(hd_gendisk[drive]); + goto out; +} + +static int __init parse_hd_setup(char *line) +{ + int ints[6]; + + (void) get_options(line, ARRAY_SIZE(ints), ints); + hd_setup(NULL, ints); + + return 1; +} +__setup("hd=", parse_hd_setup); + +late_initcall(hd_init); diff --git a/kernel/drivers/block/ida_cmd.h b/kernel/drivers/block/ida_cmd.h new file mode 100644 index 000000000..98b5746b3 --- /dev/null +++ b/kernel/drivers/block/ida_cmd.h @@ -0,0 +1,349 @@ +/* + * Disk Array driver for Compaq SMART2 Controllers + * Copyright 1998 Compaq Computer Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Questions/Comments/Bugfixes to iss_storagedev@hp.com + * + */ +#ifndef ARRAYCMD_H +#define ARRAYCMD_H + +#include +#if 0 +#include +#endif + +/* for the Smart Array 42XX cards */ +#define S42XX_REQUEST_PORT_OFFSET 0x40 +#define S42XX_REPLY_INTR_MASK_OFFSET 0x34 +#define S42XX_REPLY_PORT_OFFSET 0x44 +#define S42XX_INTR_STATUS 0x30 + +#define S42XX_INTR_OFF 0x08 +#define S42XX_INTR_PENDING 0x08 + +#define COMMAND_FIFO 0x04 +#define COMMAND_COMPLETE_FIFO 0x08 +#define INTR_MASK 0x0C +#define INTR_STATUS 0x10 +#define INTR_PENDING 0x14 + +#define FIFO_NOT_EMPTY 0x01 +#define FIFO_NOT_FULL 0x02 + +#define BIG_PROBLEM 0x40 +#define LOG_NOT_CONF 2 + +#pragma pack(1) +typedef struct { + __u32 size; + __u32 addr; +} sg_t; + +#define RCODE_NONFATAL 0x02 +#define RCODE_FATAL 0x04 +#define RCODE_INVREQ 0x10 +typedef struct { + __u16 next; + __u8 cmd; + __u8 rcode; + __u32 blk; + __u16 blk_cnt; + __u8 sg_cnt; + __u8 reserved; +} rhdr_t; + +#define SG_MAX 32 +typedef struct { + rhdr_t hdr; + sg_t sg[SG_MAX]; + __u32 bp; +} rblk_t; + +typedef struct { + __u8 unit; + __u8 prio; + __u16 size; +} chdr_t; + +#define CMD_RWREQ 0x00 +#define CMD_IOCTL_PEND 0x01 +#define CMD_IOCTL_DONE 0x02 + +typedef struct cmdlist { + chdr_t hdr; + rblk_t req; + __u32 size; + int retry_cnt; + __u32 busaddr; + int ctlr; + struct cmdlist *prev; + struct cmdlist *next; + struct request *rq; + int type; +} cmdlist_t; + +#define ID_CTLR 0x11 +typedef struct { + __u8 nr_drvs; + __u32 cfg_sig; + __u8 firm_rev[4]; + __u8 rom_rev[4]; + __u8 hw_rev; + __u32 bb_rev; + __u32 drv_present_map; + __u32 ext_drv_map; + __u32 board_id; + __u8 cfg_error; + __u32 non_disk_bits; + __u8 bad_ram_addr; + __u8 cpu_rev; + __u8 pdpi_rev; + __u8 epic_rev; + __u8 wcxc_rev; + __u8 marketing_rev; + __u8 ctlr_flags; + __u8 host_flags; + __u8 expand_dis; + __u8 scsi_chips; + __u32 max_req_blocks; + __u32 ctlr_clock; + __u8 drvs_per_bus; + __u16 big_drv_present_map[8]; + __u16 big_ext_drv_map[8]; + __u16 big_non_disk_map[8]; + __u16 task_flags; + __u8 icl_bus; + __u8 red_modes; + __u8 cur_red_mode; + __u8 red_ctlr_stat; + __u8 red_fail_reason; + __u8 reserved[403]; +} id_ctlr_t; + +typedef struct { + __u16 cyl; + __u8 heads; + __u8 xsig; + __u8 psectors; + __u16 wpre; + __u8 maxecc; + __u8 drv_ctrl; + __u16 pcyls; + __u8 pheads; + __u16 landz; + __u8 sect_per_track; + __u8 cksum; +} drv_param_t; + +#define ID_LOG_DRV 0x10 +typedef struct { + __u16 blk_size; + __u32 nr_blks; + drv_param_t drv; + __u8 fault_tol; + __u8 reserved; + __u8 bios_disable; +} id_log_drv_t; + +#define ID_LOG_DRV_EXT 0x18 +typedef struct { + __u32 log_drv_id; + __u8 log_drv_label[64]; + __u8 reserved[418]; +} id_log_drv_ext_t; + +#define SENSE_LOG_DRV_STAT 0x12 +typedef struct { + __u8 status; + __u32 fail_map; + __u16 read_err[32]; + __u16 write_err[32]; + __u8 drv_err_data[256]; + __u8 drq_timeout[32]; + __u32 blks_to_recover; + __u8 drv_recovering; + __u16 remap_cnt[32]; + __u32 replace_drv_map; + __u32 act_spare_map; + __u8 spare_stat; + __u8 spare_repl_map[32]; + __u32 repl_ok_map; + __u8 media_exch; + __u8 cache_fail; + __u8 expn_fail; + __u8 unit_flags; + __u16 big_fail_map[8]; + __u16 big_remap_map[128]; + __u16 big_repl_map[8]; + __u16 big_act_spare_map[8]; + __u8 big_spar_repl_map[128]; + __u16 big_repl_ok_map[8]; + __u8 big_drv_rebuild; + __u8 reserved[36]; +} sense_log_drv_stat_t; + +#define START_RECOVER 0x13 + +#define ID_PHYS_DRV 0x15 +typedef struct { + __u8 scsi_bus; + __u8 scsi_id; + __u16 blk_size; + __u32 nr_blks; + __u32 rsvd_blks; + __u8 drv_model[40]; + __u8 drv_sn[40]; + __u8 drv_fw[8]; + __u8 scsi_iq_bits; + __u8 compaq_drv_stmp; + __u8 last_fail; + __u8 phys_drv_flags; + __u8 phys_drv_flags1; + __u8 scsi_lun; + __u8 phys_drv_flags2; + __u8 reserved; + __u32 spi_speed_rules; + __u8 phys_connector[2]; + __u8 phys_box_on_bus; + __u8 phys_bay_in_box; +} id_phys_drv_t; + +#define BLINK_DRV_LEDS 0x16 +typedef struct { + __u32 blink_duration; + __u32 reserved; + __u8 blink[256]; + __u8 reserved1[248]; +} blink_drv_leds_t; + +#define SENSE_BLINK_LEDS 0x17 +typedef struct { + __u32 blink_duration; + __u32 btime_elap; + __u8 blink[256]; + __u8 reserved1[248]; +} sense_blink_leds_t; + +#define IDA_READ 0x20 +#define IDA_WRITE 0x30 +#define IDA_WRITE_MEDIA 0x31 +#define RESET_TO_DIAG 0x40 +#define DIAG_PASS_THRU 0x41 + +#define SENSE_CONFIG 0x50 +#define SET_CONFIG 0x51 +typedef struct { + __u32 cfg_sig; + __u16 compat_port; + __u8 data_dist_mode; + __u8 surf_an_ctrl; + __u16 ctlr_phys_drv; + __u16 log_unit_phys_drv; + __u16 fault_tol_mode; + __u8 phys_drv_param[16]; + drv_param_t drv; + __u32 drv_asgn_map; + __u16 dist_factor; + __u32 spare_asgn_map; + __u8 reserved[6]; + __u16 os; + __u8 ctlr_order; + __u8 extra_info; + __u32 data_offs; + __u8 parity_backedout_write_drvs; + __u8 parity_dist_mode; + __u8 parity_shift_fact; + __u8 bios_disable_flag; + __u32 blks_on_vol; + __u32 blks_per_drv; + __u8 scratch[16]; + __u16 big_drv_map[8]; + __u16 big_spare_map[8]; + __u8 ss_source_vol; + __u8 mix_drv_cap_range; + struct { + __u16 big_drv_map[8]; + __u32 blks_per_drv; + __u16 fault_tol_mode; + __u16 dist_factor; + } MDC_range[4]; + __u8 reserved1[248]; +} config_t; + +#define BYPASS_VOL_STATE 0x52 +#define SS_CREATE_VOL 0x53 +#define CHANGE_CONFIG 0x54 +#define SENSE_ORIG_CONF 0x55 +#define REORDER_LOG_DRV 0x56 +typedef struct { + __u8 old_units[32]; +} reorder_log_drv_t; + +#define LABEL_LOG_DRV 0x57 +typedef struct { + __u8 log_drv_label[64]; +} label_log_drv_t; + +#define SS_TO_VOL 0x58 + +#define SET_SURF_DELAY 0x60 +typedef struct { + __u16 delay; + __u8 reserved[510]; +} surf_delay_t; + +#define SET_OVERHEAT_DELAY 0x61 +typedef struct { + __u16 delay; +} overhead_delay_t; + +#define SET_MP_DELAY +typedef struct { + __u16 delay; + __u8 reserved[510]; +} mp_delay_t; + +#define PASSTHRU_A 0x91 +typedef struct { + __u8 target; + __u8 bus; + __u8 lun; + __u32 timeout; + __u32 flags; + __u8 status; + __u8 error; + __u8 cdb_len; + __u8 sense_error; + __u8 sense_key; + __u32 sense_info; + __u8 sense_code; + __u8 sense_qual; + __u32 residual; + __u8 reserved[4]; + __u8 cdb[12]; +} scsi_param_t; + +#define RESUME_BACKGROUND_ACTIVITY 0x99 +#define SENSE_CONTROLLER_PERFORMANCE 0xa8 +#define FLUSH_CACHE 0xc2 +#define COLLECT_BUFFER 0xd2 +#define READ_FLASH_ROM 0xf6 +#define WRITE_FLASH_ROM 0xf7 +#pragma pack() + +#endif /* ARRAYCMD_H */ diff --git a/kernel/drivers/block/ida_ioctl.h b/kernel/drivers/block/ida_ioctl.h new file mode 100644 index 000000000..888fff9ca --- /dev/null +++ b/kernel/drivers/block/ida_ioctl.h @@ -0,0 +1,87 @@ +/* + * Disk Array driver for Compaq SMART2 Controllers + * Copyright 1998 Compaq Computer Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Questions/Comments/Bugfixes to iss_storagedev@hp.com + * + */ +#ifndef IDA_IOCTL_H +#define IDA_IOCTL_H + +#include "ida_cmd.h" +#include "cpqarray.h" + +#define IDAGETDRVINFO 0x27272828 +#define IDAPASSTHRU 0x28282929 +#define IDAGETCTLRSIG 0x29293030 +#define IDAREVALIDATEVOLS 0x30303131 +#define IDADRIVERVERSION 0x31313232 +#define IDAGETPCIINFO 0x32323333 + +typedef struct _ida_pci_info_struct +{ + unsigned char bus; + unsigned char dev_fn; + __u32 board_id; +} ida_pci_info_struct; +/* + * Normally, the ioctl determines the logical unit for this command by + * the major,minor number of the fd passed to ioctl. If you need to send + * a command to a different/nonexistant unit (such as during config), you + * can override the normal behavior by setting the unit valid bit. (Normally, + * it should be zero) The controller the command is sent to is still + * determined by the major number of the open device. + */ + +#define UNITVALID 0x80 +typedef struct { + __u8 cmd; + __u8 rcode; + __u8 unit; + __u32 blk; + __u16 blk_cnt; + +/* currently, sg_cnt is assumed to be 1: only the 0th element of sg is used */ + struct { + void __user *addr; + size_t size; + } sg[SG_MAX]; + int sg_cnt; + + union ctlr_cmds { + drv_info_t drv; + unsigned char buf[1024]; + + id_ctlr_t id_ctlr; + drv_param_t drv_param; + id_log_drv_t id_log_drv; + id_log_drv_ext_t id_log_drv_ext; + sense_log_drv_stat_t sense_log_drv_stat; + id_phys_drv_t id_phys_drv; + blink_drv_leds_t blink_drv_leds; + sense_blink_leds_t sense_blink_leds; + config_t config; + reorder_log_drv_t reorder_log_drv; + label_log_drv_t label_log_drv; + surf_delay_t surf_delay; + overhead_delay_t overhead_delay; + mp_delay_t mp_delay; + scsi_param_t scsi_param; + } c; +} ida_ioctl_t; + +#endif /* IDA_IOCTL_H */ diff --git a/kernel/drivers/block/loop.c b/kernel/drivers/block/loop.c new file mode 100644 index 000000000..d7173cb1e --- /dev/null +++ b/kernel/drivers/block/loop.c @@ -0,0 +1,1869 @@ +/* + * linux/drivers/block/loop.c + * + * Written by Theodore Ts'o, 3/29/93 + * + * Copyright 1993 by Theodore Ts'o. Redistribution of this file is + * permitted under the GNU General Public License. + * + * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 + * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 + * + * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 + * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 + * + * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 + * + * Added devfs support - Richard Gooch 16-Jan-1998 + * + * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 + * + * Loadable modules and other fixes by AK, 1998 + * + * Make real block number available to downstream transfer functions, enables + * CBC (and relatives) mode encryption requiring unique IVs per data block. + * Reed H. Petty, rhp@draper.net + * + * Maximum number of loop devices now dynamic via max_loop module parameter. + * Russell Kroll 19990701 + * + * Maximum number of loop devices when compiled-in now selectable by passing + * max_loop=<1-255> to the kernel on boot. + * Erik I. Bolsø, , Oct 31, 1999 + * + * Completely rewrite request handling to be make_request_fn style and + * non blocking, pushing work to a helper thread. Lots of fixes from + * Al Viro too. + * Jens Axboe , Nov 2000 + * + * Support up to 256 loop devices + * Heinz Mauelshagen , Feb 2002 + * + * Support for falling back on the write file operation when the address space + * operations write_begin is not available on the backing filesystem. + * Anton Altaparmakov, 16 Feb 2005 + * + * Still To Fix: + * - Advisory locking is ignored here. + * - Should use an own CAP_* category instead of CAP_SYS_ADMIN + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "loop.h" + +#include + +static DEFINE_IDR(loop_index_idr); +static DEFINE_MUTEX(loop_index_mutex); + +static int max_part; +static int part_shift; + +static struct workqueue_struct *loop_wq; + +static int transfer_xor(struct loop_device *lo, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block) +{ + char *raw_buf = kmap_atomic(raw_page) + raw_off; + char *loop_buf = kmap_atomic(loop_page) + loop_off; + char *in, *out, *key; + int i, keysize; + + if (cmd == READ) { + in = raw_buf; + out = loop_buf; + } else { + in = loop_buf; + out = raw_buf; + } + + key = lo->lo_encrypt_key; + keysize = lo->lo_encrypt_key_size; + for (i = 0; i < size; i++) + *out++ = *in++ ^ key[(i & 511) % keysize]; + + kunmap_atomic(loop_buf); + kunmap_atomic(raw_buf); + cond_resched(); + return 0; +} + +static int xor_init(struct loop_device *lo, const struct loop_info64 *info) +{ + if (unlikely(info->lo_encrypt_key_size <= 0)) + return -EINVAL; + return 0; +} + +static struct loop_func_table none_funcs = { + .number = LO_CRYPT_NONE, +}; + +static struct loop_func_table xor_funcs = { + .number = LO_CRYPT_XOR, + .transfer = transfer_xor, + .init = xor_init +}; + +/* xfer_funcs[0] is special - its release function is never called */ +static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { + &none_funcs, + &xor_funcs +}; + +static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) +{ + loff_t loopsize; + + /* Compute loopsize in bytes */ + loopsize = i_size_read(file->f_mapping->host); + if (offset > 0) + loopsize -= offset; + /* offset is beyond i_size, weird but possible */ + if (loopsize < 0) + return 0; + + if (sizelimit > 0 && sizelimit < loopsize) + loopsize = sizelimit; + /* + * Unfortunately, if we want to do I/O on the device, + * the number of 512-byte sectors has to fit into a sector_t. + */ + return loopsize >> 9; +} + +static loff_t get_loop_size(struct loop_device *lo, struct file *file) +{ + return get_size(lo->lo_offset, lo->lo_sizelimit, file); +} + +static int +figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) +{ + loff_t size = get_size(offset, sizelimit, lo->lo_backing_file); + sector_t x = (sector_t)size; + struct block_device *bdev = lo->lo_device; + + if (unlikely((loff_t)x != size)) + return -EFBIG; + if (lo->lo_offset != offset) + lo->lo_offset = offset; + if (lo->lo_sizelimit != sizelimit) + lo->lo_sizelimit = sizelimit; + set_capacity(lo->lo_disk, x); + bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9); + /* let user-space know about the new size */ + kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); + return 0; +} + +static inline int +lo_do_transfer(struct loop_device *lo, int cmd, + struct page *rpage, unsigned roffs, + struct page *lpage, unsigned loffs, + int size, sector_t rblock) +{ + int ret; + + ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock); + if (likely(!ret)) + return 0; + + printk_ratelimited(KERN_ERR + "loop: Transfer error at byte offset %llu, length %i.\n", + (unsigned long long)rblock << 9, size); + return ret; +} + +static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) +{ + struct iov_iter i; + ssize_t bw; + + iov_iter_bvec(&i, ITER_BVEC, bvec, 1, bvec->bv_len); + + file_start_write(file); + bw = vfs_iter_write(file, &i, ppos); + file_end_write(file); + + if (likely(bw == bvec->bv_len)) + return 0; + + printk_ratelimited(KERN_ERR + "loop: Write error at byte offset %llu, length %i.\n", + (unsigned long long)*ppos, bvec->bv_len); + if (bw >= 0) + bw = -EIO; + return bw; +} + +static int lo_write_simple(struct loop_device *lo, struct request *rq, + loff_t pos) +{ + struct bio_vec bvec; + struct req_iterator iter; + int ret = 0; + + rq_for_each_segment(bvec, rq, iter) { + ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos); + if (ret < 0) + break; + cond_resched(); + } + + return ret; +} + +/* + * This is the slow, transforming version that needs to double buffer the + * data as it cannot do the transformations in place without having direct + * access to the destination pages of the backing file. + */ +static int lo_write_transfer(struct loop_device *lo, struct request *rq, + loff_t pos) +{ + struct bio_vec bvec, b; + struct req_iterator iter; + struct page *page; + int ret = 0; + + page = alloc_page(GFP_NOIO); + if (unlikely(!page)) + return -ENOMEM; + + rq_for_each_segment(bvec, rq, iter) { + ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page, + bvec.bv_offset, bvec.bv_len, pos >> 9); + if (unlikely(ret)) + break; + + b.bv_page = page; + b.bv_offset = 0; + b.bv_len = bvec.bv_len; + ret = lo_write_bvec(lo->lo_backing_file, &b, &pos); + if (ret < 0) + break; + } + + __free_page(page); + return ret; +} + +static int lo_read_simple(struct loop_device *lo, struct request *rq, + loff_t pos) +{ + struct bio_vec bvec; + struct req_iterator iter; + struct iov_iter i; + ssize_t len; + + rq_for_each_segment(bvec, rq, iter) { + iov_iter_bvec(&i, ITER_BVEC, &bvec, 1, bvec.bv_len); + len = vfs_iter_read(lo->lo_backing_file, &i, &pos); + if (len < 0) + return len; + + flush_dcache_page(bvec.bv_page); + + if (len != bvec.bv_len) { + struct bio *bio; + + __rq_for_each_bio(bio, rq) + zero_fill_bio(bio); + break; + } + cond_resched(); + } + + return 0; +} + +static int lo_read_transfer(struct loop_device *lo, struct request *rq, + loff_t pos) +{ + struct bio_vec bvec, b; + struct req_iterator iter; + struct iov_iter i; + struct page *page; + ssize_t len; + int ret = 0; + + page = alloc_page(GFP_NOIO); + if (unlikely(!page)) + return -ENOMEM; + + rq_for_each_segment(bvec, rq, iter) { + loff_t offset = pos; + + b.bv_page = page; + b.bv_offset = 0; + b.bv_len = bvec.bv_len; + + iov_iter_bvec(&i, ITER_BVEC, &b, 1, b.bv_len); + len = vfs_iter_read(lo->lo_backing_file, &i, &pos); + if (len < 0) { + ret = len; + goto out_free_page; + } + + ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page, + bvec.bv_offset, len, offset >> 9); + if (ret) + goto out_free_page; + + flush_dcache_page(bvec.bv_page); + + if (len != bvec.bv_len) { + struct bio *bio; + + __rq_for_each_bio(bio, rq) + zero_fill_bio(bio); + break; + } + } + + ret = 0; +out_free_page: + __free_page(page); + return ret; +} + +static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos) +{ + /* + * We use punch hole to reclaim the free space used by the + * image a.k.a. discard. However we do not support discard if + * encryption is enabled, because it may give an attacker + * useful information. + */ + struct file *file = lo->lo_backing_file; + int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + int ret; + + if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { + ret = -EOPNOTSUPP; + goto out; + } + + ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); + if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) + ret = -EIO; + out: + return ret; +} + +static int lo_req_flush(struct loop_device *lo, struct request *rq) +{ + struct file *file = lo->lo_backing_file; + int ret = vfs_fsync(file, 0); + if (unlikely(ret && ret != -EINVAL)) + ret = -EIO; + + return ret; +} + +static int do_req_filebacked(struct loop_device *lo, struct request *rq) +{ + loff_t pos; + int ret; + + pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; + + if (rq->cmd_flags & REQ_WRITE) { + if (rq->cmd_flags & REQ_FLUSH) + ret = lo_req_flush(lo, rq); + else if (rq->cmd_flags & REQ_DISCARD) + ret = lo_discard(lo, rq, pos); + else if (lo->transfer) + ret = lo_write_transfer(lo, rq, pos); + else + ret = lo_write_simple(lo, rq, pos); + + } else { + if (lo->transfer) + ret = lo_read_transfer(lo, rq, pos); + else + ret = lo_read_simple(lo, rq, pos); + } + + return ret; +} + +struct switch_request { + struct file *file; + struct completion wait; +}; + +/* + * Do the actual switch; called from the BIO completion routine + */ +static void do_loop_switch(struct loop_device *lo, struct switch_request *p) +{ + struct file *file = p->file; + struct file *old_file = lo->lo_backing_file; + struct address_space *mapping; + + /* if no new file, only flush of queued bios requested */ + if (!file) + return; + + mapping = file->f_mapping; + mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); + lo->lo_backing_file = file; + lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? + mapping->host->i_bdev->bd_block_size : PAGE_SIZE; + lo->old_gfp_mask = mapping_gfp_mask(mapping); + mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); +} + +/* + * loop_switch performs the hard work of switching a backing store. + * First it needs to flush existing IO, it does this by sending a magic + * BIO down the pipe. The completion of this BIO does the actual switch. + */ +static int loop_switch(struct loop_device *lo, struct file *file) +{ + struct switch_request w; + + w.file = file; + + /* freeze queue and wait for completion of scheduled requests */ + blk_mq_freeze_queue(lo->lo_queue); + + /* do the switch action */ + do_loop_switch(lo, &w); + + /* unfreeze */ + blk_mq_unfreeze_queue(lo->lo_queue); + + return 0; +} + +/* + * Helper to flush the IOs in loop, but keeping loop thread running + */ +static int loop_flush(struct loop_device *lo) +{ + return loop_switch(lo, NULL); +} + +/* + * loop_change_fd switched the backing store of a loopback device to + * a new file. This is useful for operating system installers to free up + * the original file and in High Availability environments to switch to + * an alternative location for the content in case of server meltdown. + * This can only work if the loop device is used read-only, and if the + * new backing store is the same size and type as the old backing store. + */ +static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, + unsigned int arg) +{ + struct file *file, *old_file; + struct inode *inode; + int error; + + error = -ENXIO; + if (lo->lo_state != Lo_bound) + goto out; + + /* the loop device has to be read-only */ + error = -EINVAL; + if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) + goto out; + + error = -EBADF; + file = fget(arg); + if (!file) + goto out; + + inode = file->f_mapping->host; + old_file = lo->lo_backing_file; + + error = -EINVAL; + + if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) + goto out_putf; + + /* size of the new backing store needs to be the same */ + if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) + goto out_putf; + + /* and ... switch */ + error = loop_switch(lo, file); + if (error) + goto out_putf; + + fput(old_file); + if (lo->lo_flags & LO_FLAGS_PARTSCAN) + ioctl_by_bdev(bdev, BLKRRPART, 0); + return 0; + + out_putf: + fput(file); + out: + return error; +} + +static inline int is_loop_device(struct file *file) +{ + struct inode *i = file->f_mapping->host; + + return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; +} + +/* loop sysfs attributes */ + +static ssize_t loop_attr_show(struct device *dev, char *page, + ssize_t (*callback)(struct loop_device *, char *)) +{ + struct gendisk *disk = dev_to_disk(dev); + struct loop_device *lo = disk->private_data; + + return callback(lo, page); +} + +#define LOOP_ATTR_RO(_name) \ +static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \ +static ssize_t loop_attr_do_show_##_name(struct device *d, \ + struct device_attribute *attr, char *b) \ +{ \ + return loop_attr_show(d, b, loop_attr_##_name##_show); \ +} \ +static struct device_attribute loop_attr_##_name = \ + __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL); + +static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) +{ + ssize_t ret; + char *p = NULL; + + spin_lock_irq(&lo->lo_lock); + if (lo->lo_backing_file) + p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1); + spin_unlock_irq(&lo->lo_lock); + + if (IS_ERR_OR_NULL(p)) + ret = PTR_ERR(p); + else { + ret = strlen(p); + memmove(buf, p, ret); + buf[ret++] = '\n'; + buf[ret] = 0; + } + + return ret; +} + +static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset); +} + +static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); +} + +static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) +{ + int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); + + return sprintf(buf, "%s\n", autoclear ? "1" : "0"); +} + +static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) +{ + int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); + + return sprintf(buf, "%s\n", partscan ? "1" : "0"); +} + +LOOP_ATTR_RO(backing_file); +LOOP_ATTR_RO(offset); +LOOP_ATTR_RO(sizelimit); +LOOP_ATTR_RO(autoclear); +LOOP_ATTR_RO(partscan); + +static struct attribute *loop_attrs[] = { + &loop_attr_backing_file.attr, + &loop_attr_offset.attr, + &loop_attr_sizelimit.attr, + &loop_attr_autoclear.attr, + &loop_attr_partscan.attr, + NULL, +}; + +static struct attribute_group loop_attribute_group = { + .name = "loop", + .attrs= loop_attrs, +}; + +static int loop_sysfs_init(struct loop_device *lo) +{ + return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj, + &loop_attribute_group); +} + +static void loop_sysfs_exit(struct loop_device *lo) +{ + sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj, + &loop_attribute_group); +} + +static void loop_config_discard(struct loop_device *lo) +{ + struct file *file = lo->lo_backing_file; + struct inode *inode = file->f_mapping->host; + struct request_queue *q = lo->lo_queue; + + /* + * We use punch hole to reclaim the free space used by the + * image a.k.a. discard. However we do not support discard if + * encryption is enabled, because it may give an attacker + * useful information. + */ + if ((!file->f_op->fallocate) || + lo->lo_encrypt_key_size) { + q->limits.discard_granularity = 0; + q->limits.discard_alignment = 0; + q->limits.max_discard_sectors = 0; + q->limits.discard_zeroes_data = 0; + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + return; + } + + q->limits.discard_granularity = inode->i_sb->s_blocksize; + q->limits.discard_alignment = 0; + q->limits.max_discard_sectors = UINT_MAX >> 9; + q->limits.discard_zeroes_data = 1; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); +} + +static int loop_set_fd(struct loop_device *lo, fmode_t mode, + struct block_device *bdev, unsigned int arg) +{ + struct file *file, *f; + struct inode *inode; + struct address_space *mapping; + unsigned lo_blocksize; + int lo_flags = 0; + int error; + loff_t size; + + /* This is safe, since we have a reference from open(). */ + __module_get(THIS_MODULE); + + error = -EBADF; + file = fget(arg); + if (!file) + goto out; + + error = -EBUSY; + if (lo->lo_state != Lo_unbound) + goto out_putf; + + /* Avoid recursion */ + f = file; + while (is_loop_device(f)) { + struct loop_device *l; + + if (f->f_mapping->host->i_bdev == bdev) + goto out_putf; + + l = f->f_mapping->host->i_bdev->bd_disk->private_data; + if (l->lo_state == Lo_unbound) { + error = -EINVAL; + goto out_putf; + } + f = l->lo_backing_file; + } + + mapping = file->f_mapping; + inode = mapping->host; + + error = -EINVAL; + if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) + goto out_putf; + + if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || + !file->f_op->write_iter) + lo_flags |= LO_FLAGS_READ_ONLY; + + lo_blocksize = S_ISBLK(inode->i_mode) ? + inode->i_bdev->bd_block_size : PAGE_SIZE; + + error = -EFBIG; + size = get_loop_size(lo, file); + if ((loff_t)(sector_t)size != size) + goto out_putf; + + error = 0; + + set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); + + lo->lo_blocksize = lo_blocksize; + lo->lo_device = bdev; + lo->lo_flags = lo_flags; + lo->lo_backing_file = file; + lo->transfer = NULL; + lo->ioctl = NULL; + lo->lo_sizelimit = 0; + lo->old_gfp_mask = mapping_gfp_mask(mapping); + mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + + if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) + blk_queue_flush(lo->lo_queue, REQ_FLUSH); + + set_capacity(lo->lo_disk, size); + bd_set_size(bdev, size << 9); + loop_sysfs_init(lo); + /* let user-space know about the new size */ + kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); + + set_blocksize(bdev, lo_blocksize); + + lo->lo_state = Lo_bound; + if (part_shift) + lo->lo_flags |= LO_FLAGS_PARTSCAN; + if (lo->lo_flags & LO_FLAGS_PARTSCAN) + ioctl_by_bdev(bdev, BLKRRPART, 0); + + /* Grab the block_device to prevent its destruction after we + * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). + */ + bdgrab(bdev); + return 0; + + out_putf: + fput(file); + out: + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); + return error; +} + +static int +loop_release_xfer(struct loop_device *lo) +{ + int err = 0; + struct loop_func_table *xfer = lo->lo_encryption; + + if (xfer) { + if (xfer->release) + err = xfer->release(lo); + lo->transfer = NULL; + lo->lo_encryption = NULL; + module_put(xfer->owner); + } + return err; +} + +static int +loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, + const struct loop_info64 *i) +{ + int err = 0; + + if (xfer) { + struct module *owner = xfer->owner; + + if (!try_module_get(owner)) + return -EINVAL; + if (xfer->init) + err = xfer->init(lo, i); + if (err) + module_put(owner); + else + lo->lo_encryption = xfer; + } + return err; +} + +static int loop_clr_fd(struct loop_device *lo) +{ + struct file *filp = lo->lo_backing_file; + gfp_t gfp = lo->old_gfp_mask; + struct block_device *bdev = lo->lo_device; + + if (lo->lo_state != Lo_bound) + return -ENXIO; + + /* + * If we've explicitly asked to tear down the loop device, + * and it has an elevated reference count, set it for auto-teardown when + * the last reference goes away. This stops $!~#$@ udev from + * preventing teardown because it decided that it needs to run blkid on + * the loopback device whenever they appear. xfstests is notorious for + * failing tests because blkid via udev races with a losetup + * /do something like mkfs/losetup -d causing the losetup -d + * command to fail with EBUSY. + */ + if (lo->lo_refcnt > 1) { + lo->lo_flags |= LO_FLAGS_AUTOCLEAR; + mutex_unlock(&lo->lo_ctl_mutex); + return 0; + } + + if (filp == NULL) + return -EINVAL; + + spin_lock_irq(&lo->lo_lock); + lo->lo_state = Lo_rundown; + lo->lo_backing_file = NULL; + spin_unlock_irq(&lo->lo_lock); + + loop_release_xfer(lo); + lo->transfer = NULL; + lo->ioctl = NULL; + lo->lo_device = NULL; + lo->lo_encryption = NULL; + lo->lo_offset = 0; + lo->lo_sizelimit = 0; + lo->lo_encrypt_key_size = 0; + memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); + memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); + memset(lo->lo_file_name, 0, LO_NAME_SIZE); + if (bdev) { + bdput(bdev); + invalidate_bdev(bdev); + } + set_capacity(lo->lo_disk, 0); + loop_sysfs_exit(lo); + if (bdev) { + bd_set_size(bdev, 0); + /* let user-space know about this change */ + kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); + } + mapping_set_gfp_mask(filp->f_mapping, gfp); + lo->lo_state = Lo_unbound; + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); + if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) + ioctl_by_bdev(bdev, BLKRRPART, 0); + lo->lo_flags = 0; + if (!part_shift) + lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; + mutex_unlock(&lo->lo_ctl_mutex); + /* + * Need not hold lo_ctl_mutex to fput backing file. + * Calling fput holding lo_ctl_mutex triggers a circular + * lock dependency possibility warning as fput can take + * bd_mutex which is usually taken before lo_ctl_mutex. + */ + fput(filp); + return 0; +} + +static int +loop_set_status(struct loop_device *lo, const struct loop_info64 *info) +{ + int err; + struct loop_func_table *xfer; + kuid_t uid = current_uid(); + + if (lo->lo_encrypt_key_size && + !uid_eq(lo->lo_key_owner, uid) && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (lo->lo_state != Lo_bound) + return -ENXIO; + if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) + return -EINVAL; + + err = loop_release_xfer(lo); + if (err) + return err; + + if (info->lo_encrypt_type) { + unsigned int type = info->lo_encrypt_type; + + if (type >= MAX_LO_CRYPT) + return -EINVAL; + xfer = xfer_funcs[type]; + if (xfer == NULL) + return -EINVAL; + } else + xfer = NULL; + + err = loop_init_xfer(lo, xfer, info); + if (err) + return err; + + if (lo->lo_offset != info->lo_offset || + lo->lo_sizelimit != info->lo_sizelimit) + if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) + return -EFBIG; + + loop_config_discard(lo); + + memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); + memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); + lo->lo_file_name[LO_NAME_SIZE-1] = 0; + lo->lo_crypt_name[LO_NAME_SIZE-1] = 0; + + if (!xfer) + xfer = &none_funcs; + lo->transfer = xfer->transfer; + lo->ioctl = xfer->ioctl; + + if ((lo->lo_flags & LO_FLAGS_AUTOCLEAR) != + (info->lo_flags & LO_FLAGS_AUTOCLEAR)) + lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; + + if ((info->lo_flags & LO_FLAGS_PARTSCAN) && + !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { + lo->lo_flags |= LO_FLAGS_PARTSCAN; + lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + ioctl_by_bdev(lo->lo_device, BLKRRPART, 0); + } + + lo->lo_encrypt_key_size = info->lo_encrypt_key_size; + lo->lo_init[0] = info->lo_init[0]; + lo->lo_init[1] = info->lo_init[1]; + if (info->lo_encrypt_key_size) { + memcpy(lo->lo_encrypt_key, info->lo_encrypt_key, + info->lo_encrypt_key_size); + lo->lo_key_owner = uid; + } + + return 0; +} + +static int +loop_get_status(struct loop_device *lo, struct loop_info64 *info) +{ + struct file *file = lo->lo_backing_file; + struct kstat stat; + int error; + + if (lo->lo_state != Lo_bound) + return -ENXIO; + error = vfs_getattr(&file->f_path, &stat); + if (error) + return error; + memset(info, 0, sizeof(*info)); + info->lo_number = lo->lo_number; + info->lo_device = huge_encode_dev(stat.dev); + info->lo_inode = stat.ino; + info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev); + info->lo_offset = lo->lo_offset; + info->lo_sizelimit = lo->lo_sizelimit; + info->lo_flags = lo->lo_flags; + memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE); + memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE); + info->lo_encrypt_type = + lo->lo_encryption ? lo->lo_encryption->number : 0; + if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) { + info->lo_encrypt_key_size = lo->lo_encrypt_key_size; + memcpy(info->lo_encrypt_key, lo->lo_encrypt_key, + lo->lo_encrypt_key_size); + } + return 0; +} + +static void +loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64) +{ + memset(info64, 0, sizeof(*info64)); + info64->lo_number = info->lo_number; + info64->lo_device = info->lo_device; + info64->lo_inode = info->lo_inode; + info64->lo_rdevice = info->lo_rdevice; + info64->lo_offset = info->lo_offset; + info64->lo_sizelimit = 0; + info64->lo_encrypt_type = info->lo_encrypt_type; + info64->lo_encrypt_key_size = info->lo_encrypt_key_size; + info64->lo_flags = info->lo_flags; + info64->lo_init[0] = info->lo_init[0]; + info64->lo_init[1] = info->lo_init[1]; + if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) + memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE); + else + memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); + memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE); +} + +static int +loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) +{ + memset(info, 0, sizeof(*info)); + info->lo_number = info64->lo_number; + info->lo_device = info64->lo_device; + info->lo_inode = info64->lo_inode; + info->lo_rdevice = info64->lo_rdevice; + info->lo_offset = info64->lo_offset; + info->lo_encrypt_type = info64->lo_encrypt_type; + info->lo_encrypt_key_size = info64->lo_encrypt_key_size; + info->lo_flags = info64->lo_flags; + info->lo_init[0] = info64->lo_init[0]; + info->lo_init[1] = info64->lo_init[1]; + if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) + memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE); + else + memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE); + memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); + + /* error in case values were truncated */ + if (info->lo_device != info64->lo_device || + info->lo_rdevice != info64->lo_rdevice || + info->lo_inode != info64->lo_inode || + info->lo_offset != info64->lo_offset) + return -EOVERFLOW; + + return 0; +} + +static int +loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg) +{ + struct loop_info info; + struct loop_info64 info64; + + if (copy_from_user(&info, arg, sizeof (struct loop_info))) + return -EFAULT; + loop_info64_from_old(&info, &info64); + return loop_set_status(lo, &info64); +} + +static int +loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg) +{ + struct loop_info64 info64; + + if (copy_from_user(&info64, arg, sizeof (struct loop_info64))) + return -EFAULT; + return loop_set_status(lo, &info64); +} + +static int +loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) { + struct loop_info info; + struct loop_info64 info64; + int err = 0; + + if (!arg) + err = -EINVAL; + if (!err) + err = loop_get_status(lo, &info64); + if (!err) + err = loop_info64_to_old(&info64, &info); + if (!err && copy_to_user(arg, &info, sizeof(info))) + err = -EFAULT; + + return err; +} + +static int +loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) { + struct loop_info64 info64; + int err = 0; + + if (!arg) + err = -EINVAL; + if (!err) + err = loop_get_status(lo, &info64); + if (!err && copy_to_user(arg, &info64, sizeof(info64))) + err = -EFAULT; + + return err; +} + +static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev) +{ + if (unlikely(lo->lo_state != Lo_bound)) + return -ENXIO; + + return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); +} + +static int lo_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct loop_device *lo = bdev->bd_disk->private_data; + int err; + + mutex_lock_nested(&lo->lo_ctl_mutex, 1); + switch (cmd) { + case LOOP_SET_FD: + err = loop_set_fd(lo, mode, bdev, arg); + break; + case LOOP_CHANGE_FD: + err = loop_change_fd(lo, bdev, arg); + break; + case LOOP_CLR_FD: + /* loop_clr_fd would have unlocked lo_ctl_mutex on success */ + err = loop_clr_fd(lo); + if (!err) + goto out_unlocked; + break; + case LOOP_SET_STATUS: + err = -EPERM; + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_status_old(lo, + (struct loop_info __user *)arg); + break; + case LOOP_GET_STATUS: + err = loop_get_status_old(lo, (struct loop_info __user *) arg); + break; + case LOOP_SET_STATUS64: + err = -EPERM; + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_status64(lo, + (struct loop_info64 __user *) arg); + break; + case LOOP_GET_STATUS64: + err = loop_get_status64(lo, (struct loop_info64 __user *) arg); + break; + case LOOP_SET_CAPACITY: + err = -EPERM; + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_capacity(lo, bdev); + break; + default: + err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; + } + mutex_unlock(&lo->lo_ctl_mutex); + +out_unlocked: + return err; +} + +#ifdef CONFIG_COMPAT +struct compat_loop_info { + compat_int_t lo_number; /* ioctl r/o */ + compat_dev_t lo_device; /* ioctl r/o */ + compat_ulong_t lo_inode; /* ioctl r/o */ + compat_dev_t lo_rdevice; /* ioctl r/o */ + compat_int_t lo_offset; + compat_int_t lo_encrypt_type; + compat_int_t lo_encrypt_key_size; /* ioctl w/o */ + compat_int_t lo_flags; /* ioctl r/o */ + char lo_name[LO_NAME_SIZE]; + unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */ + compat_ulong_t lo_init[2]; + char reserved[4]; +}; + +/* + * Transfer 32-bit compatibility structure in userspace to 64-bit loop info + * - noinlined to reduce stack space usage in main part of driver + */ +static noinline int +loop_info64_from_compat(const struct compat_loop_info __user *arg, + struct loop_info64 *info64) +{ + struct compat_loop_info info; + + if (copy_from_user(&info, arg, sizeof(info))) + return -EFAULT; + + memset(info64, 0, sizeof(*info64)); + info64->lo_number = info.lo_number; + info64->lo_device = info.lo_device; + info64->lo_inode = info.lo_inode; + info64->lo_rdevice = info.lo_rdevice; + info64->lo_offset = info.lo_offset; + info64->lo_sizelimit = 0; + info64->lo_encrypt_type = info.lo_encrypt_type; + info64->lo_encrypt_key_size = info.lo_encrypt_key_size; + info64->lo_flags = info.lo_flags; + info64->lo_init[0] = info.lo_init[0]; + info64->lo_init[1] = info.lo_init[1]; + if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) + memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE); + else + memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE); + memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE); + return 0; +} + +/* + * Transfer 64-bit loop info to 32-bit compatibility structure in userspace + * - noinlined to reduce stack space usage in main part of driver + */ +static noinline int +loop_info64_to_compat(const struct loop_info64 *info64, + struct compat_loop_info __user *arg) +{ + struct compat_loop_info info; + + memset(&info, 0, sizeof(info)); + info.lo_number = info64->lo_number; + info.lo_device = info64->lo_device; + info.lo_inode = info64->lo_inode; + info.lo_rdevice = info64->lo_rdevice; + info.lo_offset = info64->lo_offset; + info.lo_encrypt_type = info64->lo_encrypt_type; + info.lo_encrypt_key_size = info64->lo_encrypt_key_size; + info.lo_flags = info64->lo_flags; + info.lo_init[0] = info64->lo_init[0]; + info.lo_init[1] = info64->lo_init[1]; + if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI) + memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE); + else + memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE); + memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE); + + /* error in case values were truncated */ + if (info.lo_device != info64->lo_device || + info.lo_rdevice != info64->lo_rdevice || + info.lo_inode != info64->lo_inode || + info.lo_offset != info64->lo_offset || + info.lo_init[0] != info64->lo_init[0] || + info.lo_init[1] != info64->lo_init[1]) + return -EOVERFLOW; + + if (copy_to_user(arg, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +static int +loop_set_status_compat(struct loop_device *lo, + const struct compat_loop_info __user *arg) +{ + struct loop_info64 info64; + int ret; + + ret = loop_info64_from_compat(arg, &info64); + if (ret < 0) + return ret; + return loop_set_status(lo, &info64); +} + +static int +loop_get_status_compat(struct loop_device *lo, + struct compat_loop_info __user *arg) +{ + struct loop_info64 info64; + int err = 0; + + if (!arg) + err = -EINVAL; + if (!err) + err = loop_get_status(lo, &info64); + if (!err) + err = loop_info64_to_compat(&info64, arg); + return err; +} + +static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct loop_device *lo = bdev->bd_disk->private_data; + int err; + + switch(cmd) { + case LOOP_SET_STATUS: + mutex_lock(&lo->lo_ctl_mutex); + err = loop_set_status_compat( + lo, (const struct compat_loop_info __user *) arg); + mutex_unlock(&lo->lo_ctl_mutex); + break; + case LOOP_GET_STATUS: + mutex_lock(&lo->lo_ctl_mutex); + err = loop_get_status_compat( + lo, (struct compat_loop_info __user *) arg); + mutex_unlock(&lo->lo_ctl_mutex); + break; + case LOOP_SET_CAPACITY: + case LOOP_CLR_FD: + case LOOP_GET_STATUS64: + case LOOP_SET_STATUS64: + arg = (unsigned long) compat_ptr(arg); + case LOOP_SET_FD: + case LOOP_CHANGE_FD: + err = lo_ioctl(bdev, mode, cmd, arg); + break; + default: + err = -ENOIOCTLCMD; + break; + } + return err; +} +#endif + +static int lo_open(struct block_device *bdev, fmode_t mode) +{ + struct loop_device *lo; + int err = 0; + + mutex_lock(&loop_index_mutex); + lo = bdev->bd_disk->private_data; + if (!lo) { + err = -ENXIO; + goto out; + } + + mutex_lock(&lo->lo_ctl_mutex); + lo->lo_refcnt++; + mutex_unlock(&lo->lo_ctl_mutex); +out: + mutex_unlock(&loop_index_mutex); + return err; +} + +static void lo_release(struct gendisk *disk, fmode_t mode) +{ + struct loop_device *lo = disk->private_data; + int err; + + mutex_lock(&lo->lo_ctl_mutex); + + if (--lo->lo_refcnt) + goto out; + + if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { + /* + * In autoclear mode, stop the loop thread + * and remove configuration after last close. + */ + err = loop_clr_fd(lo); + if (!err) + return; + } else { + /* + * Otherwise keep thread (if running) and config, + * but flush possible ongoing bios in thread. + */ + loop_flush(lo); + } + +out: + mutex_unlock(&lo->lo_ctl_mutex); +} + +static const struct block_device_operations lo_fops = { + .owner = THIS_MODULE, + .open = lo_open, + .release = lo_release, + .ioctl = lo_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = lo_compat_ioctl, +#endif +}; + +/* + * And now the modules code and kernel interface. + */ +static int max_loop; +module_param(max_loop, int, S_IRUGO); +MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); +module_param(max_part, int, S_IRUGO); +MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); + +int loop_register_transfer(struct loop_func_table *funcs) +{ + unsigned int n = funcs->number; + + if (n >= MAX_LO_CRYPT || xfer_funcs[n]) + return -EINVAL; + xfer_funcs[n] = funcs; + return 0; +} + +static int unregister_transfer_cb(int id, void *ptr, void *data) +{ + struct loop_device *lo = ptr; + struct loop_func_table *xfer = data; + + mutex_lock(&lo->lo_ctl_mutex); + if (lo->lo_encryption == xfer) + loop_release_xfer(lo); + mutex_unlock(&lo->lo_ctl_mutex); + return 0; +} + +int loop_unregister_transfer(int number) +{ + unsigned int n = number; + struct loop_func_table *xfer; + + if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL) + return -EINVAL; + + xfer_funcs[n] = NULL; + idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer); + return 0; +} + +EXPORT_SYMBOL(loop_register_transfer); +EXPORT_SYMBOL(loop_unregister_transfer); + +static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + + blk_mq_start_request(bd->rq); + + if (cmd->rq->cmd_flags & REQ_WRITE) { + struct loop_device *lo = cmd->rq->q->queuedata; + bool need_sched = true; + + spin_lock_irq(&lo->lo_lock); + if (lo->write_started) + need_sched = false; + else + lo->write_started = true; + list_add_tail(&cmd->list, &lo->write_cmd_head); + spin_unlock_irq(&lo->lo_lock); + + if (need_sched) + queue_work(loop_wq, &lo->write_work); + } else { + queue_work(loop_wq, &cmd->read_work); + } + + return BLK_MQ_RQ_QUEUE_OK; +} + +static void loop_handle_cmd(struct loop_cmd *cmd) +{ + const bool write = cmd->rq->cmd_flags & REQ_WRITE; + struct loop_device *lo = cmd->rq->q->queuedata; + int ret = -EIO; + + if (lo->lo_state != Lo_bound) + goto failed; + + if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) + goto failed; + + ret = do_req_filebacked(lo, cmd->rq); + + failed: + if (ret) + cmd->rq->errors = -EIO; + blk_mq_complete_request(cmd->rq); +} + +static void loop_queue_write_work(struct work_struct *work) +{ + struct loop_device *lo = + container_of(work, struct loop_device, write_work); + LIST_HEAD(cmd_list); + + spin_lock_irq(&lo->lo_lock); + repeat: + list_splice_init(&lo->write_cmd_head, &cmd_list); + spin_unlock_irq(&lo->lo_lock); + + while (!list_empty(&cmd_list)) { + struct loop_cmd *cmd = list_first_entry(&cmd_list, + struct loop_cmd, list); + list_del_init(&cmd->list); + loop_handle_cmd(cmd); + } + + spin_lock_irq(&lo->lo_lock); + if (!list_empty(&lo->write_cmd_head)) + goto repeat; + lo->write_started = false; + spin_unlock_irq(&lo->lo_lock); +} + +static void loop_queue_read_work(struct work_struct *work) +{ + struct loop_cmd *cmd = + container_of(work, struct loop_cmd, read_work); + + loop_handle_cmd(cmd); +} + +static int loop_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); + + cmd->rq = rq; + INIT_WORK(&cmd->read_work, loop_queue_read_work); + + return 0; +} + +static struct blk_mq_ops loop_mq_ops = { + .queue_rq = loop_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = loop_init_request, +}; + +static int loop_add(struct loop_device **l, int i) +{ + struct loop_device *lo; + struct gendisk *disk; + int err; + + err = -ENOMEM; + lo = kzalloc(sizeof(*lo), GFP_KERNEL); + if (!lo) + goto out; + + lo->lo_state = Lo_unbound; + + /* allocate id, if @id >= 0, we're requesting that specific id */ + if (i >= 0) { + err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL); + if (err == -ENOSPC) + err = -EEXIST; + } else { + err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL); + } + if (err < 0) + goto out_free_dev; + i = err; + + err = -ENOMEM; + lo->tag_set.ops = &loop_mq_ops; + lo->tag_set.nr_hw_queues = 1; + lo->tag_set.queue_depth = 128; + lo->tag_set.numa_node = NUMA_NO_NODE; + lo->tag_set.cmd_size = sizeof(struct loop_cmd); + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + lo->tag_set.driver_data = lo; + + err = blk_mq_alloc_tag_set(&lo->tag_set); + if (err) + goto out_free_idr; + + lo->lo_queue = blk_mq_init_queue(&lo->tag_set); + if (IS_ERR_OR_NULL(lo->lo_queue)) { + err = PTR_ERR(lo->lo_queue); + goto out_cleanup_tags; + } + lo->lo_queue->queuedata = lo; + + INIT_LIST_HEAD(&lo->write_cmd_head); + INIT_WORK(&lo->write_work, loop_queue_write_work); + + disk = lo->lo_disk = alloc_disk(1 << part_shift); + if (!disk) + goto out_free_queue; + + /* + * Disable partition scanning by default. The in-kernel partition + * scanning can be requested individually per-device during its + * setup. Userspace can always add and remove partitions from all + * devices. The needed partition minors are allocated from the + * extended minor space, the main loop device numbers will continue + * to match the loop minors, regardless of the number of partitions + * used. + * + * If max_part is given, partition scanning is globally enabled for + * all loop devices. The minors for the main loop devices will be + * multiples of max_part. + * + * Note: Global-for-all-devices, set-only-at-init, read-only module + * parameteters like 'max_loop' and 'max_part' make things needlessly + * complicated, are too static, inflexible and may surprise + * userspace tools. Parameters like this in general should be avoided. + */ + if (!part_shift) + disk->flags |= GENHD_FL_NO_PART_SCAN; + disk->flags |= GENHD_FL_EXT_DEVT; + mutex_init(&lo->lo_ctl_mutex); + lo->lo_number = i; + spin_lock_init(&lo->lo_lock); + disk->major = LOOP_MAJOR; + disk->first_minor = i << part_shift; + disk->fops = &lo_fops; + disk->private_data = lo; + disk->queue = lo->lo_queue; + sprintf(disk->disk_name, "loop%d", i); + add_disk(disk); + *l = lo; + return lo->lo_number; + +out_free_queue: + blk_cleanup_queue(lo->lo_queue); +out_cleanup_tags: + blk_mq_free_tag_set(&lo->tag_set); +out_free_idr: + idr_remove(&loop_index_idr, i); +out_free_dev: + kfree(lo); +out: + return err; +} + +static void loop_remove(struct loop_device *lo) +{ + blk_cleanup_queue(lo->lo_queue); + del_gendisk(lo->lo_disk); + blk_mq_free_tag_set(&lo->tag_set); + put_disk(lo->lo_disk); + kfree(lo); +} + +static int find_free_cb(int id, void *ptr, void *data) +{ + struct loop_device *lo = ptr; + struct loop_device **l = data; + + if (lo->lo_state == Lo_unbound) { + *l = lo; + return 1; + } + return 0; +} + +static int loop_lookup(struct loop_device **l, int i) +{ + struct loop_device *lo; + int ret = -ENODEV; + + if (i < 0) { + int err; + + err = idr_for_each(&loop_index_idr, &find_free_cb, &lo); + if (err == 1) { + *l = lo; + ret = lo->lo_number; + } + goto out; + } + + /* lookup and return a specific i */ + lo = idr_find(&loop_index_idr, i); + if (lo) { + *l = lo; + ret = lo->lo_number; + } +out: + return ret; +} + +static struct kobject *loop_probe(dev_t dev, int *part, void *data) +{ + struct loop_device *lo; + struct kobject *kobj; + int err; + + mutex_lock(&loop_index_mutex); + err = loop_lookup(&lo, MINOR(dev) >> part_shift); + if (err < 0) + err = loop_add(&lo, MINOR(dev) >> part_shift); + if (err < 0) + kobj = NULL; + else + kobj = get_disk(lo->lo_disk); + mutex_unlock(&loop_index_mutex); + + *part = 0; + return kobj; +} + +static long loop_control_ioctl(struct file *file, unsigned int cmd, + unsigned long parm) +{ + struct loop_device *lo; + int ret = -ENOSYS; + + mutex_lock(&loop_index_mutex); + switch (cmd) { + case LOOP_CTL_ADD: + ret = loop_lookup(&lo, parm); + if (ret >= 0) { + ret = -EEXIST; + break; + } + ret = loop_add(&lo, parm); + break; + case LOOP_CTL_REMOVE: + ret = loop_lookup(&lo, parm); + if (ret < 0) + break; + mutex_lock(&lo->lo_ctl_mutex); + if (lo->lo_state != Lo_unbound) { + ret = -EBUSY; + mutex_unlock(&lo->lo_ctl_mutex); + break; + } + if (lo->lo_refcnt > 0) { + ret = -EBUSY; + mutex_unlock(&lo->lo_ctl_mutex); + break; + } + lo->lo_disk->private_data = NULL; + mutex_unlock(&lo->lo_ctl_mutex); + idr_remove(&loop_index_idr, lo->lo_number); + loop_remove(lo); + break; + case LOOP_CTL_GET_FREE: + ret = loop_lookup(&lo, -1); + if (ret >= 0) + break; + ret = loop_add(&lo, -1); + } + mutex_unlock(&loop_index_mutex); + + return ret; +} + +static const struct file_operations loop_ctl_fops = { + .open = nonseekable_open, + .unlocked_ioctl = loop_control_ioctl, + .compat_ioctl = loop_control_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice loop_misc = { + .minor = LOOP_CTRL_MINOR, + .name = "loop-control", + .fops = &loop_ctl_fops, +}; + +MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR); +MODULE_ALIAS("devname:loop-control"); + +static int __init loop_init(void) +{ + int i, nr; + unsigned long range; + struct loop_device *lo; + int err; + + err = misc_register(&loop_misc); + if (err < 0) + return err; + + part_shift = 0; + if (max_part > 0) { + part_shift = fls(max_part); + + /* + * Adjust max_part according to part_shift as it is exported + * to user space so that user can decide correct minor number + * if [s]he want to create more devices. + * + * Note that -1 is required because partition 0 is reserved + * for the whole disk. + */ + max_part = (1UL << part_shift) - 1; + } + + if ((1UL << part_shift) > DISK_MAX_PARTS) { + err = -EINVAL; + goto misc_out; + } + + if (max_loop > 1UL << (MINORBITS - part_shift)) { + err = -EINVAL; + goto misc_out; + } + + /* + * If max_loop is specified, create that many devices upfront. + * This also becomes a hard limit. If max_loop is not specified, + * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module + * init time. Loop devices can be requested on-demand with the + * /dev/loop-control interface, or be instantiated by accessing + * a 'dead' device node. + */ + if (max_loop) { + nr = max_loop; + range = max_loop << part_shift; + } else { + nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT; + range = 1UL << MINORBITS; + } + + if (register_blkdev(LOOP_MAJOR, "loop")) { + err = -EIO; + goto misc_out; + } + + loop_wq = alloc_workqueue("kloopd", + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 0); + if (!loop_wq) { + err = -ENOMEM; + goto misc_out; + } + + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, + THIS_MODULE, loop_probe, NULL, NULL); + + /* pre-create number of devices given by config or max_loop */ + mutex_lock(&loop_index_mutex); + for (i = 0; i < nr; i++) + loop_add(&lo, i); + mutex_unlock(&loop_index_mutex); + + printk(KERN_INFO "loop: module loaded\n"); + return 0; + +misc_out: + misc_deregister(&loop_misc); + return err; +} + +static int loop_exit_cb(int id, void *ptr, void *data) +{ + struct loop_device *lo = ptr; + + loop_remove(lo); + return 0; +} + +static void __exit loop_exit(void) +{ + unsigned long range; + + range = max_loop ? max_loop << part_shift : 1UL << MINORBITS; + + idr_for_each(&loop_index_idr, &loop_exit_cb, NULL); + idr_destroy(&loop_index_idr); + + blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); + unregister_blkdev(LOOP_MAJOR, "loop"); + + destroy_workqueue(loop_wq); + + misc_deregister(&loop_misc); +} + +module_init(loop_init); +module_exit(loop_exit); + +#ifndef MODULE +static int __init max_loop_setup(char *str) +{ + max_loop = simple_strtol(str, NULL, 0); + return 1; +} + +__setup("max_loop=", max_loop_setup); +#endif diff --git a/kernel/drivers/block/loop.h b/kernel/drivers/block/loop.h new file mode 100644 index 000000000..301c27f83 --- /dev/null +++ b/kernel/drivers/block/loop.h @@ -0,0 +1,91 @@ +/* + * loop.h + * + * Written by Theodore Ts'o, 3/29/93. + * + * Copyright 1993 by Theodore Ts'o. Redistribution of this file is + * permitted under the GNU General Public License. + */ +#ifndef _LINUX_LOOP_H +#define _LINUX_LOOP_H + +#include +#include +#include +#include +#include +#include +#include + +/* Possible states of device */ +enum { + Lo_unbound, + Lo_bound, + Lo_rundown, +}; + +struct loop_func_table; + +struct loop_device { + int lo_number; + int lo_refcnt; + loff_t lo_offset; + loff_t lo_sizelimit; + int lo_flags; + int (*transfer)(struct loop_device *, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + char lo_file_name[LO_NAME_SIZE]; + char lo_crypt_name[LO_NAME_SIZE]; + char lo_encrypt_key[LO_KEY_SIZE]; + int lo_encrypt_key_size; + struct loop_func_table *lo_encryption; + __u32 lo_init[2]; + kuid_t lo_key_owner; /* Who set the key */ + int (*ioctl)(struct loop_device *, int cmd, + unsigned long arg); + + struct file * lo_backing_file; + struct block_device *lo_device; + unsigned lo_blocksize; + void *key_data; + + gfp_t old_gfp_mask; + + spinlock_t lo_lock; + struct list_head write_cmd_head; + struct work_struct write_work; + bool write_started; + int lo_state; + struct mutex lo_ctl_mutex; + + struct request_queue *lo_queue; + struct blk_mq_tag_set tag_set; + struct gendisk *lo_disk; +}; + +struct loop_cmd { + struct work_struct read_work; + struct request *rq; + struct list_head list; +}; + +/* Support for loadable transfer modules */ +struct loop_func_table { + int number; /* filter type */ + int (*transfer)(struct loop_device *lo, int cmd, + struct page *raw_page, unsigned raw_off, + struct page *loop_page, unsigned loop_off, + int size, sector_t real_block); + int (*init)(struct loop_device *, const struct loop_info64 *); + /* release is called from loop_unregister_transfer or clr_fd */ + int (*release)(struct loop_device *); + int (*ioctl)(struct loop_device *, int cmd, unsigned long arg); + struct module *owner; +}; + +int loop_register_transfer(struct loop_func_table *funcs); +int loop_unregister_transfer(int number); + +#endif diff --git a/kernel/drivers/block/mg_disk.c b/kernel/drivers/block/mg_disk.c new file mode 100644 index 000000000..145ce2aa2 --- /dev/null +++ b/kernel/drivers/block/mg_disk.c @@ -0,0 +1,1112 @@ +/* + * drivers/block/mg_disk.c + * + * Support for the mGine m[g]flash IO mode. + * Based on legacy hd.c + * + * (c) 2008 mGine Co.,LTD + * (c) 2008 unsik Kim + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MG_RES_SEC (CONFIG_MG_DISK_RES << 1) + +/* name for block device */ +#define MG_DISK_NAME "mgd" + +#define MG_DISK_MAJ 0 +#define MG_DISK_MAX_PART 16 +#define MG_SECTOR_SIZE 512 +#define MG_MAX_SECTS 256 + +/* Register offsets */ +#define MG_BUFF_OFFSET 0x8000 +#define MG_REG_OFFSET 0xC000 +#define MG_REG_FEATURE (MG_REG_OFFSET + 2) /* write case */ +#define MG_REG_ERROR (MG_REG_OFFSET + 2) /* read case */ +#define MG_REG_SECT_CNT (MG_REG_OFFSET + 4) +#define MG_REG_SECT_NUM (MG_REG_OFFSET + 6) +#define MG_REG_CYL_LOW (MG_REG_OFFSET + 8) +#define MG_REG_CYL_HIGH (MG_REG_OFFSET + 0xA) +#define MG_REG_DRV_HEAD (MG_REG_OFFSET + 0xC) +#define MG_REG_COMMAND (MG_REG_OFFSET + 0xE) /* write case */ +#define MG_REG_STATUS (MG_REG_OFFSET + 0xE) /* read case */ +#define MG_REG_DRV_CTRL (MG_REG_OFFSET + 0x10) +#define MG_REG_BURST_CTRL (MG_REG_OFFSET + 0x12) + +/* handy status */ +#define MG_STAT_READY (ATA_DRDY | ATA_DSC) +#define MG_READY_OK(s) (((s) & (MG_STAT_READY | (ATA_BUSY | ATA_DF | \ + ATA_ERR))) == MG_STAT_READY) + +/* error code for others */ +#define MG_ERR_NONE 0 +#define MG_ERR_TIMEOUT 0x100 +#define MG_ERR_INIT_STAT 0x101 +#define MG_ERR_TRANSLATION 0x102 +#define MG_ERR_CTRL_RST 0x103 +#define MG_ERR_INV_STAT 0x104 +#define MG_ERR_RSTOUT 0x105 + +#define MG_MAX_ERRORS 6 /* Max read/write errors */ + +/* command */ +#define MG_CMD_RD 0x20 +#define MG_CMD_WR 0x30 +#define MG_CMD_SLEEP 0x99 +#define MG_CMD_WAKEUP 0xC3 +#define MG_CMD_ID 0xEC +#define MG_CMD_WR_CONF 0x3C +#define MG_CMD_RD_CONF 0x40 + +/* operation mode */ +#define MG_OP_CASCADE (1 << 0) +#define MG_OP_CASCADE_SYNC_RD (1 << 1) +#define MG_OP_CASCADE_SYNC_WR (1 << 2) +#define MG_OP_INTERLEAVE (1 << 3) + +/* synchronous */ +#define MG_BURST_LAT_4 (3 << 4) +#define MG_BURST_LAT_5 (4 << 4) +#define MG_BURST_LAT_6 (5 << 4) +#define MG_BURST_LAT_7 (6 << 4) +#define MG_BURST_LAT_8 (7 << 4) +#define MG_BURST_LEN_4 (1 << 1) +#define MG_BURST_LEN_8 (2 << 1) +#define MG_BURST_LEN_16 (3 << 1) +#define MG_BURST_LEN_32 (4 << 1) +#define MG_BURST_LEN_CONT (0 << 1) + +/* timeout value (unit: ms) */ +#define MG_TMAX_CONF_TO_CMD 1 +#define MG_TMAX_WAIT_RD_DRQ 10 +#define MG_TMAX_WAIT_WR_DRQ 500 +#define MG_TMAX_RST_TO_BUSY 10 +#define MG_TMAX_HDRST_TO_RDY 500 +#define MG_TMAX_SWRST_TO_RDY 500 +#define MG_TMAX_RSTOUT 3000 + +#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST) + +/* main structure for mflash driver */ +struct mg_host { + struct device *dev; + + struct request_queue *breq; + struct request *req; + spinlock_t lock; + struct gendisk *gd; + + struct timer_list timer; + void (*mg_do_intr) (struct mg_host *); + + u16 id[ATA_ID_WORDS]; + + u16 cyls; + u16 heads; + u16 sectors; + u32 n_sectors; + u32 nres_sectors; + + void __iomem *dev_base; + unsigned int irq; + unsigned int rst; + unsigned int rstout; + + u32 major; + u32 error; +}; + +/* + * Debugging macro and defines + */ +#undef DO_MG_DEBUG +#ifdef DO_MG_DEBUG +# define MG_DBG(fmt, args...) \ + printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args) +#else /* CONFIG_MG_DEBUG */ +# define MG_DBG(fmt, args...) do { } while (0) +#endif /* CONFIG_MG_DEBUG */ + +static void mg_request(struct request_queue *); + +static bool mg_end_request(struct mg_host *host, int err, unsigned int nr_bytes) +{ + if (__blk_end_request(host->req, err, nr_bytes)) + return true; + + host->req = NULL; + return false; +} + +static bool mg_end_request_cur(struct mg_host *host, int err) +{ + return mg_end_request(host, err, blk_rq_cur_bytes(host->req)); +} + +static void mg_dump_status(const char *msg, unsigned int stat, + struct mg_host *host) +{ + char *name = MG_DISK_NAME; + + if (host->req) + name = host->req->rq_disk->disk_name; + + printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff); + if (stat & ATA_BUSY) + printk("Busy "); + if (stat & ATA_DRDY) + printk("DriveReady "); + if (stat & ATA_DF) + printk("WriteFault "); + if (stat & ATA_DSC) + printk("SeekComplete "); + if (stat & ATA_DRQ) + printk("DataRequest "); + if (stat & ATA_CORR) + printk("CorrectedError "); + if (stat & ATA_ERR) + printk("Error "); + printk("}\n"); + if ((stat & ATA_ERR) == 0) { + host->error = 0; + } else { + host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR); + printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg, + host->error & 0xff); + if (host->error & ATA_BBK) + printk("BadSector "); + if (host->error & ATA_UNC) + printk("UncorrectableError "); + if (host->error & ATA_IDNF) + printk("SectorIdNotFound "); + if (host->error & ATA_ABORTED) + printk("DriveStatusError "); + if (host->error & ATA_AMNF) + printk("AddrMarkNotFound "); + printk("}"); + if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) { + if (host->req) + printk(", sector=%u", + (unsigned int)blk_rq_pos(host->req)); + } + printk("\n"); + } +} + +static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec) +{ + u8 status; + unsigned long expire, cur_jiffies; + struct mg_drv_data *prv_data = host->dev->platform_data; + + host->error = MG_ERR_NONE; + expire = jiffies + msecs_to_jiffies(msec); + + /* These 2 times dummy status read prevents reading invalid + * status. A very little time (3 times of mflash operating clk) + * is required for busy bit is set. Use dummy read instead of + * busy wait, because mflash's PLL is machine dependent. + */ + if (prv_data->use_polling) { + status = inb((unsigned long)host->dev_base + MG_REG_STATUS); + status = inb((unsigned long)host->dev_base + MG_REG_STATUS); + } + + status = inb((unsigned long)host->dev_base + MG_REG_STATUS); + + do { + cur_jiffies = jiffies; + if (status & ATA_BUSY) { + if (expect == ATA_BUSY) + break; + } else { + /* Check the error condition! */ + if (status & ATA_ERR) { + mg_dump_status("mg_wait", status, host); + break; + } + + if (expect == MG_STAT_READY) + if (MG_READY_OK(status)) + break; + + if (expect == ATA_DRQ) + if (status & ATA_DRQ) + break; + } + if (!msec) { + mg_dump_status("not ready", status, host); + return MG_ERR_INV_STAT; + } + + status = inb((unsigned long)host->dev_base + MG_REG_STATUS); + } while (time_before(cur_jiffies, expire)); + + if (time_after_eq(cur_jiffies, expire) && msec) + host->error = MG_ERR_TIMEOUT; + + return host->error; +} + +static unsigned int mg_wait_rstout(u32 rstout, u32 msec) +{ + unsigned long expire; + + expire = jiffies + msecs_to_jiffies(msec); + while (time_before(jiffies, expire)) { + if (gpio_get_value(rstout) == 1) + return MG_ERR_NONE; + msleep(10); + } + + return MG_ERR_RSTOUT; +} + +static void mg_unexpected_intr(struct mg_host *host) +{ + u32 status = inb((unsigned long)host->dev_base + MG_REG_STATUS); + + mg_dump_status("mg_unexpected_intr", status, host); +} + +static irqreturn_t mg_irq(int irq, void *dev_id) +{ + struct mg_host *host = dev_id; + void (*handler)(struct mg_host *) = host->mg_do_intr; + + spin_lock(&host->lock); + + host->mg_do_intr = NULL; + del_timer(&host->timer); + if (!handler) + handler = mg_unexpected_intr; + handler(host); + + spin_unlock(&host->lock); + + return IRQ_HANDLED; +} + +/* local copy of ata_id_string() */ +static void mg_id_string(const u16 *id, unsigned char *s, + unsigned int ofs, unsigned int len) +{ + unsigned int c; + + BUG_ON(len & 1); + + while (len > 0) { + c = id[ofs] >> 8; + *s = c; + s++; + + c = id[ofs] & 0xff; + *s = c; + s++; + + ofs++; + len -= 2; + } +} + +/* local copy of ata_id_c_string() */ +static void mg_id_c_string(const u16 *id, unsigned char *s, + unsigned int ofs, unsigned int len) +{ + unsigned char *p; + + mg_id_string(id, s, ofs, len - 1); + + p = s + strnlen(s, len - 1); + while (p > s && p[-1] == ' ') + p--; + *p = '\0'; +} + +static int mg_get_disk_id(struct mg_host *host) +{ + u32 i; + s32 err; + const u16 *id = host->id; + struct mg_drv_data *prv_data = host->dev->platform_data; + char fwrev[ATA_ID_FW_REV_LEN + 1]; + char model[ATA_ID_PROD_LEN + 1]; + char serial[ATA_ID_SERNO_LEN + 1]; + + if (!prv_data->use_polling) + outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); + + outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND); + err = mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ); + if (err) + return err; + + for (i = 0; i < (MG_SECTOR_SIZE >> 1); i++) + host->id[i] = le16_to_cpu(inw((unsigned long)host->dev_base + + MG_BUFF_OFFSET + i * 2)); + + outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); + err = mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD); + if (err) + return err; + + if ((id[ATA_ID_FIELD_VALID] & 1) == 0) + return MG_ERR_TRANSLATION; + + host->n_sectors = ata_id_u32(id, ATA_ID_LBA_CAPACITY); + host->cyls = id[ATA_ID_CYLS]; + host->heads = id[ATA_ID_HEADS]; + host->sectors = id[ATA_ID_SECTORS]; + + if (MG_RES_SEC && host->heads && host->sectors) { + /* modify cyls, n_sectors */ + host->cyls = (host->n_sectors - MG_RES_SEC) / + host->heads / host->sectors; + host->nres_sectors = host->n_sectors - host->cyls * + host->heads * host->sectors; + host->n_sectors -= host->nres_sectors; + } + + mg_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev)); + mg_id_c_string(id, model, ATA_ID_PROD, sizeof(model)); + mg_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial)); + printk(KERN_INFO "mg_disk: model: %s\n", model); + printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev); + printk(KERN_INFO "mg_disk: serial: %s\n", serial); + printk(KERN_INFO "mg_disk: %d + reserved %d sectors\n", + host->n_sectors, host->nres_sectors); + + if (!prv_data->use_polling) + outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); + + return err; +} + + +static int mg_disk_init(struct mg_host *host) +{ + struct mg_drv_data *prv_data = host->dev->platform_data; + s32 err; + u8 init_status; + + /* hdd rst low */ + gpio_set_value(host->rst, 0); + err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY); + if (err) + return err; + + /* hdd rst high */ + gpio_set_value(host->rst, 1); + err = mg_wait(host, MG_STAT_READY, MG_TMAX_HDRST_TO_RDY); + if (err) + return err; + + /* soft reset on */ + outb(ATA_SRST | (prv_data->use_polling ? ATA_NIEN : 0), + (unsigned long)host->dev_base + MG_REG_DRV_CTRL); + err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY); + if (err) + return err; + + /* soft reset off */ + outb(prv_data->use_polling ? ATA_NIEN : 0, + (unsigned long)host->dev_base + MG_REG_DRV_CTRL); + err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY); + if (err) + return err; + + init_status = inb((unsigned long)host->dev_base + MG_REG_STATUS) & 0xf; + + if (init_status == 0xf) + return MG_ERR_INIT_STAT; + + return err; +} + +static void mg_bad_rw_intr(struct mg_host *host) +{ + if (host->req) + if (++host->req->errors >= MG_MAX_ERRORS || + host->error == MG_ERR_TIMEOUT) + mg_end_request_cur(host, -EIO); +} + +static unsigned int mg_out(struct mg_host *host, + unsigned int sect_num, + unsigned int sect_cnt, + unsigned int cmd, + void (*intr_addr)(struct mg_host *)) +{ + struct mg_drv_data *prv_data = host->dev->platform_data; + + if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) + return host->error; + + if (!prv_data->use_polling) { + host->mg_do_intr = intr_addr; + mod_timer(&host->timer, jiffies + 3 * HZ); + } + if (MG_RES_SEC) + sect_num += MG_RES_SEC; + outb((u8)sect_cnt, (unsigned long)host->dev_base + MG_REG_SECT_CNT); + outb((u8)sect_num, (unsigned long)host->dev_base + MG_REG_SECT_NUM); + outb((u8)(sect_num >> 8), (unsigned long)host->dev_base + + MG_REG_CYL_LOW); + outb((u8)(sect_num >> 16), (unsigned long)host->dev_base + + MG_REG_CYL_HIGH); + outb((u8)((sect_num >> 24) | ATA_LBA | ATA_DEVICE_OBS), + (unsigned long)host->dev_base + MG_REG_DRV_HEAD); + outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND); + return MG_ERR_NONE; +} + +static void mg_read_one(struct mg_host *host, struct request *req) +{ + u16 *buff = (u16 *)bio_data(req->bio); + u32 i; + + for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) + *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET + + (i << 1)); +} + +static void mg_read(struct request *req) +{ + struct mg_host *host = req->rq_disk->private_data; + + if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req), + MG_CMD_RD, NULL) != MG_ERR_NONE) + mg_bad_rw_intr(host); + + MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", + blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio)); + + do { + if (mg_wait(host, ATA_DRQ, + MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) { + mg_bad_rw_intr(host); + return; + } + + mg_read_one(host, req); + + outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + + MG_REG_COMMAND); + } while (mg_end_request(host, 0, MG_SECTOR_SIZE)); +} + +static void mg_write_one(struct mg_host *host, struct request *req) +{ + u16 *buff = (u16 *)bio_data(req->bio); + u32 i; + + for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) + outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET + + (i << 1)); +} + +static void mg_write(struct request *req) +{ + struct mg_host *host = req->rq_disk->private_data; + unsigned int rem = blk_rq_sectors(req); + + if (mg_out(host, blk_rq_pos(req), rem, + MG_CMD_WR, NULL) != MG_ERR_NONE) { + mg_bad_rw_intr(host); + return; + } + + MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", + rem, blk_rq_pos(req), bio_data(req->bio)); + + if (mg_wait(host, ATA_DRQ, + MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { + mg_bad_rw_intr(host); + return; + } + + do { + mg_write_one(host, req); + + outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + + MG_REG_COMMAND); + + rem--; + if (rem > 1 && mg_wait(host, ATA_DRQ, + MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { + mg_bad_rw_intr(host); + return; + } else if (mg_wait(host, MG_STAT_READY, + MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { + mg_bad_rw_intr(host); + return; + } + } while (mg_end_request(host, 0, MG_SECTOR_SIZE)); +} + +static void mg_read_intr(struct mg_host *host) +{ + struct request *req = host->req; + u32 i; + + /* check status */ + do { + i = inb((unsigned long)host->dev_base + MG_REG_STATUS); + if (i & ATA_BUSY) + break; + if (!MG_READY_OK(i)) + break; + if (i & ATA_DRQ) + goto ok_to_read; + } while (0); + mg_dump_status("mg_read_intr", i, host); + mg_bad_rw_intr(host); + mg_request(host->breq); + return; + +ok_to_read: + mg_read_one(host, req); + + MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", + blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio)); + + /* send read confirm */ + outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); + + if (mg_end_request(host, 0, MG_SECTOR_SIZE)) { + /* set handler if read remains */ + host->mg_do_intr = mg_read_intr; + mod_timer(&host->timer, jiffies + 3 * HZ); + } else /* goto next request */ + mg_request(host->breq); +} + +static void mg_write_intr(struct mg_host *host) +{ + struct request *req = host->req; + u32 i; + bool rem; + + /* check status */ + do { + i = inb((unsigned long)host->dev_base + MG_REG_STATUS); + if (i & ATA_BUSY) + break; + if (!MG_READY_OK(i)) + break; + if ((blk_rq_sectors(req) <= 1) || (i & ATA_DRQ)) + goto ok_to_write; + } while (0); + mg_dump_status("mg_write_intr", i, host); + mg_bad_rw_intr(host); + mg_request(host->breq); + return; + +ok_to_write: + if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) { + /* write 1 sector and set handler if remains */ + mg_write_one(host, req); + MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", + blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio)); + host->mg_do_intr = mg_write_intr; + mod_timer(&host->timer, jiffies + 3 * HZ); + } + + /* send write confirm */ + outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); + + if (!rem) + mg_request(host->breq); +} + +static void mg_times_out(unsigned long data) +{ + struct mg_host *host = (struct mg_host *)data; + char *name; + + spin_lock_irq(&host->lock); + + if (!host->req) + goto out_unlock; + + host->mg_do_intr = NULL; + + name = host->req->rq_disk->disk_name; + printk(KERN_DEBUG "%s: timeout\n", name); + + host->error = MG_ERR_TIMEOUT; + mg_bad_rw_intr(host); + +out_unlock: + mg_request(host->breq); + spin_unlock_irq(&host->lock); +} + +static void mg_request_poll(struct request_queue *q) +{ + struct mg_host *host = q->queuedata; + + while (1) { + if (!host->req) { + host->req = blk_fetch_request(q); + if (!host->req) + break; + } + + if (unlikely(host->req->cmd_type != REQ_TYPE_FS)) { + mg_end_request_cur(host, -EIO); + continue; + } + + if (rq_data_dir(host->req) == READ) + mg_read(host->req); + else + mg_write(host->req); + } +} + +static unsigned int mg_issue_req(struct request *req, + struct mg_host *host, + unsigned int sect_num, + unsigned int sect_cnt) +{ + switch (rq_data_dir(req)) { + case READ: + if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr) + != MG_ERR_NONE) { + mg_bad_rw_intr(host); + return host->error; + } + break; + case WRITE: + /* TODO : handler */ + outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); + if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr) + != MG_ERR_NONE) { + mg_bad_rw_intr(host); + return host->error; + } + del_timer(&host->timer); + mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ); + outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); + if (host->error) { + mg_bad_rw_intr(host); + return host->error; + } + mg_write_one(host, req); + mod_timer(&host->timer, jiffies + 3 * HZ); + outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + + MG_REG_COMMAND); + break; + } + return MG_ERR_NONE; +} + +/* This function also called from IRQ context */ +static void mg_request(struct request_queue *q) +{ + struct mg_host *host = q->queuedata; + struct request *req; + u32 sect_num, sect_cnt; + + while (1) { + if (!host->req) { + host->req = blk_fetch_request(q); + if (!host->req) + break; + } + req = host->req; + + /* check unwanted request call */ + if (host->mg_do_intr) + return; + + del_timer(&host->timer); + + sect_num = blk_rq_pos(req); + /* deal whole segments */ + sect_cnt = blk_rq_sectors(req); + + /* sanity check */ + if (sect_num >= get_capacity(req->rq_disk) || + ((sect_num + sect_cnt) > + get_capacity(req->rq_disk))) { + printk(KERN_WARNING + "%s: bad access: sector=%d, count=%d\n", + req->rq_disk->disk_name, + sect_num, sect_cnt); + mg_end_request_cur(host, -EIO); + continue; + } + + if (unlikely(req->cmd_type != REQ_TYPE_FS)) { + mg_end_request_cur(host, -EIO); + continue; + } + + if (!mg_issue_req(req, host, sect_num, sect_cnt)) + return; + } +} + +static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct mg_host *host = bdev->bd_disk->private_data; + + geo->cylinders = (unsigned short)host->cyls; + geo->heads = (unsigned char)host->heads; + geo->sectors = (unsigned char)host->sectors; + return 0; +} + +static const struct block_device_operations mg_disk_ops = { + .getgeo = mg_getgeo +}; + +#ifdef CONFIG_PM_SLEEP +static int mg_suspend(struct device *dev) +{ + struct mg_drv_data *prv_data = dev->platform_data; + struct mg_host *host = prv_data->host; + + if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) + return -EIO; + + if (!prv_data->use_polling) + outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); + + outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND); + /* wait until mflash deep sleep */ + msleep(1); + + if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) { + if (!prv_data->use_polling) + outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); + return -EIO; + } + + return 0; +} + +static int mg_resume(struct device *dev) +{ + struct mg_drv_data *prv_data = dev->platform_data; + struct mg_host *host = prv_data->host; + + if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) + return -EIO; + + outb(MG_CMD_WAKEUP, (unsigned long)host->dev_base + MG_REG_COMMAND); + /* wait until mflash wakeup */ + msleep(1); + + if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) + return -EIO; + + if (!prv_data->use_polling) + outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); + + return 0; +} +#endif + +static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume); + +static int mg_probe(struct platform_device *plat_dev) +{ + struct mg_host *host; + struct resource *rsc; + struct mg_drv_data *prv_data = plat_dev->dev.platform_data; + int err = 0; + + if (!prv_data) { + printk(KERN_ERR "%s:%d fail (no driver_data)\n", + __func__, __LINE__); + err = -EINVAL; + goto probe_err; + } + + /* alloc mg_host */ + host = kzalloc(sizeof(struct mg_host), GFP_KERNEL); + if (!host) { + printk(KERN_ERR "%s:%d fail (no memory for mg_host)\n", + __func__, __LINE__); + err = -ENOMEM; + goto probe_err; + } + host->major = MG_DISK_MAJ; + + /* link each other */ + prv_data->host = host; + host->dev = &plat_dev->dev; + + /* io remap */ + rsc = platform_get_resource(plat_dev, IORESOURCE_MEM, 0); + if (!rsc) { + printk(KERN_ERR "%s:%d platform_get_resource fail\n", + __func__, __LINE__); + err = -EINVAL; + goto probe_err_2; + } + host->dev_base = ioremap(rsc->start, resource_size(rsc)); + if (!host->dev_base) { + printk(KERN_ERR "%s:%d ioremap fail\n", + __func__, __LINE__); + err = -EIO; + goto probe_err_2; + } + MG_DBG("dev_base = 0x%x\n", (u32)host->dev_base); + + /* get reset pin */ + rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO, + MG_RST_PIN); + if (!rsc) { + printk(KERN_ERR "%s:%d get reset pin fail\n", + __func__, __LINE__); + err = -EIO; + goto probe_err_3; + } + host->rst = rsc->start; + + /* init rst pin */ + err = gpio_request(host->rst, MG_RST_PIN); + if (err) + goto probe_err_3; + gpio_direction_output(host->rst, 1); + + /* reset out pin */ + if (!(prv_data->dev_attr & MG_DEV_MASK)) { + err = -EINVAL; + goto probe_err_3a; + } + + if (prv_data->dev_attr != MG_BOOT_DEV) { + rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO, + MG_RSTOUT_PIN); + if (!rsc) { + printk(KERN_ERR "%s:%d get reset-out pin fail\n", + __func__, __LINE__); + err = -EIO; + goto probe_err_3a; + } + host->rstout = rsc->start; + err = gpio_request(host->rstout, MG_RSTOUT_PIN); + if (err) + goto probe_err_3a; + gpio_direction_input(host->rstout); + } + + /* disk reset */ + if (prv_data->dev_attr == MG_STORAGE_DEV) { + /* If POR seq. not yet finished, wait */ + err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT); + if (err) + goto probe_err_3b; + err = mg_disk_init(host); + if (err) { + printk(KERN_ERR "%s:%d fail (err code : %d)\n", + __func__, __LINE__, err); + err = -EIO; + goto probe_err_3b; + } + } + + /* get irq resource */ + if (!prv_data->use_polling) { + host->irq = platform_get_irq(plat_dev, 0); + if (host->irq == -ENXIO) { + err = host->irq; + goto probe_err_3b; + } + err = request_irq(host->irq, mg_irq, + IRQF_TRIGGER_RISING, + MG_DEV_NAME, host); + if (err) { + printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n", + __func__, __LINE__, err); + goto probe_err_3b; + } + + } + + /* get disk id */ + err = mg_get_disk_id(host); + if (err) { + printk(KERN_ERR "%s:%d fail (err code : %d)\n", + __func__, __LINE__, err); + err = -EIO; + goto probe_err_4; + } + + err = register_blkdev(host->major, MG_DISK_NAME); + if (err < 0) { + printk(KERN_ERR "%s:%d register_blkdev fail (err code : %d)\n", + __func__, __LINE__, err); + goto probe_err_4; + } + if (!host->major) + host->major = err; + + spin_lock_init(&host->lock); + + if (prv_data->use_polling) + host->breq = blk_init_queue(mg_request_poll, &host->lock); + else + host->breq = blk_init_queue(mg_request, &host->lock); + + if (!host->breq) { + err = -ENOMEM; + printk(KERN_ERR "%s:%d (blk_init_queue) fail\n", + __func__, __LINE__); + goto probe_err_5; + } + host->breq->queuedata = host; + + /* mflash is random device, thanx for the noop */ + err = elevator_change(host->breq, "noop"); + if (err) { + printk(KERN_ERR "%s:%d (elevator_init) fail\n", + __func__, __LINE__); + goto probe_err_6; + } + blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS); + blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE); + + init_timer(&host->timer); + host->timer.function = mg_times_out; + host->timer.data = (unsigned long)host; + + host->gd = alloc_disk(MG_DISK_MAX_PART); + if (!host->gd) { + printk(KERN_ERR "%s:%d (alloc_disk) fail\n", + __func__, __LINE__); + err = -ENOMEM; + goto probe_err_7; + } + host->gd->major = host->major; + host->gd->first_minor = 0; + host->gd->fops = &mg_disk_ops; + host->gd->queue = host->breq; + host->gd->private_data = host; + sprintf(host->gd->disk_name, MG_DISK_NAME"a"); + + set_capacity(host->gd, host->n_sectors); + + add_disk(host->gd); + + return err; + +probe_err_7: + del_timer_sync(&host->timer); +probe_err_6: + blk_cleanup_queue(host->breq); +probe_err_5: + unregister_blkdev(MG_DISK_MAJ, MG_DISK_NAME); +probe_err_4: + if (!prv_data->use_polling) + free_irq(host->irq, host); +probe_err_3b: + gpio_free(host->rstout); +probe_err_3a: + gpio_free(host->rst); +probe_err_3: + iounmap(host->dev_base); +probe_err_2: + kfree(host); +probe_err: + return err; +} + +static int mg_remove(struct platform_device *plat_dev) +{ + struct mg_drv_data *prv_data = plat_dev->dev.platform_data; + struct mg_host *host = prv_data->host; + int err = 0; + + /* delete timer */ + del_timer_sync(&host->timer); + + /* remove disk */ + if (host->gd) { + del_gendisk(host->gd); + put_disk(host->gd); + } + /* remove queue */ + if (host->breq) + blk_cleanup_queue(host->breq); + + /* unregister blk device */ + unregister_blkdev(host->major, MG_DISK_NAME); + + /* free irq */ + if (!prv_data->use_polling) + free_irq(host->irq, host); + + /* free reset-out pin */ + if (prv_data->dev_attr != MG_BOOT_DEV) + gpio_free(host->rstout); + + /* free rst pin */ + if (host->rst) + gpio_free(host->rst); + + /* unmap io */ + if (host->dev_base) + iounmap(host->dev_base); + + /* free mg_host */ + kfree(host); + + return err; +} + +static struct platform_driver mg_disk_driver = { + .probe = mg_probe, + .remove = mg_remove, + .driver = { + .name = MG_DEV_NAME, + .pm = &mg_pm, + } +}; + +/**************************************************************************** + * + * Module stuff + * + ****************************************************************************/ + +static int __init mg_init(void) +{ + printk(KERN_INFO "mGine mflash driver, (c) 2008 mGine Co.\n"); + return platform_driver_register(&mg_disk_driver); +} + +static void __exit mg_exit(void) +{ + printk(KERN_INFO "mflash driver : bye bye\n"); + platform_driver_unregister(&mg_disk_driver); +} + +module_init(mg_init); +module_exit(mg_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("unsik Kim "); +MODULE_DESCRIPTION("mGine m[g]flash device driver"); diff --git a/kernel/drivers/block/mtip32xx/Kconfig b/kernel/drivers/block/mtip32xx/Kconfig new file mode 100644 index 000000000..0ba837fc6 --- /dev/null +++ b/kernel/drivers/block/mtip32xx/Kconfig @@ -0,0 +1,9 @@ +# +# mtip32xx device driver configuration +# + +config BLK_DEV_PCIESSD_MTIP32XX + tristate "Block Device Driver for Micron PCIe SSDs" + depends on PCI + help + This enables the block driver for Micron PCIe SSDs. diff --git a/kernel/drivers/block/mtip32xx/Makefile b/kernel/drivers/block/mtip32xx/Makefile new file mode 100644 index 000000000..4fbef8c83 --- /dev/null +++ b/kernel/drivers/block/mtip32xx/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for Block device driver for Micron PCIe SSD +# + +obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx.o diff --git a/kernel/drivers/block/mtip32xx/mtip32xx.c b/kernel/drivers/block/mtip32xx/mtip32xx.c new file mode 100644 index 000000000..3bd7ca985 --- /dev/null +++ b/kernel/drivers/block/mtip32xx/mtip32xx.c @@ -0,0 +1,4745 @@ +/* + * Driver for the Micron P320 SSD + * Copyright (C) 2011 Micron Technology, Inc. + * + * Portions of this code were derived from works subjected to the + * following copyright: + * Copyright (C) 2009 Integrated Device Technology, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include <../drivers/ata/ahci.h> +#include +#include +#include +#include "mtip32xx.h" + +#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) + +/* DMA region containing RX Fis, Identify, RLE10, and SMART buffers */ +#define AHCI_RX_FIS_SZ 0x100 +#define AHCI_RX_FIS_OFFSET 0x0 +#define AHCI_IDFY_SZ ATA_SECT_SIZE +#define AHCI_IDFY_OFFSET 0x400 +#define AHCI_SECTBUF_SZ ATA_SECT_SIZE +#define AHCI_SECTBUF_OFFSET 0x800 +#define AHCI_SMARTBUF_SZ ATA_SECT_SIZE +#define AHCI_SMARTBUF_OFFSET 0xC00 +/* 0x100 + 0x200 + 0x200 + 0x200 is smaller than 4k but we pad it out */ +#define BLOCK_DMA_ALLOC_SZ 4096 + +/* DMA region containing command table (should be 8192 bytes) */ +#define AHCI_CMD_SLOT_SZ sizeof(struct mtip_cmd_hdr) +#define AHCI_CMD_TBL_SZ (MTIP_MAX_COMMAND_SLOTS * AHCI_CMD_SLOT_SZ) +#define AHCI_CMD_TBL_OFFSET 0x0 + +/* DMA region per command (contains header and SGL) */ +#define AHCI_CMD_TBL_HDR_SZ 0x80 +#define AHCI_CMD_TBL_HDR_OFFSET 0x0 +#define AHCI_CMD_TBL_SGL_SZ (MTIP_MAX_SG * sizeof(struct mtip_cmd_sg)) +#define AHCI_CMD_TBL_SGL_OFFSET AHCI_CMD_TBL_HDR_SZ +#define CMD_DMA_ALLOC_SZ (AHCI_CMD_TBL_SGL_SZ + AHCI_CMD_TBL_HDR_SZ) + + +#define HOST_CAP_NZDMA (1 << 19) +#define HOST_HSORG 0xFC +#define HSORG_DISABLE_SLOTGRP_INTR (1<<24) +#define HSORG_DISABLE_SLOTGRP_PXIS (1<<16) +#define HSORG_HWREV 0xFF00 +#define HSORG_STYLE 0x8 +#define HSORG_SLOTGROUPS 0x7 + +#define PORT_COMMAND_ISSUE 0x38 +#define PORT_SDBV 0x7C + +#define PORT_OFFSET 0x100 +#define PORT_MEM_SIZE 0x80 + +#define PORT_IRQ_ERR \ + (PORT_IRQ_HBUS_ERR | PORT_IRQ_IF_ERR | PORT_IRQ_CONNECT | \ + PORT_IRQ_PHYRDY | PORT_IRQ_UNK_FIS | PORT_IRQ_BAD_PMP | \ + PORT_IRQ_TF_ERR | PORT_IRQ_HBUS_DATA_ERR | PORT_IRQ_IF_NONFATAL | \ + PORT_IRQ_OVERFLOW) +#define PORT_IRQ_LEGACY \ + (PORT_IRQ_PIOS_FIS | PORT_IRQ_D2H_REG_FIS) +#define PORT_IRQ_HANDLED \ + (PORT_IRQ_SDB_FIS | PORT_IRQ_LEGACY | \ + PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR | \ + PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY) +#define DEF_PORT_IRQ \ + (PORT_IRQ_ERR | PORT_IRQ_LEGACY | PORT_IRQ_SDB_FIS) + +/* product numbers */ +#define MTIP_PRODUCT_UNKNOWN 0x00 +#define MTIP_PRODUCT_ASICFPGA 0x11 + +/* Device instance number, incremented each time a device is probed. */ +static int instance; + +struct list_head online_list; +struct list_head removing_list; +spinlock_t dev_lock; + +/* + * Global variable used to hold the major block device number + * allocated in mtip_init(). + */ +static int mtip_major; +static struct dentry *dfs_parent; +static struct dentry *dfs_device_status; + +static u32 cpu_use[NR_CPUS]; + +static DEFINE_SPINLOCK(rssd_index_lock); +static DEFINE_IDA(rssd_index_ida); + +static int mtip_block_initialize(struct driver_data *dd); + +#ifdef CONFIG_COMPAT +struct mtip_compat_ide_task_request_s { + __u8 io_ports[8]; + __u8 hob_ports[8]; + ide_reg_valid_t out_flags; + ide_reg_valid_t in_flags; + int data_phase; + int req_cmd; + compat_ulong_t out_size; + compat_ulong_t in_size; +}; +#endif + +/* + * This function check_for_surprise_removal is called + * while card is removed from the system and it will + * read the vendor id from the configration space + * + * @pdev Pointer to the pci_dev structure. + * + * return value + * true if device removed, else false + */ +static bool mtip_check_surprise_removal(struct pci_dev *pdev) +{ + u16 vendor_id = 0; + struct driver_data *dd = pci_get_drvdata(pdev); + + if (dd->sr) + return true; + + /* Read the vendorID from the configuration space */ + pci_read_config_word(pdev, 0x00, &vendor_id); + if (vendor_id == 0xFFFF) { + dd->sr = true; + if (dd->queue) + set_bit(QUEUE_FLAG_DEAD, &dd->queue->queue_flags); + else + dev_warn(&dd->pdev->dev, + "%s: dd->queue is NULL\n", __func__); + if (dd->port) { + set_bit(MTIP_PF_SR_CLEANUP_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); + } else + dev_warn(&dd->pdev->dev, + "%s: dd->port is NULL\n", __func__); + return true; /* device removed */ + } + + return false; /* device present */ +} + +static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) +{ + struct request *rq; + + rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true); + return blk_mq_rq_to_pdu(rq); +} + +static void mtip_put_int_command(struct driver_data *dd, struct mtip_cmd *cmd) +{ + blk_put_request(blk_mq_rq_from_pdu(cmd)); +} + +/* + * Once we add support for one hctx per mtip group, this will change a bit + */ +static struct request *mtip_rq_from_tag(struct driver_data *dd, + unsigned int tag) +{ + struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0]; + + return blk_mq_tag_to_rq(hctx->tags, tag); +} + +static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd, + unsigned int tag) +{ + struct request *rq = mtip_rq_from_tag(dd, tag); + + return blk_mq_rq_to_pdu(rq); +} + +/* + * IO completion function. + * + * This completion function is called by the driver ISR when a + * command that was issued by the kernel completes. It first calls the + * asynchronous completion function which normally calls back into the block + * layer passing the asynchronous callback data, then unmaps the + * scatter list associated with the completed command, and finally + * clears the allocated bit associated with the completed command. + * + * @port Pointer to the port data structure. + * @tag Tag of the command. + * @data Pointer to driver_data. + * @status Completion status. + * + * return value + * None + */ +static void mtip_async_complete(struct mtip_port *port, + int tag, struct mtip_cmd *cmd, int status) +{ + struct driver_data *dd = port->dd; + struct request *rq; + + if (unlikely(!dd) || unlikely(!port)) + return; + + if (unlikely(status == PORT_IRQ_TF_ERR)) { + dev_warn(&port->dd->pdev->dev, + "Command tag %d failed due to TFE\n", tag); + } + + /* Unmap the DMA scatter list entries */ + dma_unmap_sg(&dd->pdev->dev, cmd->sg, cmd->scatter_ents, cmd->direction); + + rq = mtip_rq_from_tag(dd, tag); + + if (unlikely(cmd->unaligned)) + up(&port->cmd_slot_unal); + + blk_mq_end_request(rq, status ? -EIO : 0); +} + +/* + * Reset the HBA (without sleeping) + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 The reset was successful. + * -1 The HBA Reset bit did not clear. + */ +static int mtip_hba_reset(struct driver_data *dd) +{ + unsigned long timeout; + + /* Set the reset bit */ + writel(HOST_RESET, dd->mmio + HOST_CTL); + + /* Flush */ + readl(dd->mmio + HOST_CTL); + + /* Spin for up to 2 seconds, waiting for reset acknowledgement */ + timeout = jiffies + msecs_to_jiffies(2000); + do { + mdelay(10); + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) + return -1; + + } while ((readl(dd->mmio + HOST_CTL) & HOST_RESET) + && time_before(jiffies, timeout)); + + if (readl(dd->mmio + HOST_CTL) & HOST_RESET) + return -1; + + return 0; +} + +/* + * Issue a command to the hardware. + * + * Set the appropriate bit in the s_active and Command Issue hardware + * registers, causing hardware command processing to begin. + * + * @port Pointer to the port structure. + * @tag The tag of the command to be issued. + * + * return value + * None + */ +static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag) +{ + int group = tag >> 5; + + /* guard SACT and CI registers */ + spin_lock(&port->cmd_issue_lock[group]); + writel((1 << MTIP_TAG_BIT(tag)), + port->s_active[MTIP_TAG_INDEX(tag)]); + writel((1 << MTIP_TAG_BIT(tag)), + port->cmd_issue[MTIP_TAG_INDEX(tag)]); + spin_unlock(&port->cmd_issue_lock[group]); +} + +/* + * Enable/disable the reception of FIS + * + * @port Pointer to the port data structure + * @enable 1 to enable, 0 to disable + * + * return value + * Previous state: 1 enabled, 0 disabled + */ +static int mtip_enable_fis(struct mtip_port *port, int enable) +{ + u32 tmp; + + /* enable FIS reception */ + tmp = readl(port->mmio + PORT_CMD); + if (enable) + writel(tmp | PORT_CMD_FIS_RX, port->mmio + PORT_CMD); + else + writel(tmp & ~PORT_CMD_FIS_RX, port->mmio + PORT_CMD); + + /* Flush */ + readl(port->mmio + PORT_CMD); + + return (((tmp & PORT_CMD_FIS_RX) == PORT_CMD_FIS_RX)); +} + +/* + * Enable/disable the DMA engine + * + * @port Pointer to the port data structure + * @enable 1 to enable, 0 to disable + * + * return value + * Previous state: 1 enabled, 0 disabled. + */ +static int mtip_enable_engine(struct mtip_port *port, int enable) +{ + u32 tmp; + + /* enable FIS reception */ + tmp = readl(port->mmio + PORT_CMD); + if (enable) + writel(tmp | PORT_CMD_START, port->mmio + PORT_CMD); + else + writel(tmp & ~PORT_CMD_START, port->mmio + PORT_CMD); + + readl(port->mmio + PORT_CMD); + return (((tmp & PORT_CMD_START) == PORT_CMD_START)); +} + +/* + * Enables the port DMA engine and FIS reception. + * + * return value + * None + */ +static inline void mtip_start_port(struct mtip_port *port) +{ + /* Enable FIS reception */ + mtip_enable_fis(port, 1); + + /* Enable the DMA engine */ + mtip_enable_engine(port, 1); +} + +/* + * Deinitialize a port by disabling port interrupts, the DMA engine, + * and FIS reception. + * + * @port Pointer to the port structure + * + * return value + * None + */ +static inline void mtip_deinit_port(struct mtip_port *port) +{ + /* Disable interrupts on this port */ + writel(0, port->mmio + PORT_IRQ_MASK); + + /* Disable the DMA engine */ + mtip_enable_engine(port, 0); + + /* Disable FIS reception */ + mtip_enable_fis(port, 0); +} + +/* + * Initialize a port. + * + * This function deinitializes the port by calling mtip_deinit_port() and + * then initializes it by setting the command header and RX FIS addresses, + * clearing the SError register and any pending port interrupts before + * re-enabling the default set of port interrupts. + * + * @port Pointer to the port structure. + * + * return value + * None + */ +static void mtip_init_port(struct mtip_port *port) +{ + int i; + mtip_deinit_port(port); + + /* Program the command list base and FIS base addresses */ + if (readl(port->dd->mmio + HOST_CAP) & HOST_CAP_64) { + writel((port->command_list_dma >> 16) >> 16, + port->mmio + PORT_LST_ADDR_HI); + writel((port->rxfis_dma >> 16) >> 16, + port->mmio + PORT_FIS_ADDR_HI); + } + + writel(port->command_list_dma & 0xFFFFFFFF, + port->mmio + PORT_LST_ADDR); + writel(port->rxfis_dma & 0xFFFFFFFF, port->mmio + PORT_FIS_ADDR); + + /* Clear SError */ + writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR); + + /* reset the completed registers.*/ + for (i = 0; i < port->dd->slot_groups; i++) + writel(0xFFFFFFFF, port->completed[i]); + + /* Clear any pending interrupts for this port */ + writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT); + + /* Clear any pending interrupts on the HBA. */ + writel(readl(port->dd->mmio + HOST_IRQ_STAT), + port->dd->mmio + HOST_IRQ_STAT); + + /* Enable port interrupts */ + writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK); +} + +/* + * Restart a port + * + * @port Pointer to the port data structure. + * + * return value + * None + */ +static void mtip_restart_port(struct mtip_port *port) +{ + unsigned long timeout; + + /* Disable the DMA engine */ + mtip_enable_engine(port, 0); + + /* Chip quirk: wait up to 500ms for PxCMD.CR == 0 */ + timeout = jiffies + msecs_to_jiffies(500); + while ((readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) + && time_before(jiffies, timeout)) + ; + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return; + + /* + * Chip quirk: escalate to hba reset if + * PxCMD.CR not clear after 500 ms + */ + if (readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) { + dev_warn(&port->dd->pdev->dev, + "PxCMD.CR not clear, escalating reset\n"); + + if (mtip_hba_reset(port->dd)) + dev_err(&port->dd->pdev->dev, + "HBA reset escalation failed.\n"); + + /* 30 ms delay before com reset to quiesce chip */ + mdelay(30); + } + + dev_warn(&port->dd->pdev->dev, "Issuing COM reset\n"); + + /* Set PxSCTL.DET */ + writel(readl(port->mmio + PORT_SCR_CTL) | + 1, port->mmio + PORT_SCR_CTL); + readl(port->mmio + PORT_SCR_CTL); + + /* Wait 1 ms to quiesce chip function */ + timeout = jiffies + msecs_to_jiffies(1); + while (time_before(jiffies, timeout)) + ; + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return; + + /* Clear PxSCTL.DET */ + writel(readl(port->mmio + PORT_SCR_CTL) & ~1, + port->mmio + PORT_SCR_CTL); + readl(port->mmio + PORT_SCR_CTL); + + /* Wait 500 ms for bit 0 of PORT_SCR_STS to be set */ + timeout = jiffies + msecs_to_jiffies(500); + while (((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0) + && time_before(jiffies, timeout)) + ; + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return; + + if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0) + dev_warn(&port->dd->pdev->dev, + "COM reset failed\n"); + + mtip_init_port(port); + mtip_start_port(port); + +} + +static int mtip_device_reset(struct driver_data *dd) +{ + int rv = 0; + + if (mtip_check_surprise_removal(dd->pdev)) + return 0; + + if (mtip_hba_reset(dd) < 0) + rv = -EFAULT; + + mdelay(1); + mtip_init_port(dd->port); + mtip_start_port(dd->port); + + /* Enable interrupts on the HBA. */ + writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, + dd->mmio + HOST_CTL); + return rv; +} + +/* + * Helper function for tag logging + */ +static void print_tags(struct driver_data *dd, + char *msg, + unsigned long *tagbits, + int cnt) +{ + unsigned char tagmap[128]; + int group, tagmap_len = 0; + + memset(tagmap, 0, sizeof(tagmap)); + for (group = SLOTBITS_IN_LONGS; group > 0; group--) + tagmap_len += sprintf(tagmap + tagmap_len, "%016lX ", + tagbits[group-1]); + dev_warn(&dd->pdev->dev, + "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap); +} + +/* + * Internal command completion callback function. + * + * This function is normally called by the driver ISR when an internal + * command completed. This function signals the command completion by + * calling complete(). + * + * @port Pointer to the port data structure. + * @tag Tag of the command that has completed. + * @data Pointer to a completion structure. + * @status Completion status. + * + * return value + * None + */ +static void mtip_completion(struct mtip_port *port, + int tag, struct mtip_cmd *command, int status) +{ + struct completion *waiting = command->comp_data; + if (unlikely(status == PORT_IRQ_TF_ERR)) + dev_warn(&port->dd->pdev->dev, + "Internal command %d completed with TFE\n", tag); + + complete(waiting); +} + +static void mtip_null_completion(struct mtip_port *port, + int tag, struct mtip_cmd *command, int status) +{ +} + +static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, + dma_addr_t buffer_dma, unsigned int sectors); +static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, + struct smart_attr *attrib); +/* + * Handle an error. + * + * @dd Pointer to the DRIVER_DATA structure. + * + * return value + * None + */ +static void mtip_handle_tfe(struct driver_data *dd) +{ + int group, tag, bit, reissue, rv; + struct mtip_port *port; + struct mtip_cmd *cmd; + u32 completed; + struct host_to_dev_fis *fis; + unsigned long tagaccum[SLOTBITS_IN_LONGS]; + unsigned int cmd_cnt = 0; + unsigned char *buf; + char *fail_reason = NULL; + int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0; + + dev_warn(&dd->pdev->dev, "Taskfile error\n"); + + port = dd->port; + + set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); + + if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && + test_bit(MTIP_TAG_INTERNAL, port->allocated)) { + cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); + dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); + + if (cmd->comp_data && cmd->comp_func) { + cmd->comp_func(port, MTIP_TAG_INTERNAL, + cmd, PORT_IRQ_TF_ERR); + } + goto handle_tfe_exit; + } + + /* clear the tag accumulator */ + memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); + + /* Loop through all the groups */ + for (group = 0; group < dd->slot_groups; group++) { + completed = readl(port->completed[group]); + + dev_warn(&dd->pdev->dev, "g=%u, comp=%x\n", group, completed); + + /* clear completed status register in the hardware.*/ + writel(completed, port->completed[group]); + + /* Process successfully completed commands */ + for (bit = 0; bit < 32 && completed; bit++) { + if (!(completed & (1<comp_func)) { + set_bit(tag, tagaccum); + cmd_cnt++; + cmd->comp_func(port, tag, cmd, 0); + } else { + dev_err(&port->dd->pdev->dev, + "Missing completion func for tag %d", + tag); + if (mtip_check_surprise_removal(dd->pdev)) { + /* don't proceed further */ + return; + } + } + } + } + + print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt); + + /* Restart the port */ + mdelay(20); + mtip_restart_port(port); + + /* Trying to determine the cause of the error */ + rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, + dd->port->log_buf, + dd->port->log_buf_dma, 1); + if (rv) { + dev_warn(&dd->pdev->dev, + "Error in READ LOG EXT (10h) command\n"); + /* non-critical error, don't fail the load */ + } else { + buf = (unsigned char *)dd->port->log_buf; + if (buf[259] & 0x1) { + dev_info(&dd->pdev->dev, + "Write protect bit is set.\n"); + set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); + fail_all_ncq_write = 1; + fail_reason = "write protect"; + } + if (buf[288] == 0xF7) { + dev_info(&dd->pdev->dev, + "Exceeded Tmax, drive in thermal shutdown.\n"); + set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); + fail_all_ncq_cmds = 1; + fail_reason = "thermal shutdown"; + } + if (buf[288] == 0xBF) { + set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); + dev_info(&dd->pdev->dev, + "Drive indicates rebuild has failed. Secure erase required.\n"); + fail_all_ncq_cmds = 1; + fail_reason = "rebuild failed"; + } + } + + /* clear the tag accumulator */ + memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); + + /* Loop through all the groups */ + for (group = 0; group < dd->slot_groups; group++) { + for (bit = 0; bit < 32; bit++) { + reissue = 1; + tag = (group << 5) + bit; + cmd = mtip_cmd_from_tag(dd, tag); + + fis = (struct host_to_dev_fis *)cmd->command; + + /* Should re-issue? */ + if (tag == MTIP_TAG_INTERNAL || + fis->command == ATA_CMD_SET_FEATURES) + reissue = 0; + else { + if (fail_all_ncq_cmds || + (fail_all_ncq_write && + fis->command == ATA_CMD_FPDMA_WRITE)) { + dev_warn(&dd->pdev->dev, + " Fail: %s w/tag %d [%s].\n", + fis->command == ATA_CMD_FPDMA_WRITE ? + "write" : "read", + tag, + fail_reason != NULL ? + fail_reason : "unknown"); + if (cmd->comp_func) { + cmd->comp_func(port, tag, + cmd, -ENODATA); + } + continue; + } + } + + /* + * First check if this command has + * exceeded its retries. + */ + if (reissue && (cmd->retries-- > 0)) { + + set_bit(tag, tagaccum); + + /* Re-issue the command. */ + mtip_issue_ncq_command(port, tag); + + continue; + } + + /* Retire a command that will not be reissued */ + dev_warn(&port->dd->pdev->dev, + "retiring tag %d\n", tag); + + if (cmd->comp_func) + cmd->comp_func(port, tag, cmd, PORT_IRQ_TF_ERR); + else + dev_warn(&port->dd->pdev->dev, + "Bad completion for tag %d\n", + tag); + } + } + print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt); + +handle_tfe_exit: + /* clear eh_active */ + clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); + wake_up_interruptible(&port->svc_wait); +} + +/* + * Handle a set device bits interrupt + */ +static inline void mtip_workq_sdbfx(struct mtip_port *port, int group, + u32 completed) +{ + struct driver_data *dd = port->dd; + int tag, bit; + struct mtip_cmd *command; + + if (!completed) { + WARN_ON_ONCE(!completed); + return; + } + /* clear completed status register in the hardware.*/ + writel(completed, port->completed[group]); + + /* Process completed commands. */ + for (bit = 0; (bit < 32) && completed; bit++) { + if (completed & 0x01) { + tag = (group << 5) | bit; + + /* skip internal command slot. */ + if (unlikely(tag == MTIP_TAG_INTERNAL)) + continue; + + command = mtip_cmd_from_tag(dd, tag); + if (likely(command->comp_func)) + command->comp_func(port, tag, command, 0); + else { + dev_dbg(&dd->pdev->dev, + "Null completion for tag %d", + tag); + + if (mtip_check_surprise_removal( + dd->pdev)) { + return; + } + } + } + completed >>= 1; + } + + /* If last, re-enable interrupts */ + if (atomic_dec_return(&dd->irq_workers_active) == 0) + writel(0xffffffff, dd->mmio + HOST_IRQ_STAT); +} + +/* + * Process legacy pio and d2h interrupts + */ +static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) +{ + struct mtip_port *port = dd->port; + struct mtip_cmd *cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); + + if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && + (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL]) + & (1 << MTIP_TAG_INTERNAL))) { + if (cmd->comp_func) { + cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd, 0); + return; + } + } + + return; +} + +/* + * Demux and handle errors + */ +static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat) +{ + + if (unlikely(port_stat & PORT_IRQ_CONNECT)) { + dev_warn(&dd->pdev->dev, + "Clearing PxSERR.DIAG.x\n"); + writel((1 << 26), dd->port->mmio + PORT_SCR_ERR); + } + + if (unlikely(port_stat & PORT_IRQ_PHYRDY)) { + dev_warn(&dd->pdev->dev, + "Clearing PxSERR.DIAG.n\n"); + writel((1 << 16), dd->port->mmio + PORT_SCR_ERR); + } + + if (unlikely(port_stat & ~PORT_IRQ_HANDLED)) { + dev_warn(&dd->pdev->dev, + "Port stat errors %x unhandled\n", + (port_stat & ~PORT_IRQ_HANDLED)); + if (mtip_check_surprise_removal(dd->pdev)) + return; + } + if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) { + set_bit(MTIP_PF_EH_ACTIVE_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); + } +} + +static inline irqreturn_t mtip_handle_irq(struct driver_data *data) +{ + struct driver_data *dd = (struct driver_data *) data; + struct mtip_port *port = dd->port; + u32 hba_stat, port_stat; + int rv = IRQ_NONE; + int do_irq_enable = 1, i, workers; + struct mtip_work *twork; + + hba_stat = readl(dd->mmio + HOST_IRQ_STAT); + if (hba_stat) { + rv = IRQ_HANDLED; + + /* Acknowledge the interrupt status on the port.*/ + port_stat = readl(port->mmio + PORT_IRQ_STAT); + writel(port_stat, port->mmio + PORT_IRQ_STAT); + + /* Demux port status */ + if (likely(port_stat & PORT_IRQ_SDB_FIS)) { + do_irq_enable = 0; + WARN_ON_ONCE(atomic_read(&dd->irq_workers_active) != 0); + + /* Start at 1: group zero is always local? */ + for (i = 0, workers = 0; i < MTIP_MAX_SLOT_GROUPS; + i++) { + twork = &dd->work[i]; + twork->completed = readl(port->completed[i]); + if (twork->completed) + workers++; + } + + atomic_set(&dd->irq_workers_active, workers); + if (workers) { + for (i = 1; i < MTIP_MAX_SLOT_GROUPS; i++) { + twork = &dd->work[i]; + if (twork->completed) + queue_work_on( + twork->cpu_binding, + dd->isr_workq, + &twork->work); + } + + if (likely(dd->work[0].completed)) + mtip_workq_sdbfx(port, 0, + dd->work[0].completed); + + } else { + /* + * Chip quirk: SDB interrupt but nothing + * to complete + */ + do_irq_enable = 1; + } + } + + if (unlikely(port_stat & PORT_IRQ_ERR)) { + if (unlikely(mtip_check_surprise_removal(dd->pdev))) { + /* don't proceed further */ + return IRQ_HANDLED; + } + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag)) + return rv; + + mtip_process_errors(dd, port_stat & PORT_IRQ_ERR); + } + + if (unlikely(port_stat & PORT_IRQ_LEGACY)) + mtip_process_legacy(dd, port_stat & PORT_IRQ_LEGACY); + } + + /* acknowledge interrupt */ + if (unlikely(do_irq_enable)) + writel(hba_stat, dd->mmio + HOST_IRQ_STAT); + + return rv; +} + +/* + * HBA interrupt subroutine. + * + * @irq IRQ number. + * @instance Pointer to the driver data structure. + * + * return value + * IRQ_HANDLED A HBA interrupt was pending and handled. + * IRQ_NONE This interrupt was not for the HBA. + */ +static irqreturn_t mtip_irq_handler(int irq, void *instance) +{ + struct driver_data *dd = instance; + + return mtip_handle_irq(dd); +} + +static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) +{ + writel(1 << MTIP_TAG_BIT(tag), + port->cmd_issue[MTIP_TAG_INDEX(tag)]); +} + +static bool mtip_pause_ncq(struct mtip_port *port, + struct host_to_dev_fis *fis) +{ + struct host_to_dev_fis *reply; + unsigned long task_file_data; + + reply = port->rxfis + RX_FIS_D2H_REG; + task_file_data = readl(port->mmio+PORT_TFDATA); + + if (fis->command == ATA_CMD_SEC_ERASE_UNIT) + clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); + + if ((task_file_data & 1)) + return false; + + if (fis->command == ATA_CMD_SEC_ERASE_PREP) { + set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); + port->ic_pause_timer = jiffies; + return true; + } else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) && + (fis->features == 0x03)) { + set_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); + port->ic_pause_timer = jiffies; + return true; + } else if ((fis->command == ATA_CMD_SEC_ERASE_UNIT) || + ((fis->command == 0xFC) && + (fis->features == 0x27 || fis->features == 0x72 || + fis->features == 0x62 || fis->features == 0x26))) { + /* Com reset after secure erase or lowlevel format */ + mtip_restart_port(port); + return false; + } + + return false; +} + +/* + * Wait for port to quiesce + * + * @port Pointer to port data structure + * @timeout Max duration to wait (ms) + * + * return value + * 0 Success + * -EBUSY Commands still active + */ +static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) +{ + unsigned long to; + unsigned int n; + unsigned int active = 1; + + blk_mq_stop_hw_queues(port->dd->queue); + + to = jiffies + msecs_to_jiffies(timeout); + do { + if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) && + test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { + msleep(20); + continue; /* svc thd is actively issuing commands */ + } + + msleep(100); + if (mtip_check_surprise_removal(port->dd->pdev)) + goto err_fault; + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + goto err_fault; + + /* + * Ignore s_active bit 0 of array element 0. + * This bit will always be set + */ + active = readl(port->s_active[0]) & 0xFFFFFFFE; + for (n = 1; n < port->dd->slot_groups; n++) + active |= readl(port->s_active[n]); + + if (!active) + break; + } while (time_before(jiffies, to)); + + blk_mq_start_stopped_hw_queues(port->dd->queue, true); + return active ? -EBUSY : 0; +err_fault: + blk_mq_start_stopped_hw_queues(port->dd->queue, true); + return -EFAULT; +} + +/* + * Execute an internal command and wait for the completion. + * + * @port Pointer to the port data structure. + * @fis Pointer to the FIS that describes the command. + * @fis_len Length in WORDS of the FIS. + * @buffer DMA accessible for command data. + * @buf_len Length, in bytes, of the data buffer. + * @opts Command header options, excluding the FIS length + * and the number of PRD entries. + * @timeout Time in ms to wait for the command to complete. + * + * return value + * 0 Command completed successfully. + * -EFAULT The buffer address is not correctly aligned. + * -EBUSY Internal command or other IO in progress. + * -EAGAIN Time out waiting for command to complete. + */ +static int mtip_exec_internal_command(struct mtip_port *port, + struct host_to_dev_fis *fis, + int fis_len, + dma_addr_t buffer, + int buf_len, + u32 opts, + gfp_t atomic, + unsigned long timeout) +{ + struct mtip_cmd_sg *command_sg; + DECLARE_COMPLETION_ONSTACK(wait); + struct mtip_cmd *int_cmd; + struct driver_data *dd = port->dd; + int rv = 0; + + /* Make sure the buffer is 8 byte aligned. This is asic specific. */ + if (buffer & 0x00000007) { + dev_err(&dd->pdev->dev, "SG buffer is not 8 byte aligned\n"); + return -EFAULT; + } + + int_cmd = mtip_get_int_command(dd); + + set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); + port->ic_pause_timer = 0; + + clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); + + if (atomic == GFP_KERNEL) { + if (fis->command != ATA_CMD_STANDBYNOW1) { + /* wait for io to complete if non atomic */ + if (mtip_quiesce_io(port, + MTIP_QUIESCE_IO_TIMEOUT_MS) < 0) { + dev_warn(&dd->pdev->dev, + "Failed to quiesce IO\n"); + mtip_put_int_command(dd, int_cmd); + clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); + wake_up_interruptible(&port->svc_wait); + return -EBUSY; + } + } + + /* Set the completion function and data for the command. */ + int_cmd->comp_data = &wait; + int_cmd->comp_func = mtip_completion; + + } else { + /* Clear completion - we're going to poll */ + int_cmd->comp_data = NULL; + int_cmd->comp_func = mtip_null_completion; + } + + /* Copy the command to the command table */ + memcpy(int_cmd->command, fis, fis_len*4); + + /* Populate the SG list */ + int_cmd->command_header->opts = + __force_bit2int cpu_to_le32(opts | fis_len); + if (buf_len) { + command_sg = int_cmd->command + AHCI_CMD_TBL_HDR_SZ; + + command_sg->info = + __force_bit2int cpu_to_le32((buf_len-1) & 0x3FFFFF); + command_sg->dba = + __force_bit2int cpu_to_le32(buffer & 0xFFFFFFFF); + command_sg->dba_upper = + __force_bit2int cpu_to_le32((buffer >> 16) >> 16); + + int_cmd->command_header->opts |= + __force_bit2int cpu_to_le32((1 << 16)); + } + + /* Populate the command header */ + int_cmd->command_header->byte_count = 0; + + /* Issue the command to the hardware */ + mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL); + + if (atomic == GFP_KERNEL) { + /* Wait for the command to complete or timeout. */ + if ((rv = wait_for_completion_interruptible_timeout( + &wait, + msecs_to_jiffies(timeout))) <= 0) { + if (rv == -ERESTARTSYS) { /* interrupted */ + dev_err(&dd->pdev->dev, + "Internal command [%02X] was interrupted after %lu ms\n", + fis->command, timeout); + rv = -EINTR; + goto exec_ic_exit; + } else if (rv == 0) /* timeout */ + dev_err(&dd->pdev->dev, + "Internal command did not complete [%02X] within timeout of %lu ms\n", + fis->command, timeout); + else + dev_err(&dd->pdev->dev, + "Internal command [%02X] wait returned code [%d] after %lu ms - unhandled\n", + fis->command, rv, timeout); + + if (mtip_check_surprise_removal(dd->pdev) || + test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag)) { + dev_err(&dd->pdev->dev, + "Internal command [%02X] wait returned due to SR\n", + fis->command); + rv = -ENXIO; + goto exec_ic_exit; + } + mtip_device_reset(dd); /* recover from timeout issue */ + rv = -EAGAIN; + goto exec_ic_exit; + } + } else { + u32 hba_stat, port_stat; + + /* Spin for checking if command still outstanding */ + timeout = jiffies + msecs_to_jiffies(timeout); + while ((readl(port->cmd_issue[MTIP_TAG_INTERNAL]) + & (1 << MTIP_TAG_INTERNAL)) + && time_before(jiffies, timeout)) { + if (mtip_check_surprise_removal(dd->pdev)) { + rv = -ENXIO; + goto exec_ic_exit; + } + if ((fis->command != ATA_CMD_STANDBYNOW1) && + test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag)) { + rv = -ENXIO; + goto exec_ic_exit; + } + port_stat = readl(port->mmio + PORT_IRQ_STAT); + if (!port_stat) + continue; + + if (port_stat & PORT_IRQ_ERR) { + dev_err(&dd->pdev->dev, + "Internal command [%02X] failed\n", + fis->command); + mtip_device_reset(dd); + rv = -EIO; + goto exec_ic_exit; + } else { + writel(port_stat, port->mmio + PORT_IRQ_STAT); + hba_stat = readl(dd->mmio + HOST_IRQ_STAT); + if (hba_stat) + writel(hba_stat, + dd->mmio + HOST_IRQ_STAT); + } + break; + } + } + + if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) + & (1 << MTIP_TAG_INTERNAL)) { + rv = -ENXIO; + if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { + mtip_device_reset(dd); + rv = -EAGAIN; + } + } +exec_ic_exit: + /* Clear the allocated and active bits for the internal command. */ + mtip_put_int_command(dd, int_cmd); + if (rv >= 0 && mtip_pause_ncq(port, fis)) { + /* NCQ paused */ + return rv; + } + clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); + wake_up_interruptible(&port->svc_wait); + + return rv; +} + +/* + * Byte-swap ATA ID strings. + * + * ATA identify data contains strings in byte-swapped 16-bit words. + * They must be swapped (on all architectures) to be usable as C strings. + * This function swaps bytes in-place. + * + * @buf The buffer location of the string + * @len The number of bytes to swap + * + * return value + * None + */ +static inline void ata_swap_string(u16 *buf, unsigned int len) +{ + int i; + for (i = 0; i < (len/2); i++) + be16_to_cpus(&buf[i]); +} + +static void mtip_set_timeout(struct driver_data *dd, + struct host_to_dev_fis *fis, + unsigned int *timeout, u8 erasemode) +{ + switch (fis->command) { + case ATA_CMD_DOWNLOAD_MICRO: + *timeout = 120000; /* 2 minutes */ + break; + case ATA_CMD_SEC_ERASE_UNIT: + case 0xFC: + if (erasemode) + *timeout = ((*(dd->port->identify + 90) * 2) * 60000); + else + *timeout = ((*(dd->port->identify + 89) * 2) * 60000); + break; + case ATA_CMD_STANDBYNOW1: + *timeout = 120000; /* 2 minutes */ + break; + case 0xF7: + case 0xFA: + *timeout = 60000; /* 60 seconds */ + break; + case ATA_CMD_SMART: + *timeout = 15000; /* 15 seconds */ + break; + default: + *timeout = MTIP_IOCTL_CMD_TIMEOUT_MS; + break; + } +} + +/* + * Request the device identity information. + * + * If a user space buffer is not specified, i.e. is NULL, the + * identify information is still read from the drive and placed + * into the identify data buffer (@e port->identify) in the + * port data structure. + * When the identify buffer contains valid identify information @e + * port->identify_valid is non-zero. + * + * @port Pointer to the port structure. + * @user_buffer A user space buffer where the identify data should be + * copied. + * + * return value + * 0 Command completed successfully. + * -EFAULT An error occurred while coping data to the user buffer. + * -1 Command failed. + */ +static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) +{ + int rv = 0; + struct host_to_dev_fis fis; + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return -EFAULT; + + /* Build the FIS. */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = ATA_CMD_ID_ATA; + + /* Set the identify information as invalid. */ + port->identify_valid = 0; + + /* Clear the identify information. */ + memset(port->identify, 0, sizeof(u16) * ATA_ID_WORDS); + + /* Execute the command. */ + if (mtip_exec_internal_command(port, + &fis, + 5, + port->identify_dma, + sizeof(u16) * ATA_ID_WORDS, + 0, + GFP_KERNEL, + MTIP_INT_CMD_TIMEOUT_MS) + < 0) { + rv = -1; + goto out; + } + + /* + * Perform any necessary byte-swapping. Yes, the kernel does in fact + * perform field-sensitive swapping on the string fields. + * See the kernel use of ata_id_string() for proof of this. + */ +#ifdef __LITTLE_ENDIAN + ata_swap_string(port->identify + 27, 40); /* model string*/ + ata_swap_string(port->identify + 23, 8); /* firmware string*/ + ata_swap_string(port->identify + 10, 20); /* serial# string*/ +#else + { + int i; + for (i = 0; i < ATA_ID_WORDS; i++) + port->identify[i] = le16_to_cpu(port->identify[i]); + } +#endif + + /* Check security locked state */ + if (port->identify[128] & 0x4) + set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); + else + clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); + +#ifdef MTIP_TRIM /* Disabling TRIM support temporarily */ + /* Demux ID.DRAT & ID.RZAT to determine trim support */ + if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5)) + port->dd->trim_supp = true; + else +#endif + port->dd->trim_supp = false; + + /* Set the identify buffer as valid. */ + port->identify_valid = 1; + + if (user_buffer) { + if (copy_to_user( + user_buffer, + port->identify, + ATA_ID_WORDS * sizeof(u16))) { + rv = -EFAULT; + goto out; + } + } + +out: + return rv; +} + +/* + * Issue a standby immediate command to the device. + * + * @port Pointer to the port structure. + * + * return value + * 0 Command was executed successfully. + * -1 An error occurred while executing the command. + */ +static int mtip_standby_immediate(struct mtip_port *port) +{ + int rv; + struct host_to_dev_fis fis; + unsigned long start; + unsigned int timeout; + + /* Build the FIS. */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = ATA_CMD_STANDBYNOW1; + + mtip_set_timeout(port->dd, &fis, &timeout, 0); + + start = jiffies; + rv = mtip_exec_internal_command(port, + &fis, + 5, + 0, + 0, + 0, + GFP_ATOMIC, + timeout); + dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n", + jiffies_to_msecs(jiffies - start)); + if (rv) + dev_warn(&port->dd->pdev->dev, + "STANDBY IMMEDIATE command failed.\n"); + + return rv; +} + +/* + * Issue a READ LOG EXT command to the device. + * + * @port pointer to the port structure. + * @page page number to fetch + * @buffer pointer to buffer + * @buffer_dma dma address corresponding to @buffer + * @sectors page length to fetch, in sectors + * + * return value + * @rv return value from mtip_exec_internal_command() + */ +static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, + dma_addr_t buffer_dma, unsigned int sectors) +{ + struct host_to_dev_fis fis; + + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = ATA_CMD_READ_LOG_EXT; + fis.sect_count = sectors & 0xFF; + fis.sect_cnt_ex = (sectors >> 8) & 0xFF; + fis.lba_low = page; + fis.lba_mid = 0; + fis.device = ATA_DEVICE_OBS; + + memset(buffer, 0, sectors * ATA_SECT_SIZE); + + return mtip_exec_internal_command(port, + &fis, + 5, + buffer_dma, + sectors * ATA_SECT_SIZE, + 0, + GFP_ATOMIC, + MTIP_INT_CMD_TIMEOUT_MS); +} + +/* + * Issue a SMART READ DATA command to the device. + * + * @port pointer to the port structure. + * @buffer pointer to buffer + * @buffer_dma dma address corresponding to @buffer + * + * return value + * @rv return value from mtip_exec_internal_command() + */ +static int mtip_get_smart_data(struct mtip_port *port, u8 *buffer, + dma_addr_t buffer_dma) +{ + struct host_to_dev_fis fis; + + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = ATA_CMD_SMART; + fis.features = 0xD0; + fis.sect_count = 1; + fis.lba_mid = 0x4F; + fis.lba_hi = 0xC2; + fis.device = ATA_DEVICE_OBS; + + return mtip_exec_internal_command(port, + &fis, + 5, + buffer_dma, + ATA_SECT_SIZE, + 0, + GFP_ATOMIC, + 15000); +} + +/* + * Get the value of a smart attribute + * + * @port pointer to the port structure + * @id attribute number + * @attrib pointer to return attrib information corresponding to @id + * + * return value + * -EINVAL NULL buffer passed or unsupported attribute @id. + * -EPERM Identify data not valid, SMART not supported or not enabled + */ +static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, + struct smart_attr *attrib) +{ + int rv, i; + struct smart_attr *pattr; + + if (!attrib) + return -EINVAL; + + if (!port->identify_valid) { + dev_warn(&port->dd->pdev->dev, "IDENTIFY DATA not valid\n"); + return -EPERM; + } + if (!(port->identify[82] & 0x1)) { + dev_warn(&port->dd->pdev->dev, "SMART not supported\n"); + return -EPERM; + } + if (!(port->identify[85] & 0x1)) { + dev_warn(&port->dd->pdev->dev, "SMART not enabled\n"); + return -EPERM; + } + + memset(port->smart_buf, 0, ATA_SECT_SIZE); + rv = mtip_get_smart_data(port, port->smart_buf, port->smart_buf_dma); + if (rv) { + dev_warn(&port->dd->pdev->dev, "Failed to ge SMART data\n"); + return rv; + } + + pattr = (struct smart_attr *)(port->smart_buf + 2); + for (i = 0; i < 29; i++, pattr++) + if (pattr->attr_id == id) { + memcpy(attrib, pattr, sizeof(struct smart_attr)); + break; + } + + if (i == 29) { + dev_warn(&port->dd->pdev->dev, + "Query for invalid SMART attribute ID\n"); + rv = -EINVAL; + } + + return rv; +} + +/* + * Trim unused sectors + * + * @dd pointer to driver_data structure + * @lba starting lba + * @len # of 512b sectors to trim + * + * return value + * -ENOMEM Out of dma memory + * -EINVAL Invalid parameters passed in, trim not supported + * -EIO Error submitting trim request to hw + */ +static int mtip_send_trim(struct driver_data *dd, unsigned int lba, + unsigned int len) +{ + int i, rv = 0; + u64 tlba, tlen, sect_left; + struct mtip_trim_entry *buf; + dma_addr_t dma_addr; + struct host_to_dev_fis fis; + + if (!len || dd->trim_supp == false) + return -EINVAL; + + /* Trim request too big */ + WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES)); + + /* Trim request not aligned on 4k boundary */ + WARN_ON(len % 8 != 0); + + /* Warn if vu_trim structure is too big */ + WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE); + + /* Allocate a DMA buffer for the trim structure */ + buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr, + GFP_KERNEL); + if (!buf) + return -ENOMEM; + memset(buf, 0, ATA_SECT_SIZE); + + for (i = 0, sect_left = len, tlba = lba; + i < MTIP_MAX_TRIM_ENTRIES && sect_left; + i++) { + tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ? + MTIP_MAX_TRIM_ENTRY_LEN : + sect_left); + buf[i].lba = __force_bit2int cpu_to_le32(tlba); + buf[i].range = __force_bit2int cpu_to_le16(tlen); + tlba += tlen; + sect_left -= tlen; + } + WARN_ON(sect_left != 0); + + /* Build the fis */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = 0xfb; + fis.features = 0x60; + fis.sect_count = 1; + fis.device = ATA_DEVICE_OBS; + + if (mtip_exec_internal_command(dd->port, + &fis, + 5, + dma_addr, + ATA_SECT_SIZE, + 0, + GFP_KERNEL, + MTIP_TRIM_TIMEOUT_MS) < 0) + rv = -EIO; + + dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr); + return rv; +} + +/* + * Get the drive capacity. + * + * @dd Pointer to the device data structure. + * @sectors Pointer to the variable that will receive the sector count. + * + * return value + * 1 Capacity was returned successfully. + * 0 The identify information is invalid. + */ +static bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors) +{ + struct mtip_port *port = dd->port; + u64 total, raw0, raw1, raw2, raw3; + raw0 = port->identify[100]; + raw1 = port->identify[101]; + raw2 = port->identify[102]; + raw3 = port->identify[103]; + total = raw0 | raw1<<16 | raw2<<32 | raw3<<48; + *sectors = total; + return (bool) !!port->identify_valid; +} + +/* + * Display the identify command data. + * + * @port Pointer to the port data structure. + * + * return value + * None + */ +static void mtip_dump_identify(struct mtip_port *port) +{ + sector_t sectors; + unsigned short revid; + char cbuf[42]; + + if (!port->identify_valid) + return; + + strlcpy(cbuf, (char *)(port->identify+10), 21); + dev_info(&port->dd->pdev->dev, + "Serial No.: %s\n", cbuf); + + strlcpy(cbuf, (char *)(port->identify+23), 9); + dev_info(&port->dd->pdev->dev, + "Firmware Ver.: %s\n", cbuf); + + strlcpy(cbuf, (char *)(port->identify+27), 41); + dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf); + + dev_info(&port->dd->pdev->dev, "Security: %04x %s\n", + port->identify[128], + port->identify[128] & 0x4 ? "(LOCKED)" : ""); + + if (mtip_hw_get_capacity(port->dd, §ors)) + dev_info(&port->dd->pdev->dev, + "Capacity: %llu sectors (%llu MB)\n", + (u64)sectors, + ((u64)sectors) * ATA_SECT_SIZE >> 20); + + pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid); + switch (revid & 0xFF) { + case 0x1: + strlcpy(cbuf, "A0", 3); + break; + case 0x3: + strlcpy(cbuf, "A2", 3); + break; + default: + strlcpy(cbuf, "?", 2); + break; + } + dev_info(&port->dd->pdev->dev, + "Card Type: %s\n", cbuf); +} + +/* + * Map the commands scatter list into the command table. + * + * @command Pointer to the command. + * @nents Number of scatter list entries. + * + * return value + * None + */ +static inline void fill_command_sg(struct driver_data *dd, + struct mtip_cmd *command, + int nents) +{ + int n; + unsigned int dma_len; + struct mtip_cmd_sg *command_sg; + struct scatterlist *sg = command->sg; + + command_sg = command->command + AHCI_CMD_TBL_HDR_SZ; + + for (n = 0; n < nents; n++) { + dma_len = sg_dma_len(sg); + if (dma_len > 0x400000) + dev_err(&dd->pdev->dev, + "DMA segment length truncated\n"); + command_sg->info = __force_bit2int + cpu_to_le32((dma_len-1) & 0x3FFFFF); + command_sg->dba = __force_bit2int + cpu_to_le32(sg_dma_address(sg)); + command_sg->dba_upper = __force_bit2int + cpu_to_le32((sg_dma_address(sg) >> 16) >> 16); + command_sg++; + sg++; + } +} + +/* + * @brief Execute a drive command. + * + * return value 0 The command completed successfully. + * return value -1 An error occurred while executing the command. + */ +static int exec_drive_task(struct mtip_port *port, u8 *command) +{ + struct host_to_dev_fis fis; + struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG); + unsigned int to; + + /* Build the FIS. */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = command[0]; + fis.features = command[1]; + fis.sect_count = command[2]; + fis.sector = command[3]; + fis.cyl_low = command[4]; + fis.cyl_hi = command[5]; + fis.device = command[6] & ~0x10; /* Clear the dev bit*/ + + mtip_set_timeout(port->dd, &fis, &to, 0); + + dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n", + __func__, + command[0], + command[1], + command[2], + command[3], + command[4], + command[5], + command[6]); + + /* Execute the command. */ + if (mtip_exec_internal_command(port, + &fis, + 5, + 0, + 0, + 0, + GFP_KERNEL, + to) < 0) { + return -1; + } + + command[0] = reply->command; /* Status*/ + command[1] = reply->features; /* Error*/ + command[4] = reply->cyl_low; + command[5] = reply->cyl_hi; + + dbg_printk(MTIP_DRV_NAME " %s: Completion Status: stat %x, err %x , cyl_lo %x cyl_hi %x\n", + __func__, + command[0], + command[1], + command[4], + command[5]); + + return 0; +} + +/* + * @brief Execute a drive command. + * + * @param port Pointer to the port data structure. + * @param command Pointer to the user specified command parameters. + * @param user_buffer Pointer to the user space buffer where read sector + * data should be copied. + * + * return value 0 The command completed successfully. + * return value -EFAULT An error occurred while copying the completion + * data to the user space buffer. + * return value -1 An error occurred while executing the command. + */ +static int exec_drive_command(struct mtip_port *port, u8 *command, + void __user *user_buffer) +{ + struct host_to_dev_fis fis; + struct host_to_dev_fis *reply; + u8 *buf = NULL; + dma_addr_t dma_addr = 0; + int rv = 0, xfer_sz = command[3]; + unsigned int to; + + if (xfer_sz) { + if (!user_buffer) + return -EFAULT; + + buf = dmam_alloc_coherent(&port->dd->pdev->dev, + ATA_SECT_SIZE * xfer_sz, + &dma_addr, + GFP_KERNEL); + if (!buf) { + dev_err(&port->dd->pdev->dev, + "Memory allocation failed (%d bytes)\n", + ATA_SECT_SIZE * xfer_sz); + return -ENOMEM; + } + memset(buf, 0, ATA_SECT_SIZE * xfer_sz); + } + + /* Build the FIS. */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = command[0]; + fis.features = command[2]; + fis.sect_count = command[3]; + if (fis.command == ATA_CMD_SMART) { + fis.sector = command[1]; + fis.cyl_low = 0x4F; + fis.cyl_hi = 0xC2; + } + + mtip_set_timeout(port->dd, &fis, &to, 0); + + if (xfer_sz) + reply = (port->rxfis + RX_FIS_PIO_SETUP); + else + reply = (port->rxfis + RX_FIS_D2H_REG); + + dbg_printk(MTIP_DRV_NAME + " %s: User Command: cmd %x, sect %x, " + "feat %x, sectcnt %x\n", + __func__, + command[0], + command[1], + command[2], + command[3]); + + /* Execute the command. */ + if (mtip_exec_internal_command(port, + &fis, + 5, + (xfer_sz ? dma_addr : 0), + (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0), + 0, + GFP_KERNEL, + to) + < 0) { + rv = -EFAULT; + goto exit_drive_command; + } + + /* Collect the completion status. */ + command[0] = reply->command; /* Status*/ + command[1] = reply->features; /* Error*/ + command[2] = reply->sect_count; + + dbg_printk(MTIP_DRV_NAME + " %s: Completion Status: stat %x, " + "err %x, nsect %x\n", + __func__, + command[0], + command[1], + command[2]); + + if (xfer_sz) { + if (copy_to_user(user_buffer, + buf, + ATA_SECT_SIZE * command[3])) { + rv = -EFAULT; + goto exit_drive_command; + } + } +exit_drive_command: + if (buf) + dmam_free_coherent(&port->dd->pdev->dev, + ATA_SECT_SIZE * xfer_sz, buf, dma_addr); + return rv; +} + +/* + * Indicates whether a command has a single sector payload. + * + * @command passed to the device to perform the certain event. + * @features passed to the device to perform the certain event. + * + * return value + * 1 command is one that always has a single sector payload, + * regardless of the value in the Sector Count field. + * 0 otherwise + * + */ +static unsigned int implicit_sector(unsigned char command, + unsigned char features) +{ + unsigned int rv = 0; + + /* list of commands that have an implicit sector count of 1 */ + switch (command) { + case ATA_CMD_SEC_SET_PASS: + case ATA_CMD_SEC_UNLOCK: + case ATA_CMD_SEC_ERASE_PREP: + case ATA_CMD_SEC_ERASE_UNIT: + case ATA_CMD_SEC_FREEZE_LOCK: + case ATA_CMD_SEC_DISABLE_PASS: + case ATA_CMD_PMP_READ: + case ATA_CMD_PMP_WRITE: + rv = 1; + break; + case ATA_CMD_SET_MAX: + if (features == ATA_SET_MAX_UNLOCK) + rv = 1; + break; + case ATA_CMD_SMART: + if ((features == ATA_SMART_READ_VALUES) || + (features == ATA_SMART_READ_THRESHOLDS)) + rv = 1; + break; + case ATA_CMD_CONF_OVERLAY: + if ((features == ATA_DCO_IDENTIFY) || + (features == ATA_DCO_SET)) + rv = 1; + break; + } + return rv; +} + +/* + * Executes a taskfile + * See ide_taskfile_ioctl() for derivation + */ +static int exec_drive_taskfile(struct driver_data *dd, + void __user *buf, + ide_task_request_t *req_task, + int outtotal) +{ + struct host_to_dev_fis fis; + struct host_to_dev_fis *reply; + u8 *outbuf = NULL; + u8 *inbuf = NULL; + dma_addr_t outbuf_dma = 0; + dma_addr_t inbuf_dma = 0; + dma_addr_t dma_buffer = 0; + int err = 0; + unsigned int taskin = 0; + unsigned int taskout = 0; + u8 nsect = 0; + unsigned int timeout; + unsigned int force_single_sector; + unsigned int transfer_size; + unsigned long task_file_data; + int intotal = outtotal + req_task->out_size; + int erasemode = 0; + + taskout = req_task->out_size; + taskin = req_task->in_size; + /* 130560 = 512 * 0xFF*/ + if (taskin > 130560 || taskout > 130560) { + err = -EINVAL; + goto abort; + } + + if (taskout) { + outbuf = kzalloc(taskout, GFP_KERNEL); + if (outbuf == NULL) { + err = -ENOMEM; + goto abort; + } + if (copy_from_user(outbuf, buf + outtotal, taskout)) { + err = -EFAULT; + goto abort; + } + outbuf_dma = pci_map_single(dd->pdev, + outbuf, + taskout, + DMA_TO_DEVICE); + if (outbuf_dma == 0) { + err = -ENOMEM; + goto abort; + } + dma_buffer = outbuf_dma; + } + + if (taskin) { + inbuf = kzalloc(taskin, GFP_KERNEL); + if (inbuf == NULL) { + err = -ENOMEM; + goto abort; + } + + if (copy_from_user(inbuf, buf + intotal, taskin)) { + err = -EFAULT; + goto abort; + } + inbuf_dma = pci_map_single(dd->pdev, + inbuf, + taskin, DMA_FROM_DEVICE); + if (inbuf_dma == 0) { + err = -ENOMEM; + goto abort; + } + dma_buffer = inbuf_dma; + } + + /* only supports PIO and non-data commands from this ioctl. */ + switch (req_task->data_phase) { + case TASKFILE_OUT: + nsect = taskout / ATA_SECT_SIZE; + reply = (dd->port->rxfis + RX_FIS_PIO_SETUP); + break; + case TASKFILE_IN: + reply = (dd->port->rxfis + RX_FIS_PIO_SETUP); + break; + case TASKFILE_NO_DATA: + reply = (dd->port->rxfis + RX_FIS_D2H_REG); + break; + default: + err = -EINVAL; + goto abort; + } + + /* Build the FIS. */ + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = req_task->io_ports[7]; + fis.features = req_task->io_ports[1]; + fis.sect_count = req_task->io_ports[2]; + fis.lba_low = req_task->io_ports[3]; + fis.lba_mid = req_task->io_ports[4]; + fis.lba_hi = req_task->io_ports[5]; + /* Clear the dev bit*/ + fis.device = req_task->io_ports[6] & ~0x10; + + if ((req_task->in_flags.all == 0) && (req_task->out_flags.all & 1)) { + req_task->in_flags.all = + IDE_TASKFILE_STD_IN_FLAGS | + (IDE_HOB_STD_IN_FLAGS << 8); + fis.lba_low_ex = req_task->hob_ports[3]; + fis.lba_mid_ex = req_task->hob_ports[4]; + fis.lba_hi_ex = req_task->hob_ports[5]; + fis.features_ex = req_task->hob_ports[1]; + fis.sect_cnt_ex = req_task->hob_ports[2]; + + } else { + req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS; + } + + force_single_sector = implicit_sector(fis.command, fis.features); + + if ((taskin || taskout) && (!fis.sect_count)) { + if (nsect) + fis.sect_count = nsect; + else { + if (!force_single_sector) { + dev_warn(&dd->pdev->dev, + "data movement but " + "sect_count is 0\n"); + err = -EINVAL; + goto abort; + } + } + } + + dbg_printk(MTIP_DRV_NAME + " %s: cmd %x, feat %x, nsect %x," + " sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x," + " head/dev %x\n", + __func__, + fis.command, + fis.features, + fis.sect_count, + fis.lba_low, + fis.lba_mid, + fis.lba_hi, + fis.device); + + /* check for erase mode support during secure erase.*/ + if ((fis.command == ATA_CMD_SEC_ERASE_UNIT) && outbuf && + (outbuf[0] & MTIP_SEC_ERASE_MODE)) { + erasemode = 1; + } + + mtip_set_timeout(dd, &fis, &timeout, erasemode); + + /* Determine the correct transfer size.*/ + if (force_single_sector) + transfer_size = ATA_SECT_SIZE; + else + transfer_size = ATA_SECT_SIZE * fis.sect_count; + + /* Execute the command.*/ + if (mtip_exec_internal_command(dd->port, + &fis, + 5, + dma_buffer, + transfer_size, + 0, + GFP_KERNEL, + timeout) < 0) { + err = -EIO; + goto abort; + } + + task_file_data = readl(dd->port->mmio+PORT_TFDATA); + + if ((req_task->data_phase == TASKFILE_IN) && !(task_file_data & 1)) { + reply = dd->port->rxfis + RX_FIS_PIO_SETUP; + req_task->io_ports[7] = reply->control; + } else { + reply = dd->port->rxfis + RX_FIS_D2H_REG; + req_task->io_ports[7] = reply->command; + } + + /* reclaim the DMA buffers.*/ + if (inbuf_dma) + pci_unmap_single(dd->pdev, inbuf_dma, + taskin, DMA_FROM_DEVICE); + if (outbuf_dma) + pci_unmap_single(dd->pdev, outbuf_dma, + taskout, DMA_TO_DEVICE); + inbuf_dma = 0; + outbuf_dma = 0; + + /* return the ATA registers to the caller.*/ + req_task->io_ports[1] = reply->features; + req_task->io_ports[2] = reply->sect_count; + req_task->io_ports[3] = reply->lba_low; + req_task->io_ports[4] = reply->lba_mid; + req_task->io_ports[5] = reply->lba_hi; + req_task->io_ports[6] = reply->device; + + if (req_task->out_flags.all & 1) { + + req_task->hob_ports[3] = reply->lba_low_ex; + req_task->hob_ports[4] = reply->lba_mid_ex; + req_task->hob_ports[5] = reply->lba_hi_ex; + req_task->hob_ports[1] = reply->features_ex; + req_task->hob_ports[2] = reply->sect_cnt_ex; + } + dbg_printk(MTIP_DRV_NAME + " %s: Completion: stat %x," + "err %x, sect_cnt %x, lbalo %x," + "lbamid %x, lbahi %x, dev %x\n", + __func__, + req_task->io_ports[7], + req_task->io_ports[1], + req_task->io_ports[2], + req_task->io_ports[3], + req_task->io_ports[4], + req_task->io_ports[5], + req_task->io_ports[6]); + + if (taskout) { + if (copy_to_user(buf + outtotal, outbuf, taskout)) { + err = -EFAULT; + goto abort; + } + } + if (taskin) { + if (copy_to_user(buf + intotal, inbuf, taskin)) { + err = -EFAULT; + goto abort; + } + } +abort: + if (inbuf_dma) + pci_unmap_single(dd->pdev, inbuf_dma, + taskin, DMA_FROM_DEVICE); + if (outbuf_dma) + pci_unmap_single(dd->pdev, outbuf_dma, + taskout, DMA_TO_DEVICE); + kfree(outbuf); + kfree(inbuf); + + return err; +} + +/* + * Handle IOCTL calls from the Block Layer. + * + * This function is called by the Block Layer when it receives an IOCTL + * command that it does not understand. If the IOCTL command is not supported + * this function returns -ENOTTY. + * + * @dd Pointer to the driver data structure. + * @cmd IOCTL command passed from the Block Layer. + * @arg IOCTL argument passed from the Block Layer. + * + * return value + * 0 The IOCTL completed successfully. + * -ENOTTY The specified command is not supported. + * -EFAULT An error occurred copying data to a user space buffer. + * -EIO An error occurred while executing the command. + */ +static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case HDIO_GET_IDENTITY: + { + if (copy_to_user((void __user *)arg, dd->port->identify, + sizeof(u16) * ATA_ID_WORDS)) + return -EFAULT; + break; + } + case HDIO_DRIVE_CMD: + { + u8 drive_command[4]; + + /* Copy the user command info to our buffer. */ + if (copy_from_user(drive_command, + (void __user *) arg, + sizeof(drive_command))) + return -EFAULT; + + /* Execute the drive command. */ + if (exec_drive_command(dd->port, + drive_command, + (void __user *) (arg+4))) + return -EIO; + + /* Copy the status back to the users buffer. */ + if (copy_to_user((void __user *) arg, + drive_command, + sizeof(drive_command))) + return -EFAULT; + + break; + } + case HDIO_DRIVE_TASK: + { + u8 drive_command[7]; + + /* Copy the user command info to our buffer. */ + if (copy_from_user(drive_command, + (void __user *) arg, + sizeof(drive_command))) + return -EFAULT; + + /* Execute the drive command. */ + if (exec_drive_task(dd->port, drive_command)) + return -EIO; + + /* Copy the status back to the users buffer. */ + if (copy_to_user((void __user *) arg, + drive_command, + sizeof(drive_command))) + return -EFAULT; + + break; + } + case HDIO_DRIVE_TASKFILE: { + ide_task_request_t req_task; + int ret, outtotal; + + if (copy_from_user(&req_task, (void __user *) arg, + sizeof(req_task))) + return -EFAULT; + + outtotal = sizeof(req_task); + + ret = exec_drive_taskfile(dd, (void __user *) arg, + &req_task, outtotal); + + if (copy_to_user((void __user *) arg, &req_task, + sizeof(req_task))) + return -EFAULT; + + return ret; + } + + default: + return -EINVAL; + } + return 0; +} + +/* + * Submit an IO to the hw + * + * This function is called by the block layer to issue an io + * to the device. Upon completion, the callback function will + * be called with the data parameter passed as the callback data. + * + * @dd Pointer to the driver data structure. + * @start First sector to read. + * @nsect Number of sectors to read. + * @nents Number of entries in scatter list for the read command. + * @tag The tag of this read command. + * @callback Pointer to the function that should be called + * when the read completes. + * @data Callback data passed to the callback function + * when the read completes. + * @dir Direction (read or write) + * + * return value + * None + */ +static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq, + struct mtip_cmd *command, int nents, + struct blk_mq_hw_ctx *hctx) +{ + struct host_to_dev_fis *fis; + struct mtip_port *port = dd->port; + int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + u64 start = blk_rq_pos(rq); + unsigned int nsect = blk_rq_sectors(rq); + + /* Map the scatter list for DMA access */ + nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); + + prefetch(&port->flags); + + command->scatter_ents = nents; + + /* + * The number of retries for this command before it is + * reported as a failure to the upper layers. + */ + command->retries = MTIP_MAX_RETRIES; + + /* Fill out fis */ + fis = command->command; + fis->type = 0x27; + fis->opts = 1 << 7; + if (dma_dir == DMA_FROM_DEVICE) + fis->command = ATA_CMD_FPDMA_READ; + else + fis->command = ATA_CMD_FPDMA_WRITE; + fis->lba_low = start & 0xFF; + fis->lba_mid = (start >> 8) & 0xFF; + fis->lba_hi = (start >> 16) & 0xFF; + fis->lba_low_ex = (start >> 24) & 0xFF; + fis->lba_mid_ex = (start >> 32) & 0xFF; + fis->lba_hi_ex = (start >> 40) & 0xFF; + fis->device = 1 << 6; + fis->features = nsect & 0xFF; + fis->features_ex = (nsect >> 8) & 0xFF; + fis->sect_count = ((rq->tag << 3) | (rq->tag >> 5)); + fis->sect_cnt_ex = 0; + fis->control = 0; + fis->res2 = 0; + fis->res3 = 0; + fill_command_sg(dd, command, nents); + + if (unlikely(command->unaligned)) + fis->device |= 1 << 7; + + /* Populate the command header */ + command->command_header->opts = + __force_bit2int cpu_to_le32( + (nents << 16) | 5 | AHCI_CMD_PREFETCH); + command->command_header->byte_count = 0; + + /* + * Set the completion function and data for the command + * within this layer. + */ + command->comp_data = dd; + command->comp_func = mtip_async_complete; + command->direction = dma_dir; + + /* + * To prevent this command from being issued + * if an internal command is in progress or error handling is active. + */ + if (unlikely(port->flags & MTIP_PF_PAUSE_IO)) { + set_bit(rq->tag, port->cmds_to_issue); + set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); + return; + } + + /* Issue the command to the hardware */ + mtip_issue_ncq_command(port, rq->tag); +} + +/* + * Sysfs status dump. + * + * @dev Pointer to the device structure, passed by the kernrel. + * @attr Pointer to the device_attribute structure passed by the kernel. + * @buf Pointer to the char buffer that will receive the stats info. + * + * return value + * The size, in bytes, of the data copied into buf. + */ +static ssize_t mtip_hw_show_status(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct driver_data *dd = dev_to_disk(dev)->private_data; + int size = 0; + + if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag)) + size += sprintf(buf, "%s", "thermal_shutdown\n"); + else if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag)) + size += sprintf(buf, "%s", "write_protect\n"); + else + size += sprintf(buf, "%s", "online\n"); + + return size; +} + +static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL); + +/* debugsfs entries */ + +static ssize_t show_device_status(struct device_driver *drv, char *buf) +{ + int size = 0; + struct driver_data *dd, *tmp; + unsigned long flags; + char id_buf[42]; + u16 status = 0; + + spin_lock_irqsave(&dev_lock, flags); + size += sprintf(&buf[size], "Devices Present:\n"); + list_for_each_entry_safe(dd, tmp, &online_list, online_list) { + if (dd->pdev) { + if (dd->port && + dd->port->identify && + dd->port->identify_valid) { + strlcpy(id_buf, + (char *) (dd->port->identify + 10), 21); + status = *(dd->port->identify + 141); + } else { + memset(id_buf, 0, 42); + status = 0; + } + + if (dd->port && + test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) { + size += sprintf(&buf[size], + " device %s %s (ftl rebuild %d %%)\n", + dev_name(&dd->pdev->dev), + id_buf, + status); + } else { + size += sprintf(&buf[size], + " device %s %s\n", + dev_name(&dd->pdev->dev), + id_buf); + } + } + } + + size += sprintf(&buf[size], "Devices Being Removed:\n"); + list_for_each_entry_safe(dd, tmp, &removing_list, remove_list) { + if (dd->pdev) { + if (dd->port && + dd->port->identify && + dd->port->identify_valid) { + strlcpy(id_buf, + (char *) (dd->port->identify+10), 21); + status = *(dd->port->identify + 141); + } else { + memset(id_buf, 0, 42); + status = 0; + } + + if (dd->port && + test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) { + size += sprintf(&buf[size], + " device %s %s (ftl rebuild %d %%)\n", + dev_name(&dd->pdev->dev), + id_buf, + status); + } else { + size += sprintf(&buf[size], + " device %s %s\n", + dev_name(&dd->pdev->dev), + id_buf); + } + } + } + spin_unlock_irqrestore(&dev_lock, flags); + + return size; +} + +static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf, + size_t len, loff_t *offset) +{ + struct driver_data *dd = (struct driver_data *)f->private_data; + int size = *offset; + char *buf; + int rv = 0; + + if (!len || *offset) + return 0; + + buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL); + if (!buf) { + dev_err(&dd->pdev->dev, + "Memory allocation: status buffer\n"); + return -ENOMEM; + } + + size += show_device_status(NULL, buf); + + *offset = size <= len ? size : len; + size = copy_to_user(ubuf, buf, *offset); + if (size) + rv = -EFAULT; + + kfree(buf); + return rv ? rv : *offset; +} + +static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, + size_t len, loff_t *offset) +{ + struct driver_data *dd = (struct driver_data *)f->private_data; + char *buf; + u32 group_allocated; + int size = *offset; + int n, rv = 0; + + if (!len || size) + return 0; + + buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL); + if (!buf) { + dev_err(&dd->pdev->dev, + "Memory allocation: register buffer\n"); + return -ENOMEM; + } + + size += sprintf(&buf[size], "H/ S ACTive : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->s_active[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ Command Issue : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->cmd_issue[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ Completed : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->completed[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ PORT IRQ STAT : [ 0x%08X ]\n", + readl(dd->port->mmio + PORT_IRQ_STAT)); + size += sprintf(&buf[size], "H/ HOST IRQ STAT : [ 0x%08X ]\n", + readl(dd->mmio + HOST_IRQ_STAT)); + size += sprintf(&buf[size], "\n"); + + size += sprintf(&buf[size], "L/ Allocated : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) { + if (sizeof(long) > sizeof(u32)) + group_allocated = + dd->port->allocated[n/2] >> (32*(n&1)); + else + group_allocated = dd->port->allocated[n]; + size += sprintf(&buf[size], "%08X ", group_allocated); + } + size += sprintf(&buf[size], "]\n"); + + size += sprintf(&buf[size], "L/ Commands in Q : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) { + if (sizeof(long) > sizeof(u32)) + group_allocated = + dd->port->cmds_to_issue[n/2] >> (32*(n&1)); + else + group_allocated = dd->port->cmds_to_issue[n]; + size += sprintf(&buf[size], "%08X ", group_allocated); + } + size += sprintf(&buf[size], "]\n"); + + *offset = size <= len ? size : len; + size = copy_to_user(ubuf, buf, *offset); + if (size) + rv = -EFAULT; + + kfree(buf); + return rv ? rv : *offset; +} + +static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, + size_t len, loff_t *offset) +{ + struct driver_data *dd = (struct driver_data *)f->private_data; + char *buf; + int size = *offset; + int rv = 0; + + if (!len || size) + return 0; + + buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL); + if (!buf) { + dev_err(&dd->pdev->dev, + "Memory allocation: flag buffer\n"); + return -ENOMEM; + } + + size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n", + dd->port->flags); + size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n", + dd->dd_flag); + + *offset = size <= len ? size : len; + size = copy_to_user(ubuf, buf, *offset); + if (size) + rv = -EFAULT; + + kfree(buf); + return rv ? rv : *offset; +} + +static const struct file_operations mtip_device_status_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mtip_hw_read_device_status, + .llseek = no_llseek, +}; + +static const struct file_operations mtip_regs_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mtip_hw_read_registers, + .llseek = no_llseek, +}; + +static const struct file_operations mtip_flags_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mtip_hw_read_flags, + .llseek = no_llseek, +}; + +/* + * Create the sysfs related attributes. + * + * @dd Pointer to the driver data structure. + * @kobj Pointer to the kobj for the block device. + * + * return value + * 0 Operation completed successfully. + * -EINVAL Invalid parameter. + */ +static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj) +{ + if (!kobj || !dd) + return -EINVAL; + + if (sysfs_create_file(kobj, &dev_attr_status.attr)) + dev_warn(&dd->pdev->dev, + "Error creating 'status' sysfs entry\n"); + return 0; +} + +/* + * Remove the sysfs related attributes. + * + * @dd Pointer to the driver data structure. + * @kobj Pointer to the kobj for the block device. + * + * return value + * 0 Operation completed successfully. + * -EINVAL Invalid parameter. + */ +static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj) +{ + if (!kobj || !dd) + return -EINVAL; + + sysfs_remove_file(kobj, &dev_attr_status.attr); + + return 0; +} + +static int mtip_hw_debugfs_init(struct driver_data *dd) +{ + if (!dfs_parent) + return -1; + + dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent); + if (IS_ERR_OR_NULL(dd->dfs_node)) { + dev_warn(&dd->pdev->dev, + "Error creating node %s under debugfs\n", + dd->disk->disk_name); + dd->dfs_node = NULL; + return -1; + } + + debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd, + &mtip_flags_fops); + debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd, + &mtip_regs_fops); + + return 0; +} + +static void mtip_hw_debugfs_exit(struct driver_data *dd) +{ + if (dd->dfs_node) + debugfs_remove_recursive(dd->dfs_node); +} + +static int mtip_free_orphan(struct driver_data *dd) +{ + struct kobject *kobj; + + if (dd->bdev) { + if (dd->bdev->bd_holders >= 1) + return -2; + + bdput(dd->bdev); + dd->bdev = NULL; + } + + mtip_hw_debugfs_exit(dd); + + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, dd->index); + spin_unlock(&rssd_index_lock); + + if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag) && + test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) { + put_disk(dd->disk); + } else { + if (dd->disk) { + kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); + if (kobj) { + mtip_hw_sysfs_exit(dd, kobj); + kobject_put(kobj); + } + del_gendisk(dd->disk); + dd->disk = NULL; + } + if (dd->queue) { + dd->queue->queuedata = NULL; + blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); + dd->queue = NULL; + } + } + kfree(dd); + return 0; +} + +/* + * Perform any init/resume time hardware setup + * + * @dd Pointer to the driver data structure. + * + * return value + * None + */ +static inline void hba_setup(struct driver_data *dd) +{ + u32 hwdata; + hwdata = readl(dd->mmio + HOST_HSORG); + + /* interrupt bug workaround: use only 1 IS bit.*/ + writel(hwdata | + HSORG_DISABLE_SLOTGRP_INTR | + HSORG_DISABLE_SLOTGRP_PXIS, + dd->mmio + HOST_HSORG); +} + +static int mtip_device_unaligned_constrained(struct driver_data *dd) +{ + return (dd->pdev->device == P420M_DEVICE_ID ? 1 : 0); +} + +/* + * Detect the details of the product, and store anything needed + * into the driver data structure. This includes product type and + * version and number of slot groups. + * + * @dd Pointer to the driver data structure. + * + * return value + * None + */ +static void mtip_detect_product(struct driver_data *dd) +{ + u32 hwdata; + unsigned int rev, slotgroups; + + /* + * HBA base + 0xFC [15:0] - vendor-specific hardware interface + * info register: + * [15:8] hardware/software interface rev# + * [ 3] asic-style interface + * [ 2:0] number of slot groups, minus 1 (only valid for asic-style). + */ + hwdata = readl(dd->mmio + HOST_HSORG); + + dd->product_type = MTIP_PRODUCT_UNKNOWN; + dd->slot_groups = 1; + + if (hwdata & 0x8) { + dd->product_type = MTIP_PRODUCT_ASICFPGA; + rev = (hwdata & HSORG_HWREV) >> 8; + slotgroups = (hwdata & HSORG_SLOTGROUPS) + 1; + dev_info(&dd->pdev->dev, + "ASIC-FPGA design, HS rev 0x%x, " + "%i slot groups [%i slots]\n", + rev, + slotgroups, + slotgroups * 32); + + if (slotgroups > MTIP_MAX_SLOT_GROUPS) { + dev_warn(&dd->pdev->dev, + "Warning: driver only supports " + "%i slot groups.\n", MTIP_MAX_SLOT_GROUPS); + slotgroups = MTIP_MAX_SLOT_GROUPS; + } + dd->slot_groups = slotgroups; + return; + } + + dev_warn(&dd->pdev->dev, "Unrecognized product id\n"); +} + +/* + * Blocking wait for FTL rebuild to complete + * + * @dd Pointer to the DRIVER_DATA structure. + * + * return value + * 0 FTL rebuild completed successfully + * -EFAULT FTL rebuild error/timeout/interruption + */ +static int mtip_ftl_rebuild_poll(struct driver_data *dd) +{ + unsigned long timeout, cnt = 0, start; + + dev_warn(&dd->pdev->dev, + "FTL rebuild in progress. Polling for completion.\n"); + + start = jiffies; + timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS); + + do { + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag))) + return -EFAULT; + if (mtip_check_surprise_removal(dd->pdev)) + return -EFAULT; + + if (mtip_get_identify(dd->port, NULL) < 0) + return -EFAULT; + + if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == + MTIP_FTL_REBUILD_MAGIC) { + ssleep(1); + /* Print message every 3 minutes */ + if (cnt++ >= 180) { + dev_warn(&dd->pdev->dev, + "FTL rebuild in progress (%d secs).\n", + jiffies_to_msecs(jiffies - start) / 1000); + cnt = 0; + } + } else { + dev_warn(&dd->pdev->dev, + "FTL rebuild complete (%d secs).\n", + jiffies_to_msecs(jiffies - start) / 1000); + mtip_block_initialize(dd); + return 0; + } + ssleep(10); + } while (time_before(jiffies, timeout)); + + /* Check for timeout */ + dev_err(&dd->pdev->dev, + "Timed out waiting for FTL rebuild to complete (%d secs).\n", + jiffies_to_msecs(jiffies - start) / 1000); + return -EFAULT; +} + +/* + * service thread to issue queued commands + * + * @data Pointer to the driver data structure. + * + * return value + * 0 + */ + +static int mtip_service_thread(void *data) +{ + struct driver_data *dd = (struct driver_data *)data; + unsigned long slot, slot_start, slot_wrap; + unsigned int num_cmd_slots = dd->slot_groups * 32; + struct mtip_port *port = dd->port; + int ret; + + while (1) { + if (kthread_should_stop() || + test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) + goto st_out; + clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + + /* + * the condition is to check neither an internal command is + * is in progress nor error handling is active + */ + wait_event_interruptible(port->svc_wait, (port->flags) && + !(port->flags & MTIP_PF_PAUSE_IO)); + + set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + + if (kthread_should_stop() || + test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) + goto st_out; + + /* If I am an orphan, start self cleanup */ + if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags)) + break; + + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag))) + goto st_out; + +restart_eh: + /* Demux bits: start with error handling */ + if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) { + mtip_handle_tfe(dd); + clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); + } + + if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) + goto restart_eh; + + if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { + slot = 1; + /* used to restrict the loop to one iteration */ + slot_start = num_cmd_slots; + slot_wrap = 0; + while (1) { + slot = find_next_bit(port->cmds_to_issue, + num_cmd_slots, slot); + if (slot_wrap == 1) { + if ((slot_start >= slot) || + (slot >= num_cmd_slots)) + break; + } + if (unlikely(slot_start == num_cmd_slots)) + slot_start = slot; + + if (unlikely(slot == num_cmd_slots)) { + slot = 1; + slot_wrap = 1; + continue; + } + + /* Issue the command to the hardware */ + mtip_issue_ncq_command(port, slot); + + clear_bit(slot, port->cmds_to_issue); + } + + clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); + } + + if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { + if (mtip_ftl_rebuild_poll(dd) < 0) + set_bit(MTIP_DDF_REBUILD_FAILED_BIT, + &dd->dd_flag); + clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); + } + } + + /* wait for pci remove to exit */ + while (1) { + if (test_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag)) + break; + msleep_interruptible(1000); + if (kthread_should_stop()) + goto st_out; + } + + while (1) { + ret = mtip_free_orphan(dd); + if (!ret) { + /* NOTE: All data structures are invalid, do not + * access any here */ + return 0; + } + msleep_interruptible(1000); + if (kthread_should_stop()) + goto st_out; + } +st_out: + return 0; +} + +/* + * DMA region teardown + * + * @dd Pointer to driver_data structure + * + * return value + * None + */ +static void mtip_dma_free(struct driver_data *dd) +{ + struct mtip_port *port = dd->port; + + if (port->block1) + dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, + port->block1, port->block1_dma); + + if (port->command_list) { + dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, + port->command_list, port->command_list_dma); + } +} + +/* + * DMA region setup + * + * @dd Pointer to driver_data structure + * + * return value + * -ENOMEM Not enough free DMA region space to initialize driver + */ +static int mtip_dma_alloc(struct driver_data *dd) +{ + struct mtip_port *port = dd->port; + + /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */ + port->block1 = + dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, + &port->block1_dma, GFP_KERNEL); + if (!port->block1) + return -ENOMEM; + memset(port->block1, 0, BLOCK_DMA_ALLOC_SZ); + + /* Allocate dma memory for command list */ + port->command_list = + dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ, + &port->command_list_dma, GFP_KERNEL); + if (!port->command_list) { + dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ, + port->block1, port->block1_dma); + port->block1 = NULL; + port->block1_dma = 0; + return -ENOMEM; + } + memset(port->command_list, 0, AHCI_CMD_TBL_SZ); + + /* Setup all pointers into first DMA region */ + port->rxfis = port->block1 + AHCI_RX_FIS_OFFSET; + port->rxfis_dma = port->block1_dma + AHCI_RX_FIS_OFFSET; + port->identify = port->block1 + AHCI_IDFY_OFFSET; + port->identify_dma = port->block1_dma + AHCI_IDFY_OFFSET; + port->log_buf = port->block1 + AHCI_SECTBUF_OFFSET; + port->log_buf_dma = port->block1_dma + AHCI_SECTBUF_OFFSET; + port->smart_buf = port->block1 + AHCI_SMARTBUF_OFFSET; + port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET; + + return 0; +} + +static int mtip_hw_get_identify(struct driver_data *dd) +{ + struct smart_attr attr242; + unsigned char *buf; + int rv; + + if (mtip_get_identify(dd->port, NULL) < 0) + return -EFAULT; + + if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == + MTIP_FTL_REBUILD_MAGIC) { + set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); + return MTIP_FTL_REBUILD_MAGIC; + } + mtip_dump_identify(dd->port); + + /* check write protect, over temp and rebuild statuses */ + rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, + dd->port->log_buf, + dd->port->log_buf_dma, 1); + if (rv) { + dev_warn(&dd->pdev->dev, + "Error in READ LOG EXT (10h) command\n"); + /* non-critical error, don't fail the load */ + } else { + buf = (unsigned char *)dd->port->log_buf; + if (buf[259] & 0x1) { + dev_info(&dd->pdev->dev, + "Write protect bit is set.\n"); + set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); + } + if (buf[288] == 0xF7) { + dev_info(&dd->pdev->dev, + "Exceeded Tmax, drive in thermal shutdown.\n"); + set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); + } + if (buf[288] == 0xBF) { + dev_info(&dd->pdev->dev, + "Drive indicates rebuild has failed.\n"); + /* TODO */ + } + } + + /* get write protect progess */ + memset(&attr242, 0, sizeof(struct smart_attr)); + if (mtip_get_smart_attr(dd->port, 242, &attr242)) + dev_warn(&dd->pdev->dev, + "Unable to check write protect progress\n"); + else + dev_info(&dd->pdev->dev, + "Write protect progress: %u%% (%u blocks)\n", + attr242.cur, le32_to_cpu(attr242.data)); + + return rv; +} + +/* + * Called once for each card. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 on success, else an error code. + */ +static int mtip_hw_init(struct driver_data *dd) +{ + int i; + int rv; + unsigned int num_command_slots; + unsigned long timeout, timetaken; + + dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; + + mtip_detect_product(dd); + if (dd->product_type == MTIP_PRODUCT_UNKNOWN) { + rv = -EIO; + goto out1; + } + num_command_slots = dd->slot_groups * 32; + + hba_setup(dd); + + dd->port = kzalloc_node(sizeof(struct mtip_port), GFP_KERNEL, + dd->numa_node); + if (!dd->port) { + dev_err(&dd->pdev->dev, + "Memory allocation: port structure\n"); + return -ENOMEM; + } + + /* Continue workqueue setup */ + for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) + dd->work[i].port = dd->port; + + /* Enable unaligned IO constraints for some devices */ + if (mtip_device_unaligned_constrained(dd)) + dd->unal_qdepth = MTIP_MAX_UNALIGNED_SLOTS; + else + dd->unal_qdepth = 0; + + sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth); + + /* Spinlock to prevent concurrent issue */ + for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++) + spin_lock_init(&dd->port->cmd_issue_lock[i]); + + /* Set the port mmio base address. */ + dd->port->mmio = dd->mmio + PORT_OFFSET; + dd->port->dd = dd; + + /* DMA allocations */ + rv = mtip_dma_alloc(dd); + if (rv < 0) + goto out1; + + /* Setup the pointers to the extended s_active and CI registers. */ + for (i = 0; i < dd->slot_groups; i++) { + dd->port->s_active[i] = + dd->port->mmio + i*0x80 + PORT_SCR_ACT; + dd->port->cmd_issue[i] = + dd->port->mmio + i*0x80 + PORT_COMMAND_ISSUE; + dd->port->completed[i] = + dd->port->mmio + i*0x80 + PORT_SDBV; + } + + timetaken = jiffies; + timeout = jiffies + msecs_to_jiffies(30000); + while (((readl(dd->port->mmio + PORT_SCR_STAT) & 0x0F) != 0x03) && + time_before(jiffies, timeout)) { + mdelay(100); + } + if (unlikely(mtip_check_surprise_removal(dd->pdev))) { + timetaken = jiffies - timetaken; + dev_warn(&dd->pdev->dev, + "Surprise removal detected at %u ms\n", + jiffies_to_msecs(timetaken)); + rv = -ENODEV; + goto out2 ; + } + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { + timetaken = jiffies - timetaken; + dev_warn(&dd->pdev->dev, + "Removal detected at %u ms\n", + jiffies_to_msecs(timetaken)); + rv = -EFAULT; + goto out2; + } + + /* Conditionally reset the HBA. */ + if (!(readl(dd->mmio + HOST_CAP) & HOST_CAP_NZDMA)) { + if (mtip_hba_reset(dd) < 0) { + dev_err(&dd->pdev->dev, + "Card did not reset within timeout\n"); + rv = -EIO; + goto out2; + } + } else { + /* Clear any pending interrupts on the HBA */ + writel(readl(dd->mmio + HOST_IRQ_STAT), + dd->mmio + HOST_IRQ_STAT); + } + + mtip_init_port(dd->port); + mtip_start_port(dd->port); + + /* Setup the ISR and enable interrupts. */ + rv = devm_request_irq(&dd->pdev->dev, + dd->pdev->irq, + mtip_irq_handler, + IRQF_SHARED, + dev_driver_string(&dd->pdev->dev), + dd); + + if (rv) { + dev_err(&dd->pdev->dev, + "Unable to allocate IRQ %d\n", dd->pdev->irq); + goto out2; + } + irq_set_affinity_hint(dd->pdev->irq, get_cpu_mask(dd->isr_binding)); + + /* Enable interrupts on the HBA. */ + writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, + dd->mmio + HOST_CTL); + + init_waitqueue_head(&dd->port->svc_wait); + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { + rv = -EFAULT; + goto out3; + } + + return rv; + +out3: + /* Disable interrupts on the HBA. */ + writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, + dd->mmio + HOST_CTL); + + /* Release the IRQ. */ + irq_set_affinity_hint(dd->pdev->irq, NULL); + devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); + +out2: + mtip_deinit_port(dd->port); + mtip_dma_free(dd); + +out1: + /* Free the memory allocated for the for structure. */ + kfree(dd->port); + + return rv; +} + +static void mtip_standby_drive(struct driver_data *dd) +{ + if (dd->sr) + return; + + /* + * Send standby immediate (E0h) to the drive so that it + * saves its state. + */ + if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) && + !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) + if (mtip_standby_immediate(dd->port)) + dev_warn(&dd->pdev->dev, + "STANDBY IMMEDIATE failed\n"); +} + +/* + * Called to deinitialize an interface. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 + */ +static int mtip_hw_exit(struct driver_data *dd) +{ + /* + * Send standby immediate (E0h) to the drive so that it + * saves its state. + */ + if (!dd->sr) { + /* de-initialize the port. */ + mtip_deinit_port(dd->port); + + /* Disable interrupts on the HBA. */ + writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, + dd->mmio + HOST_CTL); + } + + /* Release the IRQ. */ + irq_set_affinity_hint(dd->pdev->irq, NULL); + devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); + + /* Free dma regions */ + mtip_dma_free(dd); + + /* Free the memory allocated for the for structure. */ + kfree(dd->port); + dd->port = NULL; + + return 0; +} + +/* + * Issue a Standby Immediate command to the device. + * + * This function is called by the Block Layer just before the + * system powers off during a shutdown. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 + */ +static int mtip_hw_shutdown(struct driver_data *dd) +{ + /* + * Send standby immediate (E0h) to the drive so that it + * saves its state. + */ + if (!dd->sr && dd->port) + mtip_standby_immediate(dd->port); + + return 0; +} + +/* + * Suspend function + * + * This function is called by the Block Layer just before the + * system hibernates. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 Suspend was successful + * -EFAULT Suspend was not successful + */ +static int mtip_hw_suspend(struct driver_data *dd) +{ + /* + * Send standby immediate (E0h) to the drive + * so that it saves its state. + */ + if (mtip_standby_immediate(dd->port) != 0) { + dev_err(&dd->pdev->dev, + "Failed standby-immediate command\n"); + return -EFAULT; + } + + /* Disable interrupts on the HBA.*/ + writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN, + dd->mmio + HOST_CTL); + mtip_deinit_port(dd->port); + + return 0; +} + +/* + * Resume function + * + * This function is called by the Block Layer as the + * system resumes. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 Resume was successful + * -EFAULT Resume was not successful + */ +static int mtip_hw_resume(struct driver_data *dd) +{ + /* Perform any needed hardware setup steps */ + hba_setup(dd); + + /* Reset the HBA */ + if (mtip_hba_reset(dd) != 0) { + dev_err(&dd->pdev->dev, + "Unable to reset the HBA\n"); + return -EFAULT; + } + + /* + * Enable the port, DMA engine, and FIS reception specific + * h/w in controller. + */ + mtip_init_port(dd->port); + mtip_start_port(dd->port); + + /* Enable interrupts on the HBA.*/ + writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN, + dd->mmio + HOST_CTL); + + return 0; +} + +/* + * Helper function for reusing disk name + * upon hot insertion. + */ +static int rssd_disk_name_format(char *prefix, + int index, + char *buf, + int buflen) +{ + const int base = 'z' - 'a' + 1; + char *begin = buf + strlen(prefix); + char *end = buf + buflen; + char *p; + int unit; + + p = end - 1; + *p = '\0'; + unit = base; + do { + if (p == begin) + return -EINVAL; + *--p = 'a' + (index % unit); + index = (index / unit) - 1; + } while (index >= 0); + + memmove(begin, p, end - p); + memcpy(buf, prefix, strlen(prefix)); + + return 0; +} + +/* + * Block layer IOCTL handler. + * + * @dev Pointer to the block_device structure. + * @mode ignored + * @cmd IOCTL command passed from the user application. + * @arg Argument passed from the user application. + * + * return value + * 0 IOCTL completed successfully. + * -ENOTTY IOCTL not supported or invalid driver data + * structure pointer. + */ +static int mtip_block_ioctl(struct block_device *dev, + fmode_t mode, + unsigned cmd, + unsigned long arg) +{ + struct driver_data *dd = dev->bd_disk->private_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (!dd) + return -ENOTTY; + + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) + return -ENOTTY; + + switch (cmd) { + case BLKFLSBUF: + return -ENOTTY; + default: + return mtip_hw_ioctl(dd, cmd, arg); + } +} + +#ifdef CONFIG_COMPAT +/* + * Block layer compat IOCTL handler. + * + * @dev Pointer to the block_device structure. + * @mode ignored + * @cmd IOCTL command passed from the user application. + * @arg Argument passed from the user application. + * + * return value + * 0 IOCTL completed successfully. + * -ENOTTY IOCTL not supported or invalid driver data + * structure pointer. + */ +static int mtip_block_compat_ioctl(struct block_device *dev, + fmode_t mode, + unsigned cmd, + unsigned long arg) +{ + struct driver_data *dd = dev->bd_disk->private_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + if (!dd) + return -ENOTTY; + + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) + return -ENOTTY; + + switch (cmd) { + case BLKFLSBUF: + return -ENOTTY; + case HDIO_DRIVE_TASKFILE: { + struct mtip_compat_ide_task_request_s __user *compat_req_task; + ide_task_request_t req_task; + int compat_tasksize, outtotal, ret; + + compat_tasksize = + sizeof(struct mtip_compat_ide_task_request_s); + + compat_req_task = + (struct mtip_compat_ide_task_request_s __user *) arg; + + if (copy_from_user(&req_task, (void __user *) arg, + compat_tasksize - (2 * sizeof(compat_long_t)))) + return -EFAULT; + + if (get_user(req_task.out_size, &compat_req_task->out_size)) + return -EFAULT; + + if (get_user(req_task.in_size, &compat_req_task->in_size)) + return -EFAULT; + + outtotal = sizeof(struct mtip_compat_ide_task_request_s); + + ret = exec_drive_taskfile(dd, (void __user *) arg, + &req_task, outtotal); + + if (copy_to_user((void __user *) arg, &req_task, + compat_tasksize - + (2 * sizeof(compat_long_t)))) + return -EFAULT; + + if (put_user(req_task.out_size, &compat_req_task->out_size)) + return -EFAULT; + + if (put_user(req_task.in_size, &compat_req_task->in_size)) + return -EFAULT; + + return ret; + } + default: + return mtip_hw_ioctl(dd, cmd, arg); + } +} +#endif + +/* + * Obtain the geometry of the device. + * + * You may think that this function is obsolete, but some applications, + * fdisk for example still used CHS values. This function describes the + * device as having 224 heads and 56 sectors per cylinder. These values are + * chosen so that each cylinder is aligned on a 4KB boundary. Since a + * partition is described in terms of a start and end cylinder this means + * that each partition is also 4KB aligned. Non-aligned partitions adversely + * affects performance. + * + * @dev Pointer to the block_device strucutre. + * @geo Pointer to a hd_geometry structure. + * + * return value + * 0 Operation completed successfully. + * -ENOTTY An error occurred while reading the drive capacity. + */ +static int mtip_block_getgeo(struct block_device *dev, + struct hd_geometry *geo) +{ + struct driver_data *dd = dev->bd_disk->private_data; + sector_t capacity; + + if (!dd) + return -ENOTTY; + + if (!(mtip_hw_get_capacity(dd, &capacity))) { + dev_warn(&dd->pdev->dev, + "Could not get drive capacity.\n"); + return -ENOTTY; + } + + geo->heads = 224; + geo->sectors = 56; + sector_div(capacity, (geo->heads * geo->sectors)); + geo->cylinders = capacity; + return 0; +} + +/* + * Block device operation function. + * + * This structure contains pointers to the functions required by the block + * layer. + */ +static const struct block_device_operations mtip_block_ops = { + .ioctl = mtip_block_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = mtip_block_compat_ioctl, +#endif + .getgeo = mtip_block_getgeo, + .owner = THIS_MODULE +}; + +/* + * Block layer make request function. + * + * This function is called by the kernel to process a BIO for + * the P320 device. + * + * @queue Pointer to the request queue. Unused other than to obtain + * the driver data structure. + * @rq Pointer to the request. + * + */ +static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) +{ + struct driver_data *dd = hctx->queue->queuedata; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + unsigned int nents; + + if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag))) { + return -ENXIO; + } + if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) { + return -ENODATA; + } + if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT, + &dd->dd_flag) && + rq_data_dir(rq))) { + return -ENODATA; + } + if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))) + return -ENODATA; + if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) + return -ENXIO; + } + + if (rq->cmd_flags & REQ_DISCARD) { + int err; + + err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); + blk_mq_end_request(rq, err); + return 0; + } + + /* Create the scatter list for this request. */ + nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg); + + /* Issue the read/write. */ + mtip_hw_submit_io(dd, rq, cmd, nents, hctx); + return 0; +} + +static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + struct driver_data *dd = hctx->queue->queuedata; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + + if (rq_data_dir(rq) == READ || !dd->unal_qdepth) + return false; + + /* + * If unaligned depth must be limited on this controller, mark it + * as unaligned if the IO isn't on a 4k boundary (start of length). + */ + if (blk_rq_sectors(rq) <= 64) { + if ((blk_rq_pos(rq) & 7) || (blk_rq_sectors(rq) & 7)) + cmd->unaligned = 1; + } + + if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal)) + return true; + + return false; +} + +static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + int ret; + + if (unlikely(mtip_check_unal_depth(hctx, rq))) + return BLK_MQ_RQ_QUEUE_BUSY; + + blk_mq_start_request(rq); + + ret = mtip_submit_request(hctx, rq); + if (likely(!ret)) + return BLK_MQ_RQ_QUEUE_OK; + + rq->errors = ret; + return BLK_MQ_RQ_QUEUE_ERROR; +} + +static void mtip_free_cmd(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx) +{ + struct driver_data *dd = data; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + + if (!cmd->command) + return; + + dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, + cmd->command, cmd->command_dma); +} + +static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx, + unsigned int request_idx, unsigned int numa_node) +{ + struct driver_data *dd = data; + struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); + u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; + + cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, + &cmd->command_dma, GFP_KERNEL); + if (!cmd->command) + return -ENOMEM; + + memset(cmd->command, 0, CMD_DMA_ALLOC_SZ); + + /* Point the command headers at the command tables. */ + cmd->command_header = dd->port->command_list + + (sizeof(struct mtip_cmd_hdr) * request_idx); + cmd->command_header_dma = dd->port->command_list_dma + + (sizeof(struct mtip_cmd_hdr) * request_idx); + + if (host_cap_64) + cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16); + + cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF); + + sg_init_table(cmd->sg, MTIP_MAX_SG); + return 0; +} + +static struct blk_mq_ops mtip_mq_ops = { + .queue_rq = mtip_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = mtip_init_cmd, + .exit_request = mtip_free_cmd, +}; + +/* + * Block layer initialization function. + * + * This function is called once by the PCI layer for each P320 + * device that is connected to the system. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 on success else an error code. + */ +static int mtip_block_initialize(struct driver_data *dd) +{ + int rv = 0, wait_for_rebuild = 0; + sector_t capacity; + unsigned int index = 0; + struct kobject *kobj; + unsigned char thd_name[16]; + + if (dd->disk) + goto skip_create_disk; /* hw init done, before rebuild */ + + if (mtip_hw_init(dd)) { + rv = -EINVAL; + goto protocol_init_error; + } + + dd->disk = alloc_disk_node(MTIP_MAX_MINORS, dd->numa_node); + if (dd->disk == NULL) { + dev_err(&dd->pdev->dev, + "Unable to allocate gendisk structure\n"); + rv = -EINVAL; + goto alloc_disk_error; + } + + /* Generate the disk name, implemented same as in sd.c */ + do { + if (!ida_pre_get(&rssd_index_ida, GFP_KERNEL)) + goto ida_get_error; + + spin_lock(&rssd_index_lock); + rv = ida_get_new(&rssd_index_ida, &index); + spin_unlock(&rssd_index_lock); + } while (rv == -EAGAIN); + + if (rv) + goto ida_get_error; + + rv = rssd_disk_name_format("rssd", + index, + dd->disk->disk_name, + DISK_NAME_LEN); + if (rv) + goto disk_index_error; + + dd->disk->driverfs_dev = &dd->pdev->dev; + dd->disk->major = dd->major; + dd->disk->first_minor = dd->instance * MTIP_MAX_MINORS; + dd->disk->fops = &mtip_block_ops; + dd->disk->private_data = dd; + dd->index = index; + + mtip_hw_debugfs_init(dd); + +skip_create_disk: + memset(&dd->tags, 0, sizeof(dd->tags)); + dd->tags.ops = &mtip_mq_ops; + dd->tags.nr_hw_queues = 1; + dd->tags.queue_depth = MTIP_MAX_COMMAND_SLOTS; + dd->tags.reserved_tags = 1; + dd->tags.cmd_size = sizeof(struct mtip_cmd); + dd->tags.numa_node = dd->numa_node; + dd->tags.flags = BLK_MQ_F_SHOULD_MERGE; + dd->tags.driver_data = dd; + + rv = blk_mq_alloc_tag_set(&dd->tags); + if (rv) { + dev_err(&dd->pdev->dev, + "Unable to allocate request queue\n"); + goto block_queue_alloc_init_error; + } + + /* Allocate the request queue. */ + dd->queue = blk_mq_init_queue(&dd->tags); + if (IS_ERR(dd->queue)) { + dev_err(&dd->pdev->dev, + "Unable to allocate request queue\n"); + rv = -ENOMEM; + goto block_queue_alloc_init_error; + } + + dd->disk->queue = dd->queue; + dd->queue->queuedata = dd; + + /* Initialize the protocol layer. */ + wait_for_rebuild = mtip_hw_get_identify(dd); + if (wait_for_rebuild < 0) { + dev_err(&dd->pdev->dev, + "Protocol layer initialization failed\n"); + rv = -EINVAL; + goto init_hw_cmds_error; + } + + /* + * if rebuild pending, start the service thread, and delay the block + * queue creation and add_disk() + */ + if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) + goto start_service_thread; + + /* Set device limits. */ + set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags); + clear_bit(QUEUE_FLAG_ADD_RANDOM, &dd->queue->queue_flags); + blk_queue_max_segments(dd->queue, MTIP_MAX_SG); + blk_queue_physical_block_size(dd->queue, 4096); + blk_queue_max_hw_sectors(dd->queue, 0xffff); + blk_queue_max_segment_size(dd->queue, 0x400000); + blk_queue_io_min(dd->queue, 4096); + blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask); + + /* + * write back cache is not supported in the device. FUA depends on + * write back cache support, hence setting flush support to zero. + */ + blk_queue_flush(dd->queue, 0); + + /* Signal trim support */ + if (dd->trim_supp == true) { + set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags); + dd->queue->limits.discard_granularity = 4096; + blk_queue_max_discard_sectors(dd->queue, + MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES); + dd->queue->limits.discard_zeroes_data = 0; + } + + /* Set the capacity of the device in 512 byte sectors. */ + if (!(mtip_hw_get_capacity(dd, &capacity))) { + dev_warn(&dd->pdev->dev, + "Could not read drive capacity\n"); + rv = -EIO; + goto read_capacity_error; + } + set_capacity(dd->disk, capacity); + + /* Enable the block device and add it to /dev */ + add_disk(dd->disk); + + dd->bdev = bdget_disk(dd->disk, 0); + /* + * Now that the disk is active, initialize any sysfs attributes + * managed by the protocol layer. + */ + kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); + if (kobj) { + mtip_hw_sysfs_init(dd, kobj); + kobject_put(kobj); + } + + if (dd->mtip_svc_handler) { + set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); + return rv; /* service thread created for handling rebuild */ + } + +start_service_thread: + sprintf(thd_name, "mtip_svc_thd_%02d", index); + dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread, + dd, dd->numa_node, "%s", + thd_name); + + if (IS_ERR(dd->mtip_svc_handler)) { + dev_err(&dd->pdev->dev, "service thread failed to start\n"); + dd->mtip_svc_handler = NULL; + rv = -EFAULT; + goto kthread_run_error; + } + wake_up_process(dd->mtip_svc_handler); + if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) + rv = wait_for_rebuild; + + return rv; + +kthread_run_error: + bdput(dd->bdev); + dd->bdev = NULL; + + /* Delete our gendisk. This also removes the device from /dev */ + del_gendisk(dd->disk); + +read_capacity_error: +init_hw_cmds_error: + blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); +block_queue_alloc_init_error: + mtip_hw_debugfs_exit(dd); +disk_index_error: + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, index); + spin_unlock(&rssd_index_lock); + +ida_get_error: + put_disk(dd->disk); + +alloc_disk_error: + mtip_hw_exit(dd); /* De-initialize the protocol layer. */ + +protocol_init_error: + return rv; +} + +/* + * Block layer deinitialization function. + * + * Called by the PCI layer as each P320 device is removed. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 + */ +static int mtip_block_remove(struct driver_data *dd) +{ + struct kobject *kobj; + + if (!dd->sr) { + mtip_hw_debugfs_exit(dd); + + if (dd->mtip_svc_handler) { + set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); + kthread_stop(dd->mtip_svc_handler); + } + + /* Clean up the sysfs attributes, if created */ + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { + kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); + if (kobj) { + mtip_hw_sysfs_exit(dd, kobj); + kobject_put(kobj); + } + } + + mtip_standby_drive(dd); + + /* + * Delete our gendisk structure. This also removes the device + * from /dev + */ + if (dd->bdev) { + bdput(dd->bdev); + dd->bdev = NULL; + } + if (dd->disk) { + if (dd->disk->queue) { + del_gendisk(dd->disk); + blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); + dd->queue = NULL; + } else + put_disk(dd->disk); + } + dd->disk = NULL; + + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, dd->index); + spin_unlock(&rssd_index_lock); + } else { + dev_info(&dd->pdev->dev, "device %s surprise removal\n", + dd->disk->disk_name); + } + + /* De-initialize the protocol layer. */ + mtip_hw_exit(dd); + + return 0; +} + +/* + * Function called by the PCI layer when just before the + * machine shuts down. + * + * If a protocol layer shutdown function is present it will be called + * by this function. + * + * @dd Pointer to the driver data structure. + * + * return value + * 0 + */ +static int mtip_block_shutdown(struct driver_data *dd) +{ + mtip_hw_shutdown(dd); + + /* Delete our gendisk structure, and cleanup the blk queue. */ + if (dd->disk) { + dev_info(&dd->pdev->dev, + "Shutting down %s ...\n", dd->disk->disk_name); + + if (dd->disk->queue) { + del_gendisk(dd->disk); + blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); + } else + put_disk(dd->disk); + dd->disk = NULL; + dd->queue = NULL; + } + + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, dd->index); + spin_unlock(&rssd_index_lock); + return 0; +} + +static int mtip_block_suspend(struct driver_data *dd) +{ + dev_info(&dd->pdev->dev, + "Suspending %s ...\n", dd->disk->disk_name); + mtip_hw_suspend(dd); + return 0; +} + +static int mtip_block_resume(struct driver_data *dd) +{ + dev_info(&dd->pdev->dev, "Resuming %s ...\n", + dd->disk->disk_name); + mtip_hw_resume(dd); + return 0; +} + +static void drop_cpu(int cpu) +{ + cpu_use[cpu]--; +} + +static int get_least_used_cpu_on_node(int node) +{ + int cpu, least_used_cpu, least_cnt; + const struct cpumask *node_mask; + + node_mask = cpumask_of_node(node); + least_used_cpu = cpumask_first(node_mask); + least_cnt = cpu_use[least_used_cpu]; + cpu = least_used_cpu; + + for_each_cpu(cpu, node_mask) { + if (cpu_use[cpu] < least_cnt) { + least_used_cpu = cpu; + least_cnt = cpu_use[cpu]; + } + } + cpu_use[least_used_cpu]++; + return least_used_cpu; +} + +/* Helper for selecting a node in round robin mode */ +static inline int mtip_get_next_rr_node(void) +{ + static int next_node = -1; + + if (next_node == -1) { + next_node = first_online_node; + return next_node; + } + + next_node = next_online_node(next_node); + if (next_node == MAX_NUMNODES) + next_node = first_online_node; + return next_node; +} + +static DEFINE_HANDLER(0); +static DEFINE_HANDLER(1); +static DEFINE_HANDLER(2); +static DEFINE_HANDLER(3); +static DEFINE_HANDLER(4); +static DEFINE_HANDLER(5); +static DEFINE_HANDLER(6); +static DEFINE_HANDLER(7); + +static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev) +{ + int pos; + unsigned short pcie_dev_ctrl; + + pos = pci_find_capability(pdev, PCI_CAP_ID_EXP); + if (pos) { + pci_read_config_word(pdev, + pos + PCI_EXP_DEVCTL, + &pcie_dev_ctrl); + if (pcie_dev_ctrl & (1 << 11) || + pcie_dev_ctrl & (1 << 4)) { + dev_info(&dd->pdev->dev, + "Disabling ERO/No-Snoop on bridge device %04x:%04x\n", + pdev->vendor, pdev->device); + pcie_dev_ctrl &= ~(PCI_EXP_DEVCTL_NOSNOOP_EN | + PCI_EXP_DEVCTL_RELAX_EN); + pci_write_config_word(pdev, + pos + PCI_EXP_DEVCTL, + pcie_dev_ctrl); + } + } +} + +static void mtip_fix_ero_nosnoop(struct driver_data *dd, struct pci_dev *pdev) +{ + /* + * This workaround is specific to AMD/ATI chipset with a PCI upstream + * device with device id 0x5aXX + */ + if (pdev->bus && pdev->bus->self) { + if (pdev->bus->self->vendor == PCI_VENDOR_ID_ATI && + ((pdev->bus->self->device & 0xff00) == 0x5a00)) { + mtip_disable_link_opts(dd, pdev->bus->self); + } else { + /* Check further up the topology */ + struct pci_dev *parent_dev = pdev->bus->self; + if (parent_dev->bus && + parent_dev->bus->parent && + parent_dev->bus->parent->self && + parent_dev->bus->parent->self->vendor == + PCI_VENDOR_ID_ATI && + (parent_dev->bus->parent->self->device & + 0xff00) == 0x5a00) { + mtip_disable_link_opts(dd, + parent_dev->bus->parent->self); + } + } + } +} + +/* + * Called for each supported PCI device detected. + * + * This function allocates the private data structure, enables the + * PCI device and then calls the block layer initialization function. + * + * return value + * 0 on success else an error code. + */ +static int mtip_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + int rv = 0; + struct driver_data *dd = NULL; + char cpu_list[256]; + const struct cpumask *node_mask; + int cpu, i = 0, j = 0; + int my_node = NUMA_NO_NODE; + unsigned long flags; + + /* Allocate memory for this devices private data. */ + my_node = pcibus_to_node(pdev->bus); + if (my_node != NUMA_NO_NODE) { + if (!node_online(my_node)) + my_node = mtip_get_next_rr_node(); + } else { + dev_info(&pdev->dev, "Kernel not reporting proximity, choosing a node\n"); + my_node = mtip_get_next_rr_node(); + } + dev_info(&pdev->dev, "NUMA node %d (closest: %d,%d, probe on %d:%d)\n", + my_node, pcibus_to_node(pdev->bus), dev_to_node(&pdev->dev), + cpu_to_node(raw_smp_processor_id()), raw_smp_processor_id()); + + dd = kzalloc_node(sizeof(struct driver_data), GFP_KERNEL, my_node); + if (dd == NULL) { + dev_err(&pdev->dev, + "Unable to allocate memory for driver data\n"); + return -ENOMEM; + } + + /* Attach the private data to this PCI device. */ + pci_set_drvdata(pdev, dd); + + rv = pcim_enable_device(pdev); + if (rv < 0) { + dev_err(&pdev->dev, "Unable to enable device\n"); + goto iomap_err; + } + + /* Map BAR5 to memory. */ + rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME); + if (rv < 0) { + dev_err(&pdev->dev, "Unable to map regions\n"); + goto iomap_err; + } + + if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) { + rv = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + + if (rv) { + rv = pci_set_consistent_dma_mask(pdev, + DMA_BIT_MASK(32)); + if (rv) { + dev_warn(&pdev->dev, + "64-bit DMA enable failed\n"); + goto setmask_err; + } + } + } + + /* Copy the info we may need later into the private data structure. */ + dd->major = mtip_major; + dd->instance = instance; + dd->pdev = pdev; + dd->numa_node = my_node; + + INIT_LIST_HEAD(&dd->online_list); + INIT_LIST_HEAD(&dd->remove_list); + + memset(dd->workq_name, 0, 32); + snprintf(dd->workq_name, 31, "mtipq%d", dd->instance); + + dd->isr_workq = create_workqueue(dd->workq_name); + if (!dd->isr_workq) { + dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance); + rv = -ENOMEM; + goto block_initialize_err; + } + + memset(cpu_list, 0, sizeof(cpu_list)); + + node_mask = cpumask_of_node(dd->numa_node); + if (!cpumask_empty(node_mask)) { + for_each_cpu(cpu, node_mask) + { + snprintf(&cpu_list[j], 256 - j, "%d ", cpu); + j = strlen(cpu_list); + } + + dev_info(&pdev->dev, "Node %d on package %d has %d cpu(s): %s\n", + dd->numa_node, + topology_physical_package_id(cpumask_first(node_mask)), + nr_cpus_node(dd->numa_node), + cpu_list); + } else + dev_dbg(&pdev->dev, "mtip32xx: node_mask empty\n"); + + dd->isr_binding = get_least_used_cpu_on_node(dd->numa_node); + dev_info(&pdev->dev, "Initial IRQ binding node:cpu %d:%d\n", + cpu_to_node(dd->isr_binding), dd->isr_binding); + + /* first worker context always runs in ISR */ + dd->work[0].cpu_binding = dd->isr_binding; + dd->work[1].cpu_binding = get_least_used_cpu_on_node(dd->numa_node); + dd->work[2].cpu_binding = get_least_used_cpu_on_node(dd->numa_node); + dd->work[3].cpu_binding = dd->work[0].cpu_binding; + dd->work[4].cpu_binding = dd->work[1].cpu_binding; + dd->work[5].cpu_binding = dd->work[2].cpu_binding; + dd->work[6].cpu_binding = dd->work[2].cpu_binding; + dd->work[7].cpu_binding = dd->work[1].cpu_binding; + + /* Log the bindings */ + for_each_present_cpu(cpu) { + memset(cpu_list, 0, sizeof(cpu_list)); + for (i = 0, j = 0; i < MTIP_MAX_SLOT_GROUPS; i++) { + if (dd->work[i].cpu_binding == cpu) { + snprintf(&cpu_list[j], 256 - j, "%d ", i); + j = strlen(cpu_list); + } + } + if (j) + dev_info(&pdev->dev, "CPU %d: WQs %s\n", cpu, cpu_list); + } + + INIT_WORK(&dd->work[0].work, mtip_workq_sdbf0); + INIT_WORK(&dd->work[1].work, mtip_workq_sdbf1); + INIT_WORK(&dd->work[2].work, mtip_workq_sdbf2); + INIT_WORK(&dd->work[3].work, mtip_workq_sdbf3); + INIT_WORK(&dd->work[4].work, mtip_workq_sdbf4); + INIT_WORK(&dd->work[5].work, mtip_workq_sdbf5); + INIT_WORK(&dd->work[6].work, mtip_workq_sdbf6); + INIT_WORK(&dd->work[7].work, mtip_workq_sdbf7); + + pci_set_master(pdev); + rv = pci_enable_msi(pdev); + if (rv) { + dev_warn(&pdev->dev, + "Unable to enable MSI interrupt.\n"); + goto msi_initialize_err; + } + + mtip_fix_ero_nosnoop(dd, pdev); + + /* Initialize the block layer. */ + rv = mtip_block_initialize(dd); + if (rv < 0) { + dev_err(&pdev->dev, + "Unable to initialize block layer\n"); + goto block_initialize_err; + } + + /* + * Increment the instance count so that each device has a unique + * instance number. + */ + instance++; + if (rv != MTIP_FTL_REBUILD_MAGIC) + set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); + else + rv = 0; /* device in rebuild state, return 0 from probe */ + + /* Add to online list even if in ftl rebuild */ + spin_lock_irqsave(&dev_lock, flags); + list_add(&dd->online_list, &online_list); + spin_unlock_irqrestore(&dev_lock, flags); + + goto done; + +block_initialize_err: + pci_disable_msi(pdev); + +msi_initialize_err: + if (dd->isr_workq) { + flush_workqueue(dd->isr_workq); + destroy_workqueue(dd->isr_workq); + drop_cpu(dd->work[0].cpu_binding); + drop_cpu(dd->work[1].cpu_binding); + drop_cpu(dd->work[2].cpu_binding); + } +setmask_err: + pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); + +iomap_err: + kfree(dd); + pci_set_drvdata(pdev, NULL); + return rv; +done: + return rv; +} + +/* + * Called for each probed device when the device is removed or the + * driver is unloaded. + * + * return value + * None + */ +static void mtip_pci_remove(struct pci_dev *pdev) +{ + struct driver_data *dd = pci_get_drvdata(pdev); + unsigned long flags, to; + + set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); + + spin_lock_irqsave(&dev_lock, flags); + list_del_init(&dd->online_list); + list_add(&dd->remove_list, &removing_list); + spin_unlock_irqrestore(&dev_lock, flags); + + mtip_check_surprise_removal(pdev); + synchronize_irq(dd->pdev->irq); + + /* Spin until workers are done */ + to = jiffies + msecs_to_jiffies(4000); + do { + msleep(20); + } while (atomic_read(&dd->irq_workers_active) != 0 && + time_before(jiffies, to)); + + if (atomic_read(&dd->irq_workers_active) != 0) { + dev_warn(&dd->pdev->dev, + "Completion workers still active!\n"); + } + + /* Clean up the block layer. */ + mtip_block_remove(dd); + + if (dd->isr_workq) { + flush_workqueue(dd->isr_workq); + destroy_workqueue(dd->isr_workq); + drop_cpu(dd->work[0].cpu_binding); + drop_cpu(dd->work[1].cpu_binding); + drop_cpu(dd->work[2].cpu_binding); + } + + pci_disable_msi(pdev); + + spin_lock_irqsave(&dev_lock, flags); + list_del_init(&dd->remove_list); + spin_unlock_irqrestore(&dev_lock, flags); + + if (!dd->sr) + kfree(dd); + else + set_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag); + + pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); + pci_set_drvdata(pdev, NULL); +} + +/* + * Called for each probed device when the device is suspended. + * + * return value + * 0 Success + * <0 Error + */ +static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg) +{ + int rv = 0; + struct driver_data *dd = pci_get_drvdata(pdev); + + if (!dd) { + dev_err(&pdev->dev, + "Driver private datastructure is NULL\n"); + return -EFAULT; + } + + set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag); + + /* Disable ports & interrupts then send standby immediate */ + rv = mtip_block_suspend(dd); + if (rv < 0) { + dev_err(&pdev->dev, + "Failed to suspend controller\n"); + return rv; + } + + /* + * Save the pci config space to pdev structure & + * disable the device + */ + pci_save_state(pdev); + pci_disable_device(pdev); + + /* Move to Low power state*/ + pci_set_power_state(pdev, PCI_D3hot); + + return rv; +} + +/* + * Called for each probed device when the device is resumed. + * + * return value + * 0 Success + * <0 Error + */ +static int mtip_pci_resume(struct pci_dev *pdev) +{ + int rv = 0; + struct driver_data *dd; + + dd = pci_get_drvdata(pdev); + if (!dd) { + dev_err(&pdev->dev, + "Driver private datastructure is NULL\n"); + return -EFAULT; + } + + /* Move the device to active State */ + pci_set_power_state(pdev, PCI_D0); + + /* Restore PCI configuration space */ + pci_restore_state(pdev); + + /* Enable the PCI device*/ + rv = pcim_enable_device(pdev); + if (rv < 0) { + dev_err(&pdev->dev, + "Failed to enable card during resume\n"); + goto err; + } + pci_set_master(pdev); + + /* + * Calls hbaReset, initPort, & startPort function + * then enables interrupts + */ + rv = mtip_block_resume(dd); + if (rv < 0) + dev_err(&pdev->dev, "Unable to resume\n"); + +err: + clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag); + + return rv; +} + +/* + * Shutdown routine + * + * return value + * None + */ +static void mtip_pci_shutdown(struct pci_dev *pdev) +{ + struct driver_data *dd = pci_get_drvdata(pdev); + if (dd) + mtip_block_shutdown(dd); +} + +/* Table of device ids supported by this driver. */ +static const struct pci_device_id mtip_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320H_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320M_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320S_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P325M_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420H_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420M_DEVICE_ID) }, + { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P425M_DEVICE_ID) }, + { 0 } +}; + +/* Structure that describes the PCI driver functions. */ +static struct pci_driver mtip_pci_driver = { + .name = MTIP_DRV_NAME, + .id_table = mtip_pci_tbl, + .probe = mtip_pci_probe, + .remove = mtip_pci_remove, + .suspend = mtip_pci_suspend, + .resume = mtip_pci_resume, + .shutdown = mtip_pci_shutdown, +}; + +MODULE_DEVICE_TABLE(pci, mtip_pci_tbl); + +/* + * Module initialization function. + * + * Called once when the module is loaded. This function allocates a major + * block device number to the Cyclone devices and registers the PCI layer + * of the driver. + * + * Return value + * 0 on success else error code. + */ +static int __init mtip_init(void) +{ + int error; + + pr_info(MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n"); + + spin_lock_init(&dev_lock); + + INIT_LIST_HEAD(&online_list); + INIT_LIST_HEAD(&removing_list); + + /* Allocate a major block device number to use with this driver. */ + error = register_blkdev(0, MTIP_DRV_NAME); + if (error <= 0) { + pr_err("Unable to register block device (%d)\n", + error); + return -EBUSY; + } + mtip_major = error; + + dfs_parent = debugfs_create_dir("rssd", NULL); + if (IS_ERR_OR_NULL(dfs_parent)) { + pr_warn("Error creating debugfs parent\n"); + dfs_parent = NULL; + } + if (dfs_parent) { + dfs_device_status = debugfs_create_file("device_status", + S_IRUGO, dfs_parent, NULL, + &mtip_device_status_fops); + if (IS_ERR_OR_NULL(dfs_device_status)) { + pr_err("Error creating device_status node\n"); + dfs_device_status = NULL; + } + } + + /* Register our PCI operations. */ + error = pci_register_driver(&mtip_pci_driver); + if (error) { + debugfs_remove(dfs_parent); + unregister_blkdev(mtip_major, MTIP_DRV_NAME); + } + + return error; +} + +/* + * Module de-initialization function. + * + * Called once when the module is unloaded. This function deallocates + * the major block device number allocated by mtip_init() and + * unregisters the PCI layer of the driver. + * + * Return value + * none + */ +static void __exit mtip_exit(void) +{ + /* Release the allocated major block device number. */ + unregister_blkdev(mtip_major, MTIP_DRV_NAME); + + /* Unregister the PCI driver. */ + pci_unregister_driver(&mtip_pci_driver); + + debugfs_remove_recursive(dfs_parent); +} + +MODULE_AUTHOR("Micron Technology, Inc"); +MODULE_DESCRIPTION("Micron RealSSD PCIe Block Driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(MTIP_DRV_VERSION); + +module_init(mtip_init); +module_exit(mtip_exit); diff --git a/kernel/drivers/block/mtip32xx/mtip32xx.h b/kernel/drivers/block/mtip32xx/mtip32xx.h new file mode 100644 index 000000000..ba1b31ee2 --- /dev/null +++ b/kernel/drivers/block/mtip32xx/mtip32xx.h @@ -0,0 +1,511 @@ +/* + * mtip32xx.h - Header file for the P320 SSD Block Driver + * Copyright (C) 2011 Micron Technology, Inc. + * + * Portions of this code were derived from works subjected to the + * following copyright: + * Copyright (C) 2009 Integrated Device Technology, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __MTIP32XX_H__ +#define __MTIP32XX_H__ + +#include +#include +#include +#include +#include + +/* Offset of Subsystem Device ID in pci confoguration space */ +#define PCI_SUBSYSTEM_DEVICEID 0x2E + +/* offset of Device Control register in PCIe extended capabilites space */ +#define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET 0x48 + +/* check for erase mode support during secure erase */ +#define MTIP_SEC_ERASE_MODE 0x2 + +/* # of times to retry timed out/failed IOs */ +#define MTIP_MAX_RETRIES 2 + +/* Various timeout values in ms */ +#define MTIP_NCQ_CMD_TIMEOUT_MS 15000 +#define MTIP_IOCTL_CMD_TIMEOUT_MS 5000 +#define MTIP_INT_CMD_TIMEOUT_MS 5000 +#define MTIP_QUIESCE_IO_TIMEOUT_MS (MTIP_NCQ_CMD_TIMEOUT_MS * \ + (MTIP_MAX_RETRIES + 1)) + +/* check for timeouts every 500ms */ +#define MTIP_TIMEOUT_CHECK_PERIOD 500 + +/* ftl rebuild */ +#define MTIP_FTL_REBUILD_OFFSET 142 +#define MTIP_FTL_REBUILD_MAGIC 0xED51 +#define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000 + +/* unaligned IO handling */ +#define MTIP_MAX_UNALIGNED_SLOTS 2 + +/* Macro to extract the tag bit number from a tag value. */ +#define MTIP_TAG_BIT(tag) (tag & 0x1F) + +/* + * Macro to extract the tag index from a tag value. The index + * is used to access the correct s_active/Command Issue register based + * on the tag value. + */ +#define MTIP_TAG_INDEX(tag) (tag >> 5) + +/* + * Maximum number of scatter gather entries + * a single command may have. + */ +#define MTIP_MAX_SG 504 + +/* + * Maximum number of slot groups (Command Issue & s_active registers) + * NOTE: This is the driver maximum; check dd->slot_groups for actual value. + */ +#define MTIP_MAX_SLOT_GROUPS 8 + +/* Internal command tag. */ +#define MTIP_TAG_INTERNAL 0 + +/* Micron Vendor ID & P320x SSD Device ID */ +#define PCI_VENDOR_ID_MICRON 0x1344 +#define P320H_DEVICE_ID 0x5150 +#define P320M_DEVICE_ID 0x5151 +#define P320S_DEVICE_ID 0x5152 +#define P325M_DEVICE_ID 0x5153 +#define P420H_DEVICE_ID 0x5160 +#define P420M_DEVICE_ID 0x5161 +#define P425M_DEVICE_ID 0x5163 + +/* Driver name and version strings */ +#define MTIP_DRV_NAME "mtip32xx" +#define MTIP_DRV_VERSION "1.3.1" + +/* Maximum number of minor device numbers per device. */ +#define MTIP_MAX_MINORS 16 + +/* Maximum number of supported command slots. */ +#define MTIP_MAX_COMMAND_SLOTS (MTIP_MAX_SLOT_GROUPS * 32) + +/* + * Per-tag bitfield size in longs. + * Linux bit manipulation functions + * (i.e. test_and_set_bit, find_next_zero_bit) + * manipulate memory in longs, so we try to make the math work. + * take the slot groups and find the number of longs, rounding up. + * Careful! i386 and x86_64 use different size longs! + */ +#define U32_PER_LONG (sizeof(long) / sizeof(u32)) +#define SLOTBITS_IN_LONGS ((MTIP_MAX_SLOT_GROUPS + \ + (U32_PER_LONG-1))/U32_PER_LONG) + +/* BAR number used to access the HBA registers. */ +#define MTIP_ABAR 5 + +#ifdef DEBUG + #define dbg_printk(format, arg...) \ + printk(pr_fmt(format), ##arg); +#else + #define dbg_printk(format, arg...) +#endif + +#define MTIP_DFS_MAX_BUF_SIZE 1024 + +#define __force_bit2int (unsigned int __force) + +enum { + /* below are bit numbers in 'flags' defined in mtip_port */ + MTIP_PF_IC_ACTIVE_BIT = 0, /* pio/ioctl */ + MTIP_PF_EH_ACTIVE_BIT = 1, /* error handling */ + MTIP_PF_SE_ACTIVE_BIT = 2, /* secure erase */ + MTIP_PF_DM_ACTIVE_BIT = 3, /* download microcde */ + MTIP_PF_PAUSE_IO = ((1 << MTIP_PF_IC_ACTIVE_BIT) | + (1 << MTIP_PF_EH_ACTIVE_BIT) | + (1 << MTIP_PF_SE_ACTIVE_BIT) | + (1 << MTIP_PF_DM_ACTIVE_BIT)), + + MTIP_PF_SVC_THD_ACTIVE_BIT = 4, + MTIP_PF_ISSUE_CMDS_BIT = 5, + MTIP_PF_REBUILD_BIT = 6, + MTIP_PF_SR_CLEANUP_BIT = 7, + MTIP_PF_SVC_THD_STOP_BIT = 8, + + /* below are bit numbers in 'dd_flag' defined in driver_data */ + MTIP_DDF_SEC_LOCK_BIT = 0, + MTIP_DDF_REMOVE_PENDING_BIT = 1, + MTIP_DDF_OVER_TEMP_BIT = 2, + MTIP_DDF_WRITE_PROTECT_BIT = 3, + MTIP_DDF_REMOVE_DONE_BIT = 4, + MTIP_DDF_CLEANUP_BIT = 5, + MTIP_DDF_RESUME_BIT = 6, + MTIP_DDF_INIT_DONE_BIT = 7, + MTIP_DDF_REBUILD_FAILED_BIT = 8, + + MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | + (1 << MTIP_DDF_SEC_LOCK_BIT) | + (1 << MTIP_DDF_OVER_TEMP_BIT) | + (1 << MTIP_DDF_WRITE_PROTECT_BIT) | + (1 << MTIP_DDF_REBUILD_FAILED_BIT)), + +}; + +struct smart_attr { + u8 attr_id; + u16 flags; + u8 cur; + u8 worst; + u32 data; + u8 res[3]; +} __packed; + +struct mtip_work { + struct work_struct work; + void *port; + int cpu_binding; + u32 completed; +} ____cacheline_aligned_in_smp; + +#define DEFINE_HANDLER(group) \ + void mtip_workq_sdbf##group(struct work_struct *work) \ + { \ + struct mtip_work *w = (struct mtip_work *) work; \ + mtip_workq_sdbfx(w->port, group, w->completed); \ + } + +#define MTIP_TRIM_TIMEOUT_MS 240000 +#define MTIP_MAX_TRIM_ENTRIES 8 +#define MTIP_MAX_TRIM_ENTRY_LEN 0xfff8 + +struct mtip_trim_entry { + u32 lba; /* starting lba of region */ + u16 rsvd; /* unused */ + u16 range; /* # of 512b blocks to trim */ +} __packed; + +struct mtip_trim { + /* Array of regions to trim */ + struct mtip_trim_entry entry[MTIP_MAX_TRIM_ENTRIES]; +} __packed; + +/* Register Frame Information Structure (FIS), host to device. */ +struct host_to_dev_fis { + /* + * FIS type. + * - 27h Register FIS, host to device. + * - 34h Register FIS, device to host. + * - 39h DMA Activate FIS, device to host. + * - 41h DMA Setup FIS, bi-directional. + * - 46h Data FIS, bi-directional. + * - 58h BIST Activate FIS, bi-directional. + * - 5Fh PIO Setup FIS, device to host. + * - A1h Set Device Bits FIS, device to host. + */ + unsigned char type; + unsigned char opts; + unsigned char command; + unsigned char features; + + union { + unsigned char lba_low; + unsigned char sector; + }; + union { + unsigned char lba_mid; + unsigned char cyl_low; + }; + union { + unsigned char lba_hi; + unsigned char cyl_hi; + }; + union { + unsigned char device; + unsigned char head; + }; + + union { + unsigned char lba_low_ex; + unsigned char sector_ex; + }; + union { + unsigned char lba_mid_ex; + unsigned char cyl_low_ex; + }; + union { + unsigned char lba_hi_ex; + unsigned char cyl_hi_ex; + }; + unsigned char features_ex; + + unsigned char sect_count; + unsigned char sect_cnt_ex; + unsigned char res2; + unsigned char control; + + unsigned int res3; +}; + +/* Command header structure. */ +struct mtip_cmd_hdr { + /* + * Command options. + * - Bits 31:16 Number of PRD entries. + * - Bits 15:8 Unused in this implementation. + * - Bit 7 Prefetch bit, informs the drive to prefetch PRD entries. + * - Bit 6 Write bit, should be set when writing data to the device. + * - Bit 5 Unused in this implementation. + * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes). + */ + unsigned int opts; + /* This field is unsed when using NCQ. */ + union { + unsigned int byte_count; + unsigned int status; + }; + /* + * Lower 32 bits of the command table address associated with this + * header. The command table addresses must be 128 byte aligned. + */ + unsigned int ctba; + /* + * If 64 bit addressing is used this field is the upper 32 bits + * of the command table address associated with this command. + */ + unsigned int ctbau; + /* Reserved and unused. */ + unsigned int res[4]; +}; + +/* Command scatter gather structure (PRD). */ +struct mtip_cmd_sg { + /* + * Low 32 bits of the data buffer address. For P320 this + * address must be 8 byte aligned signified by bits 2:0 being + * set to 0. + */ + unsigned int dba; + /* + * When 64 bit addressing is used this field is the upper + * 32 bits of the data buffer address. + */ + unsigned int dba_upper; + /* Unused. */ + unsigned int reserved; + /* + * Bit 31: interrupt when this data block has been transferred. + * Bits 30..22: reserved + * Bits 21..0: byte count (minus 1). For P320 the byte count must be + * 8 byte aligned signified by bits 2:0 being set to 1. + */ + unsigned int info; +}; +struct mtip_port; + +/* Structure used to describe a command. */ +struct mtip_cmd { + + struct mtip_cmd_hdr *command_header; /* ptr to command header entry */ + + dma_addr_t command_header_dma; /* corresponding physical address */ + + void *command; /* ptr to command table entry */ + + dma_addr_t command_dma; /* corresponding physical address */ + + void *comp_data; /* data passed to completion function comp_func() */ + /* + * Completion function called by the ISR upon completion of + * a command. + */ + void (*comp_func)(struct mtip_port *port, + int tag, + struct mtip_cmd *cmd, + int status); + + int scatter_ents; /* Number of scatter list entries used */ + + int unaligned; /* command is unaligned on 4k boundary */ + + struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */ + + int retries; /* The number of retries left for this command. */ + + int direction; /* Data transfer direction */ +}; + +/* Structure used to describe a port. */ +struct mtip_port { + /* Pointer back to the driver data for this port. */ + struct driver_data *dd; + /* + * Used to determine if the data pointed to by the + * identify field is valid. + */ + unsigned long identify_valid; + /* Base address of the memory mapped IO for the port. */ + void __iomem *mmio; + /* Array of pointers to the memory mapped s_active registers. */ + void __iomem *s_active[MTIP_MAX_SLOT_GROUPS]; + /* Array of pointers to the memory mapped completed registers. */ + void __iomem *completed[MTIP_MAX_SLOT_GROUPS]; + /* Array of pointers to the memory mapped Command Issue registers. */ + void __iomem *cmd_issue[MTIP_MAX_SLOT_GROUPS]; + /* + * Pointer to the beginning of the command header memory as used + * by the driver. + */ + void *command_list; + /* + * Pointer to the beginning of the command header memory as used + * by the DMA. + */ + dma_addr_t command_list_dma; + /* + * Pointer to the beginning of the RX FIS memory as used + * by the driver. + */ + void *rxfis; + /* + * Pointer to the beginning of the RX FIS memory as used + * by the DMA. + */ + dma_addr_t rxfis_dma; + /* + * Pointer to the DMA region for RX Fis, Identify, RLE10, and SMART + */ + void *block1; + /* + * DMA address of region for RX Fis, Identify, RLE10, and SMART + */ + dma_addr_t block1_dma; + /* + * Pointer to the beginning of the identify data memory as used + * by the driver. + */ + u16 *identify; + /* + * Pointer to the beginning of the identify data memory as used + * by the DMA. + */ + dma_addr_t identify_dma; + /* + * Pointer to the beginning of a sector buffer that is used + * by the driver when issuing internal commands. + */ + u16 *sector_buffer; + /* + * Pointer to the beginning of a sector buffer that is used + * by the DMA when the driver issues internal commands. + */ + dma_addr_t sector_buffer_dma; + /* + * Bit significant, used to determine if a command slot has + * been allocated. i.e. the slot is in use. Bits are cleared + * when the command slot and all associated data structures + * are no longer needed. + */ + u16 *log_buf; + dma_addr_t log_buf_dma; + + u8 *smart_buf; + dma_addr_t smart_buf_dma; + + unsigned long allocated[SLOTBITS_IN_LONGS]; + /* + * used to queue commands when an internal command is in progress + * or error handling is active + */ + unsigned long cmds_to_issue[SLOTBITS_IN_LONGS]; + /* Used by mtip_service_thread to wait for an event */ + wait_queue_head_t svc_wait; + /* + * indicates the state of the port. Also, helps the service thread + * to determine its action on wake up. + */ + unsigned long flags; + /* + * Timer used to complete commands that have been active for too long. + */ + unsigned long ic_pause_timer; + + /* Semaphore to control queue depth of unaligned IOs */ + struct semaphore cmd_slot_unal; + + /* Spinlock for working around command-issue bug. */ + spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS]; +}; + +/* + * Driver private data structure. + * + * One structure is allocated per probed device. + */ +struct driver_data { + void __iomem *mmio; /* Base address of the HBA registers. */ + + int major; /* Major device number. */ + + int instance; /* Instance number. First device probed is 0, ... */ + + struct gendisk *disk; /* Pointer to our gendisk structure. */ + + struct pci_dev *pdev; /* Pointer to the PCI device structure. */ + + struct request_queue *queue; /* Our request queue. */ + + struct blk_mq_tag_set tags; /* blk_mq tags */ + + struct mtip_port *port; /* Pointer to the port data structure. */ + + unsigned product_type; /* magic value declaring the product type */ + + unsigned slot_groups; /* number of slot groups the product supports */ + + unsigned long index; /* Index to determine the disk name */ + + unsigned long dd_flag; /* NOTE: use atomic bit operations on this */ + + struct task_struct *mtip_svc_handler; /* task_struct of svc thd */ + + struct dentry *dfs_node; + + bool trim_supp; /* flag indicating trim support */ + + bool sr; + + int numa_node; /* NUMA support */ + + char workq_name[32]; + + struct workqueue_struct *isr_workq; + + atomic_t irq_workers_active; + + struct mtip_work work[MTIP_MAX_SLOT_GROUPS]; + + int isr_binding; + + struct block_device *bdev; + + struct list_head online_list; /* linkage for online list */ + + struct list_head remove_list; /* linkage for removing list */ + + int unal_qdepth; /* qdepth of unaligned IO queue */ +}; + +#endif diff --git a/kernel/drivers/block/nbd.c b/kernel/drivers/block/nbd.c new file mode 100644 index 000000000..39e5f7fae --- /dev/null +++ b/kernel/drivers/block/nbd.c @@ -0,0 +1,896 @@ +/* + * Network block device - make block devices work over TCP + * + * Note that you can not swap over this thing, yet. Seems to work but + * deadlocks sometimes - you can not swap over TCP in general. + * + * Copyright 1997-2000, 2008 Pavel Machek + * Parts copyright 2001 Steven Whitehouse + * + * This file is released under GPLv2 or later. + * + * (part of code stolen from loop.c) + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +struct nbd_device { + int flags; + int harderror; /* Code of hard error */ + struct socket * sock; /* If == NULL, device is not ready, yet */ + int magic; + + spinlock_t queue_lock; + struct list_head queue_head; /* Requests waiting result */ + struct request *active_req; + wait_queue_head_t active_wq; + struct list_head waiting_queue; /* Requests to be sent */ + wait_queue_head_t waiting_wq; + + struct mutex tx_lock; + struct gendisk *disk; + int blksize; + loff_t bytesize; + pid_t pid; /* pid of nbd-client, if attached */ + int xmit_timeout; + int disconnect; /* a disconnect has been requested by user */ +}; + +#define NBD_MAGIC 0x68797548 + +static unsigned int nbds_max = 16; +static struct nbd_device *nbd_dev; +static int max_part; + +/* + * Use just one lock (or at most 1 per NIC). Two arguments for this: + * 1. Each NIC is essentially a synchronization point for all servers + * accessed through that NIC so there's no need to have more locks + * than NICs anyway. + * 2. More locks lead to more "Dirty cache line bouncing" which will slow + * down each lock to the point where they're actually slower than just + * a single lock. + * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this! + */ +static DEFINE_SPINLOCK(nbd_lock); + +static inline struct device *nbd_to_dev(struct nbd_device *nbd) +{ + return disk_to_dev(nbd->disk); +} + +static const char *nbdcmd_to_ascii(int cmd) +{ + switch (cmd) { + case NBD_CMD_READ: return "read"; + case NBD_CMD_WRITE: return "write"; + case NBD_CMD_DISC: return "disconnect"; + case NBD_CMD_FLUSH: return "flush"; + case NBD_CMD_TRIM: return "trim/discard"; + } + return "invalid"; +} + +static void nbd_end_request(struct nbd_device *nbd, struct request *req) +{ + int error = req->errors ? -EIO : 0; + struct request_queue *q = req->q; + unsigned long flags; + + dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req, + error ? "failed" : "done"); + + spin_lock_irqsave(q->queue_lock, flags); + __blk_end_request_all(req, error); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +/* + * Forcibly shutdown the socket causing all listeners to error + */ +static void sock_shutdown(struct nbd_device *nbd, int lock) +{ + if (lock) + mutex_lock(&nbd->tx_lock); + if (nbd->sock) { + dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); + kernel_sock_shutdown(nbd->sock, SHUT_RDWR); + nbd->sock = NULL; + } + if (lock) + mutex_unlock(&nbd->tx_lock); +} + +static void nbd_xmit_timeout(unsigned long arg) +{ + struct task_struct *task = (struct task_struct *)arg; + + printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n", + task->comm, task->pid); + force_sig(SIGKILL, task); +} + +/* + * Send or receive packet. + */ +static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, + int msg_flags) +{ + struct socket *sock = nbd->sock; + int result; + struct msghdr msg; + struct kvec iov; + sigset_t blocked, oldset; + unsigned long pflags = current->flags; + + if (unlikely(!sock)) { + dev_err(disk_to_dev(nbd->disk), + "Attempted %s on closed socket in sock_xmit\n", + (send ? "send" : "recv")); + return -EINVAL; + } + + /* Allow interception of SIGKILL only + * Don't allow other signals to interrupt the transmission */ + siginitsetinv(&blocked, sigmask(SIGKILL)); + sigprocmask(SIG_SETMASK, &blocked, &oldset); + + current->flags |= PF_MEMALLOC; + do { + sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; + iov.iov_base = buf; + iov.iov_len = size; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = msg_flags | MSG_NOSIGNAL; + + if (send) { + struct timer_list ti; + + if (nbd->xmit_timeout) { + init_timer(&ti); + ti.function = nbd_xmit_timeout; + ti.data = (unsigned long)current; + ti.expires = jiffies + nbd->xmit_timeout; + add_timer(&ti); + } + result = kernel_sendmsg(sock, &msg, &iov, 1, size); + if (nbd->xmit_timeout) + del_timer_sync(&ti); + } else + result = kernel_recvmsg(sock, &msg, &iov, 1, size, + msg.msg_flags); + + if (signal_pending(current)) { + siginfo_t info; + printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n", + task_pid_nr(current), current->comm, + dequeue_signal_lock(current, ¤t->blocked, &info)); + result = -EINTR; + sock_shutdown(nbd, !send); + break; + } + + if (result <= 0) { + if (result == 0) + result = -EPIPE; /* short read */ + break; + } + size -= result; + buf += result; + } while (size > 0); + + sigprocmask(SIG_SETMASK, &oldset, NULL); + tsk_restore_flags(current, pflags, PF_MEMALLOC); + + return result; +} + +static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec, + int flags) +{ + int result; + void *kaddr = kmap(bvec->bv_page); + result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset, + bvec->bv_len, flags); + kunmap(bvec->bv_page); + return result; +} + +/* always call with the tx_lock held */ +static int nbd_send_req(struct nbd_device *nbd, struct request *req) +{ + int result, flags; + struct nbd_request request; + unsigned long size = blk_rq_bytes(req); + + memset(&request, 0, sizeof(request)); + request.magic = htonl(NBD_REQUEST_MAGIC); + request.type = htonl(nbd_cmd(req)); + + if (nbd_cmd(req) != NBD_CMD_FLUSH && nbd_cmd(req) != NBD_CMD_DISC) { + request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); + request.len = htonl(size); + } + memcpy(request.handle, &req, sizeof(req)); + + dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", + req, nbdcmd_to_ascii(nbd_cmd(req)), + (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); + result = sock_xmit(nbd, 1, &request, sizeof(request), + (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); + if (result <= 0) { + dev_err(disk_to_dev(nbd->disk), + "Send control failed (result %d)\n", result); + return -EIO; + } + + if (nbd_cmd(req) == NBD_CMD_WRITE) { + struct req_iterator iter; + struct bio_vec bvec; + /* + * we are really probing at internals to determine + * whether to set MSG_MORE or not... + */ + rq_for_each_segment(bvec, req, iter) { + flags = 0; + if (!rq_iter_last(bvec, iter)) + flags = MSG_MORE; + dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n", + req, bvec.bv_len); + result = sock_send_bvec(nbd, &bvec, flags); + if (result <= 0) { + dev_err(disk_to_dev(nbd->disk), + "Send data failed (result %d)\n", + result); + return -EIO; + } + } + } + return 0; +} + +static struct request *nbd_find_request(struct nbd_device *nbd, + struct request *xreq) +{ + struct request *req, *tmp; + int err; + + err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq); + if (unlikely(err)) + return ERR_PTR(err); + + spin_lock(&nbd->queue_lock); + list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) { + if (req != xreq) + continue; + list_del_init(&req->queuelist); + spin_unlock(&nbd->queue_lock); + return req; + } + spin_unlock(&nbd->queue_lock); + + return ERR_PTR(-ENOENT); +} + +static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec) +{ + int result; + void *kaddr = kmap(bvec->bv_page); + result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len, + MSG_WAITALL); + kunmap(bvec->bv_page); + return result; +} + +/* NULL returned = something went wrong, inform userspace */ +static struct request *nbd_read_stat(struct nbd_device *nbd) +{ + int result; + struct nbd_reply reply; + struct request *req; + + reply.magic = 0; + result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL); + if (result <= 0) { + dev_err(disk_to_dev(nbd->disk), + "Receive control failed (result %d)\n", result); + goto harderror; + } + + if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { + dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", + (unsigned long)ntohl(reply.magic)); + result = -EPROTO; + goto harderror; + } + + req = nbd_find_request(nbd, *(struct request **)reply.handle); + if (IS_ERR(req)) { + result = PTR_ERR(req); + if (result != -ENOENT) + goto harderror; + + dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n", + reply.handle); + result = -EBADR; + goto harderror; + } + + if (ntohl(reply.error)) { + dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", + ntohl(reply.error)); + req->errors++; + return req; + } + + dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req); + if (nbd_cmd(req) == NBD_CMD_READ) { + struct req_iterator iter; + struct bio_vec bvec; + + rq_for_each_segment(bvec, req, iter) { + result = sock_recv_bvec(nbd, &bvec); + if (result <= 0) { + dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", + result); + req->errors++; + return req; + } + dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n", + req, bvec.bv_len); + } + } + return req; +harderror: + nbd->harderror = result; + return NULL; +} + +static ssize_t pid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%ld\n", + (long) ((struct nbd_device *)disk->private_data)->pid); +} + +static struct device_attribute pid_attr = { + .attr = { .name = "pid", .mode = S_IRUGO}, + .show = pid_show, +}; + +static int nbd_do_it(struct nbd_device *nbd) +{ + struct request *req; + int ret; + + BUG_ON(nbd->magic != NBD_MAGIC); + + sk_set_memalloc(nbd->sock->sk); + nbd->pid = task_pid_nr(current); + ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); + if (ret) { + dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); + nbd->pid = 0; + return ret; + } + + while ((req = nbd_read_stat(nbd)) != NULL) + nbd_end_request(nbd, req); + + device_remove_file(disk_to_dev(nbd->disk), &pid_attr); + nbd->pid = 0; + return 0; +} + +static void nbd_clear_que(struct nbd_device *nbd) +{ + struct request *req; + + BUG_ON(nbd->magic != NBD_MAGIC); + + /* + * Because we have set nbd->sock to NULL under the tx_lock, all + * modifications to the list must have completed by now. For + * the same reason, the active_req must be NULL. + * + * As a consequence, we don't need to take the spin lock while + * purging the list here. + */ + BUG_ON(nbd->sock); + BUG_ON(nbd->active_req); + + while (!list_empty(&nbd->queue_head)) { + req = list_entry(nbd->queue_head.next, struct request, + queuelist); + list_del_init(&req->queuelist); + req->errors++; + nbd_end_request(nbd, req); + } + + while (!list_empty(&nbd->waiting_queue)) { + req = list_entry(nbd->waiting_queue.next, struct request, + queuelist); + list_del_init(&req->queuelist); + req->errors++; + nbd_end_request(nbd, req); + } +} + + +static void nbd_handle_req(struct nbd_device *nbd, struct request *req) +{ + if (req->cmd_type != REQ_TYPE_FS) + goto error_out; + + nbd_cmd(req) = NBD_CMD_READ; + if (rq_data_dir(req) == WRITE) { + if ((req->cmd_flags & REQ_DISCARD)) { + WARN_ON(!(nbd->flags & NBD_FLAG_SEND_TRIM)); + nbd_cmd(req) = NBD_CMD_TRIM; + } else + nbd_cmd(req) = NBD_CMD_WRITE; + if (nbd->flags & NBD_FLAG_READ_ONLY) { + dev_err(disk_to_dev(nbd->disk), + "Write on read-only\n"); + goto error_out; + } + } + + if (req->cmd_flags & REQ_FLUSH) { + BUG_ON(unlikely(blk_rq_sectors(req))); + nbd_cmd(req) = NBD_CMD_FLUSH; + } + + req->errors = 0; + + mutex_lock(&nbd->tx_lock); + if (unlikely(!nbd->sock)) { + mutex_unlock(&nbd->tx_lock); + dev_err(disk_to_dev(nbd->disk), + "Attempted send on closed socket\n"); + goto error_out; + } + + nbd->active_req = req; + + if (nbd_send_req(nbd, req) != 0) { + dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); + req->errors++; + nbd_end_request(nbd, req); + } else { + spin_lock(&nbd->queue_lock); + list_add_tail(&req->queuelist, &nbd->queue_head); + spin_unlock(&nbd->queue_lock); + } + + nbd->active_req = NULL; + mutex_unlock(&nbd->tx_lock); + wake_up_all(&nbd->active_wq); + + return; + +error_out: + req->errors++; + nbd_end_request(nbd, req); +} + +static int nbd_thread(void *data) +{ + struct nbd_device *nbd = data; + struct request *req; + + set_user_nice(current, MIN_NICE); + while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { + /* wait for something to do */ + wait_event_interruptible(nbd->waiting_wq, + kthread_should_stop() || + !list_empty(&nbd->waiting_queue)); + + /* extract request */ + if (list_empty(&nbd->waiting_queue)) + continue; + + spin_lock_irq(&nbd->queue_lock); + req = list_entry(nbd->waiting_queue.next, struct request, + queuelist); + list_del_init(&req->queuelist); + spin_unlock_irq(&nbd->queue_lock); + + /* handle request */ + nbd_handle_req(nbd, req); + } + return 0; +} + +/* + * We always wait for result of write, for now. It would be nice to make it optional + * in future + * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK)) + * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } + */ + +static void do_nbd_request(struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) +{ + struct request *req; + + while ((req = blk_fetch_request(q)) != NULL) { + struct nbd_device *nbd; + + spin_unlock_irq(q->queue_lock); + + nbd = req->rq_disk->private_data; + + BUG_ON(nbd->magic != NBD_MAGIC); + + dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n", + req, req->cmd_type); + + if (unlikely(!nbd->sock)) { + dev_err(disk_to_dev(nbd->disk), + "Attempted send on closed socket\n"); + req->errors++; + nbd_end_request(nbd, req); + spin_lock_irq(q->queue_lock); + continue; + } + + spin_lock_irq(&nbd->queue_lock); + list_add_tail(&req->queuelist, &nbd->waiting_queue); + spin_unlock_irq(&nbd->queue_lock); + + wake_up(&nbd->waiting_wq); + + spin_lock_irq(q->queue_lock); + } +} + +/* Must be called with tx_lock held */ + +static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case NBD_DISCONNECT: { + struct request sreq; + + dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); + if (!nbd->sock) + return -EINVAL; + + mutex_unlock(&nbd->tx_lock); + fsync_bdev(bdev); + mutex_lock(&nbd->tx_lock); + blk_rq_init(NULL, &sreq); + sreq.cmd_type = REQ_TYPE_SPECIAL; + nbd_cmd(&sreq) = NBD_CMD_DISC; + + /* Check again after getting mutex back. */ + if (!nbd->sock) + return -EINVAL; + + nbd->disconnect = 1; + + nbd_send_req(nbd, &sreq); + return 0; + } + + case NBD_CLEAR_SOCK: { + struct socket *sock = nbd->sock; + nbd->sock = NULL; + nbd_clear_que(nbd); + BUG_ON(!list_empty(&nbd->queue_head)); + BUG_ON(!list_empty(&nbd->waiting_queue)); + kill_bdev(bdev); + if (sock) + sockfd_put(sock); + return 0; + } + + case NBD_SET_SOCK: { + struct socket *sock; + int err; + if (nbd->sock) + return -EBUSY; + sock = sockfd_lookup(arg, &err); + if (sock) { + nbd->sock = sock; + if (max_part > 0) + bdev->bd_invalidated = 1; + nbd->disconnect = 0; /* we're connected now */ + return 0; + } + return -EINVAL; + } + + case NBD_SET_BLKSIZE: + nbd->blksize = arg; + nbd->bytesize &= ~(nbd->blksize-1); + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); + return 0; + + case NBD_SET_SIZE: + nbd->bytesize = arg & ~(nbd->blksize-1); + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); + return 0; + + case NBD_SET_TIMEOUT: + nbd->xmit_timeout = arg * HZ; + return 0; + + case NBD_SET_FLAGS: + nbd->flags = arg; + return 0; + + case NBD_SET_SIZE_BLOCKS: + nbd->bytesize = ((u64) arg) * nbd->blksize; + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); + return 0; + + case NBD_DO_IT: { + struct task_struct *thread; + struct socket *sock; + int error; + + if (nbd->pid) + return -EBUSY; + if (!nbd->sock) + return -EINVAL; + + mutex_unlock(&nbd->tx_lock); + + if (nbd->flags & NBD_FLAG_READ_ONLY) + set_device_ro(bdev, true); + if (nbd->flags & NBD_FLAG_SEND_TRIM) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + nbd->disk->queue); + if (nbd->flags & NBD_FLAG_SEND_FLUSH) + blk_queue_flush(nbd->disk->queue, REQ_FLUSH); + else + blk_queue_flush(nbd->disk->queue, 0); + + thread = kthread_run(nbd_thread, nbd, "%s", + nbd->disk->disk_name); + if (IS_ERR(thread)) { + mutex_lock(&nbd->tx_lock); + return PTR_ERR(thread); + } + + error = nbd_do_it(nbd); + kthread_stop(thread); + + mutex_lock(&nbd->tx_lock); + if (error) + return error; + sock_shutdown(nbd, 0); + sock = nbd->sock; + nbd->sock = NULL; + nbd_clear_que(nbd); + dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); + kill_bdev(bdev); + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); + set_device_ro(bdev, false); + if (sock) + sockfd_put(sock); + nbd->flags = 0; + nbd->bytesize = 0; + bdev->bd_inode->i_size = 0; + set_capacity(nbd->disk, 0); + if (max_part > 0) + ioctl_by_bdev(bdev, BLKRRPART, 0); + if (nbd->disconnect) /* user requested, ignore socket errors */ + return 0; + return nbd->harderror; + } + + case NBD_CLEAR_QUE: + /* + * This is for compatibility only. The queue is always cleared + * by NBD_DO_IT or NBD_CLEAR_SOCK. + */ + return 0; + + case NBD_PRINT_DEBUG: + dev_info(disk_to_dev(nbd->disk), + "next = %p, prev = %p, head = %p\n", + nbd->queue_head.next, nbd->queue_head.prev, + &nbd->queue_head); + return 0; + } + return -ENOTTY; +} + +static int nbd_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nbd_device *nbd = bdev->bd_disk->private_data; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + BUG_ON(nbd->magic != NBD_MAGIC); + + mutex_lock(&nbd->tx_lock); + error = __nbd_ioctl(bdev, nbd, cmd, arg); + mutex_unlock(&nbd->tx_lock); + + return error; +} + +static const struct block_device_operations nbd_fops = +{ + .owner = THIS_MODULE, + .ioctl = nbd_ioctl, +}; + +/* + * And here should be modules and kernel interface + * (Just smiley confuses emacs :-) + */ + +static int __init nbd_init(void) +{ + int err = -ENOMEM; + int i; + int part_shift; + + BUILD_BUG_ON(sizeof(struct nbd_request) != 28); + + if (max_part < 0) { + printk(KERN_ERR "nbd: max_part must be >= 0\n"); + return -EINVAL; + } + + part_shift = 0; + if (max_part > 0) { + part_shift = fls(max_part); + + /* + * Adjust max_part according to part_shift as it is exported + * to user space so that user can know the max number of + * partition kernel should be able to manage. + * + * Note that -1 is required because partition 0 is reserved + * for the whole disk. + */ + max_part = (1UL << part_shift) - 1; + } + + if ((1UL << part_shift) > DISK_MAX_PARTS) + return -EINVAL; + + if (nbds_max > 1UL << (MINORBITS - part_shift)) + return -EINVAL; + + nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL); + if (!nbd_dev) + return -ENOMEM; + + for (i = 0; i < nbds_max; i++) { + struct gendisk *disk = alloc_disk(1 << part_shift); + if (!disk) + goto out; + nbd_dev[i].disk = disk; + /* + * The new linux 2.5 block layer implementation requires + * every gendisk to have its very own request_queue struct. + * These structs are big so we dynamically allocate them. + */ + disk->queue = blk_init_queue(do_nbd_request, &nbd_lock); + if (!disk->queue) { + put_disk(disk); + goto out; + } + /* + * Tell the block layer that we are not a rotational device + */ + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue); + disk->queue->limits.discard_granularity = 512; + disk->queue->limits.max_discard_sectors = UINT_MAX; + disk->queue->limits.discard_zeroes_data = 0; + blk_queue_max_hw_sectors(disk->queue, 65536); + disk->queue->limits.max_sectors = 256; + } + + if (register_blkdev(NBD_MAJOR, "nbd")) { + err = -EIO; + goto out; + } + + printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR); + + for (i = 0; i < nbds_max; i++) { + struct gendisk *disk = nbd_dev[i].disk; + nbd_dev[i].magic = NBD_MAGIC; + INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); + spin_lock_init(&nbd_dev[i].queue_lock); + INIT_LIST_HEAD(&nbd_dev[i].queue_head); + mutex_init(&nbd_dev[i].tx_lock); + init_waitqueue_head(&nbd_dev[i].active_wq); + init_waitqueue_head(&nbd_dev[i].waiting_wq); + nbd_dev[i].blksize = 1024; + nbd_dev[i].bytesize = 0; + disk->major = NBD_MAJOR; + disk->first_minor = i << part_shift; + disk->fops = &nbd_fops; + disk->private_data = &nbd_dev[i]; + sprintf(disk->disk_name, "nbd%d", i); + set_capacity(disk, 0); + add_disk(disk); + } + + return 0; +out: + while (i--) { + blk_cleanup_queue(nbd_dev[i].disk->queue); + put_disk(nbd_dev[i].disk); + } + kfree(nbd_dev); + return err; +} + +static void __exit nbd_cleanup(void) +{ + int i; + for (i = 0; i < nbds_max; i++) { + struct gendisk *disk = nbd_dev[i].disk; + nbd_dev[i].magic = 0; + if (disk) { + del_gendisk(disk); + blk_cleanup_queue(disk->queue); + put_disk(disk); + } + } + unregister_blkdev(NBD_MAJOR, "nbd"); + kfree(nbd_dev); + printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR); +} + +module_init(nbd_init); +module_exit(nbd_cleanup); + +MODULE_DESCRIPTION("Network Block Device"); +MODULE_LICENSE("GPL"); + +module_param(nbds_max, int, 0444); +MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)"); +module_param(max_part, int, 0444); +MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)"); diff --git a/kernel/drivers/block/null_blk.c b/kernel/drivers/block/null_blk.c new file mode 100644 index 000000000..65cd61a41 --- /dev/null +++ b/kernel/drivers/block/null_blk.c @@ -0,0 +1,674 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nullb_cmd { + struct list_head list; + struct llist_node ll_list; + struct call_single_data csd; + struct request *rq; + struct bio *bio; + unsigned int tag; + struct nullb_queue *nq; +}; + +struct nullb_queue { + unsigned long *tag_map; + wait_queue_head_t wait; + unsigned int queue_depth; + + struct nullb_cmd *cmds; +}; + +struct nullb { + struct list_head list; + unsigned int index; + struct request_queue *q; + struct gendisk *disk; + struct blk_mq_tag_set tag_set; + struct hrtimer timer; + unsigned int queue_depth; + spinlock_t lock; + + struct nullb_queue *queues; + unsigned int nr_queues; +}; + +static LIST_HEAD(nullb_list); +static struct mutex lock; +static int null_major; +static int nullb_indexes; + +struct completion_queue { + struct llist_head list; + struct hrtimer timer; +}; + +/* + * These are per-cpu for now, they will need to be configured by the + * complete_queues parameter and appropriately mapped. + */ +static DEFINE_PER_CPU(struct completion_queue, completion_queues); + +enum { + NULL_IRQ_NONE = 0, + NULL_IRQ_SOFTIRQ = 1, + NULL_IRQ_TIMER = 2, +}; + +enum { + NULL_Q_BIO = 0, + NULL_Q_RQ = 1, + NULL_Q_MQ = 2, +}; + +static int submit_queues; +module_param(submit_queues, int, S_IRUGO); +MODULE_PARM_DESC(submit_queues, "Number of submission queues"); + +static int home_node = NUMA_NO_NODE; +module_param(home_node, int, S_IRUGO); +MODULE_PARM_DESC(home_node, "Home node for the device"); + +static int queue_mode = NULL_Q_MQ; + +static int null_param_store_val(const char *str, int *val, int min, int max) +{ + int ret, new_val; + + ret = kstrtoint(str, 10, &new_val); + if (ret) + return -EINVAL; + + if (new_val < min || new_val > max) + return -EINVAL; + + *val = new_val; + return 0; +} + +static int null_set_queue_mode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ); +} + +static struct kernel_param_ops null_queue_mode_param_ops = { + .set = null_set_queue_mode, + .get = param_get_int, +}; + +device_param_cb(queue_mode, &null_queue_mode_param_ops, &queue_mode, S_IRUGO); +MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); + +static int gb = 250; +module_param(gb, int, S_IRUGO); +MODULE_PARM_DESC(gb, "Size in GB"); + +static int bs = 512; +module_param(bs, int, S_IRUGO); +MODULE_PARM_DESC(bs, "Block size (in bytes)"); + +static int nr_devices = 2; +module_param(nr_devices, int, S_IRUGO); +MODULE_PARM_DESC(nr_devices, "Number of devices to register"); + +static int irqmode = NULL_IRQ_SOFTIRQ; + +static int null_set_irqmode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &irqmode, NULL_IRQ_NONE, + NULL_IRQ_TIMER); +} + +static struct kernel_param_ops null_irqmode_param_ops = { + .set = null_set_irqmode, + .get = param_get_int, +}; + +device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO); +MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); + +static int completion_nsec = 10000; +module_param(completion_nsec, int, S_IRUGO); +MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); + +static int hw_queue_depth = 64; +module_param(hw_queue_depth, int, S_IRUGO); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64"); + +static bool use_per_node_hctx = false; +module_param(use_per_node_hctx, bool, S_IRUGO); +MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); + +static void put_tag(struct nullb_queue *nq, unsigned int tag) +{ + clear_bit_unlock(tag, nq->tag_map); + + if (waitqueue_active(&nq->wait)) + wake_up(&nq->wait); +} + +static unsigned int get_tag(struct nullb_queue *nq) +{ + unsigned int tag; + + do { + tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); + if (tag >= nq->queue_depth) + return -1U; + } while (test_and_set_bit_lock(tag, nq->tag_map)); + + return tag; +} + +static void free_cmd(struct nullb_cmd *cmd) +{ + put_tag(cmd->nq, cmd->tag); +} + +static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) +{ + struct nullb_cmd *cmd; + unsigned int tag; + + tag = get_tag(nq); + if (tag != -1U) { + cmd = &nq->cmds[tag]; + cmd->tag = tag; + cmd->nq = nq; + return cmd; + } + + return NULL; +} + +static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) +{ + struct nullb_cmd *cmd; + DEFINE_WAIT(wait); + + cmd = __alloc_cmd(nq); + if (cmd || !can_wait) + return cmd; + + do { + prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); + cmd = __alloc_cmd(nq); + if (cmd) + break; + + io_schedule(); + } while (1); + + finish_wait(&nq->wait, &wait); + return cmd; +} + +static void end_cmd(struct nullb_cmd *cmd) +{ + switch (queue_mode) { + case NULL_Q_MQ: + blk_mq_end_request(cmd->rq, 0); + return; + case NULL_Q_RQ: + INIT_LIST_HEAD(&cmd->rq->queuelist); + blk_end_request_all(cmd->rq, 0); + break; + case NULL_Q_BIO: + bio_endio(cmd->bio, 0); + break; + } + + free_cmd(cmd); +} + +static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) +{ + struct completion_queue *cq; + struct llist_node *entry; + struct nullb_cmd *cmd; + + cq = &per_cpu(completion_queues, smp_processor_id()); + + while ((entry = llist_del_all(&cq->list)) != NULL) { + entry = llist_reverse_order(entry); + do { + cmd = container_of(entry, struct nullb_cmd, ll_list); + entry = entry->next; + end_cmd(cmd); + } while (entry); + } + + return HRTIMER_NORESTART; +} + +static void null_cmd_end_timer(struct nullb_cmd *cmd) +{ + struct completion_queue *cq = &per_cpu(completion_queues, get_cpu()); + + cmd->ll_list.next = NULL; + if (llist_add(&cmd->ll_list, &cq->list)) { + ktime_t kt = ktime_set(0, completion_nsec); + + hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL); + } + + put_cpu(); +} + +static void null_softirq_done_fn(struct request *rq) +{ + if (queue_mode == NULL_Q_MQ) + end_cmd(blk_mq_rq_to_pdu(rq)); + else + end_cmd(rq->special); +} + +static inline void null_handle_cmd(struct nullb_cmd *cmd) +{ + /* Complete IO by inline, softirq or timer */ + switch (irqmode) { + case NULL_IRQ_SOFTIRQ: + switch (queue_mode) { + case NULL_Q_MQ: + blk_mq_complete_request(cmd->rq); + break; + case NULL_Q_RQ: + blk_complete_request(cmd->rq); + break; + case NULL_Q_BIO: + /* + * XXX: no proper submitting cpu information available. + */ + end_cmd(cmd); + break; + } + break; + case NULL_IRQ_NONE: + end_cmd(cmd); + break; + case NULL_IRQ_TIMER: + null_cmd_end_timer(cmd); + break; + } +} + +static struct nullb_queue *nullb_to_queue(struct nullb *nullb) +{ + int index = 0; + + if (nullb->nr_queues != 1) + index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); + + return &nullb->queues[index]; +} + +static void null_queue_bio(struct request_queue *q, struct bio *bio) +{ + struct nullb *nullb = q->queuedata; + struct nullb_queue *nq = nullb_to_queue(nullb); + struct nullb_cmd *cmd; + + cmd = alloc_cmd(nq, 1); + cmd->bio = bio; + + null_handle_cmd(cmd); +} + +static int null_rq_prep_fn(struct request_queue *q, struct request *req) +{ + struct nullb *nullb = q->queuedata; + struct nullb_queue *nq = nullb_to_queue(nullb); + struct nullb_cmd *cmd; + + cmd = alloc_cmd(nq, 0); + if (cmd) { + cmd->rq = req; + req->special = cmd; + return BLKPREP_OK; + } + + return BLKPREP_DEFER; +} + +static void null_request_fn(struct request_queue *q) +{ + struct request *rq; + + while ((rq = blk_fetch_request(q)) != NULL) { + struct nullb_cmd *cmd = rq->special; + + spin_unlock_irq(q->queue_lock); + null_handle_cmd(cmd); + spin_lock_irq(q->queue_lock); + } +} + +static int null_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + + cmd->rq = bd->rq; + cmd->nq = hctx->driver_data; + + blk_mq_start_request(bd->rq); + + null_handle_cmd(cmd); + return BLK_MQ_RQ_QUEUE_OK; +} + +static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) +{ + BUG_ON(!nullb); + BUG_ON(!nq); + + init_waitqueue_head(&nq->wait); + nq->queue_depth = nullb->queue_depth; +} + +static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int index) +{ + struct nullb *nullb = data; + struct nullb_queue *nq = &nullb->queues[index]; + + hctx->driver_data = nq; + null_init_queue(nullb, nq); + nullb->nr_queues++; + + return 0; +} + +static struct blk_mq_ops null_mq_ops = { + .queue_rq = null_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = null_init_hctx, + .complete = null_softirq_done_fn, +}; + +static void null_del_dev(struct nullb *nullb) +{ + list_del_init(&nullb->list); + + del_gendisk(nullb->disk); + blk_cleanup_queue(nullb->q); + if (queue_mode == NULL_Q_MQ) + blk_mq_free_tag_set(&nullb->tag_set); + put_disk(nullb->disk); + kfree(nullb); +} + +static int null_open(struct block_device *bdev, fmode_t mode) +{ + return 0; +} + +static void null_release(struct gendisk *disk, fmode_t mode) +{ +} + +static const struct block_device_operations null_fops = { + .owner = THIS_MODULE, + .open = null_open, + .release = null_release, +}; + +static int setup_commands(struct nullb_queue *nq) +{ + struct nullb_cmd *cmd; + int i, tag_size; + + nq->cmds = kzalloc(nq->queue_depth * sizeof(*cmd), GFP_KERNEL); + if (!nq->cmds) + return -ENOMEM; + + tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; + nq->tag_map = kzalloc(tag_size * sizeof(unsigned long), GFP_KERNEL); + if (!nq->tag_map) { + kfree(nq->cmds); + return -ENOMEM; + } + + for (i = 0; i < nq->queue_depth; i++) { + cmd = &nq->cmds[i]; + INIT_LIST_HEAD(&cmd->list); + cmd->ll_list.next = NULL; + cmd->tag = -1U; + } + + return 0; +} + +static void cleanup_queue(struct nullb_queue *nq) +{ + kfree(nq->tag_map); + kfree(nq->cmds); +} + +static void cleanup_queues(struct nullb *nullb) +{ + int i; + + for (i = 0; i < nullb->nr_queues; i++) + cleanup_queue(&nullb->queues[i]); + + kfree(nullb->queues); +} + +static int setup_queues(struct nullb *nullb) +{ + nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue), + GFP_KERNEL); + if (!nullb->queues) + return -ENOMEM; + + nullb->nr_queues = 0; + nullb->queue_depth = hw_queue_depth; + + return 0; +} + +static int init_driver_queues(struct nullb *nullb) +{ + struct nullb_queue *nq; + int i, ret = 0; + + for (i = 0; i < submit_queues; i++) { + nq = &nullb->queues[i]; + + null_init_queue(nullb, nq); + + ret = setup_commands(nq); + if (ret) + return ret; + nullb->nr_queues++; + } + return 0; +} + +static int null_add_dev(void) +{ + struct gendisk *disk; + struct nullb *nullb; + sector_t size; + int rv; + + nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); + if (!nullb) { + rv = -ENOMEM; + goto out; + } + + spin_lock_init(&nullb->lock); + + if (queue_mode == NULL_Q_MQ && use_per_node_hctx) + submit_queues = nr_online_nodes; + + rv = setup_queues(nullb); + if (rv) + goto out_free_nullb; + + if (queue_mode == NULL_Q_MQ) { + nullb->tag_set.ops = &null_mq_ops; + nullb->tag_set.nr_hw_queues = submit_queues; + nullb->tag_set.queue_depth = hw_queue_depth; + nullb->tag_set.numa_node = home_node; + nullb->tag_set.cmd_size = sizeof(struct nullb_cmd); + nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + nullb->tag_set.driver_data = nullb; + + rv = blk_mq_alloc_tag_set(&nullb->tag_set); + if (rv) + goto out_cleanup_queues; + + nullb->q = blk_mq_init_queue(&nullb->tag_set); + if (IS_ERR(nullb->q)) { + rv = -ENOMEM; + goto out_cleanup_tags; + } + } else if (queue_mode == NULL_Q_BIO) { + nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node); + if (!nullb->q) { + rv = -ENOMEM; + goto out_cleanup_queues; + } + blk_queue_make_request(nullb->q, null_queue_bio); + rv = init_driver_queues(nullb); + if (rv) + goto out_cleanup_blk_queue; + } else { + nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node); + if (!nullb->q) { + rv = -ENOMEM; + goto out_cleanup_queues; + } + blk_queue_prep_rq(nullb->q, null_rq_prep_fn); + blk_queue_softirq_done(nullb->q, null_softirq_done_fn); + rv = init_driver_queues(nullb); + if (rv) + goto out_cleanup_blk_queue; + } + + nullb->q->queuedata = nullb; + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q); + + disk = nullb->disk = alloc_disk_node(1, home_node); + if (!disk) { + rv = -ENOMEM; + goto out_cleanup_blk_queue; + } + + mutex_lock(&lock); + list_add_tail(&nullb->list, &nullb_list); + nullb->index = nullb_indexes++; + mutex_unlock(&lock); + + blk_queue_logical_block_size(nullb->q, bs); + blk_queue_physical_block_size(nullb->q, bs); + + size = gb * 1024 * 1024 * 1024ULL; + sector_div(size, bs); + set_capacity(disk, size); + + disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->major = null_major; + disk->first_minor = nullb->index; + disk->fops = &null_fops; + disk->private_data = nullb; + disk->queue = nullb->q; + sprintf(disk->disk_name, "nullb%d", nullb->index); + add_disk(disk); + return 0; + +out_cleanup_blk_queue: + blk_cleanup_queue(nullb->q); +out_cleanup_tags: + if (queue_mode == NULL_Q_MQ) + blk_mq_free_tag_set(&nullb->tag_set); +out_cleanup_queues: + cleanup_queues(nullb); +out_free_nullb: + kfree(nullb); +out: + return rv; +} + +static int __init null_init(void) +{ + unsigned int i; + + if (bs > PAGE_SIZE) { + pr_warn("null_blk: invalid block size\n"); + pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE); + bs = PAGE_SIZE; + } + + if (queue_mode == NULL_Q_MQ && use_per_node_hctx) { + if (submit_queues < nr_online_nodes) { + pr_warn("null_blk: submit_queues param is set to %u.", + nr_online_nodes); + submit_queues = nr_online_nodes; + } + } else if (submit_queues > nr_cpu_ids) + submit_queues = nr_cpu_ids; + else if (!submit_queues) + submit_queues = 1; + + mutex_init(&lock); + + /* Initialize a separate list for each CPU for issuing softirqs */ + for_each_possible_cpu(i) { + struct completion_queue *cq = &per_cpu(completion_queues, i); + + init_llist_head(&cq->list); + + if (irqmode != NULL_IRQ_TIMER) + continue; + + hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cq->timer.function = null_cmd_timer_expired; + } + + null_major = register_blkdev(0, "nullb"); + if (null_major < 0) + return null_major; + + for (i = 0; i < nr_devices; i++) { + if (null_add_dev()) { + unregister_blkdev(null_major, "nullb"); + return -EINVAL; + } + } + + pr_info("null: module loaded\n"); + return 0; +} + +static void __exit null_exit(void) +{ + struct nullb *nullb; + + unregister_blkdev(null_major, "nullb"); + + mutex_lock(&lock); + while (!list_empty(&nullb_list)) { + nullb = list_entry(nullb_list.next, struct nullb, list); + null_del_dev(nullb); + } + mutex_unlock(&lock); +} + +module_init(null_init); +module_exit(null_exit); + +MODULE_AUTHOR("Jens Axboe "); +MODULE_LICENSE("GPL"); diff --git a/kernel/drivers/block/nvme-core.c b/kernel/drivers/block/nvme-core.c new file mode 100644 index 000000000..683dff272 --- /dev/null +++ b/kernel/drivers/block/nvme-core.c @@ -0,0 +1,3178 @@ +/* + * NVM Express device driver + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NVME_MINORS (1U << MINORBITS) +#define NVME_Q_DEPTH 1024 +#define NVME_AQ_DEPTH 256 +#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) +#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) +#define ADMIN_TIMEOUT (admin_timeout * HZ) +#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) + +static unsigned char admin_timeout = 60; +module_param(admin_timeout, byte, 0644); +MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); + +unsigned char nvme_io_timeout = 30; +module_param_named(io_timeout, nvme_io_timeout, byte, 0644); +MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); + +static unsigned char shutdown_timeout = 5; +module_param(shutdown_timeout, byte, 0644); +MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); + +static int nvme_major; +module_param(nvme_major, int, 0); + +static int nvme_char_major; +module_param(nvme_char_major, int, 0); + +static int use_threaded_interrupts; +module_param(use_threaded_interrupts, int, 0); + +static DEFINE_SPINLOCK(dev_list_lock); +static LIST_HEAD(dev_list); +static struct task_struct *nvme_thread; +static struct workqueue_struct *nvme_workq; +static wait_queue_head_t nvme_kthread_wait; + +static struct class *nvme_class; + +static void nvme_reset_failed_dev(struct work_struct *ws); +static int nvme_process_cq(struct nvme_queue *nvmeq); + +struct async_cmd_info { + struct kthread_work work; + struct kthread_worker *worker; + struct request *req; + u32 result; + int status; + void *ctx; +}; + +/* + * An NVM Express queue. Each device has at least two (one for admin + * commands and one for I/O commands). + */ +struct nvme_queue { + struct device *q_dmadev; + struct nvme_dev *dev; + char irqname[24]; /* nvme4294967295-65535\0 */ + spinlock_t q_lock; + struct nvme_command *sq_cmds; + volatile struct nvme_completion *cqes; + dma_addr_t sq_dma_addr; + dma_addr_t cq_dma_addr; + u32 __iomem *q_db; + u16 q_depth; + s16 cq_vector; + u16 sq_head; + u16 sq_tail; + u16 cq_head; + u16 qid; + u8 cq_phase; + u8 cqe_seen; + struct async_cmd_info cmdinfo; + struct blk_mq_hw_ctx *hctx; +}; + +/* + * Check we didin't inadvertently grow the command struct + */ +static inline void _nvme_check_size(void) +{ + BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); + BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); + BUILD_BUG_ON(sizeof(struct nvme_features) != 64); + BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); + BUILD_BUG_ON(sizeof(struct nvme_command) != 64); + BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); + BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); + BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); +} + +typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, + struct nvme_completion *); + +struct nvme_cmd_info { + nvme_completion_fn fn; + void *ctx; + int aborted; + struct nvme_queue *nvmeq; + struct nvme_iod iod[0]; +}; + +/* + * Max size of iod being embedded in the request payload + */ +#define NVME_INT_PAGES 2 +#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) +#define NVME_INT_MASK 0x01 + +/* + * Will slightly overestimate the number of pages needed. This is OK + * as it only leads to a small amount of wasted memory for the lifetime of + * the I/O. + */ +static int nvme_npages(unsigned size, struct nvme_dev *dev) +{ + unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); +} + +static unsigned int nvme_cmd_size(struct nvme_dev *dev) +{ + unsigned int ret = sizeof(struct nvme_cmd_info); + + ret += sizeof(struct nvme_iod); + ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); + ret += sizeof(struct scatterlist) * NVME_INT_PAGES; + + return ret; +} + +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[0]; + + WARN_ON(nvmeq->hctx); + nvmeq->hctx = hctx; + hctx->driver_data = nvmeq; + return 0; +} + +static int nvme_admin_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[0]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; +} + +static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct nvme_queue *nvmeq = hctx->driver_data; + + nvmeq->hctx = NULL; +} + +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[ + (hctx_idx % dev->queue_count) + 1]; + + if (!nvmeq->hctx) + nvmeq->hctx = hctx; + + /* nvmeq queues are shared between namespaces. We assume here that + * blk-mq map the tags so they match up with the nvme queue tags. */ + WARN_ON(nvmeq->hctx->tags != hctx->tags); + + hctx->driver_data = nvmeq; + return 0; +} + +static int nvme_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; +} + +static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, + nvme_completion_fn handler) +{ + cmd->fn = handler; + cmd->ctx = ctx; + cmd->aborted = 0; + blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); +} + +static void *iod_get_private(struct nvme_iod *iod) +{ + return (void *) (iod->private & ~0x1UL); +} + +/* + * If bit 0 is set, the iod is embedded in the request payload. + */ +static bool iod_should_kfree(struct nvme_iod *iod) +{ + return (iod->private & NVME_INT_MASK) == 0; +} + +/* Special values must be less than 0x1000 */ +#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) +#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) +#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) +#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) + +static void special_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) +{ + if (ctx == CMD_CTX_CANCELLED) + return; + if (ctx == CMD_CTX_COMPLETED) { + dev_warn(nvmeq->q_dmadev, + "completed id %d twice on queue %d\n", + cqe->command_id, le16_to_cpup(&cqe->sq_id)); + return; + } + if (ctx == CMD_CTX_INVALID) { + dev_warn(nvmeq->q_dmadev, + "invalid id %d completed on queue %d\n", + cqe->command_id, le16_to_cpup(&cqe->sq_id)); + return; + } + dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); +} + +static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) +{ + void *ctx; + + if (fn) + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_CANCELLED; + return ctx; +} + +static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) +{ + u32 result = le32_to_cpup(&cqe->result); + u16 status = le16_to_cpup(&cqe->status) >> 1; + + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) + ++nvmeq->dev->event_limit; + if (status == NVME_SC_SUCCESS) + dev_warn(nvmeq->q_dmadev, + "async event result %08x\n", result); +} + +static void abort_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) +{ + struct request *req = ctx; + + u16 status = le16_to_cpup(&cqe->status) >> 1; + u32 result = le32_to_cpup(&cqe->result); + + blk_mq_free_hctx_request(nvmeq->hctx, req); + + dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); + ++nvmeq->dev->abort_limit; +} + +static void async_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) +{ + struct async_cmd_info *cmdinfo = ctx; + cmdinfo->result = le32_to_cpup(&cqe->result); + cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; + queue_kthread_work(cmdinfo->worker, &cmdinfo->work); + blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req); +} + +static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, + unsigned int tag) +{ + struct blk_mq_hw_ctx *hctx = nvmeq->hctx; + struct request *req = blk_mq_tag_to_rq(hctx->tags, tag); + + return blk_mq_rq_to_pdu(req); +} + +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, + nvme_completion_fn *fn) +{ + struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); + void *ctx; + if (tag >= nvmeq->q_depth) { + *fn = special_completion; + return CMD_CTX_INVALID; + } + if (fn) + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_COMPLETED; + return ctx; +} + +/** + * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * @nvmeq: The queue to use + * @cmd: The command to send + * + * Safe to use from interrupt context + */ +static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +{ + u16 tail = nvmeq->sq_tail; + + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); + if (++tail == nvmeq->q_depth) + tail = 0; + writel(tail, nvmeq->q_db); + nvmeq->sq_tail = tail; + + return 0; +} + +static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +{ + unsigned long flags; + int ret; + spin_lock_irqsave(&nvmeq->q_lock, flags); + ret = __nvme_submit_cmd(nvmeq, cmd); + spin_unlock_irqrestore(&nvmeq->q_lock, flags); + return ret; +} + +static __le64 **iod_list(struct nvme_iod *iod) +{ + return ((void *)iod) + iod->offset; +} + +static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, + unsigned nseg, unsigned long private) +{ + iod->private = private; + iod->offset = offsetof(struct nvme_iod, sg[nseg]); + iod->npages = -1; + iod->length = nbytes; + iod->nents = 0; +} + +static struct nvme_iod * +__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, + unsigned long priv, gfp_t gfp) +{ + struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + + sizeof(__le64 *) * nvme_npages(bytes, dev) + + sizeof(struct scatterlist) * nseg, gfp); + + if (iod) + iod_init(iod, bytes, nseg, priv); + + return iod; +} + +static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, + gfp_t gfp) +{ + unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : + sizeof(struct nvme_dsm_range); + struct nvme_iod *iod; + + if (rq->nr_phys_segments <= NVME_INT_PAGES && + size <= NVME_INT_BYTES(dev)) { + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); + + iod = cmd->iod; + iod_init(iod, size, rq->nr_phys_segments, + (unsigned long) rq | NVME_INT_MASK); + return iod; + } + + return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, + (unsigned long) rq, gfp); +} + +void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +{ + const int last_prp = dev->page_size / 8 - 1; + int i; + __le64 **list = iod_list(iod); + dma_addr_t prp_dma = iod->first_dma; + + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, list[0], prp_dma); + for (i = 0; i < iod->npages; i++) { + __le64 *prp_list = list[i]; + dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); + dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); + prp_dma = next_prp_dma; + } + + if (iod_should_kfree(iod)) + kfree(iod); +} + +static int nvme_error_status(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_SUCCESS: + return 0; + case NVME_SC_CAP_EXCEEDED: + return -ENOSPC; + default: + return -EIO; + } +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == v) + pi->ref_tag = cpu_to_be32(p); +} + +static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) +{ + if (be32_to_cpu(pi->ref_tag) == p) + pi->ref_tag = cpu_to_be32(v); +} + +/** + * nvme_dif_remap - remaps ref tags to bip seed and physical lba + * + * The virtual start sector is the one that was originally submitted by the + * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical + * start sector may be different. Remap protection information to match the + * physical LBA on writes, and back to the original seed on reads. + * + * Type 0 and 3 do not have a ref tag, so no remapping required. + */ +static void nvme_dif_remap(struct request *req, + void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) +{ + struct nvme_ns *ns = req->rq_disk->private_data; + struct bio_integrity_payload *bip; + struct t10_pi_tuple *pi; + void *p, *pmap; + u32 i, nlb, ts, phys, virt; + + if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) + return; + + bip = bio_integrity(req->bio); + if (!bip) + return; + + pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; + + p = pmap; + virt = bip_get_seed(bip); + phys = nvme_block_nr(ns, blk_rq_pos(req)); + nlb = (blk_rq_bytes(req) >> ns->lba_shift); + ts = ns->disk->integrity->tuple_size; + + for (i = 0; i < nlb; i++, virt++, phys++) { + pi = (struct t10_pi_tuple *)p; + dif_swap(phys, virt, pi); + p += ts; + } + kunmap_atomic(pmap); +} + +static int nvme_noop_verify(struct blk_integrity_iter *iter) +{ + return 0; +} + +static int nvme_noop_generate(struct blk_integrity_iter *iter) +{ + return 0; +} + +struct blk_integrity nvme_meta_noop = { + .name = "NVME_META_NOOP", + .generate_fn = nvme_noop_generate, + .verify_fn = nvme_noop_verify, +}; + +static void nvme_init_integrity(struct nvme_ns *ns) +{ + struct blk_integrity integrity; + + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + integrity = t10_pi_type3_crc; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + integrity = t10_pi_type1_crc; + break; + default: + integrity = nvme_meta_noop; + break; + } + integrity.tuple_size = ns->ms; + blk_integrity_register(ns->disk, &integrity); + blk_queue_max_integrity_segments(ns->queue, 1); +} +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static void nvme_dif_remap(struct request *req, + void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) +{ +} +static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) +{ +} +static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) +{ +} +static void nvme_init_integrity(struct nvme_ns *ns) +{ +} +#endif + +static void req_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) +{ + struct nvme_iod *iod = ctx; + struct request *req = iod_get_private(iod); + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + + u16 status = le16_to_cpup(&cqe->status) >> 1; + + if (unlikely(status)) { + if (!(status & NVME_SC_DNR || blk_noretry_request(req)) + && (jiffies - req->start_time) < req->timeout) { + unsigned long flags; + + blk_mq_requeue_request(req); + spin_lock_irqsave(req->q->queue_lock, flags); + if (!blk_queue_stopped(req->q)) + blk_mq_kick_requeue_list(req->q); + spin_unlock_irqrestore(req->q->queue_lock, flags); + return; + } + req->errors = nvme_error_status(status); + } else + req->errors = 0; + + if (cmd_rq->aborted) + dev_warn(&nvmeq->dev->pci_dev->dev, + "completing aborted command with status:%04x\n", + status); + + if (iod->nents) { + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (blk_integrity_rq(req)) { + if (!rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_complete); + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + } + } + nvme_free_iod(nvmeq->dev, iod); + + blk_mq_complete_request(req); +} + +/* length is in bytes. gfp flags indicates whether we may sleep. */ +int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, + gfp_t gfp) +{ + struct dma_pool *pool; + int length = total_len; + struct scatterlist *sg = iod->sg; + int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg); + u32 page_size = dev->page_size; + int offset = dma_addr & (page_size - 1); + __le64 *prp_list; + __le64 **list = iod_list(iod); + dma_addr_t prp_dma; + int nprps, i; + + length -= (page_size - offset); + if (length <= 0) + return total_len; + + dma_len -= (page_size - offset); + if (dma_len) { + dma_addr += (page_size - offset); + } else { + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + if (length <= page_size) { + iod->first_dma = dma_addr; + return total_len; + } + + nprps = DIV_ROUND_UP(length, page_size); + if (nprps <= (256 / 8)) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + if (!prp_list) { + iod->first_dma = dma_addr; + iod->npages = -1; + return (total_len - length) + page_size; + } + list[0] = prp_list; + iod->first_dma = prp_dma; + i = 0; + for (;;) { + if (i == page_size >> 3) { + __le64 *old_prp_list = prp_list; + prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + if (!prp_list) + return total_len - length; + list[iod->npages++] = prp_list; + prp_list[0] = old_prp_list[i - 1]; + old_prp_list[i - 1] = cpu_to_le64(prp_dma); + i = 1; + } + prp_list[i++] = cpu_to_le64(dma_addr); + dma_len -= page_size; + dma_addr += page_size; + length -= page_size; + if (length <= 0) + break; + if (dma_len > 0) + continue; + BUG_ON(dma_len < 0); + sg = sg_next(sg); + dma_addr = sg_dma_address(sg); + dma_len = sg_dma_len(sg); + } + + return total_len; +} + +/* + * We reuse the small pool to allocate the 16-byte range here as it is not + * worth having a special pool for these or additional cases to handle freeing + * the iod. + */ +static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct request *req, struct nvme_iod *iod) +{ + struct nvme_dsm_range *range = + (struct nvme_dsm_range *)iod_list(iod)[0]; + struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + range->cattr = cpu_to_le32(0); + range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; + cmnd->dsm.command_id = req->tag; + cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); + cmnd->dsm.nr = 0; + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); +} + +static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, + int cmdid) +{ + struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.command_id = cmdid; + cmnd->common.nsid = cpu_to_le32(ns->ns_id); + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); +} + +static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, + struct nvme_ns *ns) +{ + struct request *req = iod_get_private(iod); + struct nvme_command *cmnd; + u16 control = 0; + u32 dsmgmt = 0; + + if (req->cmd_flags & REQ_FUA) + control |= NVME_RW_FUA; + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + if (req->cmd_flags & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + memset(cmnd, 0, sizeof(*cmnd)); + + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; + cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (blk_integrity_rq(req)) { + cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + cmnd->rw.reftag = cpu_to_le32( + nvme_block_nr(ns, blk_rq_pos(req))); + break; + } + } else if (ns->ms) + control |= NVME_RW_PRINFO_PRACT; + + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); + + return 0; +} + +static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_queue *nvmeq = hctx->driver_data; + struct request *req = bd->rq; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_iod *iod; + enum dma_data_direction dma_dir; + + /* + * If formated with metadata, require the block layer provide a buffer + * unless this namespace is formated such that the metadata can be + * stripped/generated by the controller with PRACT=1. + */ + if (ns->ms && !blk_integrity_rq(req)) { + if (!(ns->pi_type && ns->ms == 8)) { + req->errors = -EFAULT; + blk_mq_complete_request(req); + return BLK_MQ_RQ_QUEUE_OK; + } + } + + iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); + if (!iod) + return BLK_MQ_RQ_QUEUE_BUSY; + + if (req->cmd_flags & REQ_DISCARD) { + void *range; + /* + * We reuse the small pool to allocate the 16-byte range here + * as it is not worth having a special pool for these or + * additional cases to handle freeing the iod. + */ + range = dma_pool_alloc(nvmeq->dev->prp_small_pool, + GFP_ATOMIC, + &iod->first_dma); + if (!range) + goto retry_cmd; + iod_list(iod)[0] = (__le64 *)range; + iod->npages = 0; + } else if (req->nr_phys_segments) { + dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + + sg_init_table(iod->sg, req->nr_phys_segments); + iod->nents = blk_rq_map_sg(req->q, req, iod->sg); + if (!iod->nents) + goto error_cmd; + + if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) + goto retry_cmd; + + if (blk_rq_bytes(req) != + nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, + iod->nents, dma_dir); + goto retry_cmd; + } + if (blk_integrity_rq(req)) { + if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) + goto error_cmd; + + sg_init_table(iod->meta_sg, 1); + if (blk_rq_map_integrity_sg( + req->q, req->bio, iod->meta_sg) != 1) + goto error_cmd; + + if (rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_prep); + + if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) + goto error_cmd; + } + } + + nvme_set_info(cmd, iod, req_completion); + spin_lock_irq(&nvmeq->q_lock); + if (req->cmd_flags & REQ_DISCARD) + nvme_submit_discard(nvmeq, ns, req, iod); + else if (req->cmd_flags & REQ_FLUSH) + nvme_submit_flush(nvmeq, ns, req->tag); + else + nvme_submit_iod(nvmeq, iod, ns); + + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + return BLK_MQ_RQ_QUEUE_OK; + + error_cmd: + nvme_free_iod(nvmeq->dev, iod); + return BLK_MQ_RQ_QUEUE_ERROR; + retry_cmd: + nvme_free_iod(nvmeq->dev, iod); + return BLK_MQ_RQ_QUEUE_BUSY; +} + +static int nvme_process_cq(struct nvme_queue *nvmeq) +{ + u16 head, phase; + + head = nvmeq->cq_head; + phase = nvmeq->cq_phase; + + for (;;) { + void *ctx; + nvme_completion_fn fn; + struct nvme_completion cqe = nvmeq->cqes[head]; + if ((le16_to_cpu(cqe.status) & 1) != phase) + break; + nvmeq->sq_head = le16_to_cpu(cqe.sq_head); + if (++head == nvmeq->q_depth) { + head = 0; + phase = !phase; + } + ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); + fn(nvmeq, ctx, &cqe); + } + + /* If the controller ignores the cq head doorbell and continuously + * writes to the queue, it is theoretically possible to wrap around + * the queue twice and mistakenly return IRQ_NONE. Linux only + * requires that 0.1% of your interrupts are handled, so this isn't + * a big problem. + */ + if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) + return 0; + + writel(head, nvmeq->q_db + nvmeq->dev->db_stride); + nvmeq->cq_head = head; + nvmeq->cq_phase = phase; + + nvmeq->cqe_seen = 1; + return 1; +} + +/* Admin queue isn't initialized as a request queue. If at some point this + * happens anyway, make sure to notify the user */ +static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + WARN_ON_ONCE(1); + return BLK_MQ_RQ_QUEUE_ERROR; +} + +static irqreturn_t nvme_irq(int irq, void *data) +{ + irqreturn_t result; + struct nvme_queue *nvmeq = data; + spin_lock(&nvmeq->q_lock); + nvme_process_cq(nvmeq); + result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; + nvmeq->cqe_seen = 0; + spin_unlock(&nvmeq->q_lock); + return result; +} + +static irqreturn_t nvme_irq_check(int irq, void *data) +{ + struct nvme_queue *nvmeq = data; + struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; + if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) + return IRQ_NONE; + return IRQ_WAKE_THREAD; +} + +struct sync_cmd_info { + struct task_struct *task; + u32 result; + int status; +}; + +static void sync_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) +{ + struct sync_cmd_info *cmdinfo = ctx; + cmdinfo->result = le32_to_cpup(&cqe->result); + cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; + wake_up_process(cmdinfo->task); +} + +/* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ +static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, + u32 *result, unsigned timeout) +{ + struct sync_cmd_info cmdinfo; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd_rq->nvmeq; + + cmdinfo.task = current; + cmdinfo.status = -EINTR; + + cmd->common.command_id = req->tag; + + nvme_set_info(cmd_rq, &cmdinfo, sync_completion); + + set_current_state(TASK_UNINTERRUPTIBLE); + nvme_submit_cmd(nvmeq, cmd); + schedule(); + + if (result) + *result = cmdinfo.result; + return cmdinfo.status; +} + +static int nvme_submit_async_admin_req(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_command c; + struct nvme_cmd_info *cmd_info; + struct request *req; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->cmd_flags |= REQ_NO_TIMEOUT; + cmd_info = blk_mq_rq_to_pdu(req); + nvme_set_info(cmd_info, NULL, async_req_completion); + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_async_event; + c.common.command_id = req->tag; + + blk_mq_free_hctx_request(nvmeq->hctx, req); + return __nvme_submit_cmd(nvmeq, &c); +} + +static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, + struct nvme_command *cmd, + struct async_cmd_info *cmdinfo, unsigned timeout) +{ + struct nvme_queue *nvmeq = dev->queues[0]; + struct request *req; + struct nvme_cmd_info *cmd_rq; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = timeout; + cmd_rq = blk_mq_rq_to_pdu(req); + cmdinfo->req = req; + nvme_set_info(cmd_rq, cmdinfo, async_completion); + cmdinfo->status = -EINTR; + + cmd->common.command_id = req->tag; + + return nvme_submit_cmd(nvmeq, cmd); +} + +static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, + u32 *result, unsigned timeout) +{ + int res; + struct request *req; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (IS_ERR(req)) + return PTR_ERR(req); + res = nvme_submit_sync_cmd(req, cmd, result, timeout); + blk_mq_free_request(req); + return res; +} + +int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, + u32 *result) +{ + return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT); +} + +int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns, + struct nvme_command *cmd, u32 *result) +{ + int res; + struct request *req; + + req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT), + false); + if (IS_ERR(req)) + return PTR_ERR(req); + res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT); + blk_mq_free_request(req); + return res; +} + +static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.delete_queue.opcode = opcode; + c.delete_queue.qid = cpu_to_le16(id); + + return nvme_submit_admin_cmd(dev, &c, NULL); +} + +static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; + + memset(&c, 0, sizeof(c)); + c.create_cq.opcode = nvme_admin_create_cq; + c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); + c.create_cq.cqid = cpu_to_le16(qid); + c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_cq.cq_flags = cpu_to_le16(flags); + c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); + + return nvme_submit_admin_cmd(dev, &c, NULL); +} + +static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, + struct nvme_queue *nvmeq) +{ + struct nvme_command c; + int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; + + memset(&c, 0, sizeof(c)); + c.create_sq.opcode = nvme_admin_create_sq; + c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); + c.create_sq.sqid = cpu_to_le16(qid); + c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); + c.create_sq.sq_flags = cpu_to_le16(flags); + c.create_sq.cqid = cpu_to_le16(qid); + + return nvme_submit_admin_cmd(dev, &c, NULL); +} + +static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); +} + +static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) +{ + return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); +} + +int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, + dma_addr_t dma_addr) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.identify.opcode = nvme_admin_identify; + c.identify.nsid = cpu_to_le32(nsid); + c.identify.prp1 = cpu_to_le64(dma_addr); + c.identify.cns = cpu_to_le32(cns); + + return nvme_submit_admin_cmd(dev, &c, NULL); +} + +int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_get_features; + c.features.nsid = cpu_to_le32(nsid); + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + + return nvme_submit_admin_cmd(dev, &c, result); +} + +int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + + return nvme_submit_admin_cmd(dev, &c, result); +} + +/** + * nvme_abort_req - Attempt aborting a request + * + * Schedule controller reset if the command was already aborted once before and + * still hasn't been returned to the driver, or if this is the admin queue. + */ +static void nvme_abort_req(struct request *req) +{ + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd_rq->nvmeq; + struct nvme_dev *dev = nvmeq->dev; + struct request *abort_req; + struct nvme_cmd_info *abort_cmd; + struct nvme_command cmd; + + if (!nvmeq->qid || cmd_rq->aborted) { + unsigned long flags; + + spin_lock_irqsave(&dev_list_lock, flags); + if (work_busy(&dev->reset_work)) + goto out; + list_del_init(&dev->node); + dev_warn(&dev->pci_dev->dev, + "I/O %d QID %d timeout, reset controller\n", + req->tag, nvmeq->qid); + dev->reset_workfn = nvme_reset_failed_dev; + queue_work(nvme_workq, &dev->reset_work); + out: + spin_unlock_irqrestore(&dev_list_lock, flags); + return; + } + + if (!dev->abort_limit) + return; + + abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, + false); + if (IS_ERR(abort_req)) + return; + + abort_cmd = blk_mq_rq_to_pdu(abort_req); + nvme_set_info(abort_cmd, abort_req, abort_completion); + + memset(&cmd, 0, sizeof(cmd)); + cmd.abort.opcode = nvme_admin_abort_cmd; + cmd.abort.cid = req->tag; + cmd.abort.sqid = cpu_to_le16(nvmeq->qid); + cmd.abort.command_id = abort_req->tag; + + --dev->abort_limit; + cmd_rq->aborted = 1; + + dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, + nvmeq->qid); + if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) { + dev_warn(nvmeq->q_dmadev, + "Could not abort I/O %d QID %d", + req->tag, nvmeq->qid); + blk_mq_free_request(abort_req); + } +} + +static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx, + struct request *req, void *data, bool reserved) +{ + struct nvme_queue *nvmeq = data; + void *ctx; + nvme_completion_fn fn; + struct nvme_cmd_info *cmd; + struct nvme_completion cqe; + + if (!blk_mq_request_started(req)) + return; + + cmd = blk_mq_rq_to_pdu(req); + + if (cmd->ctx == CMD_CTX_CANCELLED) + return; + + if (blk_queue_dying(req->q)) + cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); + else + cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); + + + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", + req->tag, nvmeq->qid); + ctx = cancel_cmd_info(cmd, &fn); + fn(nvmeq, ctx, &cqe); +} + +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) +{ + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd->nvmeq; + + dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, + nvmeq->qid); + spin_lock_irq(&nvmeq->q_lock); + nvme_abort_req(req); + spin_unlock_irq(&nvmeq->q_lock); + + /* + * The aborted req will be completed on receiving the abort req. + * We enable the timer again. If hit twice, it'll cause a device reset, + * as the device then is in a faulty state. + */ + return BLK_EH_RESET_TIMER; +} + +static void nvme_free_queue(struct nvme_queue *nvmeq) +{ + dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), + (void *)nvmeq->cqes, nvmeq->cq_dma_addr); + dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), + nvmeq->sq_cmds, nvmeq->sq_dma_addr); + kfree(nvmeq); +} + +static void nvme_free_queues(struct nvme_dev *dev, int lowest) +{ + int i; + + for (i = dev->queue_count - 1; i >= lowest; i--) { + struct nvme_queue *nvmeq = dev->queues[i]; + dev->queue_count--; + dev->queues[i] = NULL; + nvme_free_queue(nvmeq); + } +} + +/** + * nvme_suspend_queue - put queue into suspended state + * @nvmeq - queue to suspend + */ +static int nvme_suspend_queue(struct nvme_queue *nvmeq) +{ + int vector; + + spin_lock_irq(&nvmeq->q_lock); + if (nvmeq->cq_vector == -1) { + spin_unlock_irq(&nvmeq->q_lock); + return 1; + } + vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; + nvmeq->dev->online_queues--; + nvmeq->cq_vector = -1; + spin_unlock_irq(&nvmeq->q_lock); + + if (!nvmeq->qid && nvmeq->dev->admin_q) + blk_mq_freeze_queue_start(nvmeq->dev->admin_q); + + irq_set_affinity_hint(vector, NULL); + free_irq(vector, nvmeq); + + return 0; +} + +static void nvme_clear_queue(struct nvme_queue *nvmeq) +{ + struct blk_mq_hw_ctx *hctx = nvmeq->hctx; + + spin_lock_irq(&nvmeq->q_lock); + if (hctx && hctx->tags) + blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq); + spin_unlock_irq(&nvmeq->q_lock); +} + +static void nvme_disable_queue(struct nvme_dev *dev, int qid) +{ + struct nvme_queue *nvmeq = dev->queues[qid]; + + if (!nvmeq) + return; + if (nvme_suspend_queue(nvmeq)) + return; + + /* Don't tell the adapter to delete the admin queue. + * Don't tell a removed adapter to delete IO queues. */ + if (qid && readl(&dev->bar->csts) != -1) { + adapter_delete_sq(dev, qid); + adapter_delete_cq(dev, qid); + } + + spin_lock_irq(&nvmeq->q_lock); + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); +} + +static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, + int depth) +{ + struct device *dmadev = &dev->pci_dev->dev; + struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); + if (!nvmeq) + return NULL; + + nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth), + &nvmeq->cq_dma_addr, GFP_KERNEL); + if (!nvmeq->cqes) + goto free_nvmeq; + + nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), + &nvmeq->sq_dma_addr, GFP_KERNEL); + if (!nvmeq->sq_cmds) + goto free_cqdma; + + nvmeq->q_dmadev = dmadev; + nvmeq->dev = dev; + snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", + dev->instance, qid); + spin_lock_init(&nvmeq->q_lock); + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; + nvmeq->q_depth = depth; + nvmeq->qid = qid; + dev->queue_count++; + dev->queues[qid] = nvmeq; + + return nvmeq; + + free_cqdma: + dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, + nvmeq->cq_dma_addr); + free_nvmeq: + kfree(nvmeq); + return NULL; +} + +static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, + const char *name) +{ + if (use_threaded_interrupts) + return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, + nvme_irq_check, nvme_irq, IRQF_SHARED, + name, nvmeq); + return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, + IRQF_SHARED, name, nvmeq); +} + +static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) +{ + struct nvme_dev *dev = nvmeq->dev; + + spin_lock_irq(&nvmeq->q_lock); + nvmeq->sq_tail = 0; + nvmeq->cq_head = 0; + nvmeq->cq_phase = 1; + nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; + memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); + dev->online_queues++; + spin_unlock_irq(&nvmeq->q_lock); +} + +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) +{ + struct nvme_dev *dev = nvmeq->dev; + int result; + + nvmeq->cq_vector = qid - 1; + result = adapter_alloc_cq(dev, qid, nvmeq); + if (result < 0) + return result; + + result = adapter_alloc_sq(dev, qid, nvmeq); + if (result < 0) + goto release_cq; + + result = queue_request_irq(dev, nvmeq, nvmeq->irqname); + if (result < 0) + goto release_sq; + + nvme_init_queue(nvmeq, qid); + return result; + + release_sq: + adapter_delete_sq(dev, qid); + release_cq: + adapter_delete_cq(dev, qid); + return result; +} + +static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) +{ + unsigned long timeout; + u32 bit = enabled ? NVME_CSTS_RDY : 0; + + timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + + while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(&dev->pci_dev->dev, + "Device not ready; aborting %s\n", enabled ? + "initialisation" : "reset"); + return -ENODEV; + } + } + + return 0; +} + +/* + * If the device has been passed off to us in an enabled state, just clear + * the enabled bit. The spec says we should set the 'shutdown notification + * bits', but doing so may cause the device to complete commands to the + * admin queue ... and we don't know what memory that might be pointing at! + */ +static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) +{ + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config &= ~NVME_CC_ENABLE; + writel(dev->ctrl_config, &dev->bar->cc); + + return nvme_wait_ready(dev, cap, false); +} + +static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) +{ + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config |= NVME_CC_ENABLE; + writel(dev->ctrl_config, &dev->bar->cc); + + return nvme_wait_ready(dev, cap, true); +} + +static int nvme_shutdown_ctrl(struct nvme_dev *dev) +{ + unsigned long timeout; + + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config |= NVME_CC_SHN_NORMAL; + + writel(dev->ctrl_config, &dev->bar->cc); + + timeout = SHUTDOWN_TIMEOUT + jiffies; + while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != + NVME_CSTS_SHST_CMPLT) { + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(&dev->pci_dev->dev, + "Device shutdown incomplete; abort shutdown\n"); + return -ENODEV; + } + } + + return 0; +} + +static struct blk_mq_ops nvme_mq_admin_ops = { + .queue_rq = nvme_admin_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_admin_init_hctx, + .exit_hctx = nvme_exit_hctx, + .init_request = nvme_admin_init_request, + .timeout = nvme_timeout, +}; + +static struct blk_mq_ops nvme_mq_ops = { + .queue_rq = nvme_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_init_hctx, + .exit_hctx = nvme_exit_hctx, + .init_request = nvme_init_request, + .timeout = nvme_timeout, +}; + +static void nvme_dev_remove_admin(struct nvme_dev *dev) +{ + if (dev->admin_q && !blk_queue_dying(dev->admin_q)) { + blk_cleanup_queue(dev->admin_q); + blk_mq_free_tag_set(&dev->admin_tagset); + } +} + +static int nvme_alloc_admin_tags(struct nvme_dev *dev) +{ + if (!dev->admin_q) { + dev->admin_tagset.ops = &nvme_mq_admin_ops; + dev->admin_tagset.nr_hw_queues = 1; + dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; + dev->admin_tagset.reserved_tags = 1; + dev->admin_tagset.timeout = ADMIN_TIMEOUT; + dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->admin_tagset.cmd_size = nvme_cmd_size(dev); + dev->admin_tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->admin_tagset)) + return -ENOMEM; + + dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); + if (IS_ERR(dev->admin_q)) { + blk_mq_free_tag_set(&dev->admin_tagset); + return -ENOMEM; + } + if (!blk_get_queue(dev->admin_q)) { + nvme_dev_remove_admin(dev); + return -ENODEV; + } + } else + blk_mq_unfreeze_queue(dev->admin_q); + + return 0; +} + +static int nvme_configure_admin_queue(struct nvme_dev *dev) +{ + int result; + u32 aqa; + u64 cap = readq(&dev->bar->cap); + struct nvme_queue *nvmeq; + unsigned page_shift = PAGE_SHIFT; + unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; + unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; + + if (page_shift < dev_page_min) { + dev_err(&dev->pci_dev->dev, + "Minimum device page size (%u) too large for " + "host (%u)\n", 1 << dev_page_min, + 1 << page_shift); + return -ENODEV; + } + if (page_shift > dev_page_max) { + dev_info(&dev->pci_dev->dev, + "Device maximum page size (%u) smaller than " + "host (%u); enabling work-around\n", + 1 << dev_page_max, 1 << page_shift); + page_shift = dev_page_max; + } + + result = nvme_disable_ctrl(dev, cap); + if (result < 0) + return result; + + nvmeq = dev->queues[0]; + if (!nvmeq) { + nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); + if (!nvmeq) + return -ENOMEM; + } + + aqa = nvmeq->q_depth - 1; + aqa |= aqa << 16; + + dev->page_size = 1 << page_shift; + + dev->ctrl_config = NVME_CC_CSS_NVM; + dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; + dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; + + writel(aqa, &dev->bar->aqa); + writeq(nvmeq->sq_dma_addr, &dev->bar->asq); + writeq(nvmeq->cq_dma_addr, &dev->bar->acq); + + result = nvme_enable_ctrl(dev, cap); + if (result) + goto free_nvmeq; + + nvmeq->cq_vector = 0; + result = queue_request_irq(dev, nvmeq, nvmeq->irqname); + if (result) + goto free_nvmeq; + + return result; + + free_nvmeq: + nvme_free_queues(dev, 0); + return result; +} + +struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, + unsigned long addr, unsigned length) +{ + int i, err, count, nents, offset; + struct scatterlist *sg; + struct page **pages; + struct nvme_iod *iod; + + if (addr & 3) + return ERR_PTR(-EINVAL); + if (!length || length > INT_MAX - PAGE_SIZE) + return ERR_PTR(-EINVAL); + + offset = offset_in_page(addr); + count = DIV_ROUND_UP(offset + length, PAGE_SIZE); + pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + err = get_user_pages_fast(addr, count, 1, pages); + if (err < count) { + count = err; + err = -EFAULT; + goto put_pages; + } + + err = -ENOMEM; + iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL); + if (!iod) + goto put_pages; + + sg = iod->sg; + sg_init_table(sg, count); + for (i = 0; i < count; i++) { + sg_set_page(&sg[i], pages[i], + min_t(unsigned, length, PAGE_SIZE - offset), + offset); + length -= (PAGE_SIZE - offset); + offset = 0; + } + sg_mark_end(&sg[i - 1]); + iod->nents = count; + + nents = dma_map_sg(&dev->pci_dev->dev, sg, count, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (!nents) + goto free_iod; + + kfree(pages); + return iod; + + free_iod: + kfree(iod); + put_pages: + for (i = 0; i < count; i++) + put_page(pages[i]); + kfree(pages); + return ERR_PTR(err); +} + +void nvme_unmap_user_pages(struct nvme_dev *dev, int write, + struct nvme_iod *iod) +{ + int i; + + dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, + write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + + for (i = 0; i < iod->nents; i++) + put_page(sg_page(&iod->sg[i])); +} + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_dev *dev = ns->dev; + struct nvme_user_io io; + struct nvme_command c; + unsigned length, meta_len, prp_len; + int status, write; + struct nvme_iod *iod; + dma_addr_t meta_dma = 0; + void *meta = NULL; + void __user *metadata; + + if (copy_from_user(&io, uio, sizeof(io))) + return -EFAULT; + length = (io.nblocks + 1) << ns->lba_shift; + meta_len = (io.nblocks + 1) * ns->ms; + + if (meta_len && ((io.metadata & 3) || !io.metadata) && !ns->ext) + return -EINVAL; + else if (meta_len && ns->ext) { + length += meta_len; + meta_len = 0; + } + + metadata = (void __user *)(unsigned long)io.metadata; + + write = io.opcode & 1; + + switch (io.opcode) { + case nvme_cmd_write: + case nvme_cmd_read: + case nvme_cmd_compare: + iod = nvme_map_user_pages(dev, write, io.addr, length); + break; + default: + return -EINVAL; + } + + if (IS_ERR(iod)) + return PTR_ERR(iod); + + prp_len = nvme_setup_prps(dev, iod, length, GFP_KERNEL); + if (length != prp_len) { + status = -ENOMEM; + goto unmap; + } + if (meta_len) { + meta = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, + &meta_dma, GFP_KERNEL); + + if (!meta) { + status = -ENOMEM; + goto unmap; + } + if (write) { + if (copy_from_user(meta, metadata, meta_len)) { + status = -EFAULT; + goto unmap; + } + } + } + + memset(&c, 0, sizeof(c)); + c.rw.opcode = io.opcode; + c.rw.flags = io.flags; + c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.slba = cpu_to_le64(io.slba); + c.rw.length = cpu_to_le16(io.nblocks); + c.rw.control = cpu_to_le16(io.control); + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); + c.rw.reftag = cpu_to_le32(io.reftag); + c.rw.apptag = cpu_to_le16(io.apptag); + c.rw.appmask = cpu_to_le16(io.appmask); + c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + c.rw.prp2 = cpu_to_le64(iod->first_dma); + c.rw.metadata = cpu_to_le64(meta_dma); + status = nvme_submit_io_cmd(dev, ns, &c, NULL); + unmap: + nvme_unmap_user_pages(dev, write, iod); + nvme_free_iod(dev, iod); + if (meta) { + if (status == NVME_SC_SUCCESS && !write) { + if (copy_to_user(metadata, meta, meta_len)) + status = -EFAULT; + } + dma_free_coherent(&dev->pci_dev->dev, meta_len, meta, meta_dma); + } + return status; +} + +static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) +{ + struct nvme_passthru_cmd cmd; + struct nvme_command c; + int status, length; + struct nvme_iod *uninitialized_var(iod); + unsigned timeout; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); + c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); + c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); + c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); + c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); + c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); + + length = cmd.data_len; + if (cmd.data_len) { + iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, + length); + if (IS_ERR(iod)) + return PTR_ERR(iod); + length = nvme_setup_prps(dev, iod, length, GFP_KERNEL); + c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + c.common.prp2 = cpu_to_le64(iod->first_dma); + } + + timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : + ADMIN_TIMEOUT; + + if (length != cmd.data_len) + status = -ENOMEM; + else if (ns) { + struct request *req; + + req = blk_mq_alloc_request(ns->queue, WRITE, + (GFP_KERNEL|__GFP_WAIT), false); + if (IS_ERR(req)) + status = PTR_ERR(req); + else { + status = nvme_submit_sync_cmd(req, &c, &cmd.result, + timeout); + blk_mq_free_request(req); + } + } else + status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout); + + if (cmd.data_len) { + nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); + nvme_free_iod(dev, iod); + } + + if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, + sizeof(cmd.result))) + status = -EFAULT; + + return status; +} + +static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, + unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + switch (cmd) { + case NVME_IOCTL_ID: + force_successful_syscall_return(); + return ns->ns_id; + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->dev, ns, (void __user *)arg); + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, (void __user *)arg); + case SG_GET_VERSION_NUM: + return nvme_sg_get_version_num((void __user *)arg); + case SG_IO: + return nvme_sg_io(ns, (void __user *)arg); + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case SG_IO: + return -ENOIOCTLCMD; + } + return nvme_ioctl(bdev, mode, cmd, arg); +} +#else +#define nvme_compat_ioctl NULL +#endif + +static int nvme_open(struct block_device *bdev, fmode_t mode) +{ + int ret = 0; + struct nvme_ns *ns; + + spin_lock(&dev_list_lock); + ns = bdev->bd_disk->private_data; + if (!ns) + ret = -ENXIO; + else if (!kref_get_unless_zero(&ns->dev->kref)) + ret = -ENXIO; + spin_unlock(&dev_list_lock); + + return ret; +} + +static void nvme_free_dev(struct kref *kref); + +static void nvme_release(struct gendisk *disk, fmode_t mode) +{ + struct nvme_ns *ns = disk->private_data; + struct nvme_dev *dev = ns->dev; + + kref_put(&dev->kref, nvme_free_dev); +} + +static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bd->bd_disk) >> 11; + return 0; +} + +static void nvme_config_discard(struct nvme_ns *ns) +{ + u32 logical_block_size = queue_logical_block_size(ns->queue); + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + ns->queue->limits.max_discard_sectors = 0xffffffff; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); +} + +static int nvme_revalidate_disk(struct gendisk *disk) +{ + struct nvme_ns *ns = disk->private_data; + struct nvme_dev *dev = ns->dev; + struct nvme_id_ns *id; + dma_addr_t dma_addr; + u8 lbaf, pi_type; + u16 old_ms; + unsigned short bs; + + id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr, + GFP_KERNEL); + if (!id) { + dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n", + __func__); + return 0; + } + if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) { + dev_warn(&dev->pci_dev->dev, + "identify failed ns:%d, setting capacity to 0\n", + ns->ns_id); + memset(id, 0, sizeof(*id)); + } + + old_ms = ns->ms; + lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; + ns->lba_shift = id->lbaf[lbaf].ds; + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + + /* + * If identify namespace failed, use default 512 byte block size so + * block layer can use before failing read/write for 0 capacity. + */ + if (ns->lba_shift == 0) + ns->lba_shift = 9; + bs = 1 << ns->lba_shift; + + /* XXX: PI implementation requires metadata equal t10 pi tuple size */ + pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? + id->dps & NVME_NS_DPS_PI_MASK : 0; + + if (blk_get_integrity(disk) && (ns->pi_type != pi_type || + ns->ms != old_ms || + bs != queue_logical_block_size(disk->queue) || + (ns->ms && ns->ext))) + blk_integrity_unregister(disk); + + ns->pi_type = pi_type; + blk_queue_logical_block_size(ns->queue, bs); + + if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) && + !ns->ext) + nvme_init_integrity(ns); + + if (id->ncap == 0 || (ns->ms && !blk_get_integrity(disk))) + set_capacity(disk, 0); + else + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + + if (dev->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns); + + dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); + return 0; +} + +static const struct block_device_operations nvme_fops = { + .owner = THIS_MODULE, + .ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, + .open = nvme_open, + .release = nvme_release, + .getgeo = nvme_getgeo, + .revalidate_disk= nvme_revalidate_disk, +}; + +static int nvme_kthread(void *data) +{ + struct nvme_dev *dev, *next; + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock(&dev_list_lock); + list_for_each_entry_safe(dev, next, &dev_list, node) { + int i; + if (readl(&dev->bar->csts) & NVME_CSTS_CFS) { + if (work_busy(&dev->reset_work)) + continue; + list_del_init(&dev->node); + dev_warn(&dev->pci_dev->dev, + "Failed status: %x, reset controller\n", + readl(&dev->bar->csts)); + dev->reset_workfn = nvme_reset_failed_dev; + queue_work(nvme_workq, &dev->reset_work); + continue; + } + for (i = 0; i < dev->queue_count; i++) { + struct nvme_queue *nvmeq = dev->queues[i]; + if (!nvmeq) + continue; + spin_lock_irq(&nvmeq->q_lock); + nvme_process_cq(nvmeq); + + while ((i == 0) && (dev->event_limit > 0)) { + if (nvme_submit_async_admin_req(dev)) + break; + dev->event_limit--; + } + spin_unlock_irq(&nvmeq->q_lock); + } + } + spin_unlock(&dev_list_lock); + schedule_timeout(round_jiffies_relative(HZ)); + } + return 0; +} + +static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) +{ + struct nvme_ns *ns; + struct gendisk *disk; + int node = dev_to_node(&dev->pci_dev->dev); + + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); + if (!ns) + return; + + ns->queue = blk_mq_init_queue(&dev->tagset); + if (IS_ERR(ns->queue)) + goto out_free_ns; + queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); + queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue); + ns->dev = dev; + ns->queue->queuedata = ns; + + disk = alloc_disk_node(0, node); + if (!disk) + goto out_free_queue; + + ns->ns_id = nsid; + ns->disk = disk; + ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ + list_add_tail(&ns->list, &dev->namespaces); + + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + if (dev->max_hw_sectors) + blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + if (dev->stripe_size) + blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); + if (dev->vwc & NVME_CTRL_VWC_PRESENT) + blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); + + disk->major = nvme_major; + disk->first_minor = 0; + disk->fops = &nvme_fops; + disk->private_data = ns; + disk->queue = ns->queue; + disk->driverfs_dev = dev->device; + disk->flags = GENHD_FL_EXT_DEVT; + sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); + + /* + * Initialize capacity to 0 until we establish the namespace format and + * setup integrity extentions if necessary. The revalidate_disk after + * add_disk allows the driver to register with integrity if the format + * requires it. + */ + set_capacity(disk, 0); + nvme_revalidate_disk(ns->disk); + add_disk(ns->disk); + if (ns->ms) + revalidate_disk(ns->disk); + return; + out_free_queue: + blk_cleanup_queue(ns->queue); + out_free_ns: + kfree(ns); +} + +static void nvme_create_io_queues(struct nvme_dev *dev) +{ + unsigned i; + + for (i = dev->queue_count; i <= dev->max_qid; i++) + if (!nvme_alloc_queue(dev, i, dev->q_depth)) + break; + + for (i = dev->online_queues; i <= dev->queue_count - 1; i++) + if (nvme_create_queue(dev->queues[i], i)) + break; +} + +static int set_queue_count(struct nvme_dev *dev, int count) +{ + int status; + u32 result; + u32 q_count = (count - 1) | ((count - 1) << 16); + + status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, + &result); + if (status < 0) + return status; + if (status > 0) { + dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", + status); + return 0; + } + return min(result & 0xffff, result >> 16) + 1; +} + +static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) +{ + return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); +} + +static int nvme_setup_io_queues(struct nvme_dev *dev) +{ + struct nvme_queue *adminq = dev->queues[0]; + struct pci_dev *pdev = dev->pci_dev; + int result, i, vecs, nr_io_queues, size; + + nr_io_queues = num_possible_cpus(); + result = set_queue_count(dev, nr_io_queues); + if (result <= 0) + return result; + if (result < nr_io_queues) + nr_io_queues = result; + + size = db_bar_size(dev, nr_io_queues); + if (size > 8192) { + iounmap(dev->bar); + do { + dev->bar = ioremap(pci_resource_start(pdev, 0), size); + if (dev->bar) + break; + if (!--nr_io_queues) + return -ENOMEM; + size = db_bar_size(dev, nr_io_queues); + } while (1); + dev->dbs = ((void __iomem *)dev->bar) + 4096; + adminq->q_db = dev->dbs; + } + + /* Deregister the admin queue's interrupt */ + free_irq(dev->entry[0].vector, adminq); + + /* + * If we enable msix early due to not intx, disable it again before + * setting up the full range we need. + */ + if (!pdev->irq) + pci_disable_msix(pdev); + + for (i = 0; i < nr_io_queues; i++) + dev->entry[i].entry = i; + vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); + if (vecs < 0) { + vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32)); + if (vecs < 0) { + vecs = 1; + } else { + for (i = 0; i < vecs; i++) + dev->entry[i].vector = i + pdev->irq; + } + } + + /* + * Should investigate if there's a performance win from allocating + * more queues than interrupt vectors; it might allow the submission + * path to scale better, even if the receive path is limited by the + * number of interrupts. + */ + nr_io_queues = vecs; + dev->max_qid = nr_io_queues; + + result = queue_request_irq(dev, adminq, adminq->irqname); + if (result) + goto free_queues; + + /* Free previously allocated queues that are no longer usable */ + nvme_free_queues(dev, nr_io_queues + 1); + nvme_create_io_queues(dev); + + return 0; + + free_queues: + nvme_free_queues(dev, 1); + return result; +} + +/* + * Return: error value if an error occurred setting up the queues or calling + * Identify Device. 0 if these succeeded, even if adding some of the + * namespaces failed. At the moment, these failures are silent. TBD which + * failures should be reported. + */ +static int nvme_dev_add(struct nvme_dev *dev) +{ + struct pci_dev *pdev = dev->pci_dev; + int res; + unsigned nn, i; + struct nvme_id_ctrl *ctrl; + void *mem; + dma_addr_t dma_addr; + int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; + + mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL); + if (!mem) + return -ENOMEM; + + res = nvme_identify(dev, 0, 1, dma_addr); + if (res) { + dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); + dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); + return -EIO; + } + + ctrl = mem; + nn = le32_to_cpup(&ctrl->nn); + dev->oncs = le16_to_cpup(&ctrl->oncs); + dev->abort_limit = ctrl->acl + 1; + dev->vwc = ctrl->vwc; + memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); + memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); + memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); + if (ctrl->mdts) + dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); + if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && + (pdev->device == 0x0953) && ctrl->vs[3]) { + unsigned int max_hw_sectors; + + dev->stripe_size = 1 << (ctrl->vs[3] + shift); + max_hw_sectors = dev->stripe_size >> (shift - 9); + if (dev->max_hw_sectors) { + dev->max_hw_sectors = min(max_hw_sectors, + dev->max_hw_sectors); + } else + dev->max_hw_sectors = max_hw_sectors; + } + dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); + + dev->tagset.ops = &nvme_mq_ops; + dev->tagset.nr_hw_queues = dev->online_queues - 1; + dev->tagset.timeout = NVME_IO_TIMEOUT; + dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->tagset.queue_depth = + min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.cmd_size = nvme_cmd_size(dev); + dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; + dev->tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->tagset)) + return 0; + + for (i = 1; i <= nn; i++) + nvme_alloc_ns(dev, i); + + return 0; +} + +static int nvme_dev_map(struct nvme_dev *dev) +{ + u64 cap; + int bars, result = -ENOMEM; + struct pci_dev *pdev = dev->pci_dev; + + if (pci_enable_device_mem(pdev)) + return result; + + dev->entry[0].vector = pdev->irq; + pci_set_master(pdev); + bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (!bars) + goto disable_pci; + + if (pci_request_selected_regions(pdev, bars, "nvme")) + goto disable_pci; + + if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) && + dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) + goto disable; + + dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); + if (!dev->bar) + goto disable; + + if (readl(&dev->bar->csts) == -1) { + result = -ENODEV; + goto unmap; + } + + /* + * Some devices don't advertse INTx interrupts, pre-enable a single + * MSIX vec for setup. We'll adjust this later. + */ + if (!pdev->irq) { + result = pci_enable_msix(pdev, dev->entry, 1); + if (result < 0) + goto unmap; + } + + cap = readq(&dev->bar->cap); + dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); + dev->db_stride = 1 << NVME_CAP_STRIDE(cap); + dev->dbs = ((void __iomem *)dev->bar) + 4096; + + return 0; + + unmap: + iounmap(dev->bar); + dev->bar = NULL; + disable: + pci_release_regions(pdev); + disable_pci: + pci_disable_device(pdev); + return result; +} + +static void nvme_dev_unmap(struct nvme_dev *dev) +{ + if (dev->pci_dev->msi_enabled) + pci_disable_msi(dev->pci_dev); + else if (dev->pci_dev->msix_enabled) + pci_disable_msix(dev->pci_dev); + + if (dev->bar) { + iounmap(dev->bar); + dev->bar = NULL; + pci_release_regions(dev->pci_dev); + } + + if (pci_is_enabled(dev->pci_dev)) + pci_disable_device(dev->pci_dev); +} + +struct nvme_delq_ctx { + struct task_struct *waiter; + struct kthread_worker *worker; + atomic_t refcount; +}; + +static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) +{ + dq->waiter = current; + mb(); + + for (;;) { + set_current_state(TASK_KILLABLE); + if (!atomic_read(&dq->refcount)) + break; + if (!schedule_timeout(ADMIN_TIMEOUT) || + fatal_signal_pending(current)) { + /* + * Disable the controller first since we can't trust it + * at this point, but leave the admin queue enabled + * until all queue deletion requests are flushed. + * FIXME: This may take a while if there are more h/w + * queues than admin tags. + */ + set_current_state(TASK_RUNNING); + nvme_disable_ctrl(dev, readq(&dev->bar->cap)); + nvme_clear_queue(dev->queues[0]); + flush_kthread_worker(dq->worker); + nvme_disable_queue(dev, 0); + return; + } + } + set_current_state(TASK_RUNNING); +} + +static void nvme_put_dq(struct nvme_delq_ctx *dq) +{ + atomic_dec(&dq->refcount); + if (dq->waiter) + wake_up_process(dq->waiter); +} + +static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) +{ + atomic_inc(&dq->refcount); + return dq; +} + +static void nvme_del_queue_end(struct nvme_queue *nvmeq) +{ + struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; + nvme_put_dq(dq); +} + +static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, + kthread_work_func_t fn) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.delete_queue.opcode = opcode; + c.delete_queue.qid = cpu_to_le16(nvmeq->qid); + + init_kthread_work(&nvmeq->cmdinfo.work, fn); + return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, + ADMIN_TIMEOUT); +} + +static void nvme_del_cq_work_handler(struct kthread_work *work) +{ + struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, + cmdinfo.work); + nvme_del_queue_end(nvmeq); +} + +static int nvme_delete_cq(struct nvme_queue *nvmeq) +{ + return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, + nvme_del_cq_work_handler); +} + +static void nvme_del_sq_work_handler(struct kthread_work *work) +{ + struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, + cmdinfo.work); + int status = nvmeq->cmdinfo.status; + + if (!status) + status = nvme_delete_cq(nvmeq); + if (status) + nvme_del_queue_end(nvmeq); +} + +static int nvme_delete_sq(struct nvme_queue *nvmeq) +{ + return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, + nvme_del_sq_work_handler); +} + +static void nvme_del_queue_start(struct kthread_work *work) +{ + struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, + cmdinfo.work); + if (nvme_delete_sq(nvmeq)) + nvme_del_queue_end(nvmeq); +} + +static void nvme_disable_io_queues(struct nvme_dev *dev) +{ + int i; + DEFINE_KTHREAD_WORKER_ONSTACK(worker); + struct nvme_delq_ctx dq; + struct task_struct *kworker_task = kthread_run(kthread_worker_fn, + &worker, "nvme%d", dev->instance); + + if (IS_ERR(kworker_task)) { + dev_err(&dev->pci_dev->dev, + "Failed to create queue del task\n"); + for (i = dev->queue_count - 1; i > 0; i--) + nvme_disable_queue(dev, i); + return; + } + + dq.waiter = NULL; + atomic_set(&dq.refcount, 0); + dq.worker = &worker; + for (i = dev->queue_count - 1; i > 0; i--) { + struct nvme_queue *nvmeq = dev->queues[i]; + + if (nvme_suspend_queue(nvmeq)) + continue; + nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); + nvmeq->cmdinfo.worker = dq.worker; + init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); + queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); + } + nvme_wait_dq(&dq, dev); + kthread_stop(kworker_task); +} + +/* +* Remove the node from the device list and check +* for whether or not we need to stop the nvme_thread. +*/ +static void nvme_dev_list_remove(struct nvme_dev *dev) +{ + struct task_struct *tmp = NULL; + + spin_lock(&dev_list_lock); + list_del_init(&dev->node); + if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { + tmp = nvme_thread; + nvme_thread = NULL; + } + spin_unlock(&dev_list_lock); + + if (tmp) + kthread_stop(tmp); +} + +static void nvme_freeze_queues(struct nvme_dev *dev) +{ + struct nvme_ns *ns; + + list_for_each_entry(ns, &dev->namespaces, list) { + blk_mq_freeze_queue_start(ns->queue); + + spin_lock(ns->queue->queue_lock); + queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); + spin_unlock(ns->queue->queue_lock); + + blk_mq_cancel_requeue_work(ns->queue); + blk_mq_stop_hw_queues(ns->queue); + } +} + +static void nvme_unfreeze_queues(struct nvme_dev *dev) +{ + struct nvme_ns *ns; + + list_for_each_entry(ns, &dev->namespaces, list) { + queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); + blk_mq_unfreeze_queue(ns->queue); + blk_mq_start_stopped_hw_queues(ns->queue, true); + blk_mq_kick_requeue_list(ns->queue); + } +} + +static void nvme_dev_shutdown(struct nvme_dev *dev) +{ + int i; + u32 csts = -1; + + nvme_dev_list_remove(dev); + + if (dev->bar) { + nvme_freeze_queues(dev); + csts = readl(&dev->bar->csts); + } + if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { + for (i = dev->queue_count - 1; i >= 0; i--) { + struct nvme_queue *nvmeq = dev->queues[i]; + nvme_suspend_queue(nvmeq); + } + } else { + nvme_disable_io_queues(dev); + nvme_shutdown_ctrl(dev); + nvme_disable_queue(dev, 0); + } + nvme_dev_unmap(dev); + + for (i = dev->queue_count - 1; i >= 0; i--) + nvme_clear_queue(dev->queues[i]); +} + +static void nvme_dev_remove(struct nvme_dev *dev) +{ + struct nvme_ns *ns; + + list_for_each_entry(ns, &dev->namespaces, list) { + if (ns->disk->flags & GENHD_FL_UP) { + if (blk_get_integrity(ns->disk)) + blk_integrity_unregister(ns->disk); + del_gendisk(ns->disk); + } + if (!blk_queue_dying(ns->queue)) { + blk_mq_abort_requeue_list(ns->queue); + blk_cleanup_queue(ns->queue); + } + } +} + +static int nvme_setup_prp_pools(struct nvme_dev *dev) +{ + struct device *dmadev = &dev->pci_dev->dev; + dev->prp_page_pool = dma_pool_create("prp list page", dmadev, + PAGE_SIZE, PAGE_SIZE, 0); + if (!dev->prp_page_pool) + return -ENOMEM; + + /* Optimisation for I/Os between 4k and 128k */ + dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, + 256, 256, 0); + if (!dev->prp_small_pool) { + dma_pool_destroy(dev->prp_page_pool); + return -ENOMEM; + } + return 0; +} + +static void nvme_release_prp_pools(struct nvme_dev *dev) +{ + dma_pool_destroy(dev->prp_page_pool); + dma_pool_destroy(dev->prp_small_pool); +} + +static DEFINE_IDA(nvme_instance_ida); + +static int nvme_set_instance(struct nvme_dev *dev) +{ + int instance, error; + + do { + if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) + return -ENODEV; + + spin_lock(&dev_list_lock); + error = ida_get_new(&nvme_instance_ida, &instance); + spin_unlock(&dev_list_lock); + } while (error == -EAGAIN); + + if (error) + return -ENODEV; + + dev->instance = instance; + return 0; +} + +static void nvme_release_instance(struct nvme_dev *dev) +{ + spin_lock(&dev_list_lock); + ida_remove(&nvme_instance_ida, dev->instance); + spin_unlock(&dev_list_lock); +} + +static void nvme_free_namespaces(struct nvme_dev *dev) +{ + struct nvme_ns *ns, *next; + + list_for_each_entry_safe(ns, next, &dev->namespaces, list) { + list_del(&ns->list); + + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + + put_disk(ns->disk); + kfree(ns); + } +} + +static void nvme_free_dev(struct kref *kref) +{ + struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); + + pci_dev_put(dev->pci_dev); + put_device(dev->device); + nvme_free_namespaces(dev); + nvme_release_instance(dev); + blk_mq_free_tag_set(&dev->tagset); + blk_put_queue(dev->admin_q); + kfree(dev->queues); + kfree(dev->entry); + kfree(dev); +} + +static int nvme_dev_open(struct inode *inode, struct file *f) +{ + struct nvme_dev *dev; + int instance = iminor(inode); + int ret = -ENODEV; + + spin_lock(&dev_list_lock); + list_for_each_entry(dev, &dev_list, node) { + if (dev->instance == instance) { + if (!dev->admin_q) { + ret = -EWOULDBLOCK; + break; + } + if (!kref_get_unless_zero(&dev->kref)) + break; + f->private_data = dev; + ret = 0; + break; + } + } + spin_unlock(&dev_list_lock); + + return ret; +} + +static int nvme_dev_release(struct inode *inode, struct file *f) +{ + struct nvme_dev *dev = f->private_data; + kref_put(&dev->kref, nvme_free_dev); + return 0; +} + +static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) +{ + struct nvme_dev *dev = f->private_data; + struct nvme_ns *ns; + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(dev, NULL, (void __user *)arg); + case NVME_IOCTL_IO_CMD: + if (list_empty(&dev->namespaces)) + return -ENOTTY; + ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); + return nvme_user_cmd(dev, ns, (void __user *)arg); + default: + return -ENOTTY; + } +} + +static const struct file_operations nvme_dev_fops = { + .owner = THIS_MODULE, + .open = nvme_dev_open, + .release = nvme_dev_release, + .unlocked_ioctl = nvme_dev_ioctl, + .compat_ioctl = nvme_dev_ioctl, +}; + +static void nvme_set_irq_hints(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq; + int i; + + for (i = 0; i < dev->online_queues; i++) { + nvmeq = dev->queues[i]; + + if (!nvmeq->hctx) + continue; + + irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, + nvmeq->hctx->cpumask); + } +} + +static int nvme_dev_start(struct nvme_dev *dev) +{ + int result; + bool start_thread = false; + + result = nvme_dev_map(dev); + if (result) + return result; + + result = nvme_configure_admin_queue(dev); + if (result) + goto unmap; + + spin_lock(&dev_list_lock); + if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { + start_thread = true; + nvme_thread = NULL; + } + list_add(&dev->node, &dev_list); + spin_unlock(&dev_list_lock); + + if (start_thread) { + nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); + wake_up_all(&nvme_kthread_wait); + } else + wait_event_killable(nvme_kthread_wait, nvme_thread); + + if (IS_ERR_OR_NULL(nvme_thread)) { + result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; + goto disable; + } + + nvme_init_queue(dev->queues[0], 0); + result = nvme_alloc_admin_tags(dev); + if (result) + goto disable; + + result = nvme_setup_io_queues(dev); + if (result) + goto free_tags; + + nvme_set_irq_hints(dev); + + dev->event_limit = 1; + return result; + + free_tags: + nvme_dev_remove_admin(dev); + disable: + nvme_disable_queue(dev, 0); + nvme_dev_list_remove(dev); + unmap: + nvme_dev_unmap(dev); + return result; +} + +static int nvme_remove_dead_ctrl(void *arg) +{ + struct nvme_dev *dev = (struct nvme_dev *)arg; + struct pci_dev *pdev = dev->pci_dev; + + if (pci_get_drvdata(pdev)) + pci_stop_and_remove_bus_device_locked(pdev); + kref_put(&dev->kref, nvme_free_dev); + return 0; +} + +static void nvme_remove_disks(struct work_struct *ws) +{ + struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); + + nvme_free_queues(dev, 1); + nvme_dev_remove(dev); +} + +static int nvme_dev_resume(struct nvme_dev *dev) +{ + int ret; + + ret = nvme_dev_start(dev); + if (ret) + return ret; + if (dev->online_queues < 2) { + spin_lock(&dev_list_lock); + dev->reset_workfn = nvme_remove_disks; + queue_work(nvme_workq, &dev->reset_work); + spin_unlock(&dev_list_lock); + } else { + nvme_unfreeze_queues(dev); + nvme_set_irq_hints(dev); + } + return 0; +} + +static void nvme_dev_reset(struct nvme_dev *dev) +{ + nvme_dev_shutdown(dev); + if (nvme_dev_resume(dev)) { + dev_warn(&dev->pci_dev->dev, "Device failed to resume\n"); + kref_get(&dev->kref); + if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", + dev->instance))) { + dev_err(&dev->pci_dev->dev, + "Failed to start controller remove task\n"); + kref_put(&dev->kref, nvme_free_dev); + } + } +} + +static void nvme_reset_failed_dev(struct work_struct *ws) +{ + struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); + nvme_dev_reset(dev); +} + +static void nvme_reset_workfn(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + dev->reset_workfn(work); +} + +static void nvme_async_probe(struct work_struct *work); +static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + int node, result = -ENOMEM; + struct nvme_dev *dev; + + node = dev_to_node(&pdev->dev); + if (node == NUMA_NO_NODE) + set_dev_node(&pdev->dev, 0); + + dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); + if (!dev) + return -ENOMEM; + dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), + GFP_KERNEL, node); + if (!dev->entry) + goto free; + dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), + GFP_KERNEL, node); + if (!dev->queues) + goto free; + + INIT_LIST_HEAD(&dev->namespaces); + dev->reset_workfn = nvme_reset_failed_dev; + INIT_WORK(&dev->reset_work, nvme_reset_workfn); + dev->pci_dev = pci_dev_get(pdev); + pci_set_drvdata(pdev, dev); + result = nvme_set_instance(dev); + if (result) + goto put_pci; + + result = nvme_setup_prp_pools(dev); + if (result) + goto release; + + kref_init(&dev->kref); + dev->device = device_create(nvme_class, &pdev->dev, + MKDEV(nvme_char_major, dev->instance), + dev, "nvme%d", dev->instance); + if (IS_ERR(dev->device)) { + result = PTR_ERR(dev->device); + goto release_pools; + } + get_device(dev->device); + + INIT_LIST_HEAD(&dev->node); + INIT_WORK(&dev->probe_work, nvme_async_probe); + schedule_work(&dev->probe_work); + return 0; + + release_pools: + nvme_release_prp_pools(dev); + release: + nvme_release_instance(dev); + put_pci: + pci_dev_put(dev->pci_dev); + free: + kfree(dev->queues); + kfree(dev->entry); + kfree(dev); + return result; +} + +static void nvme_async_probe(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); + int result; + + result = nvme_dev_start(dev); + if (result) + goto reset; + + if (dev->online_queues > 1) + result = nvme_dev_add(dev); + if (result) + goto reset; + + nvme_set_irq_hints(dev); + return; + reset: + if (!work_busy(&dev->reset_work)) { + dev->reset_workfn = nvme_reset_failed_dev; + queue_work(nvme_workq, &dev->reset_work); + } +} + +static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + if (prepare) + nvme_dev_shutdown(dev); + else + nvme_dev_resume(dev); +} + +static void nvme_shutdown(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + nvme_dev_shutdown(dev); +} + +static void nvme_remove(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + spin_lock(&dev_list_lock); + list_del_init(&dev->node); + spin_unlock(&dev_list_lock); + + pci_set_drvdata(pdev, NULL); + flush_work(&dev->probe_work); + flush_work(&dev->reset_work); + nvme_dev_shutdown(dev); + nvme_dev_remove(dev); + nvme_dev_remove_admin(dev); + device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); + nvme_free_queues(dev, 0); + nvme_release_prp_pools(dev); + kref_put(&dev->kref, nvme_free_dev); +} + +/* These functions are yet to be implemented */ +#define nvme_error_detected NULL +#define nvme_dump_registers NULL +#define nvme_link_reset NULL +#define nvme_slot_reset NULL +#define nvme_error_resume NULL + +#ifdef CONFIG_PM_SLEEP +static int nvme_suspend(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct nvme_dev *ndev = pci_get_drvdata(pdev); + + nvme_dev_shutdown(ndev); + return 0; +} + +static int nvme_resume(struct device *dev) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct nvme_dev *ndev = pci_get_drvdata(pdev); + + if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) { + ndev->reset_workfn = nvme_reset_failed_dev; + queue_work(nvme_workq, &ndev->reset_work); + } + return 0; +} +#endif + +static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); + +static const struct pci_error_handlers nvme_err_handler = { + .error_detected = nvme_error_detected, + .mmio_enabled = nvme_dump_registers, + .link_reset = nvme_link_reset, + .slot_reset = nvme_slot_reset, + .resume = nvme_error_resume, + .reset_notify = nvme_reset_notify, +}; + +/* Move to pci_ids.h later */ +#define PCI_CLASS_STORAGE_EXPRESS 0x010802 + +static const struct pci_device_id nvme_id_table[] = { + { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, + { 0, } +}; +MODULE_DEVICE_TABLE(pci, nvme_id_table); + +static struct pci_driver nvme_driver = { + .name = "nvme", + .id_table = nvme_id_table, + .probe = nvme_probe, + .remove = nvme_remove, + .shutdown = nvme_shutdown, + .driver = { + .pm = &nvme_dev_pm_ops, + }, + .err_handler = &nvme_err_handler, +}; + +static int __init nvme_init(void) +{ + int result; + + init_waitqueue_head(&nvme_kthread_wait); + + nvme_workq = create_singlethread_workqueue("nvme"); + if (!nvme_workq) + return -ENOMEM; + + result = register_blkdev(nvme_major, "nvme"); + if (result < 0) + goto kill_workq; + else if (result > 0) + nvme_major = result; + + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", + &nvme_dev_fops); + if (result < 0) + goto unregister_blkdev; + else if (result > 0) + nvme_char_major = result; + + nvme_class = class_create(THIS_MODULE, "nvme"); + if (IS_ERR(nvme_class)) { + result = PTR_ERR(nvme_class); + goto unregister_chrdev; + } + + result = pci_register_driver(&nvme_driver); + if (result) + goto destroy_class; + return 0; + + destroy_class: + class_destroy(nvme_class); + unregister_chrdev: + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + unregister_blkdev: + unregister_blkdev(nvme_major, "nvme"); + kill_workq: + destroy_workqueue(nvme_workq); + return result; +} + +static void __exit nvme_exit(void) +{ + pci_unregister_driver(&nvme_driver); + unregister_blkdev(nvme_major, "nvme"); + destroy_workqueue(nvme_workq); + class_destroy(nvme_class); + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); + _nvme_check_size(); +} + +MODULE_AUTHOR("Matthew Wilcox "); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1.0"); +module_init(nvme_init); +module_exit(nvme_exit); diff --git a/kernel/drivers/block/nvme-scsi.c b/kernel/drivers/block/nvme-scsi.c new file mode 100644 index 000000000..44f2514fb --- /dev/null +++ b/kernel/drivers/block/nvme-scsi.c @@ -0,0 +1,3070 @@ +/* + * NVM Express device driver + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +/* + * Refer to the SCSI-NVMe Translation spec for details on how + * each command is translated. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static int sg_version_num = 30534; /* 2 digits for each component */ + +#define SNTI_TRANSLATION_SUCCESS 0 +#define SNTI_INTERNAL_ERROR 1 + +/* VPD Page Codes */ +#define VPD_SUPPORTED_PAGES 0x00 +#define VPD_SERIAL_NUMBER 0x80 +#define VPD_DEVICE_IDENTIFIERS 0x83 +#define VPD_EXTENDED_INQUIRY 0x86 +#define VPD_BLOCK_LIMITS 0xB0 +#define VPD_BLOCK_DEV_CHARACTERISTICS 0xB1 + +/* CDB offsets */ +#define REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET 6 +#define REPORT_LUNS_SR_OFFSET 2 +#define READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET 10 +#define REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET 4 +#define REQUEST_SENSE_DESC_OFFSET 1 +#define REQUEST_SENSE_DESC_MASK 0x01 +#define DESCRIPTOR_FORMAT_SENSE_DATA_TYPE 1 +#define INQUIRY_EVPD_BYTE_OFFSET 1 +#define INQUIRY_PAGE_CODE_BYTE_OFFSET 2 +#define INQUIRY_EVPD_BIT_MASK 1 +#define INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET 3 +#define START_STOP_UNIT_CDB_IMMED_OFFSET 1 +#define START_STOP_UNIT_CDB_IMMED_MASK 0x1 +#define START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET 3 +#define START_STOP_UNIT_CDB_POWER_COND_MOD_MASK 0xF +#define START_STOP_UNIT_CDB_POWER_COND_OFFSET 4 +#define START_STOP_UNIT_CDB_POWER_COND_MASK 0xF0 +#define START_STOP_UNIT_CDB_NO_FLUSH_OFFSET 4 +#define START_STOP_UNIT_CDB_NO_FLUSH_MASK 0x4 +#define START_STOP_UNIT_CDB_START_OFFSET 4 +#define START_STOP_UNIT_CDB_START_MASK 0x1 +#define WRITE_BUFFER_CDB_MODE_OFFSET 1 +#define WRITE_BUFFER_CDB_MODE_MASK 0x1F +#define WRITE_BUFFER_CDB_BUFFER_ID_OFFSET 2 +#define WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET 3 +#define WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET 6 +#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET 1 +#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK 0xC0 +#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT 6 +#define FORMAT_UNIT_CDB_LONG_LIST_OFFSET 1 +#define FORMAT_UNIT_CDB_LONG_LIST_MASK 0x20 +#define FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET 1 +#define FORMAT_UNIT_CDB_FORMAT_DATA_MASK 0x10 +#define FORMAT_UNIT_SHORT_PARM_LIST_LEN 4 +#define FORMAT_UNIT_LONG_PARM_LIST_LEN 8 +#define FORMAT_UNIT_PROT_INT_OFFSET 3 +#define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET 0 +#define FORMAT_UNIT_PROT_FIELD_USAGE_MASK 0x07 +#define UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET 7 + +/* Misc. defines */ +#define NIBBLE_SHIFT 4 +#define FIXED_SENSE_DATA 0x70 +#define DESC_FORMAT_SENSE_DATA 0x72 +#define FIXED_SENSE_DATA_ADD_LENGTH 10 +#define LUN_ENTRY_SIZE 8 +#define LUN_DATA_HEADER_SIZE 8 +#define ALL_LUNS_RETURNED 0x02 +#define ALL_WELL_KNOWN_LUNS_RETURNED 0x01 +#define RESTRICTED_LUNS_RETURNED 0x00 +#define NVME_POWER_STATE_START_VALID 0x00 +#define NVME_POWER_STATE_ACTIVE 0x01 +#define NVME_POWER_STATE_IDLE 0x02 +#define NVME_POWER_STATE_STANDBY 0x03 +#define NVME_POWER_STATE_LU_CONTROL 0x07 +#define POWER_STATE_0 0 +#define POWER_STATE_1 1 +#define POWER_STATE_2 2 +#define POWER_STATE_3 3 +#define DOWNLOAD_SAVE_ACTIVATE 0x05 +#define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E +#define ACTIVATE_DEFERRED_MICROCODE 0x0F +#define FORMAT_UNIT_IMMED_MASK 0x2 +#define FORMAT_UNIT_IMMED_OFFSET 1 +#define KELVIN_TEMP_FACTOR 273 +#define FIXED_FMT_SENSE_DATA_SIZE 18 +#define DESC_FMT_SENSE_DATA_SIZE 8 + +/* SCSI/NVMe defines and bit masks */ +#define INQ_STANDARD_INQUIRY_PAGE 0x00 +#define INQ_SUPPORTED_VPD_PAGES_PAGE 0x00 +#define INQ_UNIT_SERIAL_NUMBER_PAGE 0x80 +#define INQ_DEVICE_IDENTIFICATION_PAGE 0x83 +#define INQ_EXTENDED_INQUIRY_DATA_PAGE 0x86 +#define INQ_BDEV_LIMITS_PAGE 0xB0 +#define INQ_BDEV_CHARACTERISTICS_PAGE 0xB1 +#define INQ_SERIAL_NUMBER_LENGTH 0x14 +#define INQ_NUM_SUPPORTED_VPD_PAGES 6 +#define VERSION_SPC_4 0x06 +#define ACA_UNSUPPORTED 0 +#define STANDARD_INQUIRY_LENGTH 36 +#define ADDITIONAL_STD_INQ_LENGTH 31 +#define EXTENDED_INQUIRY_DATA_PAGE_LENGTH 0x3C +#define RESERVED_FIELD 0 + +/* SCSI READ/WRITE Defines */ +#define IO_CDB_WP_MASK 0xE0 +#define IO_CDB_WP_SHIFT 5 +#define IO_CDB_FUA_MASK 0x8 +#define IO_6_CDB_LBA_OFFSET 0 +#define IO_6_CDB_LBA_MASK 0x001FFFFF +#define IO_6_CDB_TX_LEN_OFFSET 4 +#define IO_6_DEFAULT_TX_LEN 256 +#define IO_10_CDB_LBA_OFFSET 2 +#define IO_10_CDB_TX_LEN_OFFSET 7 +#define IO_10_CDB_WP_OFFSET 1 +#define IO_10_CDB_FUA_OFFSET 1 +#define IO_12_CDB_LBA_OFFSET 2 +#define IO_12_CDB_TX_LEN_OFFSET 6 +#define IO_12_CDB_WP_OFFSET 1 +#define IO_12_CDB_FUA_OFFSET 1 +#define IO_16_CDB_FUA_OFFSET 1 +#define IO_16_CDB_WP_OFFSET 1 +#define IO_16_CDB_LBA_OFFSET 2 +#define IO_16_CDB_TX_LEN_OFFSET 10 + +/* Mode Sense/Select defines */ +#define MODE_PAGE_INFO_EXCEP 0x1C +#define MODE_PAGE_CACHING 0x08 +#define MODE_PAGE_CONTROL 0x0A +#define MODE_PAGE_POWER_CONDITION 0x1A +#define MODE_PAGE_RETURN_ALL 0x3F +#define MODE_PAGE_BLK_DES_LEN 0x08 +#define MODE_PAGE_LLBAA_BLK_DES_LEN 0x10 +#define MODE_PAGE_CACHING_LEN 0x14 +#define MODE_PAGE_CONTROL_LEN 0x0C +#define MODE_PAGE_POW_CND_LEN 0x28 +#define MODE_PAGE_INF_EXC_LEN 0x0C +#define MODE_PAGE_ALL_LEN 0x54 +#define MODE_SENSE6_MPH_SIZE 4 +#define MODE_SENSE6_ALLOC_LEN_OFFSET 4 +#define MODE_SENSE_PAGE_CONTROL_OFFSET 2 +#define MODE_SENSE_PAGE_CONTROL_MASK 0xC0 +#define MODE_SENSE_PAGE_CODE_OFFSET 2 +#define MODE_SENSE_PAGE_CODE_MASK 0x3F +#define MODE_SENSE_LLBAA_OFFSET 1 +#define MODE_SENSE_LLBAA_MASK 0x10 +#define MODE_SENSE_LLBAA_SHIFT 4 +#define MODE_SENSE_DBD_OFFSET 1 +#define MODE_SENSE_DBD_MASK 8 +#define MODE_SENSE_DBD_SHIFT 3 +#define MODE_SENSE10_MPH_SIZE 8 +#define MODE_SENSE10_ALLOC_LEN_OFFSET 7 +#define MODE_SELECT_CDB_PAGE_FORMAT_OFFSET 1 +#define MODE_SELECT_CDB_SAVE_PAGES_OFFSET 1 +#define MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET 4 +#define MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET 7 +#define MODE_SELECT_CDB_PAGE_FORMAT_MASK 0x10 +#define MODE_SELECT_CDB_SAVE_PAGES_MASK 0x1 +#define MODE_SELECT_6_BD_OFFSET 3 +#define MODE_SELECT_10_BD_OFFSET 6 +#define MODE_SELECT_10_LLBAA_OFFSET 4 +#define MODE_SELECT_10_LLBAA_MASK 1 +#define MODE_SELECT_6_MPH_SIZE 4 +#define MODE_SELECT_10_MPH_SIZE 8 +#define CACHING_MODE_PAGE_WCE_MASK 0x04 +#define MODE_SENSE_BLK_DESC_ENABLED 0 +#define MODE_SENSE_BLK_DESC_COUNT 1 +#define MODE_SELECT_PAGE_CODE_MASK 0x3F +#define SHORT_DESC_BLOCK 8 +#define LONG_DESC_BLOCK 16 +#define MODE_PAGE_POW_CND_LEN_FIELD 0x26 +#define MODE_PAGE_INF_EXC_LEN_FIELD 0x0A +#define MODE_PAGE_CACHING_LEN_FIELD 0x12 +#define MODE_PAGE_CONTROL_LEN_FIELD 0x0A +#define MODE_SENSE_PC_CURRENT_VALUES 0 + +/* Log Sense defines */ +#define LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE 0x00 +#define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH 0x07 +#define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE 0x2F +#define LOG_PAGE_TEMPERATURE_PAGE 0x0D +#define LOG_SENSE_CDB_SP_OFFSET 1 +#define LOG_SENSE_CDB_SP_NOT_ENABLED 0 +#define LOG_SENSE_CDB_PC_OFFSET 2 +#define LOG_SENSE_CDB_PC_MASK 0xC0 +#define LOG_SENSE_CDB_PC_SHIFT 6 +#define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES 1 +#define LOG_SENSE_CDB_PAGE_CODE_MASK 0x3F +#define LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET 7 +#define REMAINING_INFO_EXCP_PAGE_LENGTH 0x8 +#define LOG_INFO_EXCP_PAGE_LENGTH 0xC +#define REMAINING_TEMP_PAGE_LENGTH 0xC +#define LOG_TEMP_PAGE_LENGTH 0x10 +#define LOG_TEMP_UNKNOWN 0xFF +#define SUPPORTED_LOG_PAGES_PAGE_LENGTH 0x3 + +/* Read Capacity defines */ +#define READ_CAP_10_RESP_SIZE 8 +#define READ_CAP_16_RESP_SIZE 32 + +/* NVMe Namespace and Command Defines */ +#define BYTES_TO_DWORDS 4 +#define NVME_MAX_FIRMWARE_SLOT 7 + +/* Report LUNs defines */ +#define REPORT_LUNS_FIRST_LUN_OFFSET 8 + +/* SCSI ADDITIONAL SENSE Codes */ + +#define SCSI_ASC_NO_SENSE 0x00 +#define SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT 0x03 +#define SCSI_ASC_LUN_NOT_READY 0x04 +#define SCSI_ASC_WARNING 0x0B +#define SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED 0x10 +#define SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED 0x10 +#define SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED 0x10 +#define SCSI_ASC_UNRECOVERED_READ_ERROR 0x11 +#define SCSI_ASC_MISCOMPARE_DURING_VERIFY 0x1D +#define SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID 0x20 +#define SCSI_ASC_ILLEGAL_COMMAND 0x20 +#define SCSI_ASC_ILLEGAL_BLOCK 0x21 +#define SCSI_ASC_INVALID_CDB 0x24 +#define SCSI_ASC_INVALID_LUN 0x25 +#define SCSI_ASC_INVALID_PARAMETER 0x26 +#define SCSI_ASC_FORMAT_COMMAND_FAILED 0x31 +#define SCSI_ASC_INTERNAL_TARGET_FAILURE 0x44 + +/* SCSI ADDITIONAL SENSE Code Qualifiers */ + +#define SCSI_ASCQ_CAUSE_NOT_REPORTABLE 0x00 +#define SCSI_ASCQ_FORMAT_COMMAND_FAILED 0x01 +#define SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED 0x01 +#define SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED 0x02 +#define SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED 0x03 +#define SCSI_ASCQ_FORMAT_IN_PROGRESS 0x04 +#define SCSI_ASCQ_POWER_LOSS_EXPECTED 0x08 +#define SCSI_ASCQ_INVALID_LUN_ID 0x09 + +/** + * DEVICE_SPECIFIC_PARAMETER in mode parameter header (see sbc2r16) to + * enable DPOFUA support type 0x10 value. + */ +#define DEVICE_SPECIFIC_PARAMETER 0 +#define VPD_ID_DESCRIPTOR_LENGTH sizeof(VPD_IDENTIFICATION_DESCRIPTOR) + +/* MACROs to extract information from CDBs */ + +#define GET_OPCODE(cdb) cdb[0] + +#define GET_U8_FROM_CDB(cdb, index) (cdb[index] << 0) + +#define GET_U16_FROM_CDB(cdb, index) ((cdb[index] << 8) | (cdb[index + 1] << 0)) + +#define GET_U24_FROM_CDB(cdb, index) ((cdb[index] << 16) | \ +(cdb[index + 1] << 8) | \ +(cdb[index + 2] << 0)) + +#define GET_U32_FROM_CDB(cdb, index) ((cdb[index] << 24) | \ +(cdb[index + 1] << 16) | \ +(cdb[index + 2] << 8) | \ +(cdb[index + 3] << 0)) + +#define GET_U64_FROM_CDB(cdb, index) ((((u64)cdb[index]) << 56) | \ +(((u64)cdb[index + 1]) << 48) | \ +(((u64)cdb[index + 2]) << 40) | \ +(((u64)cdb[index + 3]) << 32) | \ +(((u64)cdb[index + 4]) << 24) | \ +(((u64)cdb[index + 5]) << 16) | \ +(((u64)cdb[index + 6]) << 8) | \ +(((u64)cdb[index + 7]) << 0)) + +/* Inquiry Helper Macros */ +#define GET_INQ_EVPD_BIT(cdb) \ +((GET_U8_FROM_CDB(cdb, INQUIRY_EVPD_BYTE_OFFSET) & \ +INQUIRY_EVPD_BIT_MASK) ? 1 : 0) + +#define GET_INQ_PAGE_CODE(cdb) \ +(GET_U8_FROM_CDB(cdb, INQUIRY_PAGE_CODE_BYTE_OFFSET)) + +#define GET_INQ_ALLOC_LENGTH(cdb) \ +(GET_U16_FROM_CDB(cdb, INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET)) + +/* Report LUNs Helper Macros */ +#define GET_REPORT_LUNS_ALLOC_LENGTH(cdb) \ +(GET_U32_FROM_CDB(cdb, REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET)) + +/* Read Capacity Helper Macros */ +#define GET_READ_CAP_16_ALLOC_LENGTH(cdb) \ +(GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET)) + +#define IS_READ_CAP_16(cdb) \ +((cdb[0] == SERVICE_ACTION_IN_16 && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0) + +/* Request Sense Helper Macros */ +#define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb) \ +(GET_U8_FROM_CDB(cdb, REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET)) + +/* Mode Sense Helper Macros */ +#define GET_MODE_SENSE_DBD(cdb) \ +((GET_U8_FROM_CDB(cdb, MODE_SENSE_DBD_OFFSET) & MODE_SENSE_DBD_MASK) >> \ +MODE_SENSE_DBD_SHIFT) + +#define GET_MODE_SENSE_LLBAA(cdb) \ +((GET_U8_FROM_CDB(cdb, MODE_SENSE_LLBAA_OFFSET) & \ +MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT) + +#define GET_MODE_SENSE_MPH_SIZE(cdb10) \ +(cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE) + + +/* Struct to gather data that needs to be extracted from a SCSI CDB. + Not conforming to any particular CDB variant, but compatible with all. */ + +struct nvme_trans_io_cdb { + u8 fua; + u8 prot_info; + u64 lba; + u32 xfer_len; +}; + + +/* Internal Helper Functions */ + + +/* Copy data to userspace memory */ + +static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, + unsigned long n) +{ + int res = SNTI_TRANSLATION_SUCCESS; + unsigned long not_copied; + int i; + void *index = from; + size_t remaining = n; + size_t xfer_len; + + if (hdr->iovec_count > 0) { + struct sg_iovec sgl; + + for (i = 0; i < hdr->iovec_count; i++) { + not_copied = copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec)); + if (not_copied) + return -EFAULT; + xfer_len = min(remaining, sgl.iov_len); + not_copied = copy_to_user(sgl.iov_base, index, + xfer_len); + if (not_copied) { + res = -EFAULT; + break; + } + index += xfer_len; + remaining -= xfer_len; + if (remaining == 0) + break; + } + return res; + } + not_copied = copy_to_user(hdr->dxferp, from, n); + if (not_copied) + res = -EFAULT; + return res; +} + +/* Copy data from userspace memory */ + +static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, + unsigned long n) +{ + int res = SNTI_TRANSLATION_SUCCESS; + unsigned long not_copied; + int i; + void *index = to; + size_t remaining = n; + size_t xfer_len; + + if (hdr->iovec_count > 0) { + struct sg_iovec sgl; + + for (i = 0; i < hdr->iovec_count; i++) { + not_copied = copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec)); + if (not_copied) + return -EFAULT; + xfer_len = min(remaining, sgl.iov_len); + not_copied = copy_from_user(index, sgl.iov_base, + xfer_len); + if (not_copied) { + res = -EFAULT; + break; + } + index += xfer_len; + remaining -= xfer_len; + if (remaining == 0) + break; + } + return res; + } + + not_copied = copy_from_user(to, hdr->dxferp, n); + if (not_copied) + res = -EFAULT; + return res; +} + +/* Status/Sense Buffer Writeback */ + +static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key, + u8 asc, u8 ascq) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 xfer_len; + u8 resp[DESC_FMT_SENSE_DATA_SIZE]; + + if (scsi_status_is_good(status)) { + hdr->status = SAM_STAT_GOOD; + hdr->masked_status = GOOD; + hdr->host_status = DID_OK; + hdr->driver_status = DRIVER_OK; + hdr->sb_len_wr = 0; + } else { + hdr->status = status; + hdr->masked_status = status >> 1; + hdr->host_status = DID_OK; + hdr->driver_status = DRIVER_OK; + + memset(resp, 0, DESC_FMT_SENSE_DATA_SIZE); + resp[0] = DESC_FORMAT_SENSE_DATA; + resp[1] = sense_key; + resp[2] = asc; + resp[3] = ascq; + + xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE); + hdr->sb_len_wr = xfer_len; + if (copy_to_user(hdr->sbp, resp, xfer_len) > 0) + res = -EFAULT; + } + + return res; +} + +static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc) +{ + u8 status, sense_key, asc, ascq; + int res = SNTI_TRANSLATION_SUCCESS; + + /* For non-nvme (Linux) errors, simply return the error code */ + if (nvme_sc < 0) + return nvme_sc; + + /* Mask DNR, More, and reserved fields */ + nvme_sc &= 0x7FF; + + switch (nvme_sc) { + /* Generic Command Status */ + case NVME_SC_SUCCESS: + status = SAM_STAT_GOOD; + sense_key = NO_SENSE; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_OPCODE: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ILLEGAL_COMMAND; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_FIELD: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_INVALID_CDB; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_DATA_XFER_ERROR: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_POWER_LOSS: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_WARNING; + ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED; + break; + case NVME_SC_INTERNAL: + status = SAM_STAT_CHECK_CONDITION; + sense_key = HARDWARE_ERROR; + asc = SCSI_ASC_INTERNAL_TARGET_FAILURE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ABORT_REQ: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ABORT_QUEUE: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_FUSED_FAIL: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_FUSED_MISSING: + status = SAM_STAT_TASK_ABORTED; + sense_key = ABORTED_COMMAND; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_INVALID_NS: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; + ascq = SCSI_ASCQ_INVALID_LUN_ID; + break; + case NVME_SC_LBA_RANGE: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ILLEGAL_BLOCK; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_CAP_EXCEEDED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_NS_NOT_READY: + status = SAM_STAT_CHECK_CONDITION; + sense_key = NOT_READY; + asc = SCSI_ASC_LUN_NOT_READY; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + + /* Command Specific Status */ + case NVME_SC_INVALID_FORMAT: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_FORMAT_COMMAND_FAILED; + ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED; + break; + case NVME_SC_BAD_ATTRIBUTES: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_INVALID_CDB; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + + /* Media Errors */ + case NVME_SC_WRITE_FAULT: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_READ_ERROR: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_UNRECOVERED_READ_ERROR; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_GUARD_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED; + break; + case NVME_SC_APPTAG_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED; + break; + case NVME_SC_REFTAG_CHECK: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MEDIUM_ERROR; + asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED; + ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED; + break; + case NVME_SC_COMPARE_FAILED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = MISCOMPARE; + asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + case NVME_SC_ACCESS_DENIED: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; + ascq = SCSI_ASCQ_INVALID_LUN_ID; + break; + + /* Unspecified/Default */ + case NVME_SC_CMDID_CONFLICT: + case NVME_SC_CMD_SEQ_ERROR: + case NVME_SC_CQ_INVALID: + case NVME_SC_QID_INVALID: + case NVME_SC_QUEUE_SIZE: + case NVME_SC_ABORT_LIMIT: + case NVME_SC_ABORT_MISSING: + case NVME_SC_ASYNC_LIMIT: + case NVME_SC_FIRMWARE_SLOT: + case NVME_SC_FIRMWARE_IMAGE: + case NVME_SC_INVALID_VECTOR: + case NVME_SC_INVALID_LOG_PAGE: + default: + status = SAM_STAT_CHECK_CONDITION; + sense_key = ILLEGAL_REQUEST; + asc = SCSI_ASC_NO_SENSE; + ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + break; + } + + res = nvme_trans_completion(hdr, status, sense_key, asc, ascq); + + return res; +} + +/* INQUIRY Helper Functions */ + +static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + int xfer_len; + u8 resp_data_format = 0x02; + u8 protect; + u8 cmdque = 0x01 << 1; + u8 fw_offset = sizeof(dev->firmware_rev); + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* nvme ns identify - use DPS value for PROTECT field */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + /* + * If nvme_sc was -ve, res will be -ve here. + * If nvme_sc was +ve, the status would bace been translated, and res + * can only be 0 or -ve. + * - If 0 && nvme_sc > 0, then go into next if where res gets nvme_sc + * - If -ve, return because its a Linux error. + */ + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + id_ns = mem; + (id_ns->dps) ? (protect = 0x01) : (protect = 0); + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[2] = VERSION_SPC_4; + inq_response[3] = resp_data_format; /*normaca=0 | hisup=0 */ + inq_response[4] = ADDITIONAL_STD_INQ_LENGTH; + inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ + inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ + strncpy(&inq_response[8], "NVMe ", 8); + strncpy(&inq_response[16], dev->model, 16); + + while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) + fw_offset--; + fw_offset -= 4; + strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4); + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + out_free: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out_dma: + return res; +} + +static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = INQ_SUPPORTED_VPD_PAGES_PAGE; /* Page Code */ + inq_response[3] = INQ_NUM_SUPPORTED_VPD_PAGES; /* Page Length */ + inq_response[4] = INQ_SUPPORTED_VPD_PAGES_PAGE; + inq_response[5] = INQ_UNIT_SERIAL_NUMBER_PAGE; + inq_response[6] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[7] = INQ_EXTENDED_INQUIRY_DATA_PAGE; + inq_response[8] = INQ_BDEV_CHARACTERISTICS_PAGE; + inq_response[9] = INQ_BDEV_LIMITS_PAGE; + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + return res; +} + +static int nvme_trans_unit_serial_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, + int alloc_len) +{ + struct nvme_dev *dev = ns->dev; + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ + inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ + strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH); + + xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + return res; +} + +static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *inq_response, int alloc_len) +{ + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + int xfer_len; + __be32 tmp_id = cpu_to_be32(ns->ns_id); + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + memset(inq_response, 0, alloc_len); + inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ + if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) { + struct nvme_id_ns *id_ns = mem; + void *eui = id_ns->eui64; + int len = sizeof(id_ns->eui64); + + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + + if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) { + if (bitmap_empty(eui, len * 8)) { + eui = id_ns->nguid; + len = sizeof(id_ns->nguid); + } + } + if (bitmap_empty(eui, len * 8)) + goto scsi_string; + + inq_response[3] = 4 + len; /* Page Length */ + /* Designation Descriptor start */ + inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ + inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = len; /* Designator Length */ + memcpy(&inq_response[8], eui, len); + } else { + scsi_string: + if (alloc_len < 72) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_free; + } + inq_response[3] = 0x48; /* Page Length */ + /* Designation Descriptor start */ + inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ + inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = 0x44; /* Designator Length */ + + sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor); + memcpy(&inq_response[12], dev->model, sizeof(dev->model)); + sprintf(&inq_response[52], "%04x", tmp_id); + memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); + } + xfer_len = alloc_len; + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + out_free: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out_dma: + return res; +} + +static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + u8 *inq_response; + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ctrl *id_ctrl; + struct nvme_id_ns *id_ns; + int xfer_len; + u8 microcode = 0x80; + u8 spt; + u8 spt_lut[8] = {0, 0, 2, 1, 4, 6, 5, 7}; + u8 grd_chk, app_chk, ref_chk, protect; + u8 uask_sup = 0x20; + u8 v_sup; + u8 luiclr = 0x01; + + inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); + if (inq_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + id_ns = mem; + spt = spt_lut[(id_ns->dpc) & 0x07] << 3; + (id_ns->dps) ? (protect = 0x01) : (protect = 0); + grd_chk = protect << 2; + app_chk = protect << 1; + ref_chk = protect; + + /* nvme controller identify */ + nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_free; + if (nvme_sc) { + res = nvme_sc; + goto out_free; + } + id_ctrl = mem; + v_sup = id_ctrl->vwc; + + memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE; /* Page Code */ + inq_response[2] = 0x00; /* Page Length MSB */ + inq_response[3] = 0x3C; /* Page Length LSB */ + inq_response[4] = microcode | spt | grd_chk | app_chk | ref_chk; + inq_response[5] = uask_sup; + inq_response[6] = v_sup; + inq_response[7] = luiclr; + inq_response[8] = 0; + inq_response[9] = 0; + + xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + out_free: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out_dma: + kfree(inq_response); + out_mem: + return res; +} + +static int nvme_trans_bdev_limits_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *inq_response, int alloc_len) +{ + __be32 max_sectors = cpu_to_be32( + nvme_block_nr(ns, queue_max_hw_sectors(ns->queue))); + __be32 max_discard = cpu_to_be32(ns->queue->limits.max_discard_sectors); + __be32 discard_desc_count = cpu_to_be32(0x100); + + memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); + inq_response[1] = VPD_BLOCK_LIMITS; + inq_response[3] = 0x3c; /* Page Length */ + memcpy(&inq_response[8], &max_sectors, sizeof(u32)); + memcpy(&inq_response[20], &max_discard, sizeof(u32)); + + if (max_discard) + memcpy(&inq_response[24], &discard_desc_count, sizeof(u32)); + + return nvme_trans_copy_to_user(hdr, inq_response, 0x3c); +} + +static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + u8 *inq_response; + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + + inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); + if (inq_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE; /* Page Code */ + inq_response[2] = 0x00; /* Page Length MSB */ + inq_response[3] = 0x3C; /* Page Length LSB */ + inq_response[4] = 0x00; /* Medium Rotation Rate MSB */ + inq_response[5] = 0x01; /* Medium Rotation Rate LSB */ + inq_response[6] = 0x00; /* Form Factor */ + + xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + kfree(inq_response); + out_mem: + return res; +} + +/* LOG SENSE Helper Functions */ + +static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *log_response; + + log_response = kzalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); + if (log_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = SUPPORTED_LOG_PAGES_PAGE_LENGTH; + log_response[4] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; + log_response[5] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; + log_response[6] = LOG_PAGE_TEMPERATURE_PAGE; + + xfer_len = min(alloc_len, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + kfree(log_response); + out_mem: + return res; +} + +static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, + struct sg_io_hdr *hdr, int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *log_response; + struct nvme_command c; + struct nvme_dev *dev = ns->dev; + struct nvme_smart_log *smart_log; + dma_addr_t dma_addr; + void *mem; + u8 temp_c; + u16 temp_k; + + log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); + if (log_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_smart_log), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* Get SMART Log Page */ + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_get_log_page; + c.common.nsid = cpu_to_le32(0xFFFFFFFF); + c.common.prp1 = cpu_to_le64(dma_addr); + c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / + BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART); + res = nvme_submit_admin_cmd(dev, &c, NULL); + if (res != NVME_SC_SUCCESS) { + temp_c = LOG_TEMP_UNKNOWN; + } else { + smart_log = mem; + temp_k = (smart_log->temperature[1] << 8) + + (smart_log->temperature[0]); + temp_c = temp_k - KELVIN_TEMP_FACTOR; + } + + log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = REMAINING_INFO_EXCP_PAGE_LENGTH; + /* Informational Exceptions Log Parameter 1 Start */ + /* Parameter Code=0x0000 bytes 4,5 */ + log_response[6] = 0x23; /* DU=0, TSD=1, ETC=0, TMC=0, FMT_AND_LNK=11b */ + log_response[7] = 0x04; /* PARAMETER LENGTH */ + /* Add sense Code and qualifier = 0x00 each */ + /* Use Temperature from NVMe Get Log Page, convert to C from K */ + log_response[10] = temp_c; + + xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), + mem, dma_addr); + out_dma: + kfree(log_response); + out_mem: + return res; +} + +static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, + int alloc_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *log_response; + struct nvme_command c; + struct nvme_dev *dev = ns->dev; + struct nvme_smart_log *smart_log; + dma_addr_t dma_addr; + void *mem; + u32 feature_resp; + u8 temp_c_cur, temp_c_thresh; + u16 temp_k; + + log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); + if (log_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_smart_log), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* Get SMART Log Page */ + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_get_log_page; + c.common.nsid = cpu_to_le32(0xFFFFFFFF); + c.common.prp1 = cpu_to_le64(dma_addr); + c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / + BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART); + res = nvme_submit_admin_cmd(dev, &c, NULL); + if (res != NVME_SC_SUCCESS) { + temp_c_cur = LOG_TEMP_UNKNOWN; + } else { + smart_log = mem; + temp_k = (smart_log->temperature[1] << 8) + + (smart_log->temperature[0]); + temp_c_cur = temp_k - KELVIN_TEMP_FACTOR; + } + + /* Get Features for Temp Threshold */ + res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0, + &feature_resp); + if (res != NVME_SC_SUCCESS) + temp_c_thresh = LOG_TEMP_UNKNOWN; + else + temp_c_thresh = (feature_resp & 0xFFFF) - KELVIN_TEMP_FACTOR; + + log_response[0] = LOG_PAGE_TEMPERATURE_PAGE; + /* Subpage=0x00, Page Length MSB=0 */ + log_response[3] = REMAINING_TEMP_PAGE_LENGTH; + /* Temperature Log Parameter 1 (Temperature) Start */ + /* Parameter Code = 0x0000 */ + log_response[6] = 0x01; /* Format and Linking = 01b */ + log_response[7] = 0x02; /* Parameter Length */ + /* Use Temperature from NVMe Get Log Page, convert to C from K */ + log_response[9] = temp_c_cur; + /* Temperature Log Parameter 2 (Reference Temperature) Start */ + log_response[11] = 0x01; /* Parameter Code = 0x0001 */ + log_response[12] = 0x01; /* Format and Linking = 01b */ + log_response[13] = 0x02; /* Parameter Length */ + /* Use Temperature Thresh from NVMe Get Log Page, convert to C from K */ + log_response[15] = temp_c_thresh; + + xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH); + res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); + + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), + mem, dma_addr); + out_dma: + kfree(log_response); + out_mem: + return res; +} + +/* MODE SENSE Helper Functions */ + +static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa, + u16 mode_data_length, u16 blk_desc_len) +{ + /* Quick check to make sure I don't stomp on my own memory... */ + if ((cdb10 && len < 8) || (!cdb10 && len < 4)) + return SNTI_INTERNAL_ERROR; + + if (cdb10) { + resp[0] = (mode_data_length & 0xFF00) >> 8; + resp[1] = (mode_data_length & 0x00FF); + /* resp[2] and [3] are zero */ + resp[4] = llbaa; + resp[5] = RESERVED_FIELD; + resp[6] = (blk_desc_len & 0xFF00) >> 8; + resp[7] = (blk_desc_len & 0x00FF); + } else { + resp[0] = (mode_data_length & 0x00FF); + /* resp[1] and [2] are zero */ + resp[3] = (blk_desc_len & 0x00FF); + } + + return SNTI_TRANSLATION_SUCCESS; +} + +static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int len, u8 llbaa) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 flbas; + u32 lba_length; + + if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN) + return SNTI_INTERNAL_ERROR; + else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) + return SNTI_INTERNAL_ERROR; + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + flbas = (id_ns->flbas) & 0x0F; + lba_length = (1 << (id_ns->lbaf[flbas].ds)); + + if (llbaa == 0) { + __be32 tmp_cap = cpu_to_be32(le64_to_cpu(id_ns->ncap)); + /* Byte 4 is reserved */ + __be32 tmp_len = cpu_to_be32(lba_length & 0x00FFFFFF); + + memcpy(resp, &tmp_cap, sizeof(u32)); + memcpy(&resp[4], &tmp_len, sizeof(u32)); + } else { + __be64 tmp_cap = cpu_to_be64(le64_to_cpu(id_ns->ncap)); + __be32 tmp_len = cpu_to_be32(lba_length); + + memcpy(resp, &tmp_cap, sizeof(u64)); + /* Bytes 8, 9, 10, 11 are reserved */ + memcpy(&resp[12], &tmp_len, sizeof(u32)); + } + + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out: + return res; +} + +static int nvme_trans_fill_control_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + if (len < MODE_PAGE_CONTROL_LEN) + return SNTI_INTERNAL_ERROR; + + resp[0] = MODE_PAGE_CONTROL; + resp[1] = MODE_PAGE_CONTROL_LEN_FIELD; + resp[2] = 0x0E; /* TST=000b, TMF_ONLY=0, DPICZ=1, + * D_SENSE=1, GLTSD=1, RLEC=0 */ + resp[3] = 0x12; /* Q_ALGO_MODIFIER=1h, NUAR=0, QERR=01b */ + /* Byte 4: VS=0, RAC=0, UA_INT=0, SWP=0 */ + resp[5] = 0x40; /* ATO=0, TAS=1, ATMPE=0, RWWP=0, AUTOLOAD=0 */ + /* resp[6] and [7] are obsolete, thus zero */ + resp[8] = 0xFF; /* Busy timeout period = 0xffff */ + resp[9] = 0xFF; + /* Bytes 10,11: Extended selftest completion time = 0x0000 */ + + return SNTI_TRANSLATION_SUCCESS; +} + +static int nvme_trans_fill_caching_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *resp, int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + u32 feature_resp; + u8 vwc; + + if (len < MODE_PAGE_CACHING_LEN) + return SNTI_INTERNAL_ERROR; + + nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0, + &feature_resp); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out; + if (nvme_sc) { + res = nvme_sc; + goto out; + } + vwc = feature_resp & 0x00000001; + + resp[0] = MODE_PAGE_CACHING; + resp[1] = MODE_PAGE_CACHING_LEN_FIELD; + resp[2] = vwc << 2; + + out: + return res; +} + +static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + + if (len < MODE_PAGE_POW_CND_LEN) + return SNTI_INTERNAL_ERROR; + + resp[0] = MODE_PAGE_POWER_CONDITION; + resp[1] = MODE_PAGE_POW_CND_LEN_FIELD; + /* All other bytes are zero */ + + return res; +} + +static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *resp, + int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + + if (len < MODE_PAGE_INF_EXC_LEN) + return SNTI_INTERNAL_ERROR; + + resp[0] = MODE_PAGE_INFO_EXCEP; + resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD; + resp[2] = 0x88; + /* All other bytes are zero */ + + return res; +} + +static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u16 mode_pages_offset_1 = 0; + u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4; + + mode_pages_offset_2 = mode_pages_offset_1 + MODE_PAGE_CACHING_LEN; + mode_pages_offset_3 = mode_pages_offset_2 + MODE_PAGE_CONTROL_LEN; + mode_pages_offset_4 = mode_pages_offset_3 + MODE_PAGE_POW_CND_LEN; + + res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1], + MODE_PAGE_CACHING_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2], + MODE_PAGE_CONTROL_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3], + MODE_PAGE_POW_CND_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4], + MODE_PAGE_INF_EXC_LEN); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + + out: + return res; +} + +static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa) +{ + if (dbd == MODE_SENSE_BLK_DESC_ENABLED) { + /* SPC-4: len = 8 x Num_of_descriptors if llbaa = 0, 16x if 1 */ + return 8 * (llbaa + 1) * MODE_SENSE_BLK_DESC_COUNT; + } else { + return 0; + } +} + +static int nvme_trans_mode_page_create(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *cmd, + u16 alloc_len, u8 cdb10, + int (*mode_page_fill_func) + (struct nvme_ns *, + struct sg_io_hdr *hdr, u8 *, int), + u16 mode_pages_tot_len) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int xfer_len; + u8 *response; + u8 dbd, llbaa; + u16 resp_size; + int mph_size; + u16 mode_pages_offset_1; + u16 blk_desc_len, blk_desc_offset, mode_data_length; + + dbd = GET_MODE_SENSE_DBD(cmd); + llbaa = GET_MODE_SENSE_LLBAA(cmd); + mph_size = GET_MODE_SENSE_MPH_SIZE(cdb10); + blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa); + + resp_size = mph_size + blk_desc_len + mode_pages_tot_len; + /* Refer spc4r34 Table 440 for calculation of Mode data Length field */ + mode_data_length = 3 + (3 * cdb10) + blk_desc_len + mode_pages_tot_len; + + blk_desc_offset = mph_size; + mode_pages_offset_1 = blk_desc_offset + blk_desc_len; + + response = kzalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10, + llbaa, mode_data_length, blk_desc_len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_free; + if (blk_desc_len > 0) { + res = nvme_trans_fill_blk_desc(ns, hdr, + &response[blk_desc_offset], + blk_desc_len, llbaa); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_free; + } + res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1], + mode_pages_tot_len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_free; + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + out_free: + kfree(response); + out_mem: + return res; +} + +/* Read Capacity Helper Functions */ + +static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, + u8 cdb16) +{ + u8 flbas; + u32 lba_length; + u64 rlba; + u8 prot_en; + u8 p_type_lut[4] = {0, 0, 1, 2}; + __be64 tmp_rlba; + __be32 tmp_rlba_32; + __be32 tmp_len; + + flbas = (id_ns->flbas) & 0x0F; + lba_length = (1 << (id_ns->lbaf[flbas].ds)); + rlba = le64_to_cpup(&id_ns->nsze) - 1; + (id_ns->dps) ? (prot_en = 0x01) : (prot_en = 0); + + if (!cdb16) { + if (rlba > 0xFFFFFFFF) + rlba = 0xFFFFFFFF; + tmp_rlba_32 = cpu_to_be32(rlba); + tmp_len = cpu_to_be32(lba_length); + memcpy(response, &tmp_rlba_32, sizeof(u32)); + memcpy(&response[4], &tmp_len, sizeof(u32)); + } else { + tmp_rlba = cpu_to_be64(rlba); + tmp_len = cpu_to_be32(lba_length); + memcpy(response, &tmp_rlba, sizeof(u64)); + memcpy(&response[8], &tmp_len, sizeof(u32)); + response[12] = (p_type_lut[id_ns->dps & 0x3] << 1) | prot_en; + /* P_I_Exponent = 0x0 | LBPPBE = 0x0 */ + /* LBPME = 0 | LBPRZ = 0 | LALBA = 0x00 */ + /* Bytes 16-31 - Reserved */ + } +} + +/* Start Stop Unit Helper Functions */ + +static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 pc, u8 pcmod, u8 start) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ctrl *id_ctrl; + int lowest_pow_st; /* max npss = lowest power consumption */ + unsigned ps_desired = 0; + + /* NVMe Controller Identify */ + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_id_ctrl), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ctrl = mem; + lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1)); + + switch (pc) { + case NVME_POWER_STATE_START_VALID: + /* Action unspecified if POWER CONDITION MODIFIER != 0 */ + if (pcmod == 0 && start == 0x1) + ps_desired = POWER_STATE_0; + if (pcmod == 0 && start == 0x0) + ps_desired = lowest_pow_st; + break; + case NVME_POWER_STATE_ACTIVE: + /* Action unspecified if POWER CONDITION MODIFIER != 0 */ + if (pcmod == 0) + ps_desired = POWER_STATE_0; + break; + case NVME_POWER_STATE_IDLE: + /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */ + if (pcmod == 0x0) + ps_desired = POWER_STATE_1; + else if (pcmod == 0x1) + ps_desired = POWER_STATE_2; + else if (pcmod == 0x2) + ps_desired = POWER_STATE_3; + break; + case NVME_POWER_STATE_STANDBY: + /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */ + if (pcmod == 0x0) + ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2)); + else if (pcmod == 0x1) + ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1)); + break; + case NVME_POWER_STATE_LU_CONTROL: + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0, + NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) + res = nvme_sc; + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem, + dma_addr); + out: + return res; +} + +/* Write Buffer Helper Functions */ +/* Also using this for Format Unit with hdr passed as NULL, and buffer_id, 0 */ + +static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 opcode, u32 tot_len, u32 offset, + u8 buffer_id) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + struct nvme_command c; + struct nvme_iod *iod = NULL; + unsigned length; + + memset(&c, 0, sizeof(c)); + c.common.opcode = opcode; + if (opcode == nvme_admin_download_fw) { + if (hdr->iovec_count > 0) { + /* Assuming SGL is not allowed for this command */ + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + iod = nvme_map_user_pages(dev, DMA_TO_DEVICE, + (unsigned long)hdr->dxferp, tot_len); + if (IS_ERR(iod)) { + res = PTR_ERR(iod); + goto out; + } + length = nvme_setup_prps(dev, iod, tot_len, GFP_KERNEL); + if (length != tot_len) { + res = -ENOMEM; + goto out_unmap; + } + + c.dlfw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + c.dlfw.prp2 = cpu_to_le64(iod->first_dma); + c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); + c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); + } else if (opcode == nvme_admin_activate_fw) { + u32 cdw10 = buffer_id | NVME_FWACT_REPL_ACTV; + c.common.cdw10[0] = cpu_to_le32(cdw10); + } + + nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_unmap; + if (nvme_sc) + res = nvme_sc; + + out_unmap: + if (opcode == nvme_admin_download_fw) { + nvme_unmap_user_pages(dev, DMA_TO_DEVICE, iod); + nvme_free_iod(dev, iod); + } + out: + return res; +} + +/* Mode Select Helper Functions */ + +static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10, + u16 *bd_len, u8 *llbaa) +{ + if (cdb10) { + /* 10 Byte CDB */ + *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) + + parm_list[MODE_SELECT_10_BD_OFFSET + 1]; + *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] & + MODE_SELECT_10_LLBAA_MASK; + } else { + /* 6 Byte CDB */ + *bd_len = parm_list[MODE_SELECT_6_BD_OFFSET]; + } +} + +static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list, + u16 idx, u16 bd_len, u8 llbaa) +{ + u16 bd_num; + + bd_num = bd_len / ((llbaa == 0) ? + SHORT_DESC_BLOCK : LONG_DESC_BLOCK); + /* Store block descriptor info if a FORMAT UNIT comes later */ + /* TODO Saving 1st BD info; what to do if multiple BD received? */ + if (llbaa == 0) { + /* Standard Block Descriptor - spc4r34 7.5.5.1 */ + ns->mode_select_num_blocks = + (parm_list[idx + 1] << 16) + + (parm_list[idx + 2] << 8) + + (parm_list[idx + 3]); + + ns->mode_select_block_len = + (parm_list[idx + 5] << 16) + + (parm_list[idx + 6] << 8) + + (parm_list[idx + 7]); + } else { + /* Long LBA Block Descriptor - sbc3r27 6.4.2.3 */ + ns->mode_select_num_blocks = + (((u64)parm_list[idx + 0]) << 56) + + (((u64)parm_list[idx + 1]) << 48) + + (((u64)parm_list[idx + 2]) << 40) + + (((u64)parm_list[idx + 3]) << 32) + + (((u64)parm_list[idx + 4]) << 24) + + (((u64)parm_list[idx + 5]) << 16) + + (((u64)parm_list[idx + 6]) << 8) + + ((u64)parm_list[idx + 7]); + + ns->mode_select_block_len = + (parm_list[idx + 12] << 24) + + (parm_list[idx + 13] << 16) + + (parm_list[idx + 14] << 8) + + (parm_list[idx + 15]); + } +} + +static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *mode_page, u8 page_code) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + unsigned dword11; + + switch (page_code) { + case MODE_PAGE_CACHING: + dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); + nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11, + 0, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + break; + if (nvme_sc) { + res = nvme_sc; + break; + } + break; + case MODE_PAGE_CONTROL: + break; + case MODE_PAGE_POWER_CONDITION: + /* Verify the OS is not trying to set timers */ + if ((mode_page[2] & 0x01) != 0 || (mode_page[3] & 0x0F) != 0) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + if (!res) + res = SNTI_INTERNAL_ERROR; + break; + } + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + if (!res) + res = SNTI_INTERNAL_ERROR; + break; + } + + return res; +} + +static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd, u16 parm_list_len, u8 pf, + u8 sp, u8 cdb10) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 *parm_list; + u16 bd_len; + u8 llbaa = 0; + u16 index, saved_index; + u8 page_code; + u16 mp_size; + + /* Get parm list from data-in/out buffer */ + parm_list = kmalloc(parm_list_len, GFP_KERNEL); + if (parm_list == NULL) { + res = -ENOMEM; + goto out; + } + + res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_mem; + + nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa); + index = (cdb10) ? (MODE_SELECT_10_MPH_SIZE) : (MODE_SELECT_6_MPH_SIZE); + + if (bd_len != 0) { + /* Block Descriptors present, parse */ + nvme_trans_modesel_save_bd(ns, parm_list, index, bd_len, llbaa); + index += bd_len; + } + saved_index = index; + + /* Multiple mode pages may be present; iterate through all */ + /* In 1st Iteration, don't do NVME Command, only check for CDB errors */ + do { + page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; + mp_size = parm_list[index + 1] + 2; + if ((page_code != MODE_PAGE_CACHING) && + (page_code != MODE_PAGE_CONTROL) && + (page_code != MODE_PAGE_POWER_CONDITION)) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + index += mp_size; + } while (index < parm_list_len); + + /* In 2nd Iteration, do the NVME Commands */ + index = saved_index; + do { + page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; + mp_size = parm_list[index + 1] + 2; + res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index], + page_code); + if (res != SNTI_TRANSLATION_SUCCESS) + break; + index += mp_size; + } while (index < parm_list_len); + + out_mem: + kfree(parm_list); + out: + return res; +} + +/* Format Unit Helper Functions */ + +static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, + struct sg_io_hdr *hdr) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 flbas; + + /* + * SCSI Expects a MODE SELECT would have been issued prior to + * a FORMAT UNIT, and the block size and number would be used + * from the block descriptor in it. If a MODE SELECT had not + * been issued, FORMAT shall use the current values for both. + */ + + if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + + if (ns->mode_select_num_blocks == 0) + ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap); + if (ns->mode_select_block_len == 0) { + flbas = (id_ns->flbas) & 0x0F; + ns->mode_select_block_len = + (1 << (id_ns->lbaf[flbas].ds)); + } + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + mem, dma_addr); + } + out: + return res; +} + +static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len, + u8 format_prot_info, u8 *nvme_pf_code) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 *parm_list; + u8 pf_usage, pf_code; + + parm_list = kmalloc(len, GFP_KERNEL); + if (parm_list == NULL) { + res = -ENOMEM; + goto out; + } + res = nvme_trans_copy_from_user(hdr, parm_list, len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out_mem; + + if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] & + FORMAT_UNIT_IMMED_MASK) != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + + if (len == FORMAT_UNIT_LONG_PARM_LIST_LEN && + (parm_list[FORMAT_UNIT_PROT_INT_OFFSET] & 0x0F) != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_mem; + } + pf_usage = parm_list[FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET] & + FORMAT_UNIT_PROT_FIELD_USAGE_MASK; + pf_code = (pf_usage << 2) | format_prot_info; + switch (pf_code) { + case 0: + *nvme_pf_code = 0; + break; + case 2: + *nvme_pf_code = 1; + break; + case 3: + *nvme_pf_code = 2; + break; + case 7: + *nvme_pf_code = 3; + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out_mem: + kfree(parm_list); + out: + return res; +} + +static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 prot_info) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 i; + u8 flbas, nlbaf; + u8 selected_lbaf = 0xFF; + u32 cdw10 = 0; + struct nvme_command c; + + /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + flbas = (id_ns->flbas) & 0x0F; + nlbaf = id_ns->nlbaf; + + for (i = 0; i < nlbaf; i++) { + if (ns->mode_select_block_len == (1 << (id_ns->lbaf[i].ds))) { + selected_lbaf = i; + break; + } + } + if (selected_lbaf > 0x0F) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + if (ns->mode_select_num_blocks != le64_to_cpu(id_ns->ncap)) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + + cdw10 |= prot_info << 5; + cdw10 |= selected_lbaf & 0x0F; + memset(&c, 0, sizeof(c)); + c.format.opcode = nvme_admin_format_nvm; + c.format.nsid = cpu_to_le32(ns->ns_id); + c.format.cdw10 = cpu_to_le32(cdw10); + + nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) + res = nvme_sc; + + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out: + return res; +} + +/* Read/Write Helper Functions */ + +static inline void nvme_trans_get_io_cdb6(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = 0; + cdb_info->prot_info = 0; + cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_6_CDB_LBA_OFFSET) & + IO_6_CDB_LBA_MASK; + cdb_info->xfer_len = GET_U8_FROM_CDB(cmd, IO_6_CDB_TX_LEN_OFFSET); + + /* sbc3r27 sec 5.32 - TRANSFER LEN of 0 implies a 256 Block transfer */ + if (cdb_info->xfer_len == 0) + cdb_info->xfer_len = IO_6_DEFAULT_TX_LEN; +} + +static inline void nvme_trans_get_io_cdb10(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_10_CDB_FUA_OFFSET) & + IO_CDB_FUA_MASK; + cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_10_CDB_WP_OFFSET) & + IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; + cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_10_CDB_LBA_OFFSET); + cdb_info->xfer_len = GET_U16_FROM_CDB(cmd, IO_10_CDB_TX_LEN_OFFSET); +} + +static inline void nvme_trans_get_io_cdb12(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_12_CDB_FUA_OFFSET) & + IO_CDB_FUA_MASK; + cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_12_CDB_WP_OFFSET) & + IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; + cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_12_CDB_LBA_OFFSET); + cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_12_CDB_TX_LEN_OFFSET); +} + +static inline void nvme_trans_get_io_cdb16(u8 *cmd, + struct nvme_trans_io_cdb *cdb_info) +{ + cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_16_CDB_FUA_OFFSET) & + IO_CDB_FUA_MASK; + cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_16_CDB_WP_OFFSET) & + IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; + cdb_info->lba = GET_U64_FROM_CDB(cmd, IO_16_CDB_LBA_OFFSET); + cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_16_CDB_TX_LEN_OFFSET); +} + +static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr, + struct nvme_trans_io_cdb *cdb_info, + u32 max_blocks) +{ + /* If using iovecs, send one nvme command per vector */ + if (hdr->iovec_count > 0) + return hdr->iovec_count; + else if (cdb_info->xfer_len > max_blocks) + return ((cdb_info->xfer_len - 1) / max_blocks) + 1; + else + return 1; +} + +static u16 nvme_trans_io_get_control(struct nvme_ns *ns, + struct nvme_trans_io_cdb *cdb_info) +{ + u16 control = 0; + + /* When Protection information support is added, implement here */ + + if (cdb_info->fua > 0) + control |= NVME_RW_FUA; + + return control; +} + +static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, + struct nvme_trans_io_cdb *cdb_info, u8 is_write) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_dev *dev = ns->dev; + u32 num_cmds; + struct nvme_iod *iod; + u64 unit_len; + u64 unit_num_blocks; /* Number of blocks to xfer in each nvme cmd */ + u32 retcode; + u32 i = 0; + u64 nvme_offset = 0; + void __user *next_mapping_addr; + struct nvme_command c; + u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read); + u16 control; + u32 max_blocks = queue_max_hw_sectors(ns->queue); + + num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks); + + /* + * This loop handles two cases. + * First, when an SGL is used in the form of an iovec list: + * - Use iov_base as the next mapping address for the nvme command_id + * - Use iov_len as the data transfer length for the command. + * Second, when we have a single buffer + * - If larger than max_blocks, split into chunks, offset + * each nvme command accordingly. + */ + for (i = 0; i < num_cmds; i++) { + memset(&c, 0, sizeof(c)); + if (hdr->iovec_count > 0) { + struct sg_iovec sgl; + + retcode = copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec)); + if (retcode) + return -EFAULT; + unit_len = sgl.iov_len; + unit_num_blocks = unit_len >> ns->lba_shift; + next_mapping_addr = sgl.iov_base; + } else { + unit_num_blocks = min((u64)max_blocks, + (cdb_info->xfer_len - nvme_offset)); + unit_len = unit_num_blocks << ns->lba_shift; + next_mapping_addr = hdr->dxferp + + ((1 << ns->lba_shift) * nvme_offset); + } + + c.rw.opcode = opcode; + c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.slba = cpu_to_le64(cdb_info->lba + nvme_offset); + c.rw.length = cpu_to_le16(unit_num_blocks - 1); + control = nvme_trans_io_get_control(ns, cdb_info); + c.rw.control = cpu_to_le16(control); + + iod = nvme_map_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + (unsigned long)next_mapping_addr, unit_len); + if (IS_ERR(iod)) { + res = PTR_ERR(iod); + goto out; + } + retcode = nvme_setup_prps(dev, iod, unit_len, GFP_KERNEL); + if (retcode != unit_len) { + nvme_unmap_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + iod); + nvme_free_iod(dev, iod); + res = -ENOMEM; + goto out; + } + c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + c.rw.prp2 = cpu_to_le64(iod->first_dma); + + nvme_offset += unit_num_blocks; + + nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); + if (nvme_sc != NVME_SC_SUCCESS) { + nvme_unmap_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + iod); + nvme_free_iod(dev, iod); + res = nvme_trans_status_code(hdr, nvme_sc); + goto out; + } + nvme_unmap_user_pages(dev, + (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, + iod); + nvme_free_iod(dev, iod); + } + res = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); + + out: + return res; +} + + +/* SCSI Command Translation Functions */ + +static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + struct nvme_trans_io_cdb cdb_info; + u8 opcode = cmd[0]; + u64 xfer_bytes; + u64 sum_iov_len = 0; + struct sg_iovec sgl; + int i; + size_t not_copied; + + /* Extract Fields from CDB */ + switch (opcode) { + case WRITE_6: + case READ_6: + nvme_trans_get_io_cdb6(cmd, &cdb_info); + break; + case WRITE_10: + case READ_10: + nvme_trans_get_io_cdb10(cmd, &cdb_info); + break; + case WRITE_12: + case READ_12: + nvme_trans_get_io_cdb12(cmd, &cdb_info); + break; + case WRITE_16: + case READ_16: + nvme_trans_get_io_cdb16(cmd, &cdb_info); + break; + default: + /* Will never really reach here */ + res = SNTI_INTERNAL_ERROR; + goto out; + } + + /* Calculate total length of transfer (in bytes) */ + if (hdr->iovec_count > 0) { + for (i = 0; i < hdr->iovec_count; i++) { + not_copied = copy_from_user(&sgl, hdr->dxferp + + i * sizeof(struct sg_iovec), + sizeof(struct sg_iovec)); + if (not_copied) + return -EFAULT; + sum_iov_len += sgl.iov_len; + /* IO vector sizes should be multiples of block size */ + if (sgl.iov_len % (1 << ns->lba_shift) != 0) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_PARAMETER, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + } + } else { + sum_iov_len = hdr->dxfer_len; + } + + /* As Per sg ioctl howto, if the lengths differ, use the lower one */ + xfer_bytes = min(((u64)hdr->dxfer_len), sum_iov_len); + + /* If block count and actual data buffer size dont match, error out */ + if (xfer_bytes != (cdb_info.xfer_len << ns->lba_shift)) { + res = -EINVAL; + goto out; + } + + /* Check for 0 length transfer - it is not illegal */ + if (cdb_info.xfer_len == 0) + goto out; + + /* Send NVMe IO Command(s) */ + res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + + out: + return res; +} + +static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 evpd; + u8 page_code; + int alloc_len; + u8 *inq_response; + + evpd = GET_INQ_EVPD_BIT(cmd); + page_code = GET_INQ_PAGE_CODE(cmd); + alloc_len = GET_INQ_ALLOC_LENGTH(cmd); + + inq_response = kmalloc(max(alloc_len, STANDARD_INQUIRY_LENGTH), + GFP_KERNEL); + if (inq_response == NULL) { + res = -ENOMEM; + goto out_mem; + } + + if (evpd == 0) { + if (page_code == INQ_STANDARD_INQUIRY_PAGE) { + res = nvme_trans_standard_inquiry_page(ns, hdr, + inq_response, alloc_len); + } else { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + } else { + switch (page_code) { + case VPD_SUPPORTED_PAGES: + res = nvme_trans_supported_vpd_pages(ns, hdr, + inq_response, alloc_len); + break; + case VPD_SERIAL_NUMBER: + res = nvme_trans_unit_serial_page(ns, hdr, inq_response, + alloc_len); + break; + case VPD_DEVICE_IDENTIFIERS: + res = nvme_trans_device_id_page(ns, hdr, inq_response, + alloc_len); + break; + case VPD_EXTENDED_INQUIRY: + res = nvme_trans_ext_inq_page(ns, hdr, alloc_len); + break; + case VPD_BLOCK_LIMITS: + res = nvme_trans_bdev_limits_page(ns, hdr, inq_response, + alloc_len); + break; + case VPD_BLOCK_DEV_CHARACTERISTICS: + res = nvme_trans_bdev_char_page(ns, hdr, alloc_len); + break; + default: + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, + SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + } + kfree(inq_response); + out_mem: + return res; +} + +static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u16 alloc_len; + u8 sp; + u8 pc; + u8 page_code; + + sp = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_SP_OFFSET); + if (sp != LOG_SENSE_CDB_SP_NOT_ENABLED) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + pc = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_PC_OFFSET); + page_code = pc & LOG_SENSE_CDB_PAGE_CODE_MASK; + pc = (pc & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT; + if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + alloc_len = GET_U16_FROM_CDB(cmd, LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET); + switch (page_code) { + case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE: + res = nvme_trans_log_supp_pages(ns, hdr, alloc_len); + break; + case LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE: + res = nvme_trans_log_info_exceptions(ns, hdr, alloc_len); + break; + case LOG_PAGE_TEMPERATURE_PAGE: + res = nvme_trans_log_temperature(ns, hdr, alloc_len); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 cdb10 = 0; + u16 parm_list_len; + u8 page_format; + u8 save_pages; + + page_format = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_PAGE_FORMAT_OFFSET); + page_format &= MODE_SELECT_CDB_PAGE_FORMAT_MASK; + + save_pages = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_SAVE_PAGES_OFFSET); + save_pages &= MODE_SELECT_CDB_SAVE_PAGES_MASK; + + if (GET_OPCODE(cmd) == MODE_SELECT) { + parm_list_len = GET_U8_FROM_CDB(cmd, + MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET); + } else { + parm_list_len = GET_U16_FROM_CDB(cmd, + MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET); + cdb10 = 1; + } + + if (parm_list_len != 0) { + /* + * According to SPC-4 r24, a paramter list length field of 0 + * shall not be considered an error + */ + res = nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len, + page_format, save_pages, cdb10); + } + + return res; +} + +static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u16 alloc_len; + u8 cdb10 = 0; + u8 page_code; + u8 pc; + + if (GET_OPCODE(cmd) == MODE_SENSE) { + alloc_len = GET_U8_FROM_CDB(cmd, MODE_SENSE6_ALLOC_LEN_OFFSET); + } else { + alloc_len = GET_U16_FROM_CDB(cmd, + MODE_SENSE10_ALLOC_LEN_OFFSET); + cdb10 = 1; + } + + pc = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CONTROL_OFFSET) & + MODE_SENSE_PAGE_CONTROL_MASK; + if (pc != MODE_SENSE_PC_CURRENT_VALUES) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + + page_code = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CODE_OFFSET) & + MODE_SENSE_PAGE_CODE_MASK; + switch (page_code) { + case MODE_PAGE_CACHING: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_caching_page, + MODE_PAGE_CACHING_LEN); + break; + case MODE_PAGE_CONTROL: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_control_page, + MODE_PAGE_CONTROL_LEN); + break; + case MODE_PAGE_POWER_CONDITION: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_pow_cnd_page, + MODE_PAGE_POW_CND_LEN); + break; + case MODE_PAGE_INFO_EXCEP: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_inf_exc_page, + MODE_PAGE_INF_EXC_LEN); + break; + case MODE_PAGE_RETURN_ALL: + res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, + cdb10, + &nvme_trans_fill_all_pages, + MODE_PAGE_ALL_LEN); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + u32 alloc_len = READ_CAP_10_RESP_SIZE; + u32 resp_size = READ_CAP_10_RESP_SIZE; + u32 xfer_len; + u8 cdb16; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ns *id_ns; + u8 *response; + + cdb16 = IS_READ_CAP_16(cmd); + if (cdb16) { + alloc_len = GET_READ_CAP_16_ALLOC_LENGTH(cmd); + resp_size = READ_CAP_16_RESP_SIZE; + } + + mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + /* nvme ns identify */ + nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ns = mem; + + response = kzalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_dma; + } + nvme_trans_fill_read_cap(response, id_ns, cdb16); + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, + dma_addr); + out: + return res; +} + +static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + u32 alloc_len, xfer_len, resp_size; + u8 select_report; + u8 *response; + struct nvme_dev *dev = ns->dev; + dma_addr_t dma_addr; + void *mem; + struct nvme_id_ctrl *id_ctrl; + u32 ll_length, lun_id; + u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; + __be32 tmp_len; + + alloc_len = GET_REPORT_LUNS_ALLOC_LENGTH(cmd); + select_report = GET_U8_FROM_CDB(cmd, REPORT_LUNS_SR_OFFSET); + + if ((select_report != ALL_LUNS_RETURNED) && + (select_report != ALL_WELL_KNOWN_LUNS_RETURNED) && + (select_report != RESTRICTED_LUNS_RETURNED)) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } else { + /* NVMe Controller Identify */ + mem = dma_alloc_coherent(&dev->pci_dev->dev, + sizeof(struct nvme_id_ctrl), + &dma_addr, GFP_KERNEL); + if (mem == NULL) { + res = -ENOMEM; + goto out; + } + nvme_sc = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out_dma; + if (nvme_sc) { + res = nvme_sc; + goto out_dma; + } + id_ctrl = mem; + ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE; + resp_size = ll_length + LUN_DATA_HEADER_SIZE; + + if (alloc_len < resp_size) { + res = nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out_dma; + } + + response = kzalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out_dma; + } + + /* The first LUN ID will always be 0 per the SAM spec */ + for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) { + /* + * Set the LUN Id and then increment to the next LUN + * location in the parameter data. + */ + __be64 tmp_id = cpu_to_be64(lun_id); + memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64)); + lun_id_offset += LUN_ENTRY_SIZE; + } + tmp_len = cpu_to_be32(ll_length); + memcpy(response, &tmp_len, sizeof(u32)); + } + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out_dma: + dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem, + dma_addr); + out: + return res; +} + +static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 alloc_len, xfer_len, resp_size; + u8 desc_format; + u8 *response; + + alloc_len = GET_REQUEST_SENSE_ALLOC_LENGTH(cmd); + desc_format = GET_U8_FROM_CDB(cmd, REQUEST_SENSE_DESC_OFFSET); + desc_format &= REQUEST_SENSE_DESC_MASK; + + resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) : + (FIXED_FMT_SENSE_DATA_SIZE)); + response = kzalloc(resp_size, GFP_KERNEL); + if (response == NULL) { + res = -ENOMEM; + goto out; + } + + if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) { + /* Descriptor Format Sense Data */ + response[0] = DESC_FORMAT_SENSE_DATA; + response[1] = NO_SENSE; + /* TODO How is LOW POWER CONDITION ON handled? (byte 2) */ + response[2] = SCSI_ASC_NO_SENSE; + response[3] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + /* SDAT_OVFL = 0 | Additional Sense Length = 0 */ + } else { + /* Fixed Format Sense Data */ + response[0] = FIXED_SENSE_DATA; + /* Byte 1 = Obsolete */ + response[2] = NO_SENSE; /* FM, EOM, ILI, SDAT_OVFL = 0 */ + /* Bytes 3-6 - Information - set to zero */ + response[7] = FIXED_SENSE_DATA_ADD_LENGTH; + /* Bytes 8-11 - Cmd Specific Information - set to zero */ + response[12] = SCSI_ASC_NO_SENSE; + response[13] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; + /* Byte 14 = Field Replaceable Unit Code = 0 */ + /* Bytes 15-17 - SKSV=0; Sense Key Specific = 0 */ + } + + xfer_len = min(alloc_len, resp_size); + res = nvme_trans_copy_to_user(hdr, response, xfer_len); + + kfree(response); + out: + return res; +} + +static int nvme_trans_security_protocol(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *cmd) +{ + return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); +} + +static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_command c; + u8 immed, pcmod, pc, no_flush, start; + + immed = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_IMMED_OFFSET); + pcmod = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET); + pc = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_OFFSET); + no_flush = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_NO_FLUSH_OFFSET); + start = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_START_OFFSET); + + immed &= START_STOP_UNIT_CDB_IMMED_MASK; + pcmod &= START_STOP_UNIT_CDB_POWER_COND_MOD_MASK; + pc = (pc & START_STOP_UNIT_CDB_POWER_COND_MASK) >> NIBBLE_SHIFT; + no_flush &= START_STOP_UNIT_CDB_NO_FLUSH_MASK; + start &= START_STOP_UNIT_CDB_START_MASK; + + if (immed != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } else { + if (no_flush == 0) { + /* Issue NVME FLUSH command prior to START STOP UNIT */ + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_cmd_flush; + c.common.nsid = cpu_to_le32(ns->ns_id); + + nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out; + if (nvme_sc) { + res = nvme_sc; + goto out; + } + } + /* Setup the expected power state transition */ + res = nvme_trans_power_state(ns, hdr, pc, pcmod, start); + } + + out: + return res; +} + +static int nvme_trans_synchronize_cache(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + int nvme_sc; + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_cmd_flush; + c.common.nsid = cpu_to_le32(ns->ns_id); + + nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); + + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + goto out; + if (nvme_sc) + res = nvme_sc; + + out: + return res; +} + +static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u8 parm_hdr_len = 0; + u8 nvme_pf_code = 0; + u8 format_prot_info, long_list, format_data; + + format_prot_info = GET_U8_FROM_CDB(cmd, + FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET); + long_list = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_LONG_LIST_OFFSET); + format_data = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET); + + format_prot_info = (format_prot_info & + FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK) >> + FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT; + long_list &= FORMAT_UNIT_CDB_LONG_LIST_MASK; + format_data &= FORMAT_UNIT_CDB_FORMAT_DATA_MASK; + + if (format_data != 0) { + if (format_prot_info != 0) { + if (long_list == 0) + parm_hdr_len = FORMAT_UNIT_SHORT_PARM_LIST_LEN; + else + parm_hdr_len = FORMAT_UNIT_LONG_PARM_LIST_LEN; + } + } else if (format_data == 0 && format_prot_info != 0) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + + /* Get parm header from data-in/out buffer */ + /* + * According to the translation spec, the only fields in the parameter + * list we are concerned with are in the header. So allocate only that. + */ + if (parm_hdr_len > 0) { + res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len, + format_prot_info, &nvme_pf_code); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + } + + /* Attempt to activate any previously downloaded firmware image */ + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, 0, 0, 0); + + /* Determine Block size and count and send format command */ + res = nvme_trans_fmt_set_blk_size_count(ns, hdr); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + + res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code); + + out: + return res; +} + +static int nvme_trans_test_unit_ready(struct nvme_ns *ns, + struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + struct nvme_dev *dev = ns->dev; + + if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + NOT_READY, SCSI_ASC_LUN_NOT_READY, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + else + res = nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0); + + return res; +} + +static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + int res = SNTI_TRANSLATION_SUCCESS; + u32 buffer_offset, parm_list_length; + u8 buffer_id, mode; + + parm_list_length = + GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET); + if (parm_list_length % BYTES_TO_DWORDS != 0) { + /* NVMe expects Firmware file to be a whole number of DWORDS */ + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + buffer_id = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_ID_OFFSET); + if (buffer_id > NVME_MAX_FIRMWARE_SLOT) { + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + goto out; + } + mode = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_MODE_OFFSET) & + WRITE_BUFFER_CDB_MODE_MASK; + buffer_offset = + GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET); + + switch (mode) { + case DOWNLOAD_SAVE_ACTIVATE: + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw, + parm_list_length, buffer_offset, + buffer_id); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, + parm_list_length, buffer_offset, + buffer_id); + break; + case DOWNLOAD_SAVE_DEFER_ACTIVATE: + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw, + parm_list_length, buffer_offset, + buffer_id); + break; + case ACTIVATE_DEFERRED_MICROCODE: + res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, + parm_list_length, buffer_offset, + buffer_id); + break; + default: + res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + + out: + return res; +} + +struct scsi_unmap_blk_desc { + __be64 slba; + __be32 nlb; + u32 resv; +}; + +struct scsi_unmap_parm_list { + __be16 unmap_data_len; + __be16 unmap_blk_desc_data_len; + u32 resv; + struct scsi_unmap_blk_desc desc[0]; +}; + +static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *cmd) +{ + struct nvme_dev *dev = ns->dev; + struct scsi_unmap_parm_list *plist; + struct nvme_dsm_range *range; + struct nvme_command c; + int i, nvme_sc, res = -ENOMEM; + u16 ndesc, list_len; + dma_addr_t dma_addr; + + list_len = GET_U16_FROM_CDB(cmd, UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET); + if (!list_len) + return -EINVAL; + + plist = kmalloc(list_len, GFP_KERNEL); + if (!plist) + return -ENOMEM; + + res = nvme_trans_copy_from_user(hdr, plist, list_len); + if (res != SNTI_TRANSLATION_SUCCESS) + goto out; + + ndesc = be16_to_cpu(plist->unmap_blk_desc_data_len) >> 4; + if (!ndesc || ndesc > 256) { + res = -EINVAL; + goto out; + } + + range = dma_alloc_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), + &dma_addr, GFP_KERNEL); + if (!range) + goto out; + + for (i = 0; i < ndesc; i++) { + range[i].nlb = cpu_to_le32(be32_to_cpu(plist->desc[i].nlb)); + range[i].slba = cpu_to_le64(be64_to_cpu(plist->desc[i].slba)); + range[i].cattr = 0; + } + + memset(&c, 0, sizeof(c)); + c.dsm.opcode = nvme_cmd_dsm; + c.dsm.nsid = cpu_to_le32(ns->ns_id); + c.dsm.prp1 = cpu_to_le64(dma_addr); + c.dsm.nr = cpu_to_le32(ndesc - 1); + c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + + nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); + res = nvme_trans_status_code(hdr, nvme_sc); + + dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), + range, dma_addr); + out: + kfree(plist); + return res; +} + +static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) +{ + u8 cmd[BLK_MAX_CDB]; + int retcode; + unsigned int opcode; + + if (hdr->cmdp == NULL) + return -EMSGSIZE; + if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) + return -EFAULT; + + /* + * Prime the hdr with good status for scsi commands that don't require + * an nvme command for translation. + */ + retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); + if (retcode) + return retcode; + + opcode = cmd[0]; + + switch (opcode) { + case READ_6: + case READ_10: + case READ_12: + case READ_16: + retcode = nvme_trans_io(ns, hdr, 0, cmd); + break; + case WRITE_6: + case WRITE_10: + case WRITE_12: + case WRITE_16: + retcode = nvme_trans_io(ns, hdr, 1, cmd); + break; + case INQUIRY: + retcode = nvme_trans_inquiry(ns, hdr, cmd); + break; + case LOG_SENSE: + retcode = nvme_trans_log_sense(ns, hdr, cmd); + break; + case MODE_SELECT: + case MODE_SELECT_10: + retcode = nvme_trans_mode_select(ns, hdr, cmd); + break; + case MODE_SENSE: + case MODE_SENSE_10: + retcode = nvme_trans_mode_sense(ns, hdr, cmd); + break; + case READ_CAPACITY: + retcode = nvme_trans_read_capacity(ns, hdr, cmd); + break; + case SERVICE_ACTION_IN_16: + if (IS_READ_CAP_16(cmd)) + retcode = nvme_trans_read_capacity(ns, hdr, cmd); + else + goto out; + break; + case REPORT_LUNS: + retcode = nvme_trans_report_luns(ns, hdr, cmd); + break; + case REQUEST_SENSE: + retcode = nvme_trans_request_sense(ns, hdr, cmd); + break; + case SECURITY_PROTOCOL_IN: + case SECURITY_PROTOCOL_OUT: + retcode = nvme_trans_security_protocol(ns, hdr, cmd); + break; + case START_STOP: + retcode = nvme_trans_start_stop(ns, hdr, cmd); + break; + case SYNCHRONIZE_CACHE: + retcode = nvme_trans_synchronize_cache(ns, hdr, cmd); + break; + case FORMAT_UNIT: + retcode = nvme_trans_format_unit(ns, hdr, cmd); + break; + case TEST_UNIT_READY: + retcode = nvme_trans_test_unit_ready(ns, hdr, cmd); + break; + case WRITE_BUFFER: + retcode = nvme_trans_write_buffer(ns, hdr, cmd); + break; + case UNMAP: + retcode = nvme_trans_unmap(ns, hdr, cmd); + break; + default: + out: + retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + break; + } + return retcode; +} + +int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) +{ + struct sg_io_hdr hdr; + int retcode; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&hdr, u_hdr, sizeof(hdr))) + return -EFAULT; + if (hdr.interface_id != 'S') + return -EINVAL; + if (hdr.cmd_len > BLK_MAX_CDB) + return -EINVAL; + + retcode = nvme_scsi_translate(ns, &hdr); + if (retcode < 0) + return retcode; + if (retcode > 0) + retcode = SNTI_TRANSLATION_SUCCESS; + if (copy_to_user(u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0) + return -EFAULT; + + return retcode; +} + +int nvme_sg_get_version_num(int __user *ip) +{ + return put_user(sg_version_num, ip); +} diff --git a/kernel/drivers/block/osdblk.c b/kernel/drivers/block/osdblk.c new file mode 100644 index 000000000..e22942596 --- /dev/null +++ b/kernel/drivers/block/osdblk.c @@ -0,0 +1,699 @@ + +/* + osdblk.c -- Export a single SCSI OSD object as a Linux block device + + + Copyright 2009 Red Hat, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + + Instructions for use + -------------------- + + 1) Map a Linux block device to an existing OSD object. + + In this example, we will use partition id 1234, object id 5678, + OSD device /dev/osd1. + + $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add + + + 2) List all active blkdev<->object mappings. + + In this example, we have performed step #1 twice, creating two blkdevs, + mapped to two separate OSD objects. + + $ cat /sys/class/osdblk/list + 0 174 1234 5678 /dev/osd1 + 1 179 1994 897123 /dev/osd0 + + The columns, in order, are: + - blkdev unique id + - blkdev assigned major + - OSD object partition id + - OSD object id + - OSD device + + + 3) Remove an active blkdev<->object mapping. + + In this example, we remove the mapping with blkdev unique id 1. + + $ echo 1 > /sys/class/osdblk/remove + + + NOTE: The actual creation and deletion of OSD objects is outside the scope + of this driver. + + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRV_NAME "osdblk" +#define PFX DRV_NAME ": " + +/* #define _OSDBLK_DEBUG */ +#ifdef _OSDBLK_DEBUG +#define OSDBLK_DEBUG(fmt, a...) \ + printk(KERN_NOTICE "osdblk @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define OSDBLK_DEBUG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +MODULE_AUTHOR("Jeff Garzik "); +MODULE_DESCRIPTION("block device inside an OSD object osdblk.ko"); +MODULE_LICENSE("GPL"); + +struct osdblk_device; + +enum { + OSDBLK_MINORS_PER_MAJOR = 256, /* max minors per blkdev */ + OSDBLK_MAX_REQ = 32, /* max parallel requests */ + OSDBLK_OP_TIMEOUT = 4 * 60, /* sync OSD req timeout */ +}; + +struct osdblk_request { + struct request *rq; /* blk layer request */ + struct bio *bio; /* cloned bio */ + struct osdblk_device *osdev; /* associated blkdev */ +}; + +struct osdblk_device { + int id; /* blkdev unique id */ + + int major; /* blkdev assigned major */ + struct gendisk *disk; /* blkdev's gendisk and rq */ + struct request_queue *q; + + struct osd_dev *osd; /* associated OSD */ + + char name[32]; /* blkdev name, e.g. osdblk34 */ + + spinlock_t lock; /* queue lock */ + + struct osd_obj_id obj; /* OSD partition, obj id */ + uint8_t obj_cred[OSD_CAP_LEN]; /* OSD cred */ + + struct osdblk_request req[OSDBLK_MAX_REQ]; /* request table */ + + struct list_head node; + + char osd_path[0]; /* OSD device path */ +}; + +static struct class *class_osdblk; /* /sys/class/osdblk */ +static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */ +static LIST_HEAD(osdblkdev_list); + +static const struct block_device_operations osdblk_bd_ops = { + .owner = THIS_MODULE, +}; + +static const struct osd_attr g_attr_logical_length = ATTR_DEF( + OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); + +static void osdblk_make_credential(u8 cred_a[OSD_CAP_LEN], + const struct osd_obj_id *obj) +{ + osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); +} + +/* copied from exofs; move to libosd? */ +/* + * Perform a synchronous OSD operation. copied from exofs; move to libosd? + */ +static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential) +{ + int ret; + + or->timeout = timeout; + ret = osd_finalize_request(or, 0, credential, NULL); + if (ret) + return ret; + + ret = osd_execute_request(or); + + /* osd_req_decode_sense(or, ret); */ + return ret; +} + +/* + * Perform an asynchronous OSD operation. copied from exofs; move to libosd? + */ +static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done, + void *caller_context, u8 *cred) +{ + int ret; + + ret = osd_finalize_request(or, 0, cred, NULL); + if (ret) + return ret; + + ret = osd_execute_request_async(or, async_done, caller_context); + + return ret; +} + +/* copied from exofs; move to libosd? */ +static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr) +{ + struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ + void *iter = NULL; + int nelem; + + do { + nelem = 1; + osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter); + if ((cur_attr.attr_page == attr->attr_page) && + (cur_attr.attr_id == attr->attr_id)) { + attr->len = cur_attr.len; + attr->val_ptr = cur_attr.val_ptr; + return 0; + } + } while (iter); + + return -EIO; +} + +static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out) +{ + struct osd_request *or; + struct osd_attr attr; + int ret; + + /* start request */ + or = osd_start_request(osdev->osd, GFP_KERNEL); + if (!or) + return -ENOMEM; + + /* create a get-attributes(length) request */ + osd_req_get_attributes(or, &osdev->obj); + + osd_req_add_get_attr_list(or, &g_attr_logical_length, 1); + + /* execute op synchronously */ + ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred); + if (ret) + goto out; + + /* extract length from returned attribute info */ + attr = g_attr_logical_length; + ret = extract_attr_from_req(or, &attr); + if (ret) + goto out; + + *size_out = get_unaligned_be64(attr.val_ptr); + +out: + osd_end_request(or); + return ret; + +} + +static void osdblk_osd_complete(struct osd_request *or, void *private) +{ + struct osdblk_request *orq = private; + struct osd_sense_info osi; + int ret = osd_req_decode_sense(or, &osi); + + if (ret) { + ret = -EIO; + OSDBLK_DEBUG("osdblk_osd_complete with err=%d\n", ret); + } + + /* complete OSD request */ + osd_end_request(or); + + /* complete request passed to osdblk by block layer */ + __blk_end_request_all(orq->rq, ret); +} + +static void bio_chain_put(struct bio *chain) +{ + struct bio *tmp; + + while (chain) { + tmp = chain; + chain = chain->bi_next; + + bio_put(tmp); + } +} + +static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask) +{ + struct bio *tmp, *new_chain = NULL, *tail = NULL; + + while (old_chain) { + tmp = bio_clone_kmalloc(old_chain, gfpmask); + if (!tmp) + goto err_out; + + tmp->bi_bdev = NULL; + gfpmask &= ~__GFP_WAIT; + tmp->bi_next = NULL; + + if (!new_chain) + new_chain = tail = tmp; + else { + tail->bi_next = tmp; + tail = tmp; + } + + old_chain = old_chain->bi_next; + } + + return new_chain; + +err_out: + OSDBLK_DEBUG("bio_chain_clone with err\n"); + bio_chain_put(new_chain); + return NULL; +} + +static void osdblk_rq_fn(struct request_queue *q) +{ + struct osdblk_device *osdev = q->queuedata; + + while (1) { + struct request *rq; + struct osdblk_request *orq; + struct osd_request *or; + struct bio *bio; + bool do_write, do_flush; + + /* peek at request from block layer */ + rq = blk_fetch_request(q); + if (!rq) + break; + + /* filter out block requests we don't understand */ + if (rq->cmd_type != REQ_TYPE_FS) { + blk_end_request_all(rq, 0); + continue; + } + + /* deduce our operation (read, write, flush) */ + /* I wish the block layer simplified cmd_type/cmd_flags/cmd[] + * into a clearly defined set of RPC commands: + * read, write, flush, scsi command, power mgmt req, + * driver-specific, etc. + */ + + do_flush = rq->cmd_flags & REQ_FLUSH; + do_write = (rq_data_dir(rq) == WRITE); + + if (!do_flush) { /* osd_flush does not use a bio */ + /* a bio clone to be passed down to OSD request */ + bio = bio_chain_clone(rq->bio, GFP_ATOMIC); + if (!bio) + break; + } else + bio = NULL; + + /* alloc internal OSD request, for OSD command execution */ + or = osd_start_request(osdev->osd, GFP_ATOMIC); + if (!or) { + bio_chain_put(bio); + OSDBLK_DEBUG("osd_start_request with err\n"); + break; + } + + orq = &osdev->req[rq->tag]; + orq->rq = rq; + orq->bio = bio; + orq->osdev = osdev; + + /* init OSD command: flush, write or read */ + if (do_flush) + osd_req_flush_object(or, &osdev->obj, + OSD_CDB_FLUSH_ALL, 0, 0); + else if (do_write) + osd_req_write(or, &osdev->obj, blk_rq_pos(rq) * 512ULL, + bio, blk_rq_bytes(rq)); + else + osd_req_read(or, &osdev->obj, blk_rq_pos(rq) * 512ULL, + bio, blk_rq_bytes(rq)); + + OSDBLK_DEBUG("%s 0x%x bytes at 0x%llx\n", + do_flush ? "flush" : do_write ? + "write" : "read", blk_rq_bytes(rq), + blk_rq_pos(rq) * 512ULL); + + /* begin OSD command execution */ + if (osd_async_op(or, osdblk_osd_complete, orq, + osdev->obj_cred)) { + osd_end_request(or); + blk_requeue_request(q, rq); + bio_chain_put(bio); + OSDBLK_DEBUG("osd_execute_request_async with err\n"); + break; + } + + /* remove the special 'flush' marker, now that the command + * is executing + */ + rq->special = NULL; + } +} + +static void osdblk_free_disk(struct osdblk_device *osdev) +{ + struct gendisk *disk = osdev->disk; + + if (!disk) + return; + + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + if (disk->queue) + blk_cleanup_queue(disk->queue); + put_disk(disk); +} + +static int osdblk_init_disk(struct osdblk_device *osdev) +{ + struct gendisk *disk; + struct request_queue *q; + int rc; + u64 obj_size = 0; + + /* contact OSD, request size info about the object being mapped */ + rc = osdblk_get_obj_size(osdev, &obj_size); + if (rc) + return rc; + + /* create gendisk info */ + disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR); + if (!disk) + return -ENOMEM; + + sprintf(disk->disk_name, DRV_NAME "%d", osdev->id); + disk->major = osdev->major; + disk->first_minor = 0; + disk->fops = &osdblk_bd_ops; + disk->private_data = osdev; + + /* init rq */ + q = blk_init_queue(osdblk_rq_fn, &osdev->lock); + if (!q) { + put_disk(disk); + return -ENOMEM; + } + + /* switch queue to TCQ mode; allocate tag map */ + rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO); + if (rc) { + blk_cleanup_queue(q); + put_disk(disk); + return rc; + } + + /* Set our limits to the lower device limits, because osdblk cannot + * sleep when allocating a lower-request and therefore cannot be + * bouncing. + */ + blk_queue_stack_limits(q, osd_request_queue(osdev->osd)); + + blk_queue_prep_rq(q, blk_queue_start_tag); + blk_queue_flush(q, REQ_FLUSH); + + disk->queue = q; + + q->queuedata = osdev; + + osdev->disk = disk; + osdev->q = q; + + /* finally, announce the disk to the world */ + set_capacity(disk, obj_size / 512ULL); + add_disk(disk); + + printk(KERN_INFO "%s: Added of size 0x%llx\n", + disk->disk_name, (unsigned long long)obj_size); + + return 0; +} + +/******************************************************************** + * /sys/class/osdblk/ + * add map OSD object to blkdev + * remove unmap OSD object + * list show mappings + *******************************************************************/ + +static void class_osdblk_release(struct class *cls) +{ + kfree(cls); +} + +static ssize_t class_osdblk_list(struct class *c, + struct class_attribute *attr, + char *data) +{ + int n = 0; + struct list_head *tmp; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + list_for_each(tmp, &osdblkdev_list) { + struct osdblk_device *osdev; + + osdev = list_entry(tmp, struct osdblk_device, node); + + n += sprintf(data+n, "%d %d %llu %llu %s\n", + osdev->id, + osdev->major, + osdev->obj.partition, + osdev->obj.id, + osdev->osd_path); + } + + mutex_unlock(&ctl_mutex); + return n; +} + +static ssize_t class_osdblk_add(struct class *c, + struct class_attribute *attr, + const char *buf, size_t count) +{ + struct osdblk_device *osdev; + ssize_t rc; + int irc, new_id = 0; + struct list_head *tmp; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + /* new osdblk_device object */ + osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL); + if (!osdev) { + rc = -ENOMEM; + goto err_out_mod; + } + + /* static osdblk_device initialization */ + spin_lock_init(&osdev->lock); + INIT_LIST_HEAD(&osdev->node); + + /* generate unique id: find highest unique id, add one */ + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + list_for_each(tmp, &osdblkdev_list) { + struct osdblk_device *osdev; + + osdev = list_entry(tmp, struct osdblk_device, node); + if (osdev->id > new_id) + new_id = osdev->id + 1; + } + + osdev->id = new_id; + + /* add to global list */ + list_add_tail(&osdev->node, &osdblkdev_list); + + mutex_unlock(&ctl_mutex); + + /* parse add command */ + if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id, + osdev->osd_path) != 3) { + rc = -EINVAL; + goto err_out_slot; + } + + /* initialize rest of new object */ + sprintf(osdev->name, DRV_NAME "%d", osdev->id); + + /* contact requested OSD */ + osdev->osd = osduld_path_lookup(osdev->osd_path); + if (IS_ERR(osdev->osd)) { + rc = PTR_ERR(osdev->osd); + goto err_out_slot; + } + + /* build OSD credential */ + osdblk_make_credential(osdev->obj_cred, &osdev->obj); + + /* register our block device */ + irc = register_blkdev(0, osdev->name); + if (irc < 0) { + rc = irc; + goto err_out_osd; + } + + osdev->major = irc; + + /* set up and announce blkdev mapping */ + rc = osdblk_init_disk(osdev); + if (rc) + goto err_out_blkdev; + + return count; + +err_out_blkdev: + unregister_blkdev(osdev->major, osdev->name); +err_out_osd: + osduld_put_device(osdev->osd); +err_out_slot: + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + list_del_init(&osdev->node); + mutex_unlock(&ctl_mutex); + + kfree(osdev); +err_out_mod: + OSDBLK_DEBUG("Error adding device %s\n", buf); + module_put(THIS_MODULE); + return rc; +} + +static ssize_t class_osdblk_remove(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + struct osdblk_device *osdev = NULL; + int target_id, rc; + unsigned long ul; + struct list_head *tmp; + + rc = kstrtoul(buf, 10, &ul); + if (rc) + return rc; + + /* convert to int; abort if we lost anything in the conversion */ + target_id = (int) ul; + if (target_id != ul) + return -EINVAL; + + /* remove object from list immediately */ + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + list_for_each(tmp, &osdblkdev_list) { + osdev = list_entry(tmp, struct osdblk_device, node); + if (osdev->id == target_id) { + list_del_init(&osdev->node); + break; + } + osdev = NULL; + } + + mutex_unlock(&ctl_mutex); + + if (!osdev) + return -ENOENT; + + /* clean up and free blkdev and associated OSD connection */ + osdblk_free_disk(osdev); + unregister_blkdev(osdev->major, osdev->name); + osduld_put_device(osdev->osd); + kfree(osdev); + + /* release module ref */ + module_put(THIS_MODULE); + + return count; +} + +static struct class_attribute class_osdblk_attrs[] = { + __ATTR(add, 0200, NULL, class_osdblk_add), + __ATTR(remove, 0200, NULL, class_osdblk_remove), + __ATTR(list, 0444, class_osdblk_list, NULL), + __ATTR_NULL +}; + +static int osdblk_sysfs_init(void) +{ + int ret = 0; + + /* + * create control files in sysfs + * /sys/class/osdblk/... + */ + class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL); + if (!class_osdblk) + return -ENOMEM; + + class_osdblk->name = DRV_NAME; + class_osdblk->owner = THIS_MODULE; + class_osdblk->class_release = class_osdblk_release; + class_osdblk->class_attrs = class_osdblk_attrs; + + ret = class_register(class_osdblk); + if (ret) { + kfree(class_osdblk); + class_osdblk = NULL; + printk(PFX "failed to create class osdblk\n"); + return ret; + } + + return 0; +} + +static void osdblk_sysfs_cleanup(void) +{ + if (class_osdblk) + class_destroy(class_osdblk); + class_osdblk = NULL; +} + +static int __init osdblk_init(void) +{ + int rc; + + rc = osdblk_sysfs_init(); + if (rc) + return rc; + + return 0; +} + +static void __exit osdblk_exit(void) +{ + osdblk_sysfs_cleanup(); +} + +module_init(osdblk_init); +module_exit(osdblk_exit); + diff --git a/kernel/drivers/block/paride/Kconfig b/kernel/drivers/block/paride/Kconfig new file mode 100644 index 000000000..efefb5ac3 --- /dev/null +++ b/kernel/drivers/block/paride/Kconfig @@ -0,0 +1,300 @@ +# +# PARIDE configuration +# +# PARIDE doesn't need PARPORT, but if PARPORT is configured as a module, +# PARIDE must also be a module. +# PARIDE only supports PC style parports. Tough for USB or other parports... + +comment "Parallel IDE high-level drivers" + depends on PARIDE + +config PARIDE_PD + tristate "Parallel port IDE disks" + depends on PARIDE + help + This option enables the high-level driver for IDE-type disk devices + connected through a parallel port. If you chose to build PARIDE + support into your kernel, you may answer Y here to build in the + parallel port IDE driver, otherwise you should answer M to build + it as a loadable module. The module will be called pd. You + must also have at least one parallel port protocol driver in your + system. Among the devices supported by this driver are the SyQuest + EZ-135, EZ-230 and SparQ drives, the Avatar Shark and the backpack + hard drives from MicroSolutions. + +config PARIDE_PCD + tristate "Parallel port ATAPI CD-ROMs" + depends on PARIDE + ---help--- + This option enables the high-level driver for ATAPI CD-ROM devices + connected through a parallel port. If you chose to build PARIDE + support into your kernel, you may answer Y here to build in the + parallel port ATAPI CD-ROM driver, otherwise you should answer M to + build it as a loadable module. The module will be called pcd. You + must also have at least one parallel port protocol driver in your + system. Among the devices supported by this driver are the + MicroSolutions backpack CD-ROM drives and the Freecom Power CD. If + you have such a CD-ROM drive, you should also say Y or M to "ISO + 9660 CD-ROM file system support" below, because that's the file + system used on CD-ROMs. + +config PARIDE_PF + tristate "Parallel port ATAPI disks" + depends on PARIDE + help + This option enables the high-level driver for ATAPI disk devices + connected through a parallel port. If you chose to build PARIDE + support into your kernel, you may answer Y here to build in the + parallel port ATAPI disk driver, otherwise you should answer M + to build it as a loadable module. The module will be called pf. + You must also have at least one parallel port protocol driver in + your system. Among the devices supported by this driver are the + MicroSolutions backpack PD/CD drive and the Imation Superdisk + LS-120 drive. + +config PARIDE_PT + tristate "Parallel port ATAPI tapes" + depends on PARIDE + help + This option enables the high-level driver for ATAPI tape devices + connected through a parallel port. If you chose to build PARIDE + support into your kernel, you may answer Y here to build in the + parallel port ATAPI disk driver, otherwise you should answer M + to build it as a loadable module. The module will be called pt. + You must also have at least one parallel port protocol driver in + your system. Among the devices supported by this driver is the + parallel port version of the HP 5GB drive. + +config PARIDE_PG + tristate "Parallel port generic ATAPI devices" + depends on PARIDE + ---help--- + This option enables a special high-level driver for generic ATAPI + devices connected through a parallel port. The driver allows user + programs, such as cdrtools, to send ATAPI commands directly to a + device. + + If you chose to build PARIDE support into your kernel, you may + answer Y here to build in the parallel port generic ATAPI driver, + otherwise you should answer M to build it as a loadable module. The + module will be called pg. + + You must also have at least one parallel port protocol driver in + your system. + + This driver implements an API loosely related to the generic SCSI + driver. See . for details. + + You can obtain the most recent version of cdrtools from + . Versions 1.6.1a3 and + later fully support this driver. + +comment "Parallel IDE protocol modules" + depends on PARIDE + +config PARIDE_ATEN + tristate "ATEN EH-100 protocol" + depends on PARIDE + help + This option enables support for the ATEN EH-100 parallel port IDE + protocol. This protocol is used in some inexpensive low performance + parallel port kits made in Hong Kong. If you chose to build PARIDE + support into your kernel, you may answer Y here to build in the + protocol driver, otherwise you should answer M to build it as a + loadable module. The module will be called aten. You must also + have a high-level driver for the type of device that you want to + support. + +config PARIDE_BPCK + tristate "MicroSolutions backpack (Series 5) protocol" + depends on PARIDE + ---help--- + This option enables support for the Micro Solutions BACKPACK + parallel port Series 5 IDE protocol. (Most BACKPACK drives made + before 1999 were Series 5) Series 5 drives will NOT always have the + Series noted on the bottom of the drive. Series 6 drivers will. + + In other words, if your BACKPACK drive doesn't say "Series 6" on the + bottom, enable this option. + + If you chose to build PARIDE support into your kernel, you may + answer Y here to build in the protocol driver, otherwise you should + answer M to build it as a loadable module. The module will be + called bpck. You must also have a high-level driver for the type + of device that you want to support. + +config PARIDE_BPCK6 + tristate "MicroSolutions backpack (Series 6) protocol" + depends on PARIDE && !64BIT + ---help--- + This option enables support for the Micro Solutions BACKPACK + parallel port Series 6 IDE protocol. (Most BACKPACK drives made + after 1999 were Series 6) Series 6 drives will have the Series noted + on the bottom of the drive. Series 5 drivers don't always have it + noted. + + In other words, if your BACKPACK drive says "Series 6" on the + bottom, enable this option. + + If you chose to build PARIDE support into your kernel, you may + answer Y here to build in the protocol driver, otherwise you should + answer M to build it as a loadable module. The module will be + called bpck6. You must also have a high-level driver for the type + of device that you want to support. + +config PARIDE_COMM + tristate "DataStor Commuter protocol" + depends on PARIDE + help + This option enables support for the Commuter parallel port IDE + protocol from DataStor. If you chose to build PARIDE support + into your kernel, you may answer Y here to build in the protocol + driver, otherwise you should answer M to build it as a loadable + module. The module will be called comm. You must also have + a high-level driver for the type of device that you want to support. + +config PARIDE_DSTR + tristate "DataStor EP-2000 protocol" + depends on PARIDE + help + This option enables support for the EP-2000 parallel port IDE + protocol from DataStor. If you chose to build PARIDE support + into your kernel, you may answer Y here to build in the protocol + driver, otherwise you should answer M to build it as a loadable + module. The module will be called dstr. You must also have + a high-level driver for the type of device that you want to support. + +config PARIDE_FIT2 + tristate "FIT TD-2000 protocol" + depends on PARIDE + help + This option enables support for the TD-2000 parallel port IDE + protocol from Fidelity International Technology. This is a simple + (low speed) adapter that is used in some portable hard drives. If + you chose to build PARIDE support into your kernel, you may answer Y + here to build in the protocol driver, otherwise you should answer M + to build it as a loadable module. The module will be called ktti. + You must also have a high-level driver for the type of device that + you want to support. + +config PARIDE_FIT3 + tristate "FIT TD-3000 protocol" + depends on PARIDE + help + This option enables support for the TD-3000 parallel port IDE + protocol from Fidelity International Technology. This protocol is + used in newer models of their portable disk, CD-ROM and PD/CD + devices. If you chose to build PARIDE support into your kernel, you + may answer Y here to build in the protocol driver, otherwise you + should answer M to build it as a loadable module. The module will be + called fit3. You must also have a high-level driver for the type + of device that you want to support. + +config PARIDE_EPAT + tristate "Shuttle EPAT/EPEZ protocol" + depends on PARIDE + help + This option enables support for the EPAT parallel port IDE protocol. + EPAT is a parallel port IDE adapter manufactured by Shuttle + Technology and widely used in devices from major vendors such as + Hewlett-Packard, SyQuest, Imation and Avatar. If you chose to build + PARIDE support into your kernel, you may answer Y here to build in + the protocol driver, otherwise you should answer M to build it as a + loadable module. The module will be called epat. You must also + have a high-level driver for the type of device that you want to + support. + +config PARIDE_EPATC8 + bool "Support c7/c8 chips" + depends on PARIDE_EPAT + help + This option enables support for the newer Shuttle EP1284 (aka c7 and + c8) chip. You need this if you are using any recent Imation SuperDisk + (LS-120) drive. + +config PARIDE_EPIA + tristate "Shuttle EPIA protocol" + depends on PARIDE + help + This option enables support for the (obsolete) EPIA parallel port + IDE protocol from Shuttle Technology. This adapter can still be + found in some no-name kits. If you chose to build PARIDE support + into your kernel, you may answer Y here to build in the protocol + driver, otherwise you should answer M to build it as a loadable + module. The module will be called epia. You must also have a + high-level driver for the type of device that you want to support. + +config PARIDE_FRIQ + tristate "Freecom IQ ASIC-2 protocol" + depends on PARIDE + help + This option enables support for version 2 of the Freecom IQ parallel + port IDE adapter. This adapter is used by the Maxell Superdisk + drive. If you chose to build PARIDE support into your kernel, you + may answer Y here to build in the protocol driver, otherwise you + should answer M to build it as a loadable module. The module will be + called friq. You must also have a high-level driver for the type + of device that you want to support. + +config PARIDE_FRPW + tristate "FreeCom power protocol" + depends on PARIDE + help + This option enables support for the Freecom power parallel port IDE + protocol. If you chose to build PARIDE support into your kernel, you + may answer Y here to build in the protocol driver, otherwise you + should answer M to build it as a loadable module. The module will be + called frpw. You must also have a high-level driver for the type + of device that you want to support. + +config PARIDE_KBIC + tristate "KingByte KBIC-951A/971A protocols" + depends on PARIDE + help + This option enables support for the KBIC-951A and KBIC-971A parallel + port IDE protocols from KingByte Information Corp. KingByte's + adapters appear in many no-name portable disk and CD-ROM products, + especially in Europe. If you chose to build PARIDE support into your + kernel, you may answer Y here to build in the protocol driver, + otherwise you should answer M to build it as a loadable module. The + module will be called kbic. You must also have a high-level driver + for the type of device that you want to support. + +config PARIDE_KTTI + tristate "KT PHd protocol" + depends on PARIDE + help + This option enables support for the "PHd" parallel port IDE protocol + from KT Technology. This is a simple (low speed) adapter that is + used in some 2.5" portable hard drives. If you chose to build PARIDE + support into your kernel, you may answer Y here to build in the + protocol driver, otherwise you should answer M to build it as a + loadable module. The module will be called ktti. You must also + have a high-level driver for the type of device that you want to + support. + +config PARIDE_ON20 + tristate "OnSpec 90c20 protocol" + depends on PARIDE + help + This option enables support for the (obsolete) 90c20 parallel port + IDE protocol from OnSpec (often marketed under the ValuStore brand + name). If you chose to build PARIDE support into your kernel, you + may answer Y here to build in the protocol driver, otherwise you + should answer M to build it as a loadable module. The module will + be called on20. You must also have a high-level driver for the + type of device that you want to support. + +config PARIDE_ON26 + tristate "OnSpec 90c26 protocol" + depends on PARIDE + help + This option enables support for the 90c26 parallel port IDE protocol + from OnSpec Electronics (often marketed under the ValuStore brand + name). If you chose to build PARIDE support into your kernel, you + may answer Y here to build in the protocol driver, otherwise you + should answer M to build it as a loadable module. The module will be + called on26. You must also have a high-level driver for the type + of device that you want to support. + +# diff --git a/kernel/drivers/block/paride/Makefile b/kernel/drivers/block/paride/Makefile new file mode 100644 index 000000000..a539e004b --- /dev/null +++ b/kernel/drivers/block/paride/Makefile @@ -0,0 +1,28 @@ +# +# Makefile for Parallel port IDE device drivers. +# +# 7 October 2000, Bartlomiej Zolnierkiewicz +# Rewritten to use lists instead of if-statements. +# + +obj-$(CONFIG_PARIDE) += paride.o +obj-$(CONFIG_PARIDE_ATEN) += aten.o +obj-$(CONFIG_PARIDE_BPCK) += bpck.o +obj-$(CONFIG_PARIDE_COMM) += comm.o +obj-$(CONFIG_PARIDE_DSTR) += dstr.o +obj-$(CONFIG_PARIDE_KBIC) += kbic.o +obj-$(CONFIG_PARIDE_EPAT) += epat.o +obj-$(CONFIG_PARIDE_EPIA) += epia.o +obj-$(CONFIG_PARIDE_FRPW) += frpw.o +obj-$(CONFIG_PARIDE_FRIQ) += friq.o +obj-$(CONFIG_PARIDE_FIT2) += fit2.o +obj-$(CONFIG_PARIDE_FIT3) += fit3.o +obj-$(CONFIG_PARIDE_ON20) += on20.o +obj-$(CONFIG_PARIDE_ON26) += on26.o +obj-$(CONFIG_PARIDE_KTTI) += ktti.o +obj-$(CONFIG_PARIDE_BPCK6) += bpck6.o +obj-$(CONFIG_PARIDE_PD) += pd.o +obj-$(CONFIG_PARIDE_PCD) += pcd.o +obj-$(CONFIG_PARIDE_PF) += pf.o +obj-$(CONFIG_PARIDE_PT) += pt.o +obj-$(CONFIG_PARIDE_PG) += pg.o diff --git a/kernel/drivers/block/paride/Transition-notes b/kernel/drivers/block/paride/Transition-notes new file mode 100644 index 000000000..70374907c --- /dev/null +++ b/kernel/drivers/block/paride/Transition-notes @@ -0,0 +1,128 @@ +Lemma 1: + If ps_tq is scheduled, ps_tq_active is 1. ps_tq_int() can be called + only when ps_tq_active is 1. +Proof: All assignments to ps_tq_active and all scheduling of ps_tq happen + under ps_spinlock. There are three places where that can happen: + one in ps_set_intr() (A) and two in ps_tq_int() (B and C). + Consider the sequnce of these events. A can not be preceded by + anything except B, since it is under if (!ps_tq_active) under + ps_spinlock. C is always preceded by B, since we can't reach it + other than through B and we don't drop ps_spinlock between them. + IOW, the sequence is A?(BA|BC|B)*. OTOH, number of B can not exceed + the sum of numbers of A and C, since each call of ps_tq_int() is + the result of ps_tq execution. Therefore, the sequence starts with + A and each B is preceded by either A or C. Moments when we enter + ps_tq_int() are sandwiched between {A,C} and B in that sequence, + since at any time number of B can not exceed the number of these + moments which, in turn, can not exceed the number of A and C. + In other words, the sequence of events is (A or C set ps_tq_active to + 1 and schedule ps_tq, ps_tq is executed, ps_tq_int() is entered, + B resets ps_tq_active)*. + + +consider the following area: + * in do_pd_request1(): to calls of pi_do_claimed() and return in + case when pd_req is NULL. + * in next_request(): to call of do_pd_request1() + * in do_pd_read(): to call of ps_set_intr() + * in do_pd_read_start(): to calls of pi_do_claimed(), next_request() +and ps_set_intr() + * in do_pd_read_drq(): to calls of pi_do_claimed() and next_request() + * in do_pd_write(): to call of ps_set_intr() + * in do_pd_write_start(): to calls of pi_do_claimed(), next_request() +and ps_set_intr() + * in do_pd_write_done(): to calls of pi_do_claimed() and next_request() + * in ps_set_intr(): to check for ps_tq_active and to scheduling + ps_tq if ps_tq_active was 0. + * in ps_tq_int(): from the moment when we get ps_spinlock() to the + return, call of con() or scheduling ps_tq. + * in pi_schedule_claimed() when called from pi_do_claimed() called from + pd.c, everything until returning 1 or setting or setting ->claim_cont + on the path that returns 0 + * in pi_do_claimed() when called from pd.c, everything until the call + of pi_do_claimed() plus the everything until the call of cont() if + pi_do_claimed() has returned 1. + * in pi_wake_up() called for PIA that belongs to pd.c, everything from + the moment when pi_spinlock has been acquired. + +Lemma 2: + 1) at any time at most one thread of execution can be in that area or + be preempted there. + 2) When there is such a thread, pd_busy is set or pd_lock is held by + that thread. + 3) When there is such a thread, ps_tq_active is 0 or ps_spinlock is + held by that thread. + 4) When there is such a thread, all PIA belonging to pd.c have NULL + ->claim_cont or pi_spinlock is held by thread in question. + +Proof: consider the first moment when the above is not true. + +(1) can become not true if some thread enters that area while another is there. + a) do_pd_request1() can be called from next_request() or do_pd_request() + In the first case the thread was already in the area. In the second, + the thread was holding pd_lock and found pd_busy not set, which would + mean that (2) was already not true. + b) ps_set_intr() and pi_schedule_claimed() can be called only from the + area. + c) pi_do_claimed() is called by pd.c only from the area. + d) ps_tq_int() can enter the area only when the thread is holding + ps_spinlock and ps_tq_active is 1 (due to Lemma 1). It means that + (3) was already not true. + e) do_pd_{read,write}* could be called only from the area. The only + case that needs consideration is call from pi_wake_up() and there + we would have to be called for the PIA that got ->claimed_cont + from pd.c. That could happen only if pi_do_claimed() had been + called from pd.c for that PIA, which happens only for PIA belonging + to pd.c. + f) pi_wake_up() can enter the area only when the thread is holding + pi_spinlock and ->claimed_cont is non-NULL for PIA belonging to + pd.c. It means that (4) was already not true. + +(2) can become not true only when pd_lock is released by the thread in question. + Indeed, pd_busy is reset only in the area and thread that resets + it is holding pd_lock. The only place within the area where we + release pd_lock is in pd_next_buf() (called from within the area). + But that code does not reset pd_busy, so pd_busy would have to be + 0 when pd_next_buf() had acquired pd_lock. If it become 0 while + we were acquiring the lock, (1) would be already false, since + the thread that had reset it would be in the area simulateously. + If it was 0 before we tried to acquire pd_lock, (2) would be + already false. + +For similar reasons, (3) can become not true only when ps_spinlock is released +by the thread in question. However, all such places within the area are right +after resetting ps_tq_active to 0. + +(4) is done the same way - all places where we release pi_spinlock within +the area are either after resetting ->claimed_cont to NULL while holding +pi_spinlock, or after not tocuhing ->claimed_cont since acquiring pi_spinlock +also in the area. The only place where ->claimed_cont is made non-NULL is +in the area, under pi_spinlock and we do not release it until after leaving +the area. + +QED. + + +Corollary 1: ps_tq_active can be killed. Indeed, the only place where we +check its value is in ps_set_intr() and if it had been non-zero at that +point, we would have violated either (2.1) (if it was set while ps_set_intr() +was acquiring ps_spinlock) or (2.3) (if it was set when we started to +acquire ps_spinlock). + +Corollary 2: ps_spinlock can be killed. Indeed, Lemma 1 and Lemma 2 show +that the only possible contention is between scheduling ps_tq followed by +immediate release of spinlock and beginning of execution of ps_tq on +another CPU. + +Corollary 3: assignment to pd_busy in do_pd_read_start() and do_pd_write_start() +can be killed. Indeed, we are not holding pd_lock and thus pd_busy is already +1 here. + +Corollary 4: in ps_tq_int() uses of con can be replaced with uses of +ps_continuation, since the latter is changed only from the area. +We don't need to reset it to NULL, since we are guaranteed that there +will be a call of ps_set_intr() before we look at ps_continuation again. +We can remove the check for ps_continuation being NULL for the same +reason - the value is guaranteed to be set by the last ps_set_intr() and +we never pass it NULL. Assignements in the beginning of ps_set_intr() +can be taken to callers as long as they remain within the area. diff --git a/kernel/drivers/block/paride/aten.c b/kernel/drivers/block/paride/aten.c new file mode 100644 index 000000000..269546556 --- /dev/null +++ b/kernel/drivers/block/paride/aten.c @@ -0,0 +1,162 @@ +/* + aten.c (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + aten.c is a low-level protocol driver for the ATEN EH-100 + parallel port adapter. The EH-100 supports 4-bit and 8-bit + modes only. There is also an EH-132 which supports EPP mode + transfers. The EH-132 is not yet supported. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.05 init_proto, release_proto + +*/ + +#define ATEN_VERSION "1.01" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +#define j44(a,b) ((((a>>4)&0x0f)|(b&0xf0))^0x88) + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set +*/ + +static int cont_map[2] = { 0x08, 0x20 }; + +static void aten_write_regr( PIA *pi, int cont, int regr, int val) + +{ int r; + + r = regr + cont_map[cont] + 0x80; + + w0(r); w2(0xe); w2(6); w0(val); w2(7); w2(6); w2(0xc); +} + +static int aten_read_regr( PIA *pi, int cont, int regr ) + +{ int a, b, r; + + r = regr + cont_map[cont] + 0x40; + + switch (pi->mode) { + + case 0: w0(r); w2(0xe); w2(6); + w2(7); w2(6); w2(0); + a = r1(); w0(0x10); b = r1(); w2(0xc); + return j44(a,b); + + case 1: r |= 0x10; + w0(r); w2(0xe); w2(6); w0(0xff); + w2(0x27); w2(0x26); w2(0x20); + a = r0(); + w2(0x26); w2(0xc); + return a; + } + return -1; +} + +static void aten_read_block( PIA *pi, char * buf, int count ) + +{ int k, a, b, c, d; + + switch (pi->mode) { + + case 0: w0(0x48); w2(0xe); w2(6); + for (k=0;ksaved_r0 = r0(); + pi->saved_r2 = r2(); + w2(0xc); +} + +static void aten_disconnect ( PIA *pi ) + +{ w0(pi->saved_r0); + w2(pi->saved_r2); +} + +static void aten_log_adapter( PIA *pi, char * scratch, int verbose ) + +{ char *mode_string[2] = {"4-bit","8-bit"}; + + printk("%s: aten %s, ATEN EH-100 at 0x%x, ", + pi->device,ATEN_VERSION,pi->port); + printk("mode %d (%s), delay %d\n",pi->mode, + mode_string[pi->mode],pi->delay); + +} + +static struct pi_protocol aten = { + .owner = THIS_MODULE, + .name = "aten", + .max_mode = 2, + .epp_first = 2, + .default_delay = 1, + .max_units = 1, + .write_regr = aten_write_regr, + .read_regr = aten_read_regr, + .write_block = aten_write_block, + .read_block = aten_read_block, + .connect = aten_connect, + .disconnect = aten_disconnect, + .log_adapter = aten_log_adapter, +}; + +static int __init aten_init(void) +{ + return paride_register(&aten); +} + +static void __exit aten_exit(void) +{ + paride_unregister( &aten ); +} + +MODULE_LICENSE("GPL"); +module_init(aten_init) +module_exit(aten_exit) diff --git a/kernel/drivers/block/paride/bpck.c b/kernel/drivers/block/paride/bpck.c new file mode 100644 index 000000000..4f27e7392 --- /dev/null +++ b/kernel/drivers/block/paride/bpck.c @@ -0,0 +1,477 @@ +/* + bpck.c (c) 1996-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + bpck.c is a low-level protocol driver for the MicroSolutions + "backpack" parallel port IDE adapter. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.05 init_proto, release_proto, pi->delay + 1.02 GRG 1998.08.15 default pi->delay returned to 4 + +*/ + +#define BPCK_VERSION "1.02" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +#undef r2 +#undef w2 + +#define PC pi->private +#define r2() (PC=(in_p(2) & 0xff)) +#define w2(byte) {out_p(2,byte); PC = byte;} +#define t2(pat) {PC ^= pat; out_p(2,PC);} +#define e2() {PC &= 0xfe; out_p(2,PC);} +#define o2() {PC |= 1; out_p(2,PC);} + +#define j44(l,h) (((l>>3)&0x7)|((l>>4)&0x8)|((h<<1)&0x70)|(h&0x80)) + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set + cont = 2 - use internal bpck register addressing +*/ + +static int cont_map[3] = { 0x40, 0x48, 0 }; + +static int bpck_read_regr( PIA *pi, int cont, int regr ) + +{ int r, l, h; + + r = regr + cont_map[cont]; + + switch (pi->mode) { + + case 0: w0(r & 0xf); w0(r); t2(2); t2(4); + l = r1(); + t2(4); + h = r1(); + return j44(l,h); + + case 1: w0(r & 0xf); w0(r); t2(2); + e2(); t2(0x20); + t2(4); h = r0(); + t2(1); t2(0x20); + return h; + + case 2: + case 3: + case 4: w0(r); w2(9); w2(0); w2(0x20); + h = r4(); + w2(0); + return h; + + } + return -1; +} + +static void bpck_write_regr( PIA *pi, int cont, int regr, int val ) + +{ int r; + + r = regr + cont_map[cont]; + + switch (pi->mode) { + + case 0: + case 1: w0(r); + t2(2); + w0(val); + o2(); t2(4); t2(1); + break; + + case 2: + case 3: + case 4: w0(r); w2(9); w2(0); + w0(val); w2(1); w2(3); w2(0); + break; + + } +} + +/* These macros access the bpck registers in native addressing */ + +#define WR(r,v) bpck_write_regr(pi,2,r,v) +#define RR(r) (bpck_read_regr(pi,2,r)) + +static void bpck_write_block( PIA *pi, char * buf, int count ) + +{ int i; + + switch (pi->mode) { + + case 0: WR(4,0x40); + w0(0x40); t2(2); t2(1); + for (i=0;imode) { + + case 0: WR(4,0x40); + w0(0x40); t2(2); + for (i=0;iunit; + s = 0; + w2(4); w2(0xe); r2(); t2(2); + o1 = r1()&0xf8; + o0 = r0(); + w0(255-id); w2(4); w0(id); + t2(8); t2(8); t2(8); + t2(2); t = r1()&0xf8; + f7 = ((id % 8) == 7); + if ((f7) || (t != o1)) { t2(2); s = r1()&0xf8; } + if ((t == o1) && ((!f7) || (s == o1))) { + w2(0x4c); w0(o0); + return 0; + } + t2(8); w0(0); t2(2); w2(0x4c); w0(o0); + return 1; +} + +static void bpck_connect ( PIA *pi ) + +{ pi->saved_r0 = r0(); + w0(0xff-pi->unit); w2(4); w0(pi->unit); + t2(8); t2(8); t2(8); + t2(2); t2(2); + + switch (pi->mode) { + + case 0: t2(8); WR(4,0); + break; + + case 1: t2(8); WR(4,0x10); + break; + + case 2: + case 3: + case 4: w2(0); WR(4,8); + break; + + } + + WR(5,8); + + if (pi->devtype == PI_PCD) { + WR(0x46,0x10); /* fiddle with ESS logic ??? */ + WR(0x4c,0x38); + WR(0x4d,0x88); + WR(0x46,0xa0); + WR(0x41,0); + WR(0x4e,8); + } +} + +static void bpck_disconnect ( PIA *pi ) + +{ w0(0); + if (pi->mode >= 2) { w2(9); w2(0); } else t2(2); + w2(0x4c); w0(pi->saved_r0); +} + +static void bpck_force_spp ( PIA *pi ) + +/* This fakes the EPP protocol to turn off EPP ... */ + +{ pi->saved_r0 = r0(); + w0(0xff-pi->unit); w2(4); w0(pi->unit); + t2(8); t2(8); t2(8); + t2(2); t2(2); + + w2(0); + w0(4); w2(9); w2(0); + w0(0); w2(1); w2(3); w2(0); + w0(0); w2(9); w2(0); + w2(0x4c); w0(pi->saved_r0); +} + +#define TEST_LEN 16 + +static int bpck_test_proto( PIA *pi, char * scratch, int verbose ) + +{ int i, e, l, h, om; + char buf[TEST_LEN]; + + bpck_force_spp(pi); + + switch (pi->mode) { + + case 0: bpck_connect(pi); + WR(0x13,0x7f); + w0(0x13); t2(2); + for(i=0;imode; + pi->mode = 0; + bpck_connect(pi); + WR(7,3); + WR(4,8); + bpck_disconnect(pi); + + pi->mode = om; + bpck_connect(pi); + w0(0x13); w2(9); w2(1); w0(0); w2(3); w2(0); w2(0xe0); + + switch (pi->mode) { + case 2: for (i=0;idevice,pi->port,pi->unit,pi->mode); + for (i=0;imode; od = pi->delay; + pi->mode = 0; pi->delay = 6; + + bpck_connect(pi); + + n = 0; + WR(4,0); + for (i=0;i<64;i++) { + WR(6,8); + WR(6,0xc); + p = 0x100; + for (k=0;k<9;k++) { + f = (((i + 0x180) & p) != 0) * 2; + WR(6,f+0xc); + WR(6,f+0xd); + WR(6,f+0xc); + p = (p >> 1); + } + for (j=0;j<2;j++) { + v = 0; + for (k=0;k<8;k++) { + WR(6,0xc); + WR(6,0xd); + WR(6,0xc); + f = RR(0); + v = 2*v + (f == 0x84); + } + buf[2*i+1-j] = v; + } + } + WR(6,8); + WR(6,0); + WR(5,8); + + bpck_disconnect(pi); + + if (om >= 2) { + bpck_connect(pi); + WR(7,3); + WR(4,8); + bpck_disconnect(pi); + } + + pi->mode = om; pi->delay = od; +} + +static int bpck_test_port ( PIA *pi ) /* check for 8-bit port */ + +{ int i, r, m; + + w2(0x2c); i = r0(); w0(255-i); r = r0(); w0(i); + m = -1; + if (r == i) m = 2; + if (r == (255-i)) m = 0; + + w2(0xc); i = r0(); w0(255-i); r = r0(); w0(i); + if (r != (255-i)) m = -1; + + if (m == 0) { w2(6); w2(0xc); r = r0(); w0(0xaa); w0(r); w0(0xaa); } + if (m == 2) { w2(0x26); w2(0xc); } + + if (m == -1) return 0; + return 5; +} + +static void bpck_log_adapter( PIA *pi, char * scratch, int verbose ) + +{ char *mode_string[5] = { "4-bit","8-bit","EPP-8", + "EPP-16","EPP-32" }; + +#ifdef DUMP_EEPROM + int i; +#endif + + bpck_read_eeprom(pi,scratch); + +#ifdef DUMP_EEPROM + if (verbose) { + for(i=0;i<128;i++) + if ((scratch[i] < ' ') || (scratch[i] > '~')) + scratch[i] = '.'; + printk("%s: bpck EEPROM: %64.64s\n",pi->device,scratch); + printk("%s: %64.64s\n",pi->device,&scratch[64]); + } +#endif + + printk("%s: bpck %s, backpack %8.8s unit %d", + pi->device,BPCK_VERSION,&scratch[110],pi->unit); + printk(" at 0x%x, mode %d (%s), delay %d\n",pi->port, + pi->mode,mode_string[pi->mode],pi->delay); +} + +static struct pi_protocol bpck = { + .owner = THIS_MODULE, + .name = "bpck", + .max_mode = 5, + .epp_first = 2, + .default_delay = 4, + .max_units = 255, + .write_regr = bpck_write_regr, + .read_regr = bpck_read_regr, + .write_block = bpck_write_block, + .read_block = bpck_read_block, + .connect = bpck_connect, + .disconnect = bpck_disconnect, + .test_port = bpck_test_port, + .probe_unit = bpck_probe_unit, + .test_proto = bpck_test_proto, + .log_adapter = bpck_log_adapter, +}; + +static int __init bpck_init(void) +{ + return paride_register(&bpck); +} + +static void __exit bpck_exit(void) +{ + paride_unregister(&bpck); +} + +MODULE_LICENSE("GPL"); +module_init(bpck_init) +module_exit(bpck_exit) diff --git a/kernel/drivers/block/paride/bpck6.c b/kernel/drivers/block/paride/bpck6.c new file mode 100644 index 000000000..ec64e7f5d --- /dev/null +++ b/kernel/drivers/block/paride/bpck6.c @@ -0,0 +1,267 @@ +/* + backpack.c (c) 2001 Micro Solutions Inc. + Released under the terms of the GNU General Public license + + backpack.c is a low-level protocol driver for the Micro Solutions + "BACKPACK" parallel port IDE adapter + (Works on Series 6 drives) + + Written by: Ken Hahn (linux-dev@micro-solutions.com) + Clive Turvey (linux-dev@micro-solutions.com) + +*/ + +/* + This is Ken's linux wrapper for the PPC library + Version 1.0.0 is the backpack driver for which source is not available + Version 2.0.0 is the first to have source released + Version 2.0.1 is the "Cox-ified" source code + Version 2.0.2 - fixed version string usage, and made ppc functions static +*/ + + +#define BACKPACK_VERSION "2.0.2" + +#include +#include +#include +#include +#include +#include +#include + +#include "ppc6lnx.c" +#include "paride.h" + +/* PARAMETERS */ +static bool verbose; /* set this to 1 to see debugging messages and whatnot */ + + +#define PPCSTRUCT(pi) ((Interface *)(pi->private)) + +/****************************************************************/ +/* + ATAPI CDROM DRIVE REGISTERS +*/ +#define ATAPI_DATA 0 /* data port */ +#define ATAPI_ERROR 1 /* error register (read) */ +#define ATAPI_FEATURES 1 /* feature register (write) */ +#define ATAPI_INT_REASON 2 /* interrupt reason register */ +#define ATAPI_COUNT_LOW 4 /* byte count register (low) */ +#define ATAPI_COUNT_HIGH 5 /* byte count register (high) */ +#define ATAPI_DRIVE_SEL 6 /* drive select register */ +#define ATAPI_STATUS 7 /* status port (read) */ +#define ATAPI_COMMAND 7 /* command port (write) */ +#define ATAPI_ALT_STATUS 0x0e /* alternate status reg (read) */ +#define ATAPI_DEVICE_CONTROL 0x0e /* device control (write) */ +/****************************************************************/ + +static int bpck6_read_regr(PIA *pi, int cont, int reg) +{ + unsigned int out; + + /* check for bad settings */ + if (reg<0 || reg>7 || cont<0 || cont>2) + { + return(-1); + } + out=ppc6_rd_port(PPCSTRUCT(pi),cont?reg|8:reg); + return(out); +} + +static void bpck6_write_regr(PIA *pi, int cont, int reg, int val) +{ + /* check for bad settings */ + if (reg>=0 && reg<=7 && cont>=0 && cont<=1) + { + ppc6_wr_port(PPCSTRUCT(pi),cont?reg|8:reg,(u8)val); + } +} + +static void bpck6_write_block( PIA *pi, char * buf, int len ) +{ + ppc6_wr_port16_blk(PPCSTRUCT(pi),ATAPI_DATA,buf,(u32)len>>1); +} + +static void bpck6_read_block( PIA *pi, char * buf, int len ) +{ + ppc6_rd_port16_blk(PPCSTRUCT(pi),ATAPI_DATA,buf,(u32)len>>1); +} + +static void bpck6_connect ( PIA *pi ) +{ + if(verbose) + { + printk(KERN_DEBUG "connect\n"); + } + + if(pi->mode >=2) + { + PPCSTRUCT(pi)->mode=4+pi->mode-2; + } + else if(pi->mode==1) + { + PPCSTRUCT(pi)->mode=3; + } + else + { + PPCSTRUCT(pi)->mode=1; + } + + ppc6_open(PPCSTRUCT(pi)); + ppc6_wr_extout(PPCSTRUCT(pi),0x3); +} + +static void bpck6_disconnect ( PIA *pi ) +{ + if(verbose) + { + printk("disconnect\n"); + } + ppc6_wr_extout(PPCSTRUCT(pi),0x0); + ppc6_close(PPCSTRUCT(pi)); +} + +static int bpck6_test_port ( PIA *pi ) /* check for 8-bit port */ +{ + if(verbose) + { + printk(KERN_DEBUG "PARPORT indicates modes=%x for lp=0x%lx\n", + ((struct pardevice*)(pi->pardev))->port->modes, + ((struct pardevice *)(pi->pardev))->port->base); + } + + /*copy over duplicate stuff.. initialize state info*/ + PPCSTRUCT(pi)->ppc_id=pi->unit; + PPCSTRUCT(pi)->lpt_addr=pi->port; + + /* look at the parport device to see if what modes we can use */ + if(((struct pardevice *)(pi->pardev))->port->modes & + (PARPORT_MODE_EPP) + ) + { + return 5; /* Can do EPP*/ + } + else if(((struct pardevice *)(pi->pardev))->port->modes & + (PARPORT_MODE_TRISTATE) + ) + { + return 2; + } + else /*Just flat SPP*/ + { + return 1; + } +} + +static int bpck6_probe_unit ( PIA *pi ) +{ + int out; + + if(verbose) + { + printk(KERN_DEBUG "PROBE UNIT %x on port:%x\n",pi->unit,pi->port); + } + + /*SET PPC UNIT NUMBER*/ + PPCSTRUCT(pi)->ppc_id=pi->unit; + + /*LOWER DOWN TO UNIDIRECTIONAL*/ + PPCSTRUCT(pi)->mode=1; + + out=ppc6_open(PPCSTRUCT(pi)); + + if(verbose) + { + printk(KERN_DEBUG "ppc_open returned %2x\n",out); + } + + if(out) + { + ppc6_close(PPCSTRUCT(pi)); + if(verbose) + { + printk(KERN_DEBUG "leaving probe\n"); + } + return(1); + } + else + { + if(verbose) + { + printk(KERN_DEBUG "Failed open\n"); + } + return(0); + } +} + +static void bpck6_log_adapter( PIA *pi, char * scratch, int verbose ) +{ + char *mode_string[5]= + {"4-bit","8-bit","EPP-8","EPP-16","EPP-32"}; + + printk("%s: BACKPACK Protocol Driver V"BACKPACK_VERSION"\n",pi->device); + printk("%s: Copyright 2001 by Micro Solutions, Inc., DeKalb IL.\n",pi->device); + printk("%s: BACKPACK %s, Micro Solutions BACKPACK Drive at 0x%x\n", + pi->device,BACKPACK_VERSION,pi->port); + printk("%s: Unit: %d Mode:%d (%s) Delay %d\n",pi->device, + pi->unit,pi->mode,mode_string[pi->mode],pi->delay); +} + +static int bpck6_init_proto(PIA *pi) +{ + Interface *p = kzalloc(sizeof(Interface), GFP_KERNEL); + + if (p) { + pi->private = (unsigned long)p; + return 0; + } + + printk(KERN_ERR "%s: ERROR COULDN'T ALLOCATE MEMORY\n", pi->device); + return -1; +} + +static void bpck6_release_proto(PIA *pi) +{ + kfree((void *)(pi->private)); +} + +static struct pi_protocol bpck6 = { + .owner = THIS_MODULE, + .name = "bpck6", + .max_mode = 5, + .epp_first = 2, /* 2-5 use epp (need 8 ports) */ + .max_units = 255, + .write_regr = bpck6_write_regr, + .read_regr = bpck6_read_regr, + .write_block = bpck6_write_block, + .read_block = bpck6_read_block, + .connect = bpck6_connect, + .disconnect = bpck6_disconnect, + .test_port = bpck6_test_port, + .probe_unit = bpck6_probe_unit, + .log_adapter = bpck6_log_adapter, + .init_proto = bpck6_init_proto, + .release_proto = bpck6_release_proto, +}; + +static int __init bpck6_init(void) +{ + printk(KERN_INFO "bpck6: BACKPACK Protocol Driver V"BACKPACK_VERSION"\n"); + printk(KERN_INFO "bpck6: Copyright 2001 by Micro Solutions, Inc., DeKalb IL. USA\n"); + if(verbose) + printk(KERN_DEBUG "bpck6: verbose debug enabled.\n"); + return paride_register(&bpck6); +} + +static void __exit bpck6_exit(void) +{ + paride_unregister(&bpck6); +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Micro Solutions Inc."); +MODULE_DESCRIPTION("BACKPACK Protocol module, compatible with PARIDE"); +module_param(verbose, bool, 0644); +module_init(bpck6_init) +module_exit(bpck6_exit) diff --git a/kernel/drivers/block/paride/comm.c b/kernel/drivers/block/paride/comm.c new file mode 100644 index 000000000..9bcd35495 --- /dev/null +++ b/kernel/drivers/block/paride/comm.c @@ -0,0 +1,218 @@ +/* + comm.c (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + comm.c is a low-level protocol driver for some older models + of the DataStor "Commuter" parallel to IDE adapter. Some of + the parallel port devices marketed by Arista currently + use this adapter. +*/ + +/* Changes: + + 1.01 GRG 1998.05.05 init_proto, release_proto + +*/ + +#define COMM_VERSION "1.01" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +/* mode codes: 0 nybble reads, 8-bit writes + 1 8-bit reads and writes + 2 8-bit EPP mode +*/ + +#define j44(a,b) (((a>>3)&0x0f)|((b<<1)&0xf0)) + +#define P1 w2(5);w2(0xd);w2(0xd);w2(5);w2(4); +#define P2 w2(5);w2(7);w2(7);w2(5);w2(4); + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set +*/ + +static int cont_map[2] = { 0x08, 0x10 }; + +static int comm_read_regr( PIA *pi, int cont, int regr ) + +{ int l, h, r; + + r = regr + cont_map[cont]; + + switch (pi->mode) { + + case 0: w0(r); P1; w0(0); + w2(6); l = r1(); w0(0x80); h = r1(); w2(4); + return j44(l,h); + + case 1: w0(r+0x20); P1; + w0(0); w2(0x26); h = r0(); w2(4); + return h; + + case 2: + case 3: + case 4: w3(r+0x20); (void)r1(); + w2(0x24); h = r4(); w2(4); + return h; + + } + return -1; +} + +static void comm_write_regr( PIA *pi, int cont, int regr, int val ) + +{ int r; + + r = regr + cont_map[cont]; + + switch (pi->mode) { + + case 0: + case 1: w0(r); P1; w0(val); P2; + break; + + case 2: + case 3: + case 4: w3(r); (void)r1(); w4(val); + break; + } +} + +static void comm_connect ( PIA *pi ) + +{ pi->saved_r0 = r0(); + pi->saved_r2 = r2(); + w2(4); w0(0xff); w2(6); + w2(4); w0(0xaa); w2(6); + w2(4); w0(0x00); w2(6); + w2(4); w0(0x87); w2(6); + w2(4); w0(0xe0); w2(0xc); w2(0xc); w2(4); +} + +static void comm_disconnect ( PIA *pi ) + +{ w2(0); w2(0); w2(0); w2(4); + w0(pi->saved_r0); + w2(pi->saved_r2); +} + +static void comm_read_block( PIA *pi, char * buf, int count ) + +{ int i, l, h; + + switch (pi->mode) { + + case 0: w0(0x48); P1; + for(i=0;imode) { + + case 0: + case 1: w0(0x68); P1; + for (k=0;kdevice,COMM_VERSION,pi->port); + printk("mode %d (%s), delay %d\n",pi->mode, + mode_string[pi->mode],pi->delay); + +} + +static struct pi_protocol comm = { + .owner = THIS_MODULE, + .name = "comm", + .max_mode = 5, + .epp_first = 2, + .default_delay = 1, + .max_units = 1, + .write_regr = comm_write_regr, + .read_regr = comm_read_regr, + .write_block = comm_write_block, + .read_block = comm_read_block, + .connect = comm_connect, + .disconnect = comm_disconnect, + .log_adapter = comm_log_adapter, +}; + +static int __init comm_init(void) +{ + return paride_register(&comm); +} + +static void __exit comm_exit(void) +{ + paride_unregister(&comm); +} + +MODULE_LICENSE("GPL"); +module_init(comm_init) +module_exit(comm_exit) diff --git a/kernel/drivers/block/paride/dstr.c b/kernel/drivers/block/paride/dstr.c new file mode 100644 index 000000000..accc5c777 --- /dev/null +++ b/kernel/drivers/block/paride/dstr.c @@ -0,0 +1,233 @@ +/* + dstr.c (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + dstr.c is a low-level protocol driver for the + DataStor EP2000 parallel to IDE adapter chip. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.06 init_proto, release_proto + +*/ + +#define DSTR_VERSION "1.01" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +/* mode codes: 0 nybble reads, 8-bit writes + 1 8-bit reads and writes + 2 8-bit EPP mode + 3 EPP-16 + 4 EPP-32 +*/ + +#define j44(a,b) (((a>>3)&0x07)|((~a>>4)&0x08)|((b<<1)&0x70)|((~b)&0x80)) + +#define P1 w2(5);w2(0xd);w2(5);w2(4); +#define P2 w2(5);w2(7);w2(5);w2(4); +#define P3 w2(6);w2(4);w2(6);w2(4); + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set +*/ + +static int cont_map[2] = { 0x20, 0x40 }; + +static int dstr_read_regr( PIA *pi, int cont, int regr ) + +{ int a, b, r; + + r = regr + cont_map[cont]; + + w0(0x81); P1; + if (pi->mode) { w0(0x11); } else { w0(1); } + P2; w0(r); P1; + + switch (pi->mode) { + + case 0: w2(6); a = r1(); w2(4); w2(6); b = r1(); w2(4); + return j44(a,b); + + case 1: w0(0); w2(0x26); a = r0(); w2(4); + return a; + + case 2: + case 3: + case 4: w2(0x24); a = r4(); w2(4); + return a; + + } + return -1; +} + +static void dstr_write_regr( PIA *pi, int cont, int regr, int val ) + +{ int r; + + r = regr + cont_map[cont]; + + w0(0x81); P1; + if (pi->mode >= 2) { w0(0x11); } else { w0(1); } + P2; w0(r); P1; + + switch (pi->mode) { + + case 0: + case 1: w0(val); w2(5); w2(7); w2(5); w2(4); + break; + + case 2: + case 3: + case 4: w4(val); + break; + } +} + +#define CCP(x) w0(0xff);w2(0xc);w2(4);\ + w0(0xaa);w0(0x55);w0(0);w0(0xff);w0(0x87);w0(0x78);\ + w0(x);w2(5);w2(4); + +static void dstr_connect ( PIA *pi ) + +{ pi->saved_r0 = r0(); + pi->saved_r2 = r2(); + w2(4); CCP(0xe0); w0(0xff); +} + +static void dstr_disconnect ( PIA *pi ) + +{ CCP(0x30); + w0(pi->saved_r0); + w2(pi->saved_r2); +} + +static void dstr_read_block( PIA *pi, char * buf, int count ) + +{ int k, a, b; + + w0(0x81); P1; + if (pi->mode) { w0(0x19); } else { w0(9); } + P2; w0(0x82); P1; P3; w0(0x20); P1; + + switch (pi->mode) { + + case 0: for (k=0;kmode) { w0(0x19); } else { w0(9); } + P2; w0(0x82); P1; P3; w0(0x20); P1; + + switch (pi->mode) { + + case 0: + case 1: for (k=0;kdevice,DSTR_VERSION,pi->port); + printk("mode %d (%s), delay %d\n",pi->mode, + mode_string[pi->mode],pi->delay); + +} + +static struct pi_protocol dstr = { + .owner = THIS_MODULE, + .name = "dstr", + .max_mode = 5, + .epp_first = 2, + .default_delay = 1, + .max_units = 1, + .write_regr = dstr_write_regr, + .read_regr = dstr_read_regr, + .write_block = dstr_write_block, + .read_block = dstr_read_block, + .connect = dstr_connect, + .disconnect = dstr_disconnect, + .log_adapter = dstr_log_adapter, +}; + +static int __init dstr_init(void) +{ + return paride_register(&dstr); +} + +static void __exit dstr_exit(void) +{ + paride_unregister(&dstr); +} + +MODULE_LICENSE("GPL"); +module_init(dstr_init) +module_exit(dstr_exit) diff --git a/kernel/drivers/block/paride/epat.c b/kernel/drivers/block/paride/epat.c new file mode 100644 index 000000000..1bcdff773 --- /dev/null +++ b/kernel/drivers/block/paride/epat.c @@ -0,0 +1,340 @@ +/* + epat.c (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + This is the low level protocol driver for the EPAT parallel + to IDE adapter from Shuttle Technologies. This adapter is + used in many popular parallel port disk products such as the + SyQuest EZ drives, the Avatar Shark and the Imation SuperDisk. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.06 init_proto, release_proto + 1.02 Joshua b. Jore CPP(renamed), epat_connect, epat_disconnect + +*/ + +#define EPAT_VERSION "1.02" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +#define j44(a,b) (((a>>4)&0x0f)+(b&0xf0)) +#define j53(a,b) (((a>>3)&0x1f)+((b<<4)&0xe0)) + +static int epatc8; + +module_param(epatc8, int, 0); +MODULE_PARM_DESC(epatc8, "support for the Shuttle EP1284 chip, " + "used in any recent Imation SuperDisk (LS-120) drive."); + +/* cont = 0 IDE register file + cont = 1 IDE control registers + cont = 2 internal EPAT registers +*/ + +static int cont_map[3] = { 0x18, 0x10, 0 }; + +static void epat_write_regr( PIA *pi, int cont, int regr, int val) + +{ int r; + + r = regr + cont_map[cont]; + + switch (pi->mode) { + + case 0: + case 1: + case 2: w0(0x60+r); w2(1); w0(val); w2(4); + break; + + case 3: + case 4: + case 5: w3(0x40+r); w4(val); + break; + + } +} + +static int epat_read_regr( PIA *pi, int cont, int regr ) + +{ int a, b, r; + + r = regr + cont_map[cont]; + + switch (pi->mode) { + + case 0: w0(r); w2(1); w2(3); + a = r1(); w2(4); b = r1(); + return j44(a,b); + + case 1: w0(0x40+r); w2(1); w2(4); + a = r1(); b = r2(); w0(0xff); + return j53(a,b); + + case 2: w0(0x20+r); w2(1); w2(0x25); + a = r0(); w2(4); + return a; + + case 3: + case 4: + case 5: w3(r); w2(0x24); a = r4(); w2(4); + return a; + + } + return -1; /* never gets here */ +} + +static void epat_read_block( PIA *pi, char * buf, int count ) + +{ int k, ph, a, b; + + switch (pi->mode) { + + case 0: w0(7); w2(1); w2(3); w0(0xff); + ph = 0; + for(k=0;kmode) { + + case 0: + case 1: + case 2: w0(0x67); w2(1); w2(5); + ph = 0; + for(k=0;ksaved_r0 = r0(); + pi->saved_r2 = r2(); + + /* Initialize the chip */ + CPP(0); + + if (epatc8) { + CPP(0x40);CPP(0xe0); + w0(0);w2(1);w2(4); + WR(0x8,0x12);WR(0xc,0x14);WR(0x12,0x10); + WR(0xe,0xf);WR(0xf,4); + /* WR(0xe,0xa);WR(0xf,4); */ + WR(0xe,0xd);WR(0xf,0); + /* CPP(0x30); */ + } + + /* Connect to the chip */ + CPP(0xe0); + w0(0);w2(1);w2(4); /* Idle into SPP */ + if (pi->mode >= 3) { + w0(0);w2(1);w2(4);w2(0xc); + /* Request EPP */ + w0(0x40);w2(6);w2(7);w2(4);w2(0xc);w2(4); + } + + if (!epatc8) { + WR(8,0x10); WR(0xc,0x14); WR(0xa,0x38); WR(0x12,0x10); + } +} + +static void epat_disconnect (PIA *pi) +{ CPP(0x30); + w0(pi->saved_r0); + w2(pi->saved_r2); +} + +static int epat_test_proto( PIA *pi, char * scratch, int verbose ) + +{ int k, j, f, cc; + int e[2] = {0,0}; + + epat_connect(pi); + cc = RR(0xd); + epat_disconnect(pi); + + epat_connect(pi); + for (j=0;j<2;j++) { + WRi(6,0xa0+j*0x10); + for (k=0;k<256;k++) { + WRi(2,k^0xaa); + WRi(3,k^0x55); + if (RRi(2) != (k^0xaa)) e[j]++; + } + } + epat_disconnect(pi); + + f = 0; + epat_connect(pi); + WR(0x13,1); WR(0x13,0); WR(0xa,0x11); + epat_read_block(pi,scratch,512); + + for (k=0;k<256;k++) { + if ((scratch[2*k] & 0xff) != k) f++; + if ((scratch[2*k+1] & 0xff) != (0xff-k)) f++; + } + epat_disconnect(pi); + + if (verbose) { + printk("%s: epat: port 0x%x, mode %d, ccr %x, test=(%d,%d,%d)\n", + pi->device,pi->port,pi->mode,cc,e[0],e[1],f); + } + + return (e[0] && e[1]) || f; +} + +static void epat_log_adapter( PIA *pi, char * scratch, int verbose ) + +{ int ver; + char *mode_string[6] = + {"4-bit","5/3","8-bit","EPP-8","EPP-16","EPP-32"}; + + epat_connect(pi); + WR(0xa,0x38); /* read the version code */ + ver = RR(0xb); + epat_disconnect(pi); + + printk("%s: epat %s, Shuttle EPAT chip %x at 0x%x, ", + pi->device,EPAT_VERSION,ver,pi->port); + printk("mode %d (%s), delay %d\n",pi->mode, + mode_string[pi->mode],pi->delay); + +} + +static struct pi_protocol epat = { + .owner = THIS_MODULE, + .name = "epat", + .max_mode = 6, + .epp_first = 3, + .default_delay = 1, + .max_units = 1, + .write_regr = epat_write_regr, + .read_regr = epat_read_regr, + .write_block = epat_write_block, + .read_block = epat_read_block, + .connect = epat_connect, + .disconnect = epat_disconnect, + .test_proto = epat_test_proto, + .log_adapter = epat_log_adapter, +}; + +static int __init epat_init(void) +{ +#ifdef CONFIG_PARIDE_EPATC8 + epatc8 = 1; +#endif + return paride_register(&epat); +} + +static void __exit epat_exit(void) +{ + paride_unregister(&epat); +} + +MODULE_LICENSE("GPL"); +module_init(epat_init) +module_exit(epat_exit) diff --git a/kernel/drivers/block/paride/epia.c b/kernel/drivers/block/paride/epia.c new file mode 100644 index 000000000..fb0e782d0 --- /dev/null +++ b/kernel/drivers/block/paride/epia.c @@ -0,0 +1,316 @@ +/* + epia.c (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + epia.c is a low-level protocol driver for Shuttle Technologies + EPIA parallel to IDE adapter chip. This device is now obsolete + and has been replaced with the EPAT chip, which is supported + by epat.c, however, some devices based on EPIA are still + available. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.06 init_proto, release_proto + 1.02 GRG 1998.06.17 support older versions of EPIA + +*/ + +#define EPIA_VERSION "1.02" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +/* mode codes: 0 nybble reads on port 1, 8-bit writes + 1 5/3 reads on ports 1 & 2, 8-bit writes + 2 8-bit reads and writes + 3 8-bit EPP mode + 4 16-bit EPP + 5 32-bit EPP +*/ + +#define j44(a,b) (((a>>4)&0x0f)+(b&0xf0)) +#define j53(a,b) (((a>>3)&0x1f)+((b<<4)&0xe0)) + +/* cont = 0 IDE register file + cont = 1 IDE control registers +*/ + +static int cont_map[2] = { 0, 0x80 }; + +static int epia_read_regr( PIA *pi, int cont, int regr ) + +{ int a, b, r; + + regr += cont_map[cont]; + + switch (pi->mode) { + + case 0: r = regr^0x39; + w0(r); w2(1); w2(3); w0(r); + a = r1(); w2(1); b = r1(); w2(4); + return j44(a,b); + + case 1: r = regr^0x31; + w0(r); w2(1); w0(r&0x37); + w2(3); w2(5); w0(r|0xf0); + a = r1(); b = r2(); w2(4); + return j53(a,b); + + case 2: r = regr^0x29; + w0(r); w2(1); w2(0X21); w2(0x23); + a = r0(); w2(4); + return a; + + case 3: + case 4: + case 5: w3(regr); w2(0x24); a = r4(); w2(4); + return a; + + } + return -1; +} + +static void epia_write_regr( PIA *pi, int cont, int regr, int val) + +{ int r; + + regr += cont_map[cont]; + + switch (pi->mode) { + + case 0: + case 1: + case 2: r = regr^0x19; + w0(r); w2(1); w0(val); w2(3); w2(4); + break; + + case 3: + case 4: + case 5: r = regr^0x40; + w3(r); w4(val); w2(4); + break; + } +} + +#define WR(r,v) epia_write_regr(pi,0,r,v) +#define RR(r) (epia_read_regr(pi,0,r)) + +/* The use of register 0x84 is entirely unclear - it seems to control + some EPP counters ... currently we know about 3 different block + sizes: the standard 512 byte reads and writes, 12 byte writes and + 2048 byte reads (the last two being used in the CDrom drivers. +*/ + +static void epia_connect ( PIA *pi ) + +{ pi->saved_r0 = r0(); + pi->saved_r2 = r2(); + + w2(4); w0(0xa0); w0(0x50); w0(0xc0); w0(0x30); w0(0xa0); w0(0); + w2(1); w2(4); + if (pi->mode >= 3) { + w0(0xa); w2(1); w2(4); w0(0x82); w2(4); w2(0xc); w2(4); + w2(0x24); w2(0x26); w2(4); + } + WR(0x86,8); +} + +static void epia_disconnect ( PIA *pi ) + +{ /* WR(0x84,0x10); */ + w0(pi->saved_r0); + w2(1); w2(4); + w0(pi->saved_r0); + w2(pi->saved_r2); +} + +static void epia_read_block( PIA *pi, char * buf, int count ) + +{ int k, ph, a, b; + + switch (pi->mode) { + + case 0: w0(0x81); w2(1); w2(3); w0(0xc1); + ph = 1; + for (k=0;k 512) WR(0x84,3); + w3(0); w2(0x24); + for (k=0;k 512) WR(0x84,3); + w3(0); w2(0x24); + for (k=0;k 512) WR(0x84,3); + w3(0); w2(0x24); + for (k=0;kmode) { + + case 0: + case 1: + case 2: w0(0xa1); w2(1); w2(3); w2(1); w2(5); + ph = 0; last = 0x8000; + for (k=0;kdevice,pi->port,pi->mode,e[0],e[1],f); + } + + return (e[0] && e[1]) || f; + +} + + +static void epia_log_adapter( PIA *pi, char * scratch, int verbose ) + +{ char *mode_string[6] = {"4-bit","5/3","8-bit", + "EPP-8","EPP-16","EPP-32"}; + + printk("%s: epia %s, Shuttle EPIA at 0x%x, ", + pi->device,EPIA_VERSION,pi->port); + printk("mode %d (%s), delay %d\n",pi->mode, + mode_string[pi->mode],pi->delay); + +} + +static struct pi_protocol epia = { + .owner = THIS_MODULE, + .name = "epia", + .max_mode = 6, + .epp_first = 3, + .default_delay = 1, + .max_units = 1, + .write_regr = epia_write_regr, + .read_regr = epia_read_regr, + .write_block = epia_write_block, + .read_block = epia_read_block, + .connect = epia_connect, + .disconnect = epia_disconnect, + .test_proto = epia_test_proto, + .log_adapter = epia_log_adapter, +}; + +static int __init epia_init(void) +{ + return paride_register(&epia); +} + +static void __exit epia_exit(void) +{ + paride_unregister(&epia); +} + +MODULE_LICENSE("GPL"); +module_init(epia_init) +module_exit(epia_exit) diff --git a/kernel/drivers/block/paride/fit2.c b/kernel/drivers/block/paride/fit2.c new file mode 100644 index 000000000..381283753 --- /dev/null +++ b/kernel/drivers/block/paride/fit2.c @@ -0,0 +1,151 @@ +/* + fit2.c (c) 1998 Grant R. Guenther + Under the terms of the GNU General Public License. + + fit2.c is a low-level protocol driver for the older version + of the Fidelity International Technology parallel port adapter. + This adapter is used in their TransDisk 2000 and older TransDisk + 3000 portable hard-drives. As far as I can tell, this device + supports 4-bit mode _only_. + + Newer models of the FIT products use an enhanced protocol. + The "fit3" protocol module should support current drives. + +*/ + +#define FIT2_VERSION "1.0" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +#define j44(a,b) (((a>>4)&0x0f)|(b&0xf0)) + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set + +NB: The FIT adapter does not appear to use the control registers. +So, we map ALT_STATUS to STATUS and NO-OP writes to the device +control register - this means that IDE reset will not work on these +devices. + +*/ + +static void fit2_write_regr( PIA *pi, int cont, int regr, int val) + +{ if (cont == 1) return; + w2(0xc); w0(regr); w2(4); w0(val); w2(5); w0(0); w2(4); +} + +static int fit2_read_regr( PIA *pi, int cont, int regr ) + +{ int a, b, r; + + if (cont) { + if (regr != 6) return 0xff; + r = 7; + } else r = regr + 0x10; + + w2(0xc); w0(r); w2(4); w2(5); + w0(0); a = r1(); + w0(1); b = r1(); + w2(4); + + return j44(a,b); + +} + +static void fit2_read_block( PIA *pi, char * buf, int count ) + +{ int k, a, b, c, d; + + w2(0xc); w0(0x10); + + for (k=0;ksaved_r0 = r0(); + pi->saved_r2 = r2(); + w2(0xcc); +} + +static void fit2_disconnect ( PIA *pi ) + +{ w0(pi->saved_r0); + w2(pi->saved_r2); +} + +static void fit2_log_adapter( PIA *pi, char * scratch, int verbose ) + +{ printk("%s: fit2 %s, FIT 2000 adapter at 0x%x, delay %d\n", + pi->device,FIT2_VERSION,pi->port,pi->delay); + +} + +static struct pi_protocol fit2 = { + .owner = THIS_MODULE, + .name = "fit2", + .max_mode = 1, + .epp_first = 2, + .default_delay = 1, + .max_units = 1, + .write_regr = fit2_write_regr, + .read_regr = fit2_read_regr, + .write_block = fit2_write_block, + .read_block = fit2_read_block, + .connect = fit2_connect, + .disconnect = fit2_disconnect, + .log_adapter = fit2_log_adapter, +}; + +static int __init fit2_init(void) +{ + return paride_register(&fit2); +} + +static void __exit fit2_exit(void) +{ + paride_unregister(&fit2); +} + +MODULE_LICENSE("GPL"); +module_init(fit2_init) +module_exit(fit2_exit) diff --git a/kernel/drivers/block/paride/fit3.c b/kernel/drivers/block/paride/fit3.c new file mode 100644 index 000000000..275d26945 --- /dev/null +++ b/kernel/drivers/block/paride/fit3.c @@ -0,0 +1,211 @@ +/* + fit3.c (c) 1998 Grant R. Guenther + Under the terms of the GNU General Public License. + + fit3.c is a low-level protocol driver for newer models + of the Fidelity International Technology parallel port adapter. + This adapter is used in their TransDisk 3000 portable + hard-drives, as well as CD-ROM, PD-CD and other devices. + + The TD-2000 and certain older devices use a different protocol. + Try the fit2 protocol module with them. + + NB: The FIT adapters do not appear to support the control + registers. So, we map ALT_STATUS to STATUS and NO-OP writes + to the device control register - this means that IDE reset + will not work on these devices. + +*/ + +#define FIT3_VERSION "1.0" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +#define j44(a,b) (((a>>3)&0x0f)|((b<<1)&0xf0)) + +#define w7(byte) {out_p(7,byte);} +#define r7() (in_p(7) & 0xff) + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set + +*/ + +static void fit3_write_regr( PIA *pi, int cont, int regr, int val) + +{ if (cont == 1) return; + + switch (pi->mode) { + + case 0: + case 1: w2(0xc); w0(regr); w2(0x8); w2(0xc); + w0(val); w2(0xd); + w0(0); w2(0xc); + break; + + case 2: w2(0xc); w0(regr); w2(0x8); w2(0xc); + w4(val); w4(0); + w2(0xc); + break; + + } +} + +static int fit3_read_regr( PIA *pi, int cont, int regr ) + +{ int a, b; + + if (cont) { + if (regr != 6) return 0xff; + regr = 7; + } + + switch (pi->mode) { + + case 0: w2(0xc); w0(regr + 0x10); w2(0x8); w2(0xc); + w2(0xd); a = r1(); + w2(0xf); b = r1(); + w2(0xc); + return j44(a,b); + + case 1: w2(0xc); w0(regr + 0x90); w2(0x8); w2(0xc); + w2(0xec); w2(0xee); w2(0xef); a = r0(); + w2(0xc); + return a; + + case 2: w2(0xc); w0(regr + 0x90); w2(0x8); w2(0xc); + w2(0xec); + a = r4(); b = r4(); + w2(0xc); + return a; + + } + return -1; + +} + +static void fit3_read_block( PIA *pi, char * buf, int count ) + +{ int k, a, b, c, d; + + switch (pi->mode) { + + case 0: w2(0xc); w0(0x10); w2(0x8); w2(0xc); + for (k=0;kmode) { + + case 0: + case 1: w2(0xc); w0(0); w2(0x8); w2(0xc); + for (k=0;ksaved_r0 = r0(); + pi->saved_r2 = r2(); + w2(0xc); w0(0); w2(0xa); + if (pi->mode == 2) { + w2(0xc); w0(0x9); w2(0x8); w2(0xc); + } +} + +static void fit3_disconnect ( PIA *pi ) + +{ w2(0xc); w0(0xa); w2(0x8); w2(0xc); + w0(pi->saved_r0); + w2(pi->saved_r2); +} + +static void fit3_log_adapter( PIA *pi, char * scratch, int verbose ) + +{ char *mode_string[3] = {"4-bit","8-bit","EPP"}; + + printk("%s: fit3 %s, FIT 3000 adapter at 0x%x, " + "mode %d (%s), delay %d\n", + pi->device,FIT3_VERSION,pi->port, + pi->mode,mode_string[pi->mode],pi->delay); + +} + +static struct pi_protocol fit3 = { + .owner = THIS_MODULE, + .name = "fit3", + .max_mode = 3, + .epp_first = 2, + .default_delay = 1, + .max_units = 1, + .write_regr = fit3_write_regr, + .read_regr = fit3_read_regr, + .write_block = fit3_write_block, + .read_block = fit3_read_block, + .connect = fit3_connect, + .disconnect = fit3_disconnect, + .log_adapter = fit3_log_adapter, +}; + +static int __init fit3_init(void) +{ + return paride_register(&fit3); +} + +static void __exit fit3_exit(void) +{ + paride_unregister(&fit3); +} + +MODULE_LICENSE("GPL"); +module_init(fit3_init) +module_exit(fit3_exit) diff --git a/kernel/drivers/block/paride/friq.c b/kernel/drivers/block/paride/friq.c new file mode 100644 index 000000000..4f2ba2446 --- /dev/null +++ b/kernel/drivers/block/paride/friq.c @@ -0,0 +1,276 @@ +/* + friq.c (c) 1998 Grant R. Guenther + Under the terms of the GNU General Public License + + friq.c is a low-level protocol driver for the Freecom "IQ" + parallel port IDE adapter. Early versions of this adapter + use the 'frpw' protocol. + + Freecom uses this adapter in a battery powered external + CD-ROM drive. It is also used in LS-120 drives by + Maxell and Panasonic, and other devices. + + The battery powered drive requires software support to + control the power to the drive. This module enables the + drive power when the high level driver (pcd) is loaded + and disables it when the module is unloaded. Note, if + the friq module is built in to the kernel, the power + will never be switched off, so other means should be + used to conserve battery power. + +*/ + +/* Changes: + + 1.01 GRG 1998.12.20 Added support for soft power switch +*/ + +#define FRIQ_VERSION "1.01" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +#define CMD(x) w2(4);w0(0xff);w0(0xff);w0(0x73);w0(0x73);\ + w0(0xc9);w0(0xc9);w0(0x26);w0(0x26);w0(x);w0(x); + +#define j44(l,h) (((l>>4)&0x0f)|(h&0xf0)) + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set +*/ + +static int cont_map[2] = { 0x08, 0x10 }; + +static int friq_read_regr( PIA *pi, int cont, int regr ) + +{ int h,l,r; + + r = regr + cont_map[cont]; + + CMD(r); + w2(6); l = r1(); + w2(4); h = r1(); + w2(4); + + return j44(l,h); + +} + +static void friq_write_regr( PIA *pi, int cont, int regr, int val) + +{ int r; + + r = regr + cont_map[cont]; + + CMD(r); + w0(val); + w2(5);w2(7);w2(5);w2(4); +} + +static void friq_read_block_int( PIA *pi, char * buf, int count, int regr ) + +{ int h, l, k, ph; + + switch(pi->mode) { + + case 0: CMD(regr); + for (k=0;kmode) { + + case 0: + case 1: CMD(8); w2(5); + for (k=0;ksaved_r0 = r0(); + pi->saved_r2 = r2(); + w2(4); +} + +static void friq_disconnect ( PIA *pi ) + +{ CMD(0x20); + w0(pi->saved_r0); + w2(pi->saved_r2); +} + +static int friq_test_proto( PIA *pi, char * scratch, int verbose ) + +{ int j, k, r; + int e[2] = {0,0}; + + pi->saved_r0 = r0(); + w0(0xff); udelay(20); CMD(0x3d); /* turn the power on */ + udelay(500); + w0(pi->saved_r0); + + friq_connect(pi); + for (j=0;j<2;j++) { + friq_write_regr(pi,0,6,0xa0+j*0x10); + for (k=0;k<256;k++) { + friq_write_regr(pi,0,2,k^0xaa); + friq_write_regr(pi,0,3,k^0x55); + if (friq_read_regr(pi,0,2) != (k^0xaa)) e[j]++; + } + } + friq_disconnect(pi); + + friq_connect(pi); + friq_read_block_int(pi,scratch,512,0x10); + r = 0; + for (k=0;k<128;k++) if (scratch[k] != k) r++; + friq_disconnect(pi); + + if (verbose) { + printk("%s: friq: port 0x%x, mode %d, test=(%d,%d,%d)\n", + pi->device,pi->port,pi->mode,e[0],e[1],r); + } + + return (r || (e[0] && e[1])); +} + + +static void friq_log_adapter( PIA *pi, char * scratch, int verbose ) + +{ char *mode_string[6] = {"4-bit","8-bit", + "EPP-8","EPP-16","EPP-32"}; + + printk("%s: friq %s, Freecom IQ ASIC-2 adapter at 0x%x, ", pi->device, + FRIQ_VERSION,pi->port); + printk("mode %d (%s), delay %d\n",pi->mode, + mode_string[pi->mode],pi->delay); + + pi->private = 1; + friq_connect(pi); + CMD(0x9e); /* disable sleep timer */ + friq_disconnect(pi); + +} + +static void friq_release_proto( PIA *pi) +{ + if (pi->private) { /* turn off the power */ + friq_connect(pi); + CMD(0x1d); CMD(0x1e); + friq_disconnect(pi); + pi->private = 0; + } +} + +static struct pi_protocol friq = { + .owner = THIS_MODULE, + .name = "friq", + .max_mode = 5, + .epp_first = 2, + .default_delay = 1, + .max_units = 1, + .write_regr = friq_write_regr, + .read_regr = friq_read_regr, + .write_block = friq_write_block, + .read_block = friq_read_block, + .connect = friq_connect, + .disconnect = friq_disconnect, + .test_proto = friq_test_proto, + .log_adapter = friq_log_adapter, + .release_proto = friq_release_proto, +}; + +static int __init friq_init(void) +{ + return paride_register(&friq); +} + +static void __exit friq_exit(void) +{ + paride_unregister(&friq); +} + +MODULE_LICENSE("GPL"); +module_init(friq_init) +module_exit(friq_exit) diff --git a/kernel/drivers/block/paride/frpw.c b/kernel/drivers/block/paride/frpw.c new file mode 100644 index 000000000..c3cde3646 --- /dev/null +++ b/kernel/drivers/block/paride/frpw.c @@ -0,0 +1,313 @@ +/* + frpw.c (c) 1996-8 Grant R. Guenther + Under the terms of the GNU General Public License + + frpw.c is a low-level protocol driver for the Freecom "Power" + parallel port IDE adapter. + + Some applications of this adapter may require a "printer" reset + prior to loading the driver. This can be done by loading and + unloading the "lp" driver, or it can be done by this driver + if you define FRPW_HARD_RESET. The latter is not recommended + as it may upset devices on other ports. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.06 init_proto, release_proto + fix chip detect + added EPP-16 and EPP-32 + 1.02 GRG 1998.09.23 added hard reset to initialisation process + 1.03 GRG 1998.12.14 made hard reset conditional + +*/ + +#define FRPW_VERSION "1.03" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +#define cec4 w2(0xc);w2(0xe);w2(0xe);w2(0xc);w2(4);w2(4);w2(4); +#define j44(l,h) (((l>>4)&0x0f)|(h&0xf0)) + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set +*/ + +static int cont_map[2] = { 0x08, 0x10 }; + +static int frpw_read_regr( PIA *pi, int cont, int regr ) + +{ int h,l,r; + + r = regr + cont_map[cont]; + + w2(4); + w0(r); cec4; + w2(6); l = r1(); + w2(4); h = r1(); + w2(4); + + return j44(l,h); + +} + +static void frpw_write_regr( PIA *pi, int cont, int regr, int val) + +{ int r; + + r = regr + cont_map[cont]; + + w2(4); w0(r); cec4; + w0(val); + w2(5);w2(7);w2(5);w2(4); +} + +static void frpw_read_block_int( PIA *pi, char * buf, int count, int regr ) + +{ int h, l, k, ph; + + switch(pi->mode) { + + case 0: w2(4); w0(regr); cec4; + for (k=0;kmode) { + + case 0: + case 1: + case 2: w2(4); w0(8); cec4; w2(5); + for (k=0;ksaved_r0 = r0(); + pi->saved_r2 = r2(); + w2(4); +} + +static void frpw_disconnect ( PIA *pi ) + +{ w2(4); w0(0x20); cec4; + w0(pi->saved_r0); + w2(pi->saved_r2); +} + +/* Stub logic to see if PNP string is available - used to distinguish + between the Xilinx and ASIC implementations of the Freecom adapter. +*/ + +static int frpw_test_pnp ( PIA *pi ) + +/* returns chip_type: 0 = Xilinx, 1 = ASIC */ + +{ int olddelay, a, b; + +#ifdef FRPW_HARD_RESET + w0(0); w2(8); udelay(50); w2(0xc); /* parallel bus reset */ + mdelay(1500); +#endif + + olddelay = pi->delay; + pi->delay = 10; + + pi->saved_r0 = r0(); + pi->saved_r2 = r2(); + + w2(4); w0(4); w2(6); w2(7); + a = r1() & 0xff; w2(4); b = r1() & 0xff; + w2(0xc); w2(0xe); w2(4); + + pi->delay = olddelay; + w0(pi->saved_r0); + w2(pi->saved_r2); + + return ((~a&0x40) && (b&0x40)); +} + +/* We use the pi->private to remember the result of the PNP test. + To make this work, private = port*2 + chip. Yes, I know it's + a hack :-( +*/ + +static int frpw_test_proto( PIA *pi, char * scratch, int verbose ) + +{ int j, k, r; + int e[2] = {0,0}; + + if ((pi->private>>1) != pi->port) + pi->private = frpw_test_pnp(pi) + 2*pi->port; + + if (((pi->private%2) == 0) && (pi->mode > 2)) { + if (verbose) + printk("%s: frpw: Xilinx does not support mode %d\n", + pi->device, pi->mode); + return 1; + } + + if (((pi->private%2) == 1) && (pi->mode == 2)) { + if (verbose) + printk("%s: frpw: ASIC does not support mode 2\n", + pi->device); + return 1; + } + + frpw_connect(pi); + for (j=0;j<2;j++) { + frpw_write_regr(pi,0,6,0xa0+j*0x10); + for (k=0;k<256;k++) { + frpw_write_regr(pi,0,2,k^0xaa); + frpw_write_regr(pi,0,3,k^0x55); + if (frpw_read_regr(pi,0,2) != (k^0xaa)) e[j]++; + } + } + frpw_disconnect(pi); + + frpw_connect(pi); + frpw_read_block_int(pi,scratch,512,0x10); + r = 0; + for (k=0;k<128;k++) if (scratch[k] != k) r++; + frpw_disconnect(pi); + + if (verbose) { + printk("%s: frpw: port 0x%x, chip %ld, mode %d, test=(%d,%d,%d)\n", + pi->device,pi->port,(pi->private%2),pi->mode,e[0],e[1],r); + } + + return (r || (e[0] && e[1])); +} + + +static void frpw_log_adapter( PIA *pi, char * scratch, int verbose ) + +{ char *mode_string[6] = {"4-bit","8-bit","EPP", + "EPP-8","EPP-16","EPP-32"}; + + printk("%s: frpw %s, Freecom (%s) adapter at 0x%x, ", pi->device, + FRPW_VERSION,((pi->private%2) == 0)?"Xilinx":"ASIC",pi->port); + printk("mode %d (%s), delay %d\n",pi->mode, + mode_string[pi->mode],pi->delay); + +} + +static struct pi_protocol frpw = { + .owner = THIS_MODULE, + .name = "frpw", + .max_mode = 6, + .epp_first = 2, + .default_delay = 2, + .max_units = 1, + .write_regr = frpw_write_regr, + .read_regr = frpw_read_regr, + .write_block = frpw_write_block, + .read_block = frpw_read_block, + .connect = frpw_connect, + .disconnect = frpw_disconnect, + .test_proto = frpw_test_proto, + .log_adapter = frpw_log_adapter, +}; + +static int __init frpw_init(void) +{ + return paride_register(&frpw); +} + +static void __exit frpw_exit(void) +{ + paride_unregister(&frpw); +} + +MODULE_LICENSE("GPL"); +module_init(frpw_init) +module_exit(frpw_exit) diff --git a/kernel/drivers/block/paride/kbic.c b/kernel/drivers/block/paride/kbic.c new file mode 100644 index 000000000..35999c415 --- /dev/null +++ b/kernel/drivers/block/paride/kbic.c @@ -0,0 +1,305 @@ +/* + kbic.c (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + This is a low-level driver for the KBIC-951A and KBIC-971A + parallel to IDE adapter chips from KingByte Information Systems. + + The chips are almost identical, however, the wakeup code + required for the 971A interferes with the correct operation of + the 951A, so this driver registers itself twice, once for + each chip. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.06 init_proto, release_proto + +*/ + +#define KBIC_VERSION "1.01" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +#define r12w() (delay_p,inw(pi->port+1)&0xffff) + +#define j44(a,b) ((((a>>4)&0x0f)|(b&0xf0))^0x88) +#define j53(w) (((w>>3)&0x1f)|((w>>4)&0xe0)) + + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set +*/ + +static int cont_map[2] = { 0x80, 0x40 }; + +static int kbic_read_regr( PIA *pi, int cont, int regr ) + +{ int a, b, s; + + s = cont_map[cont]; + + switch (pi->mode) { + + case 0: w0(regr|0x18|s); w2(4); w2(6); w2(4); w2(1); w0(8); + a = r1(); w0(0x28); b = r1(); w2(4); + return j44(a,b); + + case 1: w0(regr|0x38|s); w2(4); w2(6); w2(4); w2(5); w0(8); + a = r12w(); w2(4); + return j53(a); + + case 2: w0(regr|0x08|s); w2(4); w2(6); w2(4); w2(0xa5); w2(0xa1); + a = r0(); w2(4); + return a; + + case 3: + case 4: + case 5: w0(0x20|s); w2(4); w2(6); w2(4); w3(regr); + a = r4(); b = r4(); w2(4); w2(0); w2(4); + return a; + + } + return -1; +} + +static void kbic_write_regr( PIA *pi, int cont, int regr, int val) + +{ int s; + + s = cont_map[cont]; + + switch (pi->mode) { + + case 0: + case 1: + case 2: w0(regr|0x10|s); w2(4); w2(6); w2(4); + w0(val); w2(5); w2(4); + break; + + case 3: + case 4: + case 5: w0(0x20|s); w2(4); w2(6); w2(4); w3(regr); + w4(val); w4(val); + w2(4); w2(0); w2(4); + break; + + } +} + +static void k951_connect ( PIA *pi ) + +{ pi->saved_r0 = r0(); + pi->saved_r2 = r2(); + w2(4); +} + +static void k951_disconnect ( PIA *pi ) + +{ w0(pi->saved_r0); + w2(pi->saved_r2); +} + +#define CCP(x) w2(0xc4);w0(0xaa);w0(0x55);w0(0);w0(0xff);w0(0x87);\ + w0(0x78);w0(x);w2(0xc5);w2(0xc4);w0(0xff); + +static void k971_connect ( PIA *pi ) + +{ pi->saved_r0 = r0(); + pi->saved_r2 = r2(); + CCP(0x20); + w2(4); +} + +static void k971_disconnect ( PIA *pi ) + +{ CCP(0x30); + w0(pi->saved_r0); + w2(pi->saved_r2); +} + +/* counts must be congruent to 0 MOD 4, but all known applications + have this property. +*/ + +static void kbic_read_block( PIA *pi, char * buf, int count ) + +{ int k, a, b; + + switch (pi->mode) { + + case 0: w0(0x98); w2(4); w2(6); w2(4); + for (k=0;kmode) { + + case 0: + case 1: + case 2: w0(0x90); w2(4); w2(6); w2(4); + for(k=0;kdevice,KBIC_VERSION,chip,pi->port); + printk("mode %d (%s), delay %d\n",pi->mode, + mode_string[pi->mode],pi->delay); + +} + +static void k951_log_adapter( PIA *pi, char * scratch, int verbose ) + +{ kbic_log_adapter(pi,scratch,verbose,"KBIC-951A"); +} + +static void k971_log_adapter( PIA *pi, char * scratch, int verbose ) + +{ kbic_log_adapter(pi,scratch,verbose,"KBIC-971A"); +} + +static struct pi_protocol k951 = { + .owner = THIS_MODULE, + .name = "k951", + .max_mode = 6, + .epp_first = 3, + .default_delay = 1, + .max_units = 1, + .write_regr = kbic_write_regr, + .read_regr = kbic_read_regr, + .write_block = kbic_write_block, + .read_block = kbic_read_block, + .connect = k951_connect, + .disconnect = k951_disconnect, + .log_adapter = k951_log_adapter, +}; + +static struct pi_protocol k971 = { + .owner = THIS_MODULE, + .name = "k971", + .max_mode = 6, + .epp_first = 3, + .default_delay = 1, + .max_units = 1, + .write_regr = kbic_write_regr, + .read_regr = kbic_read_regr, + .write_block = kbic_write_block, + .read_block = kbic_read_block, + .connect = k971_connect, + .disconnect = k971_disconnect, + .log_adapter = k971_log_adapter, +}; + +static int __init kbic_init(void) +{ + int rv; + + rv = paride_register(&k951); + if (rv < 0) + return rv; + rv = paride_register(&k971); + if (rv < 0) + paride_unregister(&k951); + return rv; +} + +static void __exit kbic_exit(void) +{ + paride_unregister(&k951); + paride_unregister(&k971); +} + +MODULE_LICENSE("GPL"); +module_init(kbic_init) +module_exit(kbic_exit) diff --git a/kernel/drivers/block/paride/ktti.c b/kernel/drivers/block/paride/ktti.c new file mode 100644 index 000000000..117ab0e8c --- /dev/null +++ b/kernel/drivers/block/paride/ktti.c @@ -0,0 +1,128 @@ +/* + ktti.c (c) 1998 Grant R. Guenther + Under the terms of the GNU General Public License. + + ktti.c is a low-level protocol driver for the KT Technology + parallel port adapter. This adapter is used in the "PHd" + portable hard-drives. As far as I can tell, this device + supports 4-bit mode _only_. + +*/ + +#define KTTI_VERSION "1.0" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +#define j44(a,b) (((a>>4)&0x0f)|(b&0xf0)) + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set +*/ + +static int cont_map[2] = { 0x10, 0x08 }; + +static void ktti_write_regr( PIA *pi, int cont, int regr, int val) + +{ int r; + + r = regr + cont_map[cont]; + + w0(r); w2(0xb); w2(0xa); w2(3); w2(6); + w0(val); w2(3); w0(0); w2(6); w2(0xb); +} + +static int ktti_read_regr( PIA *pi, int cont, int regr ) + +{ int a, b, r; + + r = regr + cont_map[cont]; + + w0(r); w2(0xb); w2(0xa); w2(9); w2(0xc); w2(9); + a = r1(); w2(0xc); b = r1(); w2(9); w2(0xc); w2(9); + return j44(a,b); + +} + +static void ktti_read_block( PIA *pi, char * buf, int count ) + +{ int k, a, b; + + for (k=0;ksaved_r0 = r0(); + pi->saved_r2 = r2(); + w2(0xb); w2(0xa); w0(0); w2(3); w2(6); +} + +static void ktti_disconnect ( PIA *pi ) + +{ w2(0xb); w2(0xa); w0(0xa0); w2(3); w2(4); + w0(pi->saved_r0); + w2(pi->saved_r2); +} + +static void ktti_log_adapter( PIA *pi, char * scratch, int verbose ) + +{ printk("%s: ktti %s, KT adapter at 0x%x, delay %d\n", + pi->device,KTTI_VERSION,pi->port,pi->delay); + +} + +static struct pi_protocol ktti = { + .owner = THIS_MODULE, + .name = "ktti", + .max_mode = 1, + .epp_first = 2, + .default_delay = 1, + .max_units = 1, + .write_regr = ktti_write_regr, + .read_regr = ktti_read_regr, + .write_block = ktti_write_block, + .read_block = ktti_read_block, + .connect = ktti_connect, + .disconnect = ktti_disconnect, + .log_adapter = ktti_log_adapter, +}; + +static int __init ktti_init(void) +{ + return paride_register(&ktti); +} + +static void __exit ktti_exit(void) +{ + paride_unregister(&ktti); +} + +MODULE_LICENSE("GPL"); +module_init(ktti_init) +module_exit(ktti_exit) diff --git a/kernel/drivers/block/paride/mkd b/kernel/drivers/block/paride/mkd new file mode 100644 index 000000000..971f099b4 --- /dev/null +++ b/kernel/drivers/block/paride/mkd @@ -0,0 +1,30 @@ +#!/bin/bash +# +# mkd -- a script to create the device special files for the PARIDE subsystem +# +# block devices: pd (45), pcd (46), pf (47) +# character devices: pt (96), pg (97) +# +function mkdev { + mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1 +} +# +function pd { + D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) ) + mkdev pd$D b 45 $[ $1 * 16 ] + for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + do mkdev pd$D$P b 45 $[ $1 * 16 + $P ] + done +} +# +cd /dev +# +for u in 0 1 2 3 ; do pd $u ; done +for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done +for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done +for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done +for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done +for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done +# +# end of mkd + diff --git a/kernel/drivers/block/paride/on20.c b/kernel/drivers/block/paride/on20.c new file mode 100644 index 000000000..0173697a1 --- /dev/null +++ b/kernel/drivers/block/paride/on20.c @@ -0,0 +1,153 @@ +/* + on20.c (c) 1996-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + on20.c is a low-level protocol driver for the + Onspec 90c20 parallel to IDE adapter. +*/ + +/* Changes: + + 1.01 GRG 1998.05.06 init_proto, release_proto + +*/ + +#define ON20_VERSION "1.01" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +#define op(f) w2(4);w0(f);w2(5);w2(0xd);w2(5);w2(0xd);w2(5);w2(4); +#define vl(v) w2(4);w0(v);w2(5);w2(7);w2(5);w2(4); + +#define j44(a,b) (((a>>4)&0x0f)|(b&0xf0)) + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set +*/ + +static int on20_read_regr( PIA *pi, int cont, int regr ) + +{ int h,l, r ; + + r = (regr<<2) + 1 + cont; + + op(1); vl(r); op(0); + + switch (pi->mode) { + + case 0: w2(4); w2(6); l = r1(); + w2(4); w2(6); h = r1(); + w2(4); w2(6); w2(4); w2(6); w2(4); + return j44(l,h); + + case 1: w2(4); w2(0x26); r = r0(); + w2(4); w2(0x26); w2(4); + return r; + + } + return -1; +} + +static void on20_write_regr( PIA *pi, int cont, int regr, int val ) + +{ int r; + + r = (regr<<2) + 1 + cont; + + op(1); vl(r); + op(0); vl(val); + op(0); vl(val); +} + +static void on20_connect ( PIA *pi) + +{ pi->saved_r0 = r0(); + pi->saved_r2 = r2(); + + w2(4);w0(0);w2(0xc);w2(4);w2(6);w2(4);w2(6);w2(4); + if (pi->mode) { op(2); vl(8); op(2); vl(9); } + else { op(2); vl(0); op(2); vl(8); } +} + +static void on20_disconnect ( PIA *pi ) + +{ w2(4);w0(7);w2(4);w2(0xc);w2(4); + w0(pi->saved_r0); + w2(pi->saved_r2); +} + +static void on20_read_block( PIA *pi, char * buf, int count ) + +{ int k, l, h; + + op(1); vl(1); op(0); + + for (k=0;kmode) { + w2(4); w2(0x26); buf[k] = r0(); + } else { + w2(6); l = r1(); w2(4); + w2(6); h = r1(); w2(4); + buf[k] = j44(l,h); + } + w2(4); +} + +static void on20_write_block( PIA *pi, char * buf, int count ) + +{ int k; + + op(1); vl(1); op(0); + + for (k=0;kdevice,ON20_VERSION,pi->port); + printk("mode %d (%s), delay %d\n",pi->mode, + mode_string[pi->mode],pi->delay); + +} + +static struct pi_protocol on20 = { + .owner = THIS_MODULE, + .name = "on20", + .max_mode = 2, + .epp_first = 2, + .default_delay = 1, + .max_units = 1, + .write_regr = on20_write_regr, + .read_regr = on20_read_regr, + .write_block = on20_write_block, + .read_block = on20_read_block, + .connect = on20_connect, + .disconnect = on20_disconnect, + .log_adapter = on20_log_adapter, +}; + +static int __init on20_init(void) +{ + return paride_register(&on20); +} + +static void __exit on20_exit(void) +{ + paride_unregister(&on20); +} + +MODULE_LICENSE("GPL"); +module_init(on20_init) +module_exit(on20_exit) diff --git a/kernel/drivers/block/paride/on26.c b/kernel/drivers/block/paride/on26.c new file mode 100644 index 000000000..95ba25692 --- /dev/null +++ b/kernel/drivers/block/paride/on26.c @@ -0,0 +1,319 @@ +/* + on26.c (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + on26.c is a low-level protocol driver for the + OnSpec 90c26 parallel to IDE adapter chip. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.06 init_proto, release_proto + 1.02 GRG 1998.09.23 updates for the -E rev chip + 1.03 GRG 1998.12.14 fix for slave drives + 1.04 GRG 1998.12.20 yet another bug fix + +*/ + +#define ON26_VERSION "1.04" + +#include +#include +#include +#include +#include +#include +#include + +#include "paride.h" + +/* mode codes: 0 nybble reads, 8-bit writes + 1 8-bit reads and writes + 2 8-bit EPP mode + 3 EPP-16 + 4 EPP-32 +*/ + +#define j44(a,b) (((a>>4)&0x0f)|(b&0xf0)) + +#define P1 w2(5);w2(0xd);w2(5);w2(0xd);w2(5);w2(4); +#define P2 w2(5);w2(7);w2(5);w2(4); + +/* cont = 0 - access the IDE register file + cont = 1 - access the IDE command set +*/ + +static int on26_read_regr( PIA *pi, int cont, int regr ) + +{ int a, b, r; + + r = (regr<<2) + 1 + cont; + + switch (pi->mode) { + + case 0: w0(1); P1; w0(r); P2; w0(0); P1; + w2(6); a = r1(); w2(4); + w2(6); b = r1(); w2(4); + w2(6); w2(4); w2(6); w2(4); + return j44(a,b); + + case 1: w0(1); P1; w0(r); P2; w0(0); P1; + w2(0x26); a = r0(); w2(4); w2(0x26); w2(4); + return a; + + case 2: + case 3: + case 4: w3(1); w3(1); w2(5); w4(r); w2(4); + w3(0); w3(0); w2(0x24); a = r4(); w2(4); + w2(0x24); (void)r4(); w2(4); + return a; + + } + return -1; +} + +static void on26_write_regr( PIA *pi, int cont, int regr, int val ) + +{ int r; + + r = (regr<<2) + 1 + cont; + + switch (pi->mode) { + + case 0: + case 1: w0(1); P1; w0(r); P2; w0(0); P1; + w0(val); P2; w0(val); P2; + break; + + case 2: + case 3: + case 4: w3(1); w3(1); w2(5); w4(r); w2(4); + w3(0); w3(0); + w2(5); w4(val); w2(4); + w2(5); w4(val); w2(4); + break; + } +} + +#define CCP(x) w0(0xfe);w0(0xaa);w0(0x55);w0(0);w0(0xff);\ + w0(0x87);w0(0x78);w0(x);w2(4);w2(5);w2(4);w0(0xff); + +static void on26_connect ( PIA *pi ) + +{ int x; + + pi->saved_r0 = r0(); + pi->saved_r2 = r2(); + + CCP(0x20); + x = 8; if (pi->mode) x = 9; + + w0(2); P1; w0(8); P2; + w0(2); P1; w0(x); P2; +} + +static void on26_disconnect ( PIA *pi ) + +{ if (pi->mode >= 2) { w3(4); w3(4); w3(4); w3(4); } + else { w0(4); P1; w0(4); P1; } + CCP(0x30); + w0(pi->saved_r0); + w2(pi->saved_r2); +} + +#define RESET_WAIT 200 + +static int on26_test_port( PIA *pi) /* hard reset */ + +{ int i, m, d, x=0, y=0; + + pi->saved_r0 = r0(); + pi->saved_r2 = r2(); + + d = pi->delay; + m = pi->mode; + pi->delay = 5; + pi->mode = 0; + + w2(0xc); + + CCP(0x30); CCP(0); + + w0(0xfe);w0(0xaa);w0(0x55);w0(0);w0(0xff); + i = ((r1() & 0xf0) << 4); w0(0x87); + i |= (r1() & 0xf0); w0(0x78); + w0(0x20);w2(4);w2(5); + i |= ((r1() & 0xf0) >> 4); + w2(4);w0(0xff); + + if (i == 0xb5f) { + + w0(2); P1; w0(0); P2; + w0(3); P1; w0(0); P2; + w0(2); P1; w0(8); P2; udelay(100); + w0(2); P1; w0(0xa); P2; udelay(100); + w0(2); P1; w0(8); P2; udelay(1000); + + on26_write_regr(pi,0,6,0xa0); + + for (i=0;idelay = d; + pi->mode = m; + w0(pi->saved_r0); + w2(pi->saved_r2); + + return 5; +} + + +static void on26_read_block( PIA *pi, char * buf, int count ) + +{ int k, a, b; + + switch (pi->mode) { + + case 0: w0(1); P1; w0(1); P2; w0(2); P1; w0(0x18); P2; w0(0); P1; + udelay(10); + for (k=0;kmode) { + + case 0: + case 1: w0(1); P1; w0(1); P2; + w0(2); P1; w0(0x18+pi->mode); P2; w0(0); P1; + udelay(10); + for (k=0;kmode); P2; + break; + + case 2: w3(1); w3(1); w2(5); w4(1); w2(4); + w3(0); w3(0); w2(0xc5); + udelay(10); + for (k=0;kdevice,ON26_VERSION,pi->port); + printk("mode %d (%s), delay %d\n",pi->mode, + mode_string[pi->mode],pi->delay); + +} + +static struct pi_protocol on26 = { + .owner = THIS_MODULE, + .name = "on26", + .max_mode = 5, + .epp_first = 2, + .default_delay = 1, + .max_units = 1, + .write_regr = on26_write_regr, + .read_regr = on26_read_regr, + .write_block = on26_write_block, + .read_block = on26_read_block, + .connect = on26_connect, + .disconnect = on26_disconnect, + .test_port = on26_test_port, + .log_adapter = on26_log_adapter, +}; + +static int __init on26_init(void) +{ + return paride_register(&on26); +} + +static void __exit on26_exit(void) +{ + paride_unregister(&on26); +} + +MODULE_LICENSE("GPL"); +module_init(on26_init) +module_exit(on26_exit) diff --git a/kernel/drivers/block/paride/paride.c b/kernel/drivers/block/paride/paride.c new file mode 100644 index 000000000..48c50f11f --- /dev/null +++ b/kernel/drivers/block/paride/paride.c @@ -0,0 +1,434 @@ +/* + paride.c (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + This is the base module for the family of device drivers + that support parallel port IDE devices. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.03 Use spinlocks + 1.02 GRG 1998.05.05 init_proto, release_proto, ktti + 1.03 GRG 1998.08.15 eliminate compiler warning + 1.04 GRG 1998.11.28 added support for FRIQ + 1.05 TMW 2000.06.06 use parport_find_number instead of + parport_enumerate + 1.06 TMW 2001.03.26 more sane parport-or-not resource management +*/ + +#define PI_VERSION "1.06" + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* TASK_* */ +#include + +#include "paride.h" + +MODULE_LICENSE("GPL"); + +#define MAX_PROTOS 32 + +static struct pi_protocol *protocols[MAX_PROTOS]; + +static DEFINE_SPINLOCK(pi_spinlock); + +void pi_write_regr(PIA * pi, int cont, int regr, int val) +{ + pi->proto->write_regr(pi, cont, regr, val); +} + +EXPORT_SYMBOL(pi_write_regr); + +int pi_read_regr(PIA * pi, int cont, int regr) +{ + return pi->proto->read_regr(pi, cont, regr); +} + +EXPORT_SYMBOL(pi_read_regr); + +void pi_write_block(PIA * pi, char *buf, int count) +{ + pi->proto->write_block(pi, buf, count); +} + +EXPORT_SYMBOL(pi_write_block); + +void pi_read_block(PIA * pi, char *buf, int count) +{ + pi->proto->read_block(pi, buf, count); +} + +EXPORT_SYMBOL(pi_read_block); + +static void pi_wake_up(void *p) +{ + PIA *pi = (PIA *) p; + unsigned long flags; + void (*cont) (void) = NULL; + + spin_lock_irqsave(&pi_spinlock, flags); + + if (pi->claim_cont && !parport_claim(pi->pardev)) { + cont = pi->claim_cont; + pi->claim_cont = NULL; + pi->claimed = 1; + } + + spin_unlock_irqrestore(&pi_spinlock, flags); + + wake_up(&(pi->parq)); + + if (cont) + cont(); +} + +int pi_schedule_claimed(PIA * pi, void (*cont) (void)) +{ + unsigned long flags; + + spin_lock_irqsave(&pi_spinlock, flags); + if (pi->pardev && parport_claim(pi->pardev)) { + pi->claim_cont = cont; + spin_unlock_irqrestore(&pi_spinlock, flags); + return 0; + } + pi->claimed = 1; + spin_unlock_irqrestore(&pi_spinlock, flags); + return 1; +} +EXPORT_SYMBOL(pi_schedule_claimed); + +void pi_do_claimed(PIA * pi, void (*cont) (void)) +{ + if (pi_schedule_claimed(pi, cont)) + cont(); +} + +EXPORT_SYMBOL(pi_do_claimed); + +static void pi_claim(PIA * pi) +{ + if (pi->claimed) + return; + pi->claimed = 1; + if (pi->pardev) + wait_event(pi->parq, + !parport_claim((struct pardevice *) pi->pardev)); +} + +static void pi_unclaim(PIA * pi) +{ + pi->claimed = 0; + if (pi->pardev) + parport_release((struct pardevice *) (pi->pardev)); +} + +void pi_connect(PIA * pi) +{ + pi_claim(pi); + pi->proto->connect(pi); +} + +EXPORT_SYMBOL(pi_connect); + +void pi_disconnect(PIA * pi) +{ + pi->proto->disconnect(pi); + pi_unclaim(pi); +} + +EXPORT_SYMBOL(pi_disconnect); + +static void pi_unregister_parport(PIA * pi) +{ + if (pi->pardev) { + parport_unregister_device((struct pardevice *) (pi->pardev)); + pi->pardev = NULL; + } +} + +void pi_release(PIA * pi) +{ + pi_unregister_parport(pi); + if (pi->proto->release_proto) + pi->proto->release_proto(pi); + module_put(pi->proto->owner); +} + +EXPORT_SYMBOL(pi_release); + +static int default_test_proto(PIA * pi, char *scratch, int verbose) +{ + int j, k; + int e[2] = { 0, 0 }; + + pi->proto->connect(pi); + + for (j = 0; j < 2; j++) { + pi_write_regr(pi, 0, 6, 0xa0 + j * 0x10); + for (k = 0; k < 256; k++) { + pi_write_regr(pi, 0, 2, k ^ 0xaa); + pi_write_regr(pi, 0, 3, k ^ 0x55); + if (pi_read_regr(pi, 0, 2) != (k ^ 0xaa)) + e[j]++; + } + } + pi->proto->disconnect(pi); + + if (verbose) + printk("%s: %s: port 0x%x, mode %d, test=(%d,%d)\n", + pi->device, pi->proto->name, pi->port, + pi->mode, e[0], e[1]); + + return (e[0] && e[1]); /* not here if both > 0 */ +} + +static int pi_test_proto(PIA * pi, char *scratch, int verbose) +{ + int res; + + pi_claim(pi); + if (pi->proto->test_proto) + res = pi->proto->test_proto(pi, scratch, verbose); + else + res = default_test_proto(pi, scratch, verbose); + pi_unclaim(pi); + + return res; +} + +int paride_register(PIP * pr) +{ + int k; + + for (k = 0; k < MAX_PROTOS; k++) + if (protocols[k] && !strcmp(pr->name, protocols[k]->name)) { + printk("paride: %s protocol already registered\n", + pr->name); + return -1; + } + k = 0; + while ((k < MAX_PROTOS) && (protocols[k])) + k++; + if (k == MAX_PROTOS) { + printk("paride: protocol table full\n"); + return -1; + } + protocols[k] = pr; + pr->index = k; + printk("paride: %s registered as protocol %d\n", pr->name, k); + return 0; +} + +EXPORT_SYMBOL(paride_register); + +void paride_unregister(PIP * pr) +{ + if (!pr) + return; + if (protocols[pr->index] != pr) { + printk("paride: %s not registered\n", pr->name); + return; + } + protocols[pr->index] = NULL; +} + +EXPORT_SYMBOL(paride_unregister); + +static int pi_register_parport(PIA * pi, int verbose) +{ + struct parport *port; + + port = parport_find_base(pi->port); + if (!port) + return 0; + + pi->pardev = parport_register_device(port, + pi->device, NULL, + pi_wake_up, NULL, 0, (void *) pi); + parport_put_port(port); + if (!pi->pardev) + return 0; + + init_waitqueue_head(&pi->parq); + + if (verbose) + printk("%s: 0x%x is %s\n", pi->device, pi->port, port->name); + + pi->parname = (char *) port->name; + + return 1; +} + +static int pi_probe_mode(PIA * pi, int max, char *scratch, int verbose) +{ + int best, range; + + if (pi->mode != -1) { + if (pi->mode >= max) + return 0; + range = 3; + if (pi->mode >= pi->proto->epp_first) + range = 8; + if ((range == 8) && (pi->port % 8)) + return 0; + pi->reserved = range; + return (!pi_test_proto(pi, scratch, verbose)); + } + best = -1; + for (pi->mode = 0; pi->mode < max; pi->mode++) { + range = 3; + if (pi->mode >= pi->proto->epp_first) + range = 8; + if ((range == 8) && (pi->port % 8)) + break; + pi->reserved = range; + if (!pi_test_proto(pi, scratch, verbose)) + best = pi->mode; + } + pi->mode = best; + return (best > -1); +} + +static int pi_probe_unit(PIA * pi, int unit, char *scratch, int verbose) +{ + int max, s, e; + + s = unit; + e = s + 1; + + if (s == -1) { + s = 0; + e = pi->proto->max_units; + } + + if (!pi_register_parport(pi, verbose)) + return 0; + + if (pi->proto->test_port) { + pi_claim(pi); + max = pi->proto->test_port(pi); + pi_unclaim(pi); + } else + max = pi->proto->max_mode; + + if (pi->proto->probe_unit) { + pi_claim(pi); + for (pi->unit = s; pi->unit < e; pi->unit++) + if (pi->proto->probe_unit(pi)) { + pi_unclaim(pi); + if (pi_probe_mode(pi, max, scratch, verbose)) + return 1; + pi_unregister_parport(pi); + return 0; + } + pi_unclaim(pi); + pi_unregister_parport(pi); + return 0; + } + + if (!pi_probe_mode(pi, max, scratch, verbose)) { + pi_unregister_parport(pi); + return 0; + } + return 1; + +} + +int pi_init(PIA * pi, int autoprobe, int port, int mode, + int unit, int protocol, int delay, char *scratch, + int devtype, int verbose, char *device) +{ + int p, k, s, e; + int lpts[7] = { 0x3bc, 0x378, 0x278, 0x268, 0x27c, 0x26c, 0 }; + + s = protocol; + e = s + 1; + + if (!protocols[0]) + request_module("paride_protocol"); + + if (autoprobe) { + s = 0; + e = MAX_PROTOS; + } else if ((s < 0) || (s >= MAX_PROTOS) || (port <= 0) || + (!protocols[s]) || (unit < 0) || + (unit >= protocols[s]->max_units)) { + printk("%s: Invalid parameters\n", device); + return 0; + } + + for (p = s; p < e; p++) { + struct pi_protocol *proto = protocols[p]; + if (!proto) + continue; + /* still racy */ + if (!try_module_get(proto->owner)) + continue; + pi->proto = proto; + pi->private = 0; + if (proto->init_proto && proto->init_proto(pi) < 0) { + pi->proto = NULL; + module_put(proto->owner); + continue; + } + if (delay == -1) + pi->delay = pi->proto->default_delay; + else + pi->delay = delay; + pi->devtype = devtype; + pi->device = device; + + pi->parname = NULL; + pi->pardev = NULL; + init_waitqueue_head(&pi->parq); + pi->claimed = 0; + pi->claim_cont = NULL; + + pi->mode = mode; + if (port != -1) { + pi->port = port; + if (pi_probe_unit(pi, unit, scratch, verbose)) + break; + pi->port = 0; + } else { + k = 0; + while ((pi->port = lpts[k++])) + if (pi_probe_unit + (pi, unit, scratch, verbose)) + break; + if (pi->port) + break; + } + if (pi->proto->release_proto) + pi->proto->release_proto(pi); + module_put(proto->owner); + } + + if (!pi->port) { + if (autoprobe) + printk("%s: Autoprobe failed\n", device); + else + printk("%s: Adapter not found\n", device); + return 0; + } + + if (pi->parname) + printk("%s: Sharing %s at 0x%x\n", pi->device, + pi->parname, pi->port); + + pi->proto->log_adapter(pi, scratch, verbose); + + return 1; +} + +EXPORT_SYMBOL(pi_init); diff --git a/kernel/drivers/block/paride/paride.h b/kernel/drivers/block/paride/paride.h new file mode 100644 index 000000000..2bddbf455 --- /dev/null +++ b/kernel/drivers/block/paride/paride.h @@ -0,0 +1,170 @@ +#ifndef __DRIVERS_PARIDE_H__ +#define __DRIVERS_PARIDE_H__ + +/* + paride.h (c) 1997-8 Grant R. Guenther + Under the terms of the GPL. + + This file defines the interface between the high-level parallel + IDE device drivers (pd, pf, pcd, pt) and the adapter chips. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.05 init_proto, release_proto +*/ + +#define PARIDE_H_VERSION "1.01" + +/* Some adapters need to know what kind of device they are in + + Values for devtype: +*/ + +#define PI_PD 0 /* IDE disk */ +#define PI_PCD 1 /* ATAPI CDrom */ +#define PI_PF 2 /* ATAPI disk */ +#define PI_PT 3 /* ATAPI tape */ +#define PI_PG 4 /* ATAPI generic */ + +/* The paride module contains no state, instead the drivers allocate + a pi_adapter data structure and pass it to paride in every operation. + +*/ + +struct pi_adapter { + + struct pi_protocol *proto; /* adapter protocol */ + int port; /* base address of parallel port */ + int mode; /* transfer mode in use */ + int delay; /* adapter delay setting */ + int devtype; /* device type: PI_PD etc. */ + char *device; /* name of driver */ + int unit; /* unit number for chained adapters */ + int saved_r0; /* saved port state */ + int saved_r2; /* saved port state */ + int reserved; /* number of ports reserved */ + unsigned long private; /* for protocol module */ + + wait_queue_head_t parq; /* semaphore for parport sharing */ + void *pardev; /* pointer to pardevice */ + char *parname; /* parport name */ + int claimed; /* parport has already been claimed */ + void (*claim_cont)(void); /* continuation for parport wait */ +}; + +typedef struct pi_adapter PIA; + +/* functions exported by paride to the high level drivers */ + +extern int pi_init(PIA *pi, + int autoprobe, /* 1 to autoprobe */ + int port, /* base port address */ + int mode, /* -1 for autoprobe */ + int unit, /* unit number, if supported */ + int protocol, /* protocol to use */ + int delay, /* -1 to use adapter specific default */ + char * scratch, /* address of 512 byte buffer */ + int devtype, /* device type: PI_PD, PI_PCD, etc ... */ + int verbose, /* log verbose data while probing */ + char *device /* name of the driver */ + ); /* returns 0 on failure, 1 on success */ + +extern void pi_release(PIA *pi); + +/* registers are addressed as (cont,regr) + + cont: 0 for command register file, 1 for control register(s) + regr: 0-7 for register number. + +*/ + +extern void pi_write_regr(PIA *pi, int cont, int regr, int val); + +extern int pi_read_regr(PIA *pi, int cont, int regr); + +extern void pi_write_block(PIA *pi, char * buf, int count); + +extern void pi_read_block(PIA *pi, char * buf, int count); + +extern void pi_connect(PIA *pi); + +extern void pi_disconnect(PIA *pi); + +extern void pi_do_claimed(PIA *pi, void (*cont)(void)); +extern int pi_schedule_claimed(PIA *pi, void (*cont)(void)); + +/* macros and functions exported to the protocol modules */ + +#define delay_p (pi->delay?udelay(pi->delay):(void)0) +#define out_p(offs,byte) outb(byte,pi->port+offs); delay_p; +#define in_p(offs) (delay_p,inb(pi->port+offs)) + +#define w0(byte) {out_p(0,byte);} +#define r0() (in_p(0) & 0xff) +#define w1(byte) {out_p(1,byte);} +#define r1() (in_p(1) & 0xff) +#define w2(byte) {out_p(2,byte);} +#define r2() (in_p(2) & 0xff) +#define w3(byte) {out_p(3,byte);} +#define w4(byte) {out_p(4,byte);} +#define r4() (in_p(4) & 0xff) +#define w4w(data) {outw(data,pi->port+4); delay_p;} +#define w4l(data) {outl(data,pi->port+4); delay_p;} +#define r4w() (delay_p,inw(pi->port+4)&0xffff) +#define r4l() (delay_p,inl(pi->port+4)&0xffffffff) + +static inline u16 pi_swab16( char *b, int k) + +{ union { u16 u; char t[2]; } r; + + r.t[0]=b[2*k+1]; r.t[1]=b[2*k]; + return r.u; +} + +static inline u32 pi_swab32( char *b, int k) + +{ union { u32 u; char f[4]; } r; + + r.f[0]=b[4*k+1]; r.f[1]=b[4*k]; + r.f[2]=b[4*k+3]; r.f[3]=b[4*k+2]; + return r.u; +} + +struct pi_protocol { + + char name[8]; /* name for this protocol */ + int index; /* index into protocol table */ + + int max_mode; /* max mode number */ + int epp_first; /* modes >= this use 8 ports */ + + int default_delay; /* delay parameter if not specified */ + int max_units; /* max chained units probed for */ + + void (*write_regr)(PIA *,int,int,int); + int (*read_regr)(PIA *,int,int); + void (*write_block)(PIA *,char *,int); + void (*read_block)(PIA *,char *,int); + + void (*connect)(PIA *); + void (*disconnect)(PIA *); + + int (*test_port)(PIA *); + int (*probe_unit)(PIA *); + int (*test_proto)(PIA *,char *,int); + void (*log_adapter)(PIA *,char *,int); + + int (*init_proto)(PIA *); + void (*release_proto)(PIA *); + struct module *owner; +}; + +typedef struct pi_protocol PIP; + +extern int paride_register( PIP * ); +extern void paride_unregister ( PIP * ); + +#endif /* __DRIVERS_PARIDE_H__ */ +/* end of paride.h */ diff --git a/kernel/drivers/block/paride/pcd.c b/kernel/drivers/block/paride/pcd.c new file mode 100644 index 000000000..3b7c9f1be --- /dev/null +++ b/kernel/drivers/block/paride/pcd.c @@ -0,0 +1,991 @@ +/* + pcd.c (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + This is a high-level driver for parallel port ATAPI CD-ROM + drives based on chips supported by the paride module. + + By default, the driver will autoprobe for a single parallel + port ATAPI CD-ROM drive, but if their individual parameters are + specified, the driver can handle up to 4 drives. + + The behaviour of the pcd driver can be altered by setting + some parameters from the insmod command line. The following + parameters are adjustable: + + drive0 These four arguments can be arrays of + drive1 1-6 integers as follows: + drive2 + drive3 ,,,,, + + Where, + + is the base of the parallel port address for + the corresponding drive. (required) + + is the protocol number for the adapter that + supports this drive. These numbers are + logged by 'paride' when the protocol modules + are initialised. (0 if not given) + + for those adapters that support chained + devices, this is the unit selector for the + chain of devices on the given port. It should + be zero for devices that don't support chaining. + (0 if not given) + + this can be -1 to choose the best mode, or one + of the mode numbers supported by the adapter. + (-1 if not given) + + ATAPI CD-ROMs can be jumpered to master or slave. + Set this to 0 to choose the master drive, 1 to + choose the slave, -1 (the default) to choose the + first drive found. + + some parallel ports require the driver to + go more slowly. -1 sets a default value that + should work with the chosen protocol. Otherwise, + set this to a small integer, the larger it is + the slower the port i/o. In some cases, setting + this to zero will speed up the device. (default -1) + + major You may use this parameter to overide the + default major number (46) that this driver + will use. Be sure to change the device + name as well. + + name This parameter is a character string that + contains the name the kernel will use for this + device (in /proc output, for instance). + (default "pcd") + + verbose This parameter controls the amount of logging + that the driver will do. Set it to 0 for + normal operation, 1 to see autoprobe progress + messages, or 2 to see additional debugging + output. (default 0) + + nice This parameter controls the driver's use of + idle CPU time, at the expense of some speed. + + If this driver is built into the kernel, you can use the + following kernel command line parameters, with the same values + as the corresponding module parameters listed above: + + pcd.drive0 + pcd.drive1 + pcd.drive2 + pcd.drive3 + pcd.nice + + In addition, you can use the parameter pcd.disable to disable + the driver entirely. + +*/ + +/* Changes: + + 1.01 GRG 1998.01.24 Added test unit ready support + 1.02 GRG 1998.05.06 Changes to pcd_completion, ready_wait, + and loosen interpretation of ATAPI + standard for clearing error status. + Use spinlocks. Eliminate sti(). + 1.03 GRG 1998.06.16 Eliminated an Ugh + 1.04 GRG 1998.08.15 Added extra debugging, improvements to + pcd_completion, use HZ in loop timing + 1.05 GRG 1998.08.16 Conformed to "Uniform CD-ROM" standard + 1.06 GRG 1998.08.19 Added audio ioctl support + 1.07 GRG 1998.09.24 Increased reset timeout, added jumbo support + +*/ + +#define PCD_VERSION "1.07" +#define PCD_MAJOR 46 +#define PCD_NAME "pcd" +#define PCD_UNITS 4 + +/* Here are things one can override from the insmod command. + Most are autoprobed by paride unless set here. Verbose is off + by default. + +*/ + +static int verbose = 0; +static int major = PCD_MAJOR; +static char *name = PCD_NAME; +static int nice = 0; +static int disable = 0; + +static int drive0[6] = { 0, 0, 0, -1, -1, -1 }; +static int drive1[6] = { 0, 0, 0, -1, -1, -1 }; +static int drive2[6] = { 0, 0, 0, -1, -1, -1 }; +static int drive3[6] = { 0, 0, 0, -1, -1, -1 }; + +static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3}; +static int pcd_drive_count; + +enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY}; + +/* end of parameters */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(pcd_mutex); +static DEFINE_SPINLOCK(pcd_lock); + +module_param(verbose, int, 0644); +module_param(major, int, 0); +module_param(name, charp, 0); +module_param(nice, int, 0); +module_param_array(drive0, int, NULL, 0); +module_param_array(drive1, int, NULL, 0); +module_param_array(drive2, int, NULL, 0); +module_param_array(drive3, int, NULL, 0); + +#include "paride.h" +#include "pseudo.h" + +#define PCD_RETRIES 5 +#define PCD_TMO 800 /* timeout in jiffies */ +#define PCD_DELAY 50 /* spin delay in uS */ +#define PCD_READY_TMO 20 /* in seconds */ +#define PCD_RESET_TMO 100 /* in tenths of a second */ + +#define PCD_SPIN (1000000*PCD_TMO)/(HZ*PCD_DELAY) + +#define IDE_ERR 0x01 +#define IDE_DRQ 0x08 +#define IDE_READY 0x40 +#define IDE_BUSY 0x80 + +static int pcd_open(struct cdrom_device_info *cdi, int purpose); +static void pcd_release(struct cdrom_device_info *cdi); +static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr); +static unsigned int pcd_check_events(struct cdrom_device_info *cdi, + unsigned int clearing, int slot_nr); +static int pcd_tray_move(struct cdrom_device_info *cdi, int position); +static int pcd_lock_door(struct cdrom_device_info *cdi, int lock); +static int pcd_drive_reset(struct cdrom_device_info *cdi); +static int pcd_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn *mcn); +static int pcd_audio_ioctl(struct cdrom_device_info *cdi, + unsigned int cmd, void *arg); +static int pcd_packet(struct cdrom_device_info *cdi, + struct packet_command *cgc); + +static int pcd_detect(void); +static void pcd_probe_capabilities(void); +static void do_pcd_read_drq(void); +static void do_pcd_request(struct request_queue * q); +static void do_pcd_read(void); + +struct pcd_unit { + struct pi_adapter pia; /* interface to paride layer */ + struct pi_adapter *pi; + int drive; /* master/slave */ + int last_sense; /* result of last request sense */ + int changed; /* media change seen */ + int present; /* does this unit exist ? */ + char *name; /* pcd0, pcd1, etc */ + struct cdrom_device_info info; /* uniform cdrom interface */ + struct gendisk *disk; +}; + +static struct pcd_unit pcd[PCD_UNITS]; + +static char pcd_scratch[64]; +static char pcd_buffer[2048]; /* raw block buffer */ +static int pcd_bufblk = -1; /* block in buffer, in CD units, + -1 for nothing there. See also + pd_unit. + */ + +/* the variables below are used mainly in the I/O request engine, which + processes only one request at a time. +*/ + +static struct pcd_unit *pcd_current; /* current request's drive */ +static struct request *pcd_req; +static int pcd_retries; /* retries on current request */ +static int pcd_busy; /* request being processed ? */ +static int pcd_sector; /* address of next requested sector */ +static int pcd_count; /* number of blocks still to do */ +static char *pcd_buf; /* buffer for request in progress */ + +/* kernel glue structures */ + +static int pcd_block_open(struct block_device *bdev, fmode_t mode) +{ + struct pcd_unit *cd = bdev->bd_disk->private_data; + int ret; + + mutex_lock(&pcd_mutex); + ret = cdrom_open(&cd->info, bdev, mode); + mutex_unlock(&pcd_mutex); + + return ret; +} + +static void pcd_block_release(struct gendisk *disk, fmode_t mode) +{ + struct pcd_unit *cd = disk->private_data; + mutex_lock(&pcd_mutex); + cdrom_release(&cd->info, mode); + mutex_unlock(&pcd_mutex); +} + +static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + struct pcd_unit *cd = bdev->bd_disk->private_data; + int ret; + + mutex_lock(&pcd_mutex); + ret = cdrom_ioctl(&cd->info, bdev, mode, cmd, arg); + mutex_unlock(&pcd_mutex); + + return ret; +} + +static unsigned int pcd_block_check_events(struct gendisk *disk, + unsigned int clearing) +{ + struct pcd_unit *cd = disk->private_data; + return cdrom_check_events(&cd->info, clearing); +} + +static const struct block_device_operations pcd_bdops = { + .owner = THIS_MODULE, + .open = pcd_block_open, + .release = pcd_block_release, + .ioctl = pcd_block_ioctl, + .check_events = pcd_block_check_events, +}; + +static struct cdrom_device_ops pcd_dops = { + .open = pcd_open, + .release = pcd_release, + .drive_status = pcd_drive_status, + .check_events = pcd_check_events, + .tray_move = pcd_tray_move, + .lock_door = pcd_lock_door, + .get_mcn = pcd_get_mcn, + .reset = pcd_drive_reset, + .audio_ioctl = pcd_audio_ioctl, + .generic_packet = pcd_packet, + .capability = CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK | + CDC_MCN | CDC_MEDIA_CHANGED | CDC_RESET | + CDC_PLAY_AUDIO | CDC_GENERIC_PACKET | CDC_CD_R | + CDC_CD_RW, +}; + +static void pcd_init_units(void) +{ + struct pcd_unit *cd; + int unit; + + pcd_drive_count = 0; + for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { + struct gendisk *disk = alloc_disk(1); + if (!disk) + continue; + cd->disk = disk; + cd->pi = &cd->pia; + cd->present = 0; + cd->last_sense = 0; + cd->changed = 1; + cd->drive = (*drives[unit])[D_SLV]; + if ((*drives[unit])[D_PRT]) + pcd_drive_count++; + + cd->name = &cd->info.name[0]; + snprintf(cd->name, sizeof(cd->info.name), "%s%d", name, unit); + cd->info.ops = &pcd_dops; + cd->info.handle = cd; + cd->info.speed = 0; + cd->info.capacity = 1; + cd->info.mask = 0; + disk->major = major; + disk->first_minor = unit; + strcpy(disk->disk_name, cd->name); /* umm... */ + disk->fops = &pcd_bdops; + disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; + } +} + +static int pcd_open(struct cdrom_device_info *cdi, int purpose) +{ + struct pcd_unit *cd = cdi->handle; + if (!cd->present) + return -ENODEV; + return 0; +} + +static void pcd_release(struct cdrom_device_info *cdi) +{ +} + +static inline int status_reg(struct pcd_unit *cd) +{ + return pi_read_regr(cd->pi, 1, 6); +} + +static inline int read_reg(struct pcd_unit *cd, int reg) +{ + return pi_read_regr(cd->pi, 0, reg); +} + +static inline void write_reg(struct pcd_unit *cd, int reg, int val) +{ + pi_write_regr(cd->pi, 0, reg, val); +} + +static int pcd_wait(struct pcd_unit *cd, int go, int stop, char *fun, char *msg) +{ + int j, r, e, s, p; + + j = 0; + while ((((r = status_reg(cd)) & go) || (stop && (!(r & stop)))) + && (j++ < PCD_SPIN)) + udelay(PCD_DELAY); + + if ((r & (IDE_ERR & stop)) || (j > PCD_SPIN)) { + s = read_reg(cd, 7); + e = read_reg(cd, 1); + p = read_reg(cd, 2); + if (j > PCD_SPIN) + e |= 0x100; + if (fun) + printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" + " loop=%d phase=%d\n", + cd->name, fun, msg, r, s, e, j, p); + return (s << 8) + r; + } + return 0; +} + +static int pcd_command(struct pcd_unit *cd, char *cmd, int dlen, char *fun) +{ + pi_connect(cd->pi); + + write_reg(cd, 6, 0xa0 + 0x10 * cd->drive); + + if (pcd_wait(cd, IDE_BUSY | IDE_DRQ, 0, fun, "before command")) { + pi_disconnect(cd->pi); + return -1; + } + + write_reg(cd, 4, dlen % 256); + write_reg(cd, 5, dlen / 256); + write_reg(cd, 7, 0xa0); /* ATAPI packet command */ + + if (pcd_wait(cd, IDE_BUSY, IDE_DRQ, fun, "command DRQ")) { + pi_disconnect(cd->pi); + return -1; + } + + if (read_reg(cd, 2) != 1) { + printk("%s: %s: command phase error\n", cd->name, fun); + pi_disconnect(cd->pi); + return -1; + } + + pi_write_block(cd->pi, cmd, 12); + + return 0; +} + +static int pcd_completion(struct pcd_unit *cd, char *buf, char *fun) +{ + int r, d, p, n, k, j; + + r = -1; + k = 0; + j = 0; + + if (!pcd_wait(cd, IDE_BUSY, IDE_DRQ | IDE_READY | IDE_ERR, + fun, "completion")) { + r = 0; + while (read_reg(cd, 7) & IDE_DRQ) { + d = read_reg(cd, 4) + 256 * read_reg(cd, 5); + n = (d + 3) & 0xfffc; + p = read_reg(cd, 2) & 3; + + if ((p == 2) && (n > 0) && (j == 0)) { + pi_read_block(cd->pi, buf, n); + if (verbose > 1) + printk("%s: %s: Read %d bytes\n", + cd->name, fun, n); + r = 0; + j++; + } else { + if (verbose > 1) + printk + ("%s: %s: Unexpected phase %d, d=%d, k=%d\n", + cd->name, fun, p, d, k); + if (verbose < 2) + printk_once( + "%s: WARNING: ATAPI phase errors\n", + cd->name); + mdelay(1); + } + if (k++ > PCD_TMO) { + printk("%s: Stuck DRQ\n", cd->name); + break; + } + if (pcd_wait + (cd, IDE_BUSY, IDE_DRQ | IDE_READY | IDE_ERR, fun, + "completion")) { + r = -1; + break; + } + } + } + + pi_disconnect(cd->pi); + + return r; +} + +static void pcd_req_sense(struct pcd_unit *cd, char *fun) +{ + char rs_cmd[12] = { 0x03, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0 }; + char buf[16]; + int r, c; + + r = pcd_command(cd, rs_cmd, 16, "Request sense"); + mdelay(1); + if (!r) + pcd_completion(cd, buf, "Request sense"); + + cd->last_sense = -1; + c = 2; + if (!r) { + if (fun) + printk("%s: %s: Sense key: %x, ASC: %x, ASQ: %x\n", + cd->name, fun, buf[2] & 0xf, buf[12], buf[13]); + c = buf[2] & 0xf; + cd->last_sense = + c | ((buf[12] & 0xff) << 8) | ((buf[13] & 0xff) << 16); + } + if ((c == 2) || (c == 6)) + cd->changed = 1; +} + +static int pcd_atapi(struct pcd_unit *cd, char *cmd, int dlen, char *buf, char *fun) +{ + int r; + + r = pcd_command(cd, cmd, dlen, fun); + mdelay(1); + if (!r) + r = pcd_completion(cd, buf, fun); + if (r) + pcd_req_sense(cd, fun); + + return r; +} + +static int pcd_packet(struct cdrom_device_info *cdi, struct packet_command *cgc) +{ + return pcd_atapi(cdi->handle, cgc->cmd, cgc->buflen, cgc->buffer, + "generic packet"); +} + +#define DBMSG(msg) ((verbose>1)?(msg):NULL) + +static unsigned int pcd_check_events(struct cdrom_device_info *cdi, + unsigned int clearing, int slot_nr) +{ + struct pcd_unit *cd = cdi->handle; + int res = cd->changed; + if (res) + cd->changed = 0; + return res ? DISK_EVENT_MEDIA_CHANGE : 0; +} + +static int pcd_lock_door(struct cdrom_device_info *cdi, int lock) +{ + char un_cmd[12] = { 0x1e, 0, 0, 0, lock, 0, 0, 0, 0, 0, 0, 0 }; + + return pcd_atapi(cdi->handle, un_cmd, 0, pcd_scratch, + lock ? "lock door" : "unlock door"); +} + +static int pcd_tray_move(struct cdrom_device_info *cdi, int position) +{ + char ej_cmd[12] = { 0x1b, 0, 0, 0, 3 - position, 0, 0, 0, 0, 0, 0, 0 }; + + return pcd_atapi(cdi->handle, ej_cmd, 0, pcd_scratch, + position ? "eject" : "close tray"); +} + +static void pcd_sleep(int cs) +{ + schedule_timeout_interruptible(cs); +} + +static int pcd_reset(struct pcd_unit *cd) +{ + int i, k, flg; + int expect[5] = { 1, 1, 1, 0x14, 0xeb }; + + pi_connect(cd->pi); + write_reg(cd, 6, 0xa0 + 0x10 * cd->drive); + write_reg(cd, 7, 8); + + pcd_sleep(20 * HZ / 1000); /* delay a bit */ + + k = 0; + while ((k++ < PCD_RESET_TMO) && (status_reg(cd) & IDE_BUSY)) + pcd_sleep(HZ / 10); + + flg = 1; + for (i = 0; i < 5; i++) + flg &= (read_reg(cd, i + 1) == expect[i]); + + if (verbose) { + printk("%s: Reset (%d) signature = ", cd->name, k); + for (i = 0; i < 5; i++) + printk("%3x", read_reg(cd, i + 1)); + if (!flg) + printk(" (incorrect)"); + printk("\n"); + } + + pi_disconnect(cd->pi); + return flg - 1; +} + +static int pcd_drive_reset(struct cdrom_device_info *cdi) +{ + return pcd_reset(cdi->handle); +} + +static int pcd_ready_wait(struct pcd_unit *cd, int tmo) +{ + char tr_cmd[12] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int k, p; + + k = 0; + while (k < tmo) { + cd->last_sense = 0; + pcd_atapi(cd, tr_cmd, 0, NULL, DBMSG("test unit ready")); + p = cd->last_sense; + if (!p) + return 0; + if (!(((p & 0xffff) == 0x0402) || ((p & 0xff) == 6))) + return p; + k++; + pcd_sleep(HZ); + } + return 0x000020; /* timeout */ +} + +static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr) +{ + char rc_cmd[12] = { 0x25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + struct pcd_unit *cd = cdi->handle; + + if (pcd_ready_wait(cd, PCD_READY_TMO)) + return CDS_DRIVE_NOT_READY; + if (pcd_atapi(cd, rc_cmd, 8, pcd_scratch, DBMSG("check media"))) + return CDS_NO_DISC; + return CDS_DISC_OK; +} + +static int pcd_identify(struct pcd_unit *cd, char *id) +{ + int k, s; + char id_cmd[12] = { 0x12, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 }; + + pcd_bufblk = -1; + + s = pcd_atapi(cd, id_cmd, 36, pcd_buffer, "identify"); + + if (s) + return -1; + if ((pcd_buffer[0] & 0x1f) != 5) { + if (verbose) + printk("%s: %s is not a CD-ROM\n", + cd->name, cd->drive ? "Slave" : "Master"); + return -1; + } + memcpy(id, pcd_buffer + 16, 16); + id[16] = 0; + k = 16; + while ((k >= 0) && (id[k] <= 0x20)) { + id[k] = 0; + k--; + } + + printk("%s: %s: %s\n", cd->name, cd->drive ? "Slave" : "Master", id); + + return 0; +} + +/* + * returns 0, with id set if drive is detected + * -1, if drive detection failed + */ +static int pcd_probe(struct pcd_unit *cd, int ms, char *id) +{ + if (ms == -1) { + for (cd->drive = 0; cd->drive <= 1; cd->drive++) + if (!pcd_reset(cd) && !pcd_identify(cd, id)) + return 0; + } else { + cd->drive = ms; + if (!pcd_reset(cd) && !pcd_identify(cd, id)) + return 0; + } + return -1; +} + +static void pcd_probe_capabilities(void) +{ + int unit, r; + char buffer[32]; + char cmd[12] = { 0x5a, 1 << 3, 0x2a, 0, 0, 0, 0, 18, 0, 0, 0, 0 }; + struct pcd_unit *cd; + + for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { + if (!cd->present) + continue; + r = pcd_atapi(cd, cmd, 18, buffer, "mode sense capabilities"); + if (r) + continue; + /* we should now have the cap page */ + if ((buffer[11] & 1) == 0) + cd->info.mask |= CDC_CD_R; + if ((buffer[11] & 2) == 0) + cd->info.mask |= CDC_CD_RW; + if ((buffer[12] & 1) == 0) + cd->info.mask |= CDC_PLAY_AUDIO; + if ((buffer[14] & 1) == 0) + cd->info.mask |= CDC_LOCK; + if ((buffer[14] & 8) == 0) + cd->info.mask |= CDC_OPEN_TRAY; + if ((buffer[14] >> 6) == 0) + cd->info.mask |= CDC_CLOSE_TRAY; + } +} + +static int pcd_detect(void) +{ + char id[18]; + int k, unit; + struct pcd_unit *cd; + + printk("%s: %s version %s, major %d, nice %d\n", + name, name, PCD_VERSION, major, nice); + + k = 0; + if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ + cd = pcd; + if (pi_init(cd->pi, 1, -1, -1, -1, -1, -1, pcd_buffer, + PI_PCD, verbose, cd->name)) { + if (!pcd_probe(cd, -1, id) && cd->disk) { + cd->present = 1; + k++; + } else + pi_release(cd->pi); + } + } else { + for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { + int *conf = *drives[unit]; + if (!conf[D_PRT]) + continue; + if (!pi_init(cd->pi, 0, conf[D_PRT], conf[D_MOD], + conf[D_UNI], conf[D_PRO], conf[D_DLY], + pcd_buffer, PI_PCD, verbose, cd->name)) + continue; + if (!pcd_probe(cd, conf[D_SLV], id) && cd->disk) { + cd->present = 1; + k++; + } else + pi_release(cd->pi); + } + } + if (k) + return 0; + + printk("%s: No CD-ROM drive found\n", name); + for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) + put_disk(cd->disk); + return -1; +} + +/* I/O request processing */ +static struct request_queue *pcd_queue; + +static void do_pcd_request(struct request_queue * q) +{ + if (pcd_busy) + return; + while (1) { + if (!pcd_req) { + pcd_req = blk_fetch_request(q); + if (!pcd_req) + return; + } + + if (rq_data_dir(pcd_req) == READ) { + struct pcd_unit *cd = pcd_req->rq_disk->private_data; + if (cd != pcd_current) + pcd_bufblk = -1; + pcd_current = cd; + pcd_sector = blk_rq_pos(pcd_req); + pcd_count = blk_rq_cur_sectors(pcd_req); + pcd_buf = bio_data(pcd_req->bio); + pcd_busy = 1; + ps_set_intr(do_pcd_read, NULL, 0, nice); + return; + } else { + __blk_end_request_all(pcd_req, -EIO); + pcd_req = NULL; + } + } +} + +static inline void next_request(int err) +{ + unsigned long saved_flags; + + spin_lock_irqsave(&pcd_lock, saved_flags); + if (!__blk_end_request_cur(pcd_req, err)) + pcd_req = NULL; + pcd_busy = 0; + do_pcd_request(pcd_queue); + spin_unlock_irqrestore(&pcd_lock, saved_flags); +} + +static int pcd_ready(void) +{ + return (((status_reg(pcd_current) & (IDE_BUSY | IDE_DRQ)) == IDE_DRQ)); +} + +static void pcd_transfer(void) +{ + + while (pcd_count && (pcd_sector / 4 == pcd_bufblk)) { + int o = (pcd_sector % 4) * 512; + memcpy(pcd_buf, pcd_buffer + o, 512); + pcd_count--; + pcd_buf += 512; + pcd_sector++; + } +} + +static void pcd_start(void) +{ + int b, i; + char rd_cmd[12] = { 0xa8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 }; + + pcd_bufblk = pcd_sector / 4; + b = pcd_bufblk; + for (i = 0; i < 4; i++) { + rd_cmd[5 - i] = b & 0xff; + b = b >> 8; + } + + if (pcd_command(pcd_current, rd_cmd, 2048, "read block")) { + pcd_bufblk = -1; + next_request(-EIO); + return; + } + + mdelay(1); + + ps_set_intr(do_pcd_read_drq, pcd_ready, PCD_TMO, nice); +} + +static void do_pcd_read(void) +{ + pcd_busy = 1; + pcd_retries = 0; + pcd_transfer(); + if (!pcd_count) { + next_request(0); + return; + } + + pi_do_claimed(pcd_current->pi, pcd_start); +} + +static void do_pcd_read_drq(void) +{ + unsigned long saved_flags; + + if (pcd_completion(pcd_current, pcd_buffer, "read block")) { + if (pcd_retries < PCD_RETRIES) { + mdelay(1); + pcd_retries++; + pi_do_claimed(pcd_current->pi, pcd_start); + return; + } + pcd_bufblk = -1; + next_request(-EIO); + return; + } + + do_pcd_read(); + spin_lock_irqsave(&pcd_lock, saved_flags); + do_pcd_request(pcd_queue); + spin_unlock_irqrestore(&pcd_lock, saved_flags); +} + +/* the audio_ioctl stuff is adapted from sr_ioctl.c */ + +static int pcd_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd, void *arg) +{ + struct pcd_unit *cd = cdi->handle; + + switch (cmd) { + + case CDROMREADTOCHDR: + + { + char cmd[12] = + { GPCMD_READ_TOC_PMA_ATIP, 0, 0, 0, 0, 0, 0, 0, 12, + 0, 0, 0 }; + struct cdrom_tochdr *tochdr = + (struct cdrom_tochdr *) arg; + char buffer[32]; + int r; + + r = pcd_atapi(cd, cmd, 12, buffer, "read toc header"); + + tochdr->cdth_trk0 = buffer[2]; + tochdr->cdth_trk1 = buffer[3]; + + return r ? -EIO : 0; + } + + case CDROMREADTOCENTRY: + + { + char cmd[12] = + { GPCMD_READ_TOC_PMA_ATIP, 0, 0, 0, 0, 0, 0, 0, 12, + 0, 0, 0 }; + + struct cdrom_tocentry *tocentry = + (struct cdrom_tocentry *) arg; + unsigned char buffer[32]; + int r; + + cmd[1] = + (tocentry->cdte_format == CDROM_MSF ? 0x02 : 0); + cmd[6] = tocentry->cdte_track; + + r = pcd_atapi(cd, cmd, 12, buffer, "read toc entry"); + + tocentry->cdte_ctrl = buffer[5] & 0xf; + tocentry->cdte_adr = buffer[5] >> 4; + tocentry->cdte_datamode = + (tocentry->cdte_ctrl & 0x04) ? 1 : 0; + if (tocentry->cdte_format == CDROM_MSF) { + tocentry->cdte_addr.msf.minute = buffer[9]; + tocentry->cdte_addr.msf.second = buffer[10]; + tocentry->cdte_addr.msf.frame = buffer[11]; + } else + tocentry->cdte_addr.lba = + (((((buffer[8] << 8) + buffer[9]) << 8) + + buffer[10]) << 8) + buffer[11]; + + return r ? -EIO : 0; + } + + default: + + return -ENOSYS; + } +} + +static int pcd_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn *mcn) +{ + char cmd[12] = + { GPCMD_READ_SUBCHANNEL, 0, 0x40, 2, 0, 0, 0, 0, 24, 0, 0, 0 }; + char buffer[32]; + + if (pcd_atapi(cdi->handle, cmd, 24, buffer, "get mcn")) + return -EIO; + + memcpy(mcn->medium_catalog_number, buffer + 9, 13); + mcn->medium_catalog_number[13] = 0; + + return 0; +} + +static int __init pcd_init(void) +{ + struct pcd_unit *cd; + int unit; + + if (disable) + return -EINVAL; + + pcd_init_units(); + + if (pcd_detect()) + return -ENODEV; + + /* get the atapi capabilities page */ + pcd_probe_capabilities(); + + if (register_blkdev(major, name)) { + for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) + put_disk(cd->disk); + return -EBUSY; + } + + pcd_queue = blk_init_queue(do_pcd_request, &pcd_lock); + if (!pcd_queue) { + unregister_blkdev(major, name); + for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) + put_disk(cd->disk); + return -ENOMEM; + } + + for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { + if (cd->present) { + register_cdrom(&cd->info); + cd->disk->private_data = cd; + cd->disk->queue = pcd_queue; + add_disk(cd->disk); + } + } + + return 0; +} + +static void __exit pcd_exit(void) +{ + struct pcd_unit *cd; + int unit; + + for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { + if (cd->present) { + del_gendisk(cd->disk); + pi_release(cd->pi); + unregister_cdrom(&cd->info); + } + put_disk(cd->disk); + } + blk_cleanup_queue(pcd_queue); + unregister_blkdev(major, name); +} + +MODULE_LICENSE("GPL"); +module_init(pcd_init) +module_exit(pcd_exit) diff --git a/kernel/drivers/block/paride/pd.c b/kernel/drivers/block/paride/pd.c new file mode 100644 index 000000000..d48715b28 --- /dev/null +++ b/kernel/drivers/block/paride/pd.c @@ -0,0 +1,958 @@ +/* + pd.c (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + This is the high-level driver for parallel port IDE hard + drives based on chips supported by the paride module. + + By default, the driver will autoprobe for a single parallel + port IDE drive, but if their individual parameters are + specified, the driver can handle up to 4 drives. + + The behaviour of the pd driver can be altered by setting + some parameters from the insmod command line. The following + parameters are adjustable: + + drive0 These four arguments can be arrays of + drive1 1-8 integers as follows: + drive2 + drive3 ,,,,,,, + + Where, + + is the base of the parallel port address for + the corresponding drive. (required) + + is the protocol number for the adapter that + supports this drive. These numbers are + logged by 'paride' when the protocol modules + are initialised. (0 if not given) + + for those adapters that support chained + devices, this is the unit selector for the + chain of devices on the given port. It should + be zero for devices that don't support chaining. + (0 if not given) + + this can be -1 to choose the best mode, or one + of the mode numbers supported by the adapter. + (-1 if not given) + + this defaults to 0 to indicate that the driver + should use the CHS geometry provided by the drive + itself. If set to 1, the driver will provide + a logical geometry with 64 heads and 32 sectors + per track, to be consistent with most SCSI + drivers. (0 if not given) + + set this to zero to disable the power saving + standby mode, if needed. (1 if not given) + + some parallel ports require the driver to + go more slowly. -1 sets a default value that + should work with the chosen protocol. Otherwise, + set this to a small integer, the larger it is + the slower the port i/o. In some cases, setting + this to zero will speed up the device. (default -1) + + IDE disks can be jumpered to master or slave. + Set this to 0 to choose the master drive, 1 to + choose the slave, -1 (the default) to choose the + first drive found. + + + major You may use this parameter to overide the + default major number (45) that this driver + will use. Be sure to change the device + name as well. + + name This parameter is a character string that + contains the name the kernel will use for this + device (in /proc output, for instance). + (default "pd") + + cluster The driver will attempt to aggregate requests + for adjacent blocks into larger multi-block + clusters. The maximum cluster size (in 512 + byte sectors) is set with this parameter. + (default 64) + + verbose This parameter controls the amount of logging + that the driver will do. Set it to 0 for + normal operation, 1 to see autoprobe progress + messages, or 2 to see additional debugging + output. (default 0) + + nice This parameter controls the driver's use of + idle CPU time, at the expense of some speed. + + If this driver is built into the kernel, you can use kernel + the following command line parameters, with the same values + as the corresponding module parameters listed above: + + pd.drive0 + pd.drive1 + pd.drive2 + pd.drive3 + pd.cluster + pd.nice + + In addition, you can use the parameter pd.disable to disable + the driver entirely. + +*/ + +/* Changes: + + 1.01 GRG 1997.01.24 Restored pd_reset() + Added eject ioctl + 1.02 GRG 1998.05.06 SMP spinlock changes, + Added slave support + 1.03 GRG 1998.06.16 Eliminate an Ugh. + 1.04 GRG 1998.08.15 Extra debugging, use HZ in loop timing + 1.05 GRG 1998.09.24 Added jumbo support + +*/ + +#define PD_VERSION "1.05" +#define PD_MAJOR 45 +#define PD_NAME "pd" +#define PD_UNITS 4 + +/* Here are things one can override from the insmod command. + Most are autoprobed by paride unless set here. Verbose is off + by default. + +*/ +#include + +static bool verbose = 0; +static int major = PD_MAJOR; +static char *name = PD_NAME; +static int cluster = 64; +static int nice = 0; +static int disable = 0; + +static int drive0[8] = { 0, 0, 0, -1, 0, 1, -1, -1 }; +static int drive1[8] = { 0, 0, 0, -1, 0, 1, -1, -1 }; +static int drive2[8] = { 0, 0, 0, -1, 0, 1, -1, -1 }; +static int drive3[8] = { 0, 0, 0, -1, 0, 1, -1, -1 }; + +static int (*drives[4])[8] = {&drive0, &drive1, &drive2, &drive3}; + +enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV}; + +/* end of parameters */ + +#include +#include +#include +#include +#include +#include +#include /* for the eject ioctl */ +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(pd_mutex); +static DEFINE_SPINLOCK(pd_lock); + +module_param(verbose, bool, 0); +module_param(major, int, 0); +module_param(name, charp, 0); +module_param(cluster, int, 0); +module_param(nice, int, 0); +module_param_array(drive0, int, NULL, 0); +module_param_array(drive1, int, NULL, 0); +module_param_array(drive2, int, NULL, 0); +module_param_array(drive3, int, NULL, 0); + +#include "paride.h" + +#define PD_BITS 4 + +/* numbers for "SCSI" geometry */ + +#define PD_LOG_HEADS 64 +#define PD_LOG_SECTS 32 + +#define PD_ID_OFF 54 +#define PD_ID_LEN 14 + +#define PD_MAX_RETRIES 5 +#define PD_TMO 800 /* interrupt timeout in jiffies */ +#define PD_SPIN_DEL 50 /* spin delay in micro-seconds */ + +#define PD_SPIN (1000000*PD_TMO)/(HZ*PD_SPIN_DEL) + +#define STAT_ERR 0x00001 +#define STAT_INDEX 0x00002 +#define STAT_ECC 0x00004 +#define STAT_DRQ 0x00008 +#define STAT_SEEK 0x00010 +#define STAT_WRERR 0x00020 +#define STAT_READY 0x00040 +#define STAT_BUSY 0x00080 + +#define ERR_AMNF 0x00100 +#define ERR_TK0NF 0x00200 +#define ERR_ABRT 0x00400 +#define ERR_MCR 0x00800 +#define ERR_IDNF 0x01000 +#define ERR_MC 0x02000 +#define ERR_UNC 0x04000 +#define ERR_TMO 0x10000 + +#define IDE_READ 0x20 +#define IDE_WRITE 0x30 +#define IDE_READ_VRFY 0x40 +#define IDE_INIT_DEV_PARMS 0x91 +#define IDE_STANDBY 0x96 +#define IDE_ACKCHANGE 0xdb +#define IDE_DOORLOCK 0xde +#define IDE_DOORUNLOCK 0xdf +#define IDE_IDENTIFY 0xec +#define IDE_EJECT 0xed + +#define PD_NAMELEN 8 + +struct pd_unit { + struct pi_adapter pia; /* interface to paride layer */ + struct pi_adapter *pi; + int access; /* count of active opens ... */ + int capacity; /* Size of this volume in sectors */ + int heads; /* physical geometry */ + int sectors; + int cylinders; + int can_lba; + int drive; /* master=0 slave=1 */ + int changed; /* Have we seen a disk change ? */ + int removable; /* removable media device ? */ + int standby; + int alt_geom; + char name[PD_NAMELEN]; /* pda, pdb, etc ... */ + struct gendisk *gd; +}; + +static struct pd_unit pd[PD_UNITS]; + +static char pd_scratch[512]; /* scratch block buffer */ + +static char *pd_errs[17] = { "ERR", "INDEX", "ECC", "DRQ", "SEEK", "WRERR", + "READY", "BUSY", "AMNF", "TK0NF", "ABRT", "MCR", + "IDNF", "MC", "UNC", "???", "TMO" +}; + +static inline int status_reg(struct pd_unit *disk) +{ + return pi_read_regr(disk->pi, 1, 6); +} + +static inline int read_reg(struct pd_unit *disk, int reg) +{ + return pi_read_regr(disk->pi, 0, reg); +} + +static inline void write_status(struct pd_unit *disk, int val) +{ + pi_write_regr(disk->pi, 1, 6, val); +} + +static inline void write_reg(struct pd_unit *disk, int reg, int val) +{ + pi_write_regr(disk->pi, 0, reg, val); +} + +static inline u8 DRIVE(struct pd_unit *disk) +{ + return 0xa0+0x10*disk->drive; +} + +/* ide command interface */ + +static void pd_print_error(struct pd_unit *disk, char *msg, int status) +{ + int i; + + printk("%s: %s: status = 0x%x =", disk->name, msg, status); + for (i = 0; i < ARRAY_SIZE(pd_errs); i++) + if (status & (1 << i)) + printk(" %s", pd_errs[i]); + printk("\n"); +} + +static void pd_reset(struct pd_unit *disk) +{ /* called only for MASTER drive */ + write_status(disk, 4); + udelay(50); + write_status(disk, 0); + udelay(250); +} + +#define DBMSG(msg) ((verbose>1)?(msg):NULL) + +static int pd_wait_for(struct pd_unit *disk, int w, char *msg) +{ /* polled wait */ + int k, r, e; + + k = 0; + while (k < PD_SPIN) { + r = status_reg(disk); + k++; + if (((r & w) == w) && !(r & STAT_BUSY)) + break; + udelay(PD_SPIN_DEL); + } + e = (read_reg(disk, 1) << 8) + read_reg(disk, 7); + if (k >= PD_SPIN) + e |= ERR_TMO; + if ((e & (STAT_ERR | ERR_TMO)) && (msg != NULL)) + pd_print_error(disk, msg, e); + return e; +} + +static void pd_send_command(struct pd_unit *disk, int n, int s, int h, int c0, int c1, int func) +{ + write_reg(disk, 6, DRIVE(disk) + h); + write_reg(disk, 1, 0); /* the IDE task file */ + write_reg(disk, 2, n); + write_reg(disk, 3, s); + write_reg(disk, 4, c0); + write_reg(disk, 5, c1); + write_reg(disk, 7, func); + + udelay(1); +} + +static void pd_ide_command(struct pd_unit *disk, int func, int block, int count) +{ + int c1, c0, h, s; + + if (disk->can_lba) { + s = block & 255; + c0 = (block >>= 8) & 255; + c1 = (block >>= 8) & 255; + h = ((block >>= 8) & 15) + 0x40; + } else { + s = (block % disk->sectors) + 1; + h = (block /= disk->sectors) % disk->heads; + c0 = (block /= disk->heads) % 256; + c1 = (block >>= 8); + } + pd_send_command(disk, count, s, h, c0, c1, func); +} + +/* The i/o request engine */ + +enum action {Fail = 0, Ok = 1, Hold, Wait}; + +static struct request *pd_req; /* current request */ +static enum action (*phase)(void); + +static void run_fsm(void); + +static void ps_tq_int(struct work_struct *work); + +static DECLARE_DELAYED_WORK(fsm_tq, ps_tq_int); + +static void schedule_fsm(void) +{ + if (!nice) + schedule_delayed_work(&fsm_tq, 0); + else + schedule_delayed_work(&fsm_tq, nice-1); +} + +static void ps_tq_int(struct work_struct *work) +{ + run_fsm(); +} + +static enum action do_pd_io_start(void); +static enum action pd_special(void); +static enum action do_pd_read_start(void); +static enum action do_pd_write_start(void); +static enum action do_pd_read_drq(void); +static enum action do_pd_write_done(void); + +static struct request_queue *pd_queue; +static int pd_claimed; + +static struct pd_unit *pd_current; /* current request's drive */ +static PIA *pi_current; /* current request's PIA */ + +static void run_fsm(void) +{ + while (1) { + enum action res; + unsigned long saved_flags; + int stop = 0; + + if (!phase) { + pd_current = pd_req->rq_disk->private_data; + pi_current = pd_current->pi; + phase = do_pd_io_start; + } + + switch (pd_claimed) { + case 0: + pd_claimed = 1; + if (!pi_schedule_claimed(pi_current, run_fsm)) + return; + case 1: + pd_claimed = 2; + pi_current->proto->connect(pi_current); + } + + switch(res = phase()) { + case Ok: case Fail: + pi_disconnect(pi_current); + pd_claimed = 0; + phase = NULL; + spin_lock_irqsave(&pd_lock, saved_flags); + if (!__blk_end_request_cur(pd_req, + res == Ok ? 0 : -EIO)) { + pd_req = blk_fetch_request(pd_queue); + if (!pd_req) + stop = 1; + } + spin_unlock_irqrestore(&pd_lock, saved_flags); + if (stop) + return; + case Hold: + schedule_fsm(); + return; + case Wait: + pi_disconnect(pi_current); + pd_claimed = 0; + } + } +} + +static int pd_retries = 0; /* i/o error retry count */ +static int pd_block; /* address of next requested block */ +static int pd_count; /* number of blocks still to do */ +static int pd_run; /* sectors in current cluster */ +static int pd_cmd; /* current command READ/WRITE */ +static char *pd_buf; /* buffer for request in progress */ + +static enum action do_pd_io_start(void) +{ + if (pd_req->cmd_type == REQ_TYPE_SPECIAL) { + phase = pd_special; + return pd_special(); + } + + pd_cmd = rq_data_dir(pd_req); + if (pd_cmd == READ || pd_cmd == WRITE) { + pd_block = blk_rq_pos(pd_req); + pd_count = blk_rq_cur_sectors(pd_req); + if (pd_block + pd_count > get_capacity(pd_req->rq_disk)) + return Fail; + pd_run = blk_rq_sectors(pd_req); + pd_buf = bio_data(pd_req->bio); + pd_retries = 0; + if (pd_cmd == READ) + return do_pd_read_start(); + else + return do_pd_write_start(); + } + return Fail; +} + +static enum action pd_special(void) +{ + enum action (*func)(struct pd_unit *) = pd_req->special; + return func(pd_current); +} + +static int pd_next_buf(void) +{ + unsigned long saved_flags; + + pd_count--; + pd_run--; + pd_buf += 512; + pd_block++; + if (!pd_run) + return 1; + if (pd_count) + return 0; + spin_lock_irqsave(&pd_lock, saved_flags); + __blk_end_request_cur(pd_req, 0); + pd_count = blk_rq_cur_sectors(pd_req); + pd_buf = bio_data(pd_req->bio); + spin_unlock_irqrestore(&pd_lock, saved_flags); + return 0; +} + +static unsigned long pd_timeout; + +static enum action do_pd_read_start(void) +{ + if (pd_wait_for(pd_current, STAT_READY, "do_pd_read") & STAT_ERR) { + if (pd_retries < PD_MAX_RETRIES) { + pd_retries++; + return Wait; + } + return Fail; + } + pd_ide_command(pd_current, IDE_READ, pd_block, pd_run); + phase = do_pd_read_drq; + pd_timeout = jiffies + PD_TMO; + return Hold; +} + +static enum action do_pd_write_start(void) +{ + if (pd_wait_for(pd_current, STAT_READY, "do_pd_write") & STAT_ERR) { + if (pd_retries < PD_MAX_RETRIES) { + pd_retries++; + return Wait; + } + return Fail; + } + pd_ide_command(pd_current, IDE_WRITE, pd_block, pd_run); + while (1) { + if (pd_wait_for(pd_current, STAT_DRQ, "do_pd_write_drq") & STAT_ERR) { + if (pd_retries < PD_MAX_RETRIES) { + pd_retries++; + return Wait; + } + return Fail; + } + pi_write_block(pd_current->pi, pd_buf, 512); + if (pd_next_buf()) + break; + } + phase = do_pd_write_done; + pd_timeout = jiffies + PD_TMO; + return Hold; +} + +static inline int pd_ready(void) +{ + return !(status_reg(pd_current) & STAT_BUSY); +} + +static enum action do_pd_read_drq(void) +{ + if (!pd_ready() && !time_after_eq(jiffies, pd_timeout)) + return Hold; + + while (1) { + if (pd_wait_for(pd_current, STAT_DRQ, "do_pd_read_drq") & STAT_ERR) { + if (pd_retries < PD_MAX_RETRIES) { + pd_retries++; + phase = do_pd_read_start; + return Wait; + } + return Fail; + } + pi_read_block(pd_current->pi, pd_buf, 512); + if (pd_next_buf()) + break; + } + return Ok; +} + +static enum action do_pd_write_done(void) +{ + if (!pd_ready() && !time_after_eq(jiffies, pd_timeout)) + return Hold; + + if (pd_wait_for(pd_current, STAT_READY, "do_pd_write_done") & STAT_ERR) { + if (pd_retries < PD_MAX_RETRIES) { + pd_retries++; + phase = do_pd_write_start; + return Wait; + } + return Fail; + } + return Ok; +} + +/* special io requests */ + +/* According to the ATA standard, the default CHS geometry should be + available following a reset. Some Western Digital drives come up + in a mode where only LBA addresses are accepted until the device + parameters are initialised. +*/ + +static void pd_init_dev_parms(struct pd_unit *disk) +{ + pd_wait_for(disk, 0, DBMSG("before init_dev_parms")); + pd_send_command(disk, disk->sectors, 0, disk->heads - 1, 0, 0, + IDE_INIT_DEV_PARMS); + udelay(300); + pd_wait_for(disk, 0, "Initialise device parameters"); +} + +static enum action pd_door_lock(struct pd_unit *disk) +{ + if (!(pd_wait_for(disk, STAT_READY, "Lock") & STAT_ERR)) { + pd_send_command(disk, 1, 0, 0, 0, 0, IDE_DOORLOCK); + pd_wait_for(disk, STAT_READY, "Lock done"); + } + return Ok; +} + +static enum action pd_door_unlock(struct pd_unit *disk) +{ + if (!(pd_wait_for(disk, STAT_READY, "Lock") & STAT_ERR)) { + pd_send_command(disk, 1, 0, 0, 0, 0, IDE_DOORUNLOCK); + pd_wait_for(disk, STAT_READY, "Lock done"); + } + return Ok; +} + +static enum action pd_eject(struct pd_unit *disk) +{ + pd_wait_for(disk, 0, DBMSG("before unlock on eject")); + pd_send_command(disk, 1, 0, 0, 0, 0, IDE_DOORUNLOCK); + pd_wait_for(disk, 0, DBMSG("after unlock on eject")); + pd_wait_for(disk, 0, DBMSG("before eject")); + pd_send_command(disk, 0, 0, 0, 0, 0, IDE_EJECT); + pd_wait_for(disk, 0, DBMSG("after eject")); + return Ok; +} + +static enum action pd_media_check(struct pd_unit *disk) +{ + int r = pd_wait_for(disk, STAT_READY, DBMSG("before media_check")); + if (!(r & STAT_ERR)) { + pd_send_command(disk, 1, 1, 0, 0, 0, IDE_READ_VRFY); + r = pd_wait_for(disk, STAT_READY, DBMSG("RDY after READ_VRFY")); + } else + disk->changed = 1; /* say changed if other error */ + if (r & ERR_MC) { + disk->changed = 1; + pd_send_command(disk, 1, 0, 0, 0, 0, IDE_ACKCHANGE); + pd_wait_for(disk, STAT_READY, DBMSG("RDY after ACKCHANGE")); + pd_send_command(disk, 1, 1, 0, 0, 0, IDE_READ_VRFY); + r = pd_wait_for(disk, STAT_READY, DBMSG("RDY after VRFY")); + } + return Ok; +} + +static void pd_standby_off(struct pd_unit *disk) +{ + pd_wait_for(disk, 0, DBMSG("before STANDBY")); + pd_send_command(disk, 0, 0, 0, 0, 0, IDE_STANDBY); + pd_wait_for(disk, 0, DBMSG("after STANDBY")); +} + +static enum action pd_identify(struct pd_unit *disk) +{ + int j; + char id[PD_ID_LEN + 1]; + +/* WARNING: here there may be dragons. reset() applies to both drives, + but we call it only on probing the MASTER. This should allow most + common configurations to work, but be warned that a reset can clear + settings on the SLAVE drive. +*/ + + if (disk->drive == 0) + pd_reset(disk); + + write_reg(disk, 6, DRIVE(disk)); + pd_wait_for(disk, 0, DBMSG("before IDENT")); + pd_send_command(disk, 1, 0, 0, 0, 0, IDE_IDENTIFY); + + if (pd_wait_for(disk, STAT_DRQ, DBMSG("IDENT DRQ")) & STAT_ERR) + return Fail; + pi_read_block(disk->pi, pd_scratch, 512); + disk->can_lba = pd_scratch[99] & 2; + disk->sectors = le16_to_cpu(*(__le16 *) (pd_scratch + 12)); + disk->heads = le16_to_cpu(*(__le16 *) (pd_scratch + 6)); + disk->cylinders = le16_to_cpu(*(__le16 *) (pd_scratch + 2)); + if (disk->can_lba) + disk->capacity = le32_to_cpu(*(__le32 *) (pd_scratch + 120)); + else + disk->capacity = disk->sectors * disk->heads * disk->cylinders; + + for (j = 0; j < PD_ID_LEN; j++) + id[j ^ 1] = pd_scratch[j + PD_ID_OFF]; + j = PD_ID_LEN - 1; + while ((j >= 0) && (id[j] <= 0x20)) + j--; + j++; + id[j] = 0; + + disk->removable = pd_scratch[0] & 0x80; + + printk("%s: %s, %s, %d blocks [%dM], (%d/%d/%d), %s media\n", + disk->name, id, + disk->drive ? "slave" : "master", + disk->capacity, disk->capacity / 2048, + disk->cylinders, disk->heads, disk->sectors, + disk->removable ? "removable" : "fixed"); + + if (disk->capacity) + pd_init_dev_parms(disk); + if (!disk->standby) + pd_standby_off(disk); + + return Ok; +} + +/* end of io request engine */ + +static void do_pd_request(struct request_queue * q) +{ + if (pd_req) + return; + pd_req = blk_fetch_request(q); + if (!pd_req) + return; + + schedule_fsm(); +} + +static int pd_special_command(struct pd_unit *disk, + enum action (*func)(struct pd_unit *disk)) +{ + struct request *rq; + int err = 0; + + rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + rq->cmd_type = REQ_TYPE_SPECIAL; + rq->special = func; + + err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0); + + blk_put_request(rq); + return err; +} + +/* kernel glue structures */ + +static int pd_open(struct block_device *bdev, fmode_t mode) +{ + struct pd_unit *disk = bdev->bd_disk->private_data; + + mutex_lock(&pd_mutex); + disk->access++; + + if (disk->removable) { + pd_special_command(disk, pd_media_check); + pd_special_command(disk, pd_door_lock); + } + mutex_unlock(&pd_mutex); + return 0; +} + +static int pd_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct pd_unit *disk = bdev->bd_disk->private_data; + + if (disk->alt_geom) { + geo->heads = PD_LOG_HEADS; + geo->sectors = PD_LOG_SECTS; + geo->cylinders = disk->capacity / (geo->heads * geo->sectors); + } else { + geo->heads = disk->heads; + geo->sectors = disk->sectors; + geo->cylinders = disk->cylinders; + } + + return 0; +} + +static int pd_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct pd_unit *disk = bdev->bd_disk->private_data; + + switch (cmd) { + case CDROMEJECT: + mutex_lock(&pd_mutex); + if (disk->access == 1) + pd_special_command(disk, pd_eject); + mutex_unlock(&pd_mutex); + return 0; + default: + return -EINVAL; + } +} + +static void pd_release(struct gendisk *p, fmode_t mode) +{ + struct pd_unit *disk = p->private_data; + + mutex_lock(&pd_mutex); + if (!--disk->access && disk->removable) + pd_special_command(disk, pd_door_unlock); + mutex_unlock(&pd_mutex); +} + +static unsigned int pd_check_events(struct gendisk *p, unsigned int clearing) +{ + struct pd_unit *disk = p->private_data; + int r; + if (!disk->removable) + return 0; + pd_special_command(disk, pd_media_check); + r = disk->changed; + disk->changed = 0; + return r ? DISK_EVENT_MEDIA_CHANGE : 0; +} + +static int pd_revalidate(struct gendisk *p) +{ + struct pd_unit *disk = p->private_data; + if (pd_special_command(disk, pd_identify) == 0) + set_capacity(p, disk->capacity); + else + set_capacity(p, 0); + return 0; +} + +static const struct block_device_operations pd_fops = { + .owner = THIS_MODULE, + .open = pd_open, + .release = pd_release, + .ioctl = pd_ioctl, + .getgeo = pd_getgeo, + .check_events = pd_check_events, + .revalidate_disk= pd_revalidate +}; + +/* probing */ + +static void pd_probe_drive(struct pd_unit *disk) +{ + struct gendisk *p = alloc_disk(1 << PD_BITS); + if (!p) + return; + strcpy(p->disk_name, disk->name); + p->fops = &pd_fops; + p->major = major; + p->first_minor = (disk - pd) << PD_BITS; + disk->gd = p; + p->private_data = disk; + p->queue = pd_queue; + + if (disk->drive == -1) { + for (disk->drive = 0; disk->drive <= 1; disk->drive++) + if (pd_special_command(disk, pd_identify) == 0) + return; + } else if (pd_special_command(disk, pd_identify) == 0) + return; + disk->gd = NULL; + put_disk(p); +} + +static int pd_detect(void) +{ + int found = 0, unit, pd_drive_count = 0; + struct pd_unit *disk; + + for (unit = 0; unit < PD_UNITS; unit++) { + int *parm = *drives[unit]; + struct pd_unit *disk = pd + unit; + disk->pi = &disk->pia; + disk->access = 0; + disk->changed = 1; + disk->capacity = 0; + disk->drive = parm[D_SLV]; + snprintf(disk->name, PD_NAMELEN, "%s%c", name, 'a'+unit); + disk->alt_geom = parm[D_GEO]; + disk->standby = parm[D_SBY]; + if (parm[D_PRT]) + pd_drive_count++; + } + + if (pd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ + disk = pd; + if (pi_init(disk->pi, 1, -1, -1, -1, -1, -1, pd_scratch, + PI_PD, verbose, disk->name)) { + pd_probe_drive(disk); + if (!disk->gd) + pi_release(disk->pi); + } + + } else { + for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) { + int *parm = *drives[unit]; + if (!parm[D_PRT]) + continue; + if (pi_init(disk->pi, 0, parm[D_PRT], parm[D_MOD], + parm[D_UNI], parm[D_PRO], parm[D_DLY], + pd_scratch, PI_PD, verbose, disk->name)) { + pd_probe_drive(disk); + if (!disk->gd) + pi_release(disk->pi); + } + } + } + for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) { + if (disk->gd) { + set_capacity(disk->gd, disk->capacity); + add_disk(disk->gd); + found = 1; + } + } + if (!found) + printk("%s: no valid drive found\n", name); + return found; +} + +static int __init pd_init(void) +{ + if (disable) + goto out1; + + pd_queue = blk_init_queue(do_pd_request, &pd_lock); + if (!pd_queue) + goto out1; + + blk_queue_max_hw_sectors(pd_queue, cluster); + + if (register_blkdev(major, name)) + goto out2; + + printk("%s: %s version %s, major %d, cluster %d, nice %d\n", + name, name, PD_VERSION, major, cluster, nice); + if (!pd_detect()) + goto out3; + + return 0; + +out3: + unregister_blkdev(major, name); +out2: + blk_cleanup_queue(pd_queue); +out1: + return -ENODEV; +} + +static void __exit pd_exit(void) +{ + struct pd_unit *disk; + int unit; + unregister_blkdev(major, name); + for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) { + struct gendisk *p = disk->gd; + if (p) { + disk->gd = NULL; + del_gendisk(p); + put_disk(p); + pi_release(disk->pi); + } + } + blk_cleanup_queue(pd_queue); +} + +MODULE_LICENSE("GPL"); +module_init(pd_init) +module_exit(pd_exit) diff --git a/kernel/drivers/block/paride/pf.c b/kernel/drivers/block/paride/pf.c new file mode 100644 index 000000000..9a15fd3c9 --- /dev/null +++ b/kernel/drivers/block/paride/pf.c @@ -0,0 +1,1007 @@ +/* + pf.c (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + This is the high-level driver for parallel port ATAPI disk + drives based on chips supported by the paride module. + + By default, the driver will autoprobe for a single parallel + port ATAPI disk drive, but if their individual parameters are + specified, the driver can handle up to 4 drives. + + The behaviour of the pf driver can be altered by setting + some parameters from the insmod command line. The following + parameters are adjustable: + + drive0 These four arguments can be arrays of + drive1 1-7 integers as follows: + drive2 + drive3 ,,,,,, + + Where, + + is the base of the parallel port address for + the corresponding drive. (required) + + is the protocol number for the adapter that + supports this drive. These numbers are + logged by 'paride' when the protocol modules + are initialised. (0 if not given) + + for those adapters that support chained + devices, this is the unit selector for the + chain of devices on the given port. It should + be zero for devices that don't support chaining. + (0 if not given) + + this can be -1 to choose the best mode, or one + of the mode numbers supported by the adapter. + (-1 if not given) + + ATAPI CDroms can be jumpered to master or slave. + Set this to 0 to choose the master drive, 1 to + choose the slave, -1 (the default) to choose the + first drive found. + + Some ATAPI devices support multiple LUNs. + One example is the ATAPI PD/CD drive from + Matshita/Panasonic. This device has a + CD drive on LUN 0 and a PD drive on LUN 1. + By default, the driver will search for the + first LUN with a supported device. Set + this parameter to force it to use a specific + LUN. (default -1) + + some parallel ports require the driver to + go more slowly. -1 sets a default value that + should work with the chosen protocol. Otherwise, + set this to a small integer, the larger it is + the slower the port i/o. In some cases, setting + this to zero will speed up the device. (default -1) + + major You may use this parameter to overide the + default major number (47) that this driver + will use. Be sure to change the device + name as well. + + name This parameter is a character string that + contains the name the kernel will use for this + device (in /proc output, for instance). + (default "pf"). + + cluster The driver will attempt to aggregate requests + for adjacent blocks into larger multi-block + clusters. The maximum cluster size (in 512 + byte sectors) is set with this parameter. + (default 64) + + verbose This parameter controls the amount of logging + that the driver will do. Set it to 0 for + normal operation, 1 to see autoprobe progress + messages, or 2 to see additional debugging + output. (default 0) + + nice This parameter controls the driver's use of + idle CPU time, at the expense of some speed. + + If this driver is built into the kernel, you can use the + following command line parameters, with the same values + as the corresponding module parameters listed above: + + pf.drive0 + pf.drive1 + pf.drive2 + pf.drive3 + pf.cluster + pf.nice + + In addition, you can use the parameter pf.disable to disable + the driver entirely. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.03 Changes for SMP. Eliminate sti(). + Fix for drives that don't clear STAT_ERR + until after next CDB delivered. + Small change in pf_completion to round + up transfer size. + 1.02 GRG 1998.06.16 Eliminated an Ugh + 1.03 GRG 1998.08.16 Use HZ in loop timings, extra debugging + 1.04 GRG 1998.09.24 Added jumbo support + +*/ + +#define PF_VERSION "1.04" +#define PF_MAJOR 47 +#define PF_NAME "pf" +#define PF_UNITS 4 + +#include + +/* Here are things one can override from the insmod command. + Most are autoprobed by paride unless set here. Verbose is off + by default. + +*/ + +static bool verbose = 0; +static int major = PF_MAJOR; +static char *name = PF_NAME; +static int cluster = 64; +static int nice = 0; +static int disable = 0; + +static int drive0[7] = { 0, 0, 0, -1, -1, -1, -1 }; +static int drive1[7] = { 0, 0, 0, -1, -1, -1, -1 }; +static int drive2[7] = { 0, 0, 0, -1, -1, -1, -1 }; +static int drive3[7] = { 0, 0, 0, -1, -1, -1, -1 }; + +static int (*drives[4])[7] = {&drive0, &drive1, &drive2, &drive3}; +static int pf_drive_count; + +enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_LUN, D_DLY}; + +/* end of parameters */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_MUTEX(pf_mutex); +static DEFINE_SPINLOCK(pf_spin_lock); + +module_param(verbose, bool, 0644); +module_param(major, int, 0); +module_param(name, charp, 0); +module_param(cluster, int, 0); +module_param(nice, int, 0); +module_param_array(drive0, int, NULL, 0); +module_param_array(drive1, int, NULL, 0); +module_param_array(drive2, int, NULL, 0); +module_param_array(drive3, int, NULL, 0); + +#include "paride.h" +#include "pseudo.h" + +/* constants for faking geometry numbers */ + +#define PF_FD_MAX 8192 /* use FD geometry under this size */ +#define PF_FD_HDS 2 +#define PF_FD_SPT 18 +#define PF_HD_HDS 64 +#define PF_HD_SPT 32 + +#define PF_MAX_RETRIES 5 +#define PF_TMO 800 /* interrupt timeout in jiffies */ +#define PF_SPIN_DEL 50 /* spin delay in micro-seconds */ + +#define PF_SPIN (1000000*PF_TMO)/(HZ*PF_SPIN_DEL) + +#define STAT_ERR 0x00001 +#define STAT_INDEX 0x00002 +#define STAT_ECC 0x00004 +#define STAT_DRQ 0x00008 +#define STAT_SEEK 0x00010 +#define STAT_WRERR 0x00020 +#define STAT_READY 0x00040 +#define STAT_BUSY 0x00080 + +#define ATAPI_REQ_SENSE 0x03 +#define ATAPI_LOCK 0x1e +#define ATAPI_DOOR 0x1b +#define ATAPI_MODE_SENSE 0x5a +#define ATAPI_CAPACITY 0x25 +#define ATAPI_IDENTIFY 0x12 +#define ATAPI_READ_10 0x28 +#define ATAPI_WRITE_10 0x2a + +static int pf_open(struct block_device *bdev, fmode_t mode); +static void do_pf_request(struct request_queue * q); +static int pf_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); +static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo); + +static void pf_release(struct gendisk *disk, fmode_t mode); + +static int pf_detect(void); +static void do_pf_read(void); +static void do_pf_read_start(void); +static void do_pf_write(void); +static void do_pf_write_start(void); +static void do_pf_read_drq(void); +static void do_pf_write_done(void); + +#define PF_NM 0 +#define PF_RO 1 +#define PF_RW 2 + +#define PF_NAMELEN 8 + +struct pf_unit { + struct pi_adapter pia; /* interface to paride layer */ + struct pi_adapter *pi; + int removable; /* removable media device ? */ + int media_status; /* media present ? WP ? */ + int drive; /* drive */ + int lun; + int access; /* count of active opens ... */ + int present; /* device present ? */ + char name[PF_NAMELEN]; /* pf0, pf1, ... */ + struct gendisk *disk; +}; + +static struct pf_unit units[PF_UNITS]; + +static int pf_identify(struct pf_unit *pf); +static void pf_lock(struct pf_unit *pf, int func); +static void pf_eject(struct pf_unit *pf); +static unsigned int pf_check_events(struct gendisk *disk, + unsigned int clearing); + +static char pf_scratch[512]; /* scratch block buffer */ + +/* the variables below are used mainly in the I/O request engine, which + processes only one request at a time. +*/ + +static int pf_retries = 0; /* i/o error retry count */ +static int pf_busy = 0; /* request being processed ? */ +static struct request *pf_req; /* current request */ +static int pf_block; /* address of next requested block */ +static int pf_count; /* number of blocks still to do */ +static int pf_run; /* sectors in current cluster */ +static int pf_cmd; /* current command READ/WRITE */ +static struct pf_unit *pf_current;/* unit of current request */ +static int pf_mask; /* stopper for pseudo-int */ +static char *pf_buf; /* buffer for request in progress */ + +/* kernel glue structures */ + +static const struct block_device_operations pf_fops = { + .owner = THIS_MODULE, + .open = pf_open, + .release = pf_release, + .ioctl = pf_ioctl, + .getgeo = pf_getgeo, + .check_events = pf_check_events, +}; + +static void __init pf_init_units(void) +{ + struct pf_unit *pf; + int unit; + + pf_drive_count = 0; + for (unit = 0, pf = units; unit < PF_UNITS; unit++, pf++) { + struct gendisk *disk = alloc_disk(1); + if (!disk) + continue; + pf->disk = disk; + pf->pi = &pf->pia; + pf->media_status = PF_NM; + pf->drive = (*drives[unit])[D_SLV]; + pf->lun = (*drives[unit])[D_LUN]; + snprintf(pf->name, PF_NAMELEN, "%s%d", name, unit); + disk->major = major; + disk->first_minor = unit; + strcpy(disk->disk_name, pf->name); + disk->fops = &pf_fops; + if (!(*drives[unit])[D_PRT]) + pf_drive_count++; + } +} + +static int pf_open(struct block_device *bdev, fmode_t mode) +{ + struct pf_unit *pf = bdev->bd_disk->private_data; + int ret; + + mutex_lock(&pf_mutex); + pf_identify(pf); + + ret = -ENODEV; + if (pf->media_status == PF_NM) + goto out; + + ret = -EROFS; + if ((pf->media_status == PF_RO) && (mode & FMODE_WRITE)) + goto out; + + ret = 0; + pf->access++; + if (pf->removable) + pf_lock(pf, 1); +out: + mutex_unlock(&pf_mutex); + return ret; +} + +static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct pf_unit *pf = bdev->bd_disk->private_data; + sector_t capacity = get_capacity(pf->disk); + + if (capacity < PF_FD_MAX) { + geo->cylinders = sector_div(capacity, PF_FD_HDS * PF_FD_SPT); + geo->heads = PF_FD_HDS; + geo->sectors = PF_FD_SPT; + } else { + geo->cylinders = sector_div(capacity, PF_HD_HDS * PF_HD_SPT); + geo->heads = PF_HD_HDS; + geo->sectors = PF_HD_SPT; + } + + return 0; +} + +static int pf_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) +{ + struct pf_unit *pf = bdev->bd_disk->private_data; + + if (cmd != CDROMEJECT) + return -EINVAL; + + if (pf->access != 1) + return -EBUSY; + mutex_lock(&pf_mutex); + pf_eject(pf); + mutex_unlock(&pf_mutex); + + return 0; +} + +static void pf_release(struct gendisk *disk, fmode_t mode) +{ + struct pf_unit *pf = disk->private_data; + + mutex_lock(&pf_mutex); + if (pf->access <= 0) { + mutex_unlock(&pf_mutex); + WARN_ON(1); + return; + } + + pf->access--; + + if (!pf->access && pf->removable) + pf_lock(pf, 0); + + mutex_unlock(&pf_mutex); +} + +static unsigned int pf_check_events(struct gendisk *disk, unsigned int clearing) +{ + return DISK_EVENT_MEDIA_CHANGE; +} + +static inline int status_reg(struct pf_unit *pf) +{ + return pi_read_regr(pf->pi, 1, 6); +} + +static inline int read_reg(struct pf_unit *pf, int reg) +{ + return pi_read_regr(pf->pi, 0, reg); +} + +static inline void write_reg(struct pf_unit *pf, int reg, int val) +{ + pi_write_regr(pf->pi, 0, reg, val); +} + +static int pf_wait(struct pf_unit *pf, int go, int stop, char *fun, char *msg) +{ + int j, r, e, s, p; + + j = 0; + while ((((r = status_reg(pf)) & go) || (stop && (!(r & stop)))) + && (j++ < PF_SPIN)) + udelay(PF_SPIN_DEL); + + if ((r & (STAT_ERR & stop)) || (j > PF_SPIN)) { + s = read_reg(pf, 7); + e = read_reg(pf, 1); + p = read_reg(pf, 2); + if (j > PF_SPIN) + e |= 0x100; + if (fun) + printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" + " loop=%d phase=%d\n", + pf->name, fun, msg, r, s, e, j, p); + return (e << 8) + s; + } + return 0; +} + +static int pf_command(struct pf_unit *pf, char *cmd, int dlen, char *fun) +{ + pi_connect(pf->pi); + + write_reg(pf, 6, 0xa0+0x10*pf->drive); + + if (pf_wait(pf, STAT_BUSY | STAT_DRQ, 0, fun, "before command")) { + pi_disconnect(pf->pi); + return -1; + } + + write_reg(pf, 4, dlen % 256); + write_reg(pf, 5, dlen / 256); + write_reg(pf, 7, 0xa0); /* ATAPI packet command */ + + if (pf_wait(pf, STAT_BUSY, STAT_DRQ, fun, "command DRQ")) { + pi_disconnect(pf->pi); + return -1; + } + + if (read_reg(pf, 2) != 1) { + printk("%s: %s: command phase error\n", pf->name, fun); + pi_disconnect(pf->pi); + return -1; + } + + pi_write_block(pf->pi, cmd, 12); + + return 0; +} + +static int pf_completion(struct pf_unit *pf, char *buf, char *fun) +{ + int r, s, n; + + r = pf_wait(pf, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR, + fun, "completion"); + + if ((read_reg(pf, 2) & 2) && (read_reg(pf, 7) & STAT_DRQ)) { + n = (((read_reg(pf, 4) + 256 * read_reg(pf, 5)) + + 3) & 0xfffc); + pi_read_block(pf->pi, buf, n); + } + + s = pf_wait(pf, STAT_BUSY, STAT_READY | STAT_ERR, fun, "data done"); + + pi_disconnect(pf->pi); + + return (r ? r : s); +} + +static void pf_req_sense(struct pf_unit *pf, int quiet) +{ + char rs_cmd[12] = + { ATAPI_REQ_SENSE, pf->lun << 5, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0 }; + char buf[16]; + int r; + + r = pf_command(pf, rs_cmd, 16, "Request sense"); + mdelay(1); + if (!r) + pf_completion(pf, buf, "Request sense"); + + if ((!r) && (!quiet)) + printk("%s: Sense key: %x, ASC: %x, ASQ: %x\n", + pf->name, buf[2] & 0xf, buf[12], buf[13]); +} + +static int pf_atapi(struct pf_unit *pf, char *cmd, int dlen, char *buf, char *fun) +{ + int r; + + r = pf_command(pf, cmd, dlen, fun); + mdelay(1); + if (!r) + r = pf_completion(pf, buf, fun); + if (r) + pf_req_sense(pf, !fun); + + return r; +} + +static void pf_lock(struct pf_unit *pf, int func) +{ + char lo_cmd[12] = { ATAPI_LOCK, pf->lun << 5, 0, 0, func, 0, 0, 0, 0, 0, 0, 0 }; + + pf_atapi(pf, lo_cmd, 0, pf_scratch, func ? "lock" : "unlock"); +} + +static void pf_eject(struct pf_unit *pf) +{ + char ej_cmd[12] = { ATAPI_DOOR, pf->lun << 5, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 }; + + pf_lock(pf, 0); + pf_atapi(pf, ej_cmd, 0, pf_scratch, "eject"); +} + +#define PF_RESET_TMO 30 /* in tenths of a second */ + +static void pf_sleep(int cs) +{ + schedule_timeout_interruptible(cs); +} + +/* the ATAPI standard actually specifies the contents of all 7 registers + after a reset, but the specification is ambiguous concerning the last + two bytes, and different drives interpret the standard differently. + */ + +static int pf_reset(struct pf_unit *pf) +{ + int i, k, flg; + int expect[5] = { 1, 1, 1, 0x14, 0xeb }; + + pi_connect(pf->pi); + write_reg(pf, 6, 0xa0+0x10*pf->drive); + write_reg(pf, 7, 8); + + pf_sleep(20 * HZ / 1000); + + k = 0; + while ((k++ < PF_RESET_TMO) && (status_reg(pf) & STAT_BUSY)) + pf_sleep(HZ / 10); + + flg = 1; + for (i = 0; i < 5; i++) + flg &= (read_reg(pf, i + 1) == expect[i]); + + if (verbose) { + printk("%s: Reset (%d) signature = ", pf->name, k); + for (i = 0; i < 5; i++) + printk("%3x", read_reg(pf, i + 1)); + if (!flg) + printk(" (incorrect)"); + printk("\n"); + } + + pi_disconnect(pf->pi); + return flg - 1; +} + +static void pf_mode_sense(struct pf_unit *pf) +{ + char ms_cmd[12] = + { ATAPI_MODE_SENSE, pf->lun << 5, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0 }; + char buf[8]; + + pf_atapi(pf, ms_cmd, 8, buf, "mode sense"); + pf->media_status = PF_RW; + if (buf[3] & 0x80) + pf->media_status = PF_RO; +} + +static void xs(char *buf, char *targ, int offs, int len) +{ + int j, k, l; + + j = 0; + l = 0; + for (k = 0; k < len; k++) + if ((buf[k + offs] != 0x20) || (buf[k + offs] != l)) + l = targ[j++] = buf[k + offs]; + if (l == 0x20) + j--; + targ[j] = 0; +} + +static int xl(char *buf, int offs) +{ + int v, k; + + v = 0; + for (k = 0; k < 4; k++) + v = v * 256 + (buf[k + offs] & 0xff); + return v; +} + +static void pf_get_capacity(struct pf_unit *pf) +{ + char rc_cmd[12] = { ATAPI_CAPACITY, pf->lun << 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + char buf[8]; + int bs; + + if (pf_atapi(pf, rc_cmd, 8, buf, "get capacity")) { + pf->media_status = PF_NM; + return; + } + set_capacity(pf->disk, xl(buf, 0) + 1); + bs = xl(buf, 4); + if (bs != 512) { + set_capacity(pf->disk, 0); + if (verbose) + printk("%s: Drive %d, LUN %d," + " unsupported block size %d\n", + pf->name, pf->drive, pf->lun, bs); + } +} + +static int pf_identify(struct pf_unit *pf) +{ + int dt, s; + char *ms[2] = { "master", "slave" }; + char mf[10], id[18]; + char id_cmd[12] = + { ATAPI_IDENTIFY, pf->lun << 5, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 }; + char buf[36]; + + s = pf_atapi(pf, id_cmd, 36, buf, "identify"); + if (s) + return -1; + + dt = buf[0] & 0x1f; + if ((dt != 0) && (dt != 7)) { + if (verbose) + printk("%s: Drive %d, LUN %d, unsupported type %d\n", + pf->name, pf->drive, pf->lun, dt); + return -1; + } + + xs(buf, mf, 8, 8); + xs(buf, id, 16, 16); + + pf->removable = (buf[1] & 0x80); + + pf_mode_sense(pf); + pf_mode_sense(pf); + pf_mode_sense(pf); + + pf_get_capacity(pf); + + printk("%s: %s %s, %s LUN %d, type %d", + pf->name, mf, id, ms[pf->drive], pf->lun, dt); + if (pf->removable) + printk(", removable"); + if (pf->media_status == PF_NM) + printk(", no media\n"); + else { + if (pf->media_status == PF_RO) + printk(", RO"); + printk(", %llu blocks\n", + (unsigned long long)get_capacity(pf->disk)); + } + return 0; +} + +/* returns 0, with id set if drive is detected + -1, if drive detection failed +*/ +static int pf_probe(struct pf_unit *pf) +{ + if (pf->drive == -1) { + for (pf->drive = 0; pf->drive <= 1; pf->drive++) + if (!pf_reset(pf)) { + if (pf->lun != -1) + return pf_identify(pf); + else + for (pf->lun = 0; pf->lun < 8; pf->lun++) + if (!pf_identify(pf)) + return 0; + } + } else { + if (pf_reset(pf)) + return -1; + if (pf->lun != -1) + return pf_identify(pf); + for (pf->lun = 0; pf->lun < 8; pf->lun++) + if (!pf_identify(pf)) + return 0; + } + return -1; +} + +static int pf_detect(void) +{ + struct pf_unit *pf = units; + int k, unit; + + printk("%s: %s version %s, major %d, cluster %d, nice %d\n", + name, name, PF_VERSION, major, cluster, nice); + + k = 0; + if (pf_drive_count == 0) { + if (pi_init(pf->pi, 1, -1, -1, -1, -1, -1, pf_scratch, PI_PF, + verbose, pf->name)) { + if (!pf_probe(pf) && pf->disk) { + pf->present = 1; + k++; + } else + pi_release(pf->pi); + } + + } else + for (unit = 0; unit < PF_UNITS; unit++, pf++) { + int *conf = *drives[unit]; + if (!conf[D_PRT]) + continue; + if (pi_init(pf->pi, 0, conf[D_PRT], conf[D_MOD], + conf[D_UNI], conf[D_PRO], conf[D_DLY], + pf_scratch, PI_PF, verbose, pf->name)) { + if (pf->disk && !pf_probe(pf)) { + pf->present = 1; + k++; + } else + pi_release(pf->pi); + } + } + if (k) + return 0; + + printk("%s: No ATAPI disk detected\n", name); + for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) + put_disk(pf->disk); + return -1; +} + +/* The i/o request engine */ + +static int pf_start(struct pf_unit *pf, int cmd, int b, int c) +{ + int i; + char io_cmd[12] = { cmd, pf->lun << 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + for (i = 0; i < 4; i++) { + io_cmd[5 - i] = b & 0xff; + b = b >> 8; + } + + io_cmd[8] = c & 0xff; + io_cmd[7] = (c >> 8) & 0xff; + + i = pf_command(pf, io_cmd, c * 512, "start i/o"); + + mdelay(1); + + return i; +} + +static int pf_ready(void) +{ + return (((status_reg(pf_current) & (STAT_BUSY | pf_mask)) == pf_mask)); +} + +static struct request_queue *pf_queue; + +static void pf_end_request(int err) +{ + if (pf_req && !__blk_end_request_cur(pf_req, err)) + pf_req = NULL; +} + +static void do_pf_request(struct request_queue * q) +{ + if (pf_busy) + return; +repeat: + if (!pf_req) { + pf_req = blk_fetch_request(q); + if (!pf_req) + return; + } + + pf_current = pf_req->rq_disk->private_data; + pf_block = blk_rq_pos(pf_req); + pf_run = blk_rq_sectors(pf_req); + pf_count = blk_rq_cur_sectors(pf_req); + + if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) { + pf_end_request(-EIO); + goto repeat; + } + + pf_cmd = rq_data_dir(pf_req); + pf_buf = bio_data(pf_req->bio); + pf_retries = 0; + + pf_busy = 1; + if (pf_cmd == READ) + pi_do_claimed(pf_current->pi, do_pf_read); + else if (pf_cmd == WRITE) + pi_do_claimed(pf_current->pi, do_pf_write); + else { + pf_busy = 0; + pf_end_request(-EIO); + goto repeat; + } +} + +static int pf_next_buf(void) +{ + unsigned long saved_flags; + + pf_count--; + pf_run--; + pf_buf += 512; + pf_block++; + if (!pf_run) + return 1; + if (!pf_count) { + spin_lock_irqsave(&pf_spin_lock, saved_flags); + pf_end_request(0); + spin_unlock_irqrestore(&pf_spin_lock, saved_flags); + if (!pf_req) + return 1; + pf_count = blk_rq_cur_sectors(pf_req); + pf_buf = bio_data(pf_req->bio); + } + return 0; +} + +static inline void next_request(int err) +{ + unsigned long saved_flags; + + spin_lock_irqsave(&pf_spin_lock, saved_flags); + pf_end_request(err); + pf_busy = 0; + do_pf_request(pf_queue); + spin_unlock_irqrestore(&pf_spin_lock, saved_flags); +} + +/* detach from the calling context - in case the spinlock is held */ +static void do_pf_read(void) +{ + ps_set_intr(do_pf_read_start, NULL, 0, nice); +} + +static void do_pf_read_start(void) +{ + pf_busy = 1; + + if (pf_start(pf_current, ATAPI_READ_10, pf_block, pf_run)) { + pi_disconnect(pf_current->pi); + if (pf_retries < PF_MAX_RETRIES) { + pf_retries++; + pi_do_claimed(pf_current->pi, do_pf_read_start); + return; + } + next_request(-EIO); + return; + } + pf_mask = STAT_DRQ; + ps_set_intr(do_pf_read_drq, pf_ready, PF_TMO, nice); +} + +static void do_pf_read_drq(void) +{ + while (1) { + if (pf_wait(pf_current, STAT_BUSY, STAT_DRQ | STAT_ERR, + "read block", "completion") & STAT_ERR) { + pi_disconnect(pf_current->pi); + if (pf_retries < PF_MAX_RETRIES) { + pf_req_sense(pf_current, 0); + pf_retries++; + pi_do_claimed(pf_current->pi, do_pf_read_start); + return; + } + next_request(-EIO); + return; + } + pi_read_block(pf_current->pi, pf_buf, 512); + if (pf_next_buf()) + break; + } + pi_disconnect(pf_current->pi); + next_request(0); +} + +static void do_pf_write(void) +{ + ps_set_intr(do_pf_write_start, NULL, 0, nice); +} + +static void do_pf_write_start(void) +{ + pf_busy = 1; + + if (pf_start(pf_current, ATAPI_WRITE_10, pf_block, pf_run)) { + pi_disconnect(pf_current->pi); + if (pf_retries < PF_MAX_RETRIES) { + pf_retries++; + pi_do_claimed(pf_current->pi, do_pf_write_start); + return; + } + next_request(-EIO); + return; + } + + while (1) { + if (pf_wait(pf_current, STAT_BUSY, STAT_DRQ | STAT_ERR, + "write block", "data wait") & STAT_ERR) { + pi_disconnect(pf_current->pi); + if (pf_retries < PF_MAX_RETRIES) { + pf_retries++; + pi_do_claimed(pf_current->pi, do_pf_write_start); + return; + } + next_request(-EIO); + return; + } + pi_write_block(pf_current->pi, pf_buf, 512); + if (pf_next_buf()) + break; + } + pf_mask = 0; + ps_set_intr(do_pf_write_done, pf_ready, PF_TMO, nice); +} + +static void do_pf_write_done(void) +{ + if (pf_wait(pf_current, STAT_BUSY, 0, "write block", "done") & STAT_ERR) { + pi_disconnect(pf_current->pi); + if (pf_retries < PF_MAX_RETRIES) { + pf_retries++; + pi_do_claimed(pf_current->pi, do_pf_write_start); + return; + } + next_request(-EIO); + return; + } + pi_disconnect(pf_current->pi); + next_request(0); +} + +static int __init pf_init(void) +{ /* preliminary initialisation */ + struct pf_unit *pf; + int unit; + + if (disable) + return -EINVAL; + + pf_init_units(); + + if (pf_detect()) + return -ENODEV; + pf_busy = 0; + + if (register_blkdev(major, name)) { + for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) + put_disk(pf->disk); + return -EBUSY; + } + pf_queue = blk_init_queue(do_pf_request, &pf_spin_lock); + if (!pf_queue) { + unregister_blkdev(major, name); + for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) + put_disk(pf->disk); + return -ENOMEM; + } + + blk_queue_max_segments(pf_queue, cluster); + + for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { + struct gendisk *disk = pf->disk; + + if (!pf->present) + continue; + disk->private_data = pf; + disk->queue = pf_queue; + add_disk(disk); + } + return 0; +} + +static void __exit pf_exit(void) +{ + struct pf_unit *pf; + int unit; + unregister_blkdev(major, name); + for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { + if (!pf->present) + continue; + del_gendisk(pf->disk); + put_disk(pf->disk); + pi_release(pf->pi); + } + blk_cleanup_queue(pf_queue); +} + +MODULE_LICENSE("GPL"); +module_init(pf_init) +module_exit(pf_exit) diff --git a/kernel/drivers/block/paride/pg.c b/kernel/drivers/block/paride/pg.c new file mode 100644 index 000000000..876d0c3ea --- /dev/null +++ b/kernel/drivers/block/paride/pg.c @@ -0,0 +1,726 @@ +/* + pg.c (c) 1998 Grant R. Guenther + Under the terms of the GNU General Public License. + + The pg driver provides a simple character device interface for + sending ATAPI commands to a device. With the exception of the + ATAPI reset operation, all operations are performed by a pair + of read and write operations to the appropriate /dev/pgN device. + A write operation delivers a command and any outbound data in + a single buffer. Normally, the write will succeed unless the + device is offline or malfunctioning, or there is already another + command pending. If the write succeeds, it should be followed + immediately by a read operation, to obtain any returned data and + status information. A read will fail if there is no operation + in progress. + + As a special case, the device can be reset with a write operation, + and in this case, no following read is expected, or permitted. + + There are no ioctl() operations. Any single operation + may transfer at most PG_MAX_DATA bytes. Note that the driver must + copy the data through an internal buffer. In keeping with all + current ATAPI devices, command packets are assumed to be exactly + 12 bytes in length. + + To permit future changes to this interface, the headers in the + read and write buffers contain a single character "magic" flag. + Currently this flag must be the character "P". + + By default, the driver will autoprobe for a single parallel + port ATAPI device, but if their individual parameters are + specified, the driver can handle up to 4 devices. + + To use this device, you must have the following device + special files defined: + + /dev/pg0 c 97 0 + /dev/pg1 c 97 1 + /dev/pg2 c 97 2 + /dev/pg3 c 97 3 + + (You'll need to change the 97 to something else if you use + the 'major' parameter to install the driver on a different + major number.) + + The behaviour of the pg driver can be altered by setting + some parameters from the insmod command line. The following + parameters are adjustable: + + drive0 These four arguments can be arrays of + drive1 1-6 integers as follows: + drive2 + drive3 ,,,,, + + Where, + + is the base of the parallel port address for + the corresponding drive. (required) + + is the protocol number for the adapter that + supports this drive. These numbers are + logged by 'paride' when the protocol modules + are initialised. (0 if not given) + + for those adapters that support chained + devices, this is the unit selector for the + chain of devices on the given port. It should + be zero for devices that don't support chaining. + (0 if not given) + + this can be -1 to choose the best mode, or one + of the mode numbers supported by the adapter. + (-1 if not given) + + ATAPI devices can be jumpered to master or slave. + Set this to 0 to choose the master drive, 1 to + choose the slave, -1 (the default) to choose the + first drive found. + + some parallel ports require the driver to + go more slowly. -1 sets a default value that + should work with the chosen protocol. Otherwise, + set this to a small integer, the larger it is + the slower the port i/o. In some cases, setting + this to zero will speed up the device. (default -1) + + major You may use this parameter to overide the + default major number (97) that this driver + will use. Be sure to change the device + name as well. + + name This parameter is a character string that + contains the name the kernel will use for this + device (in /proc output, for instance). + (default "pg"). + + verbose This parameter controls the amount of logging + that is done by the driver. Set it to 0 for + quiet operation, to 1 to enable progress + messages while the driver probes for devices, + or to 2 for full debug logging. (default 0) + + If this driver is built into the kernel, you can use + the following command line parameters, with the same values + as the corresponding module parameters listed above: + + pg.drive0 + pg.drive1 + pg.drive2 + pg.drive3 + + In addition, you can use the parameter pg.disable to disable + the driver entirely. + +*/ + +/* Changes: + + 1.01 GRG 1998.06.16 Bug fixes + 1.02 GRG 1998.09.24 Added jumbo support + +*/ + +#define PG_VERSION "1.02" +#define PG_MAJOR 97 +#define PG_NAME "pg" +#define PG_UNITS 4 + +#ifndef PI_PG +#define PI_PG 4 +#endif + +#include +/* Here are things one can override from the insmod command. + Most are autoprobed by paride unless set here. Verbose is 0 + by default. + +*/ + +static int verbose; +static int major = PG_MAJOR; +static char *name = PG_NAME; +static int disable = 0; + +static int drive0[6] = { 0, 0, 0, -1, -1, -1 }; +static int drive1[6] = { 0, 0, 0, -1, -1, -1 }; +static int drive2[6] = { 0, 0, 0, -1, -1, -1 }; +static int drive3[6] = { 0, 0, 0, -1, -1, -1 }; + +static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3}; +static int pg_drive_count; + +enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY}; + +/* end of parameters */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* current, TASK_* */ +#include +#include + +#include + +module_param(verbose, int, 0644); +module_param(major, int, 0); +module_param(name, charp, 0); +module_param_array(drive0, int, NULL, 0); +module_param_array(drive1, int, NULL, 0); +module_param_array(drive2, int, NULL, 0); +module_param_array(drive3, int, NULL, 0); + +#include "paride.h" + +#define PG_SPIN_DEL 50 /* spin delay in micro-seconds */ +#define PG_SPIN 200 +#define PG_TMO HZ +#define PG_RESET_TMO 10*HZ + +#define STAT_ERR 0x01 +#define STAT_INDEX 0x02 +#define STAT_ECC 0x04 +#define STAT_DRQ 0x08 +#define STAT_SEEK 0x10 +#define STAT_WRERR 0x20 +#define STAT_READY 0x40 +#define STAT_BUSY 0x80 + +#define ATAPI_IDENTIFY 0x12 + +static DEFINE_MUTEX(pg_mutex); +static int pg_open(struct inode *inode, struct file *file); +static int pg_release(struct inode *inode, struct file *file); +static ssize_t pg_read(struct file *filp, char __user *buf, + size_t count, loff_t * ppos); +static ssize_t pg_write(struct file *filp, const char __user *buf, + size_t count, loff_t * ppos); +static int pg_detect(void); + +#define PG_NAMELEN 8 + +struct pg { + struct pi_adapter pia; /* interface to paride layer */ + struct pi_adapter *pi; + int busy; /* write done, read expected */ + int start; /* jiffies at command start */ + int dlen; /* transfer size requested */ + unsigned long timeout; /* timeout requested */ + int status; /* last sense key */ + int drive; /* drive */ + unsigned long access; /* count of active opens ... */ + int present; /* device present ? */ + char *bufptr; + char name[PG_NAMELEN]; /* pg0, pg1, ... */ +}; + +static struct pg devices[PG_UNITS]; + +static int pg_identify(struct pg *dev, int log); + +static char pg_scratch[512]; /* scratch block buffer */ + +static struct class *pg_class; + +/* kernel glue structures */ + +static const struct file_operations pg_fops = { + .owner = THIS_MODULE, + .read = pg_read, + .write = pg_write, + .open = pg_open, + .release = pg_release, + .llseek = noop_llseek, +}; + +static void pg_init_units(void) +{ + int unit; + + pg_drive_count = 0; + for (unit = 0; unit < PG_UNITS; unit++) { + int *parm = *drives[unit]; + struct pg *dev = &devices[unit]; + dev->pi = &dev->pia; + clear_bit(0, &dev->access); + dev->busy = 0; + dev->present = 0; + dev->bufptr = NULL; + dev->drive = parm[D_SLV]; + snprintf(dev->name, PG_NAMELEN, "%s%c", name, 'a'+unit); + if (parm[D_PRT]) + pg_drive_count++; + } +} + +static inline int status_reg(struct pg *dev) +{ + return pi_read_regr(dev->pi, 1, 6); +} + +static inline int read_reg(struct pg *dev, int reg) +{ + return pi_read_regr(dev->pi, 0, reg); +} + +static inline void write_reg(struct pg *dev, int reg, int val) +{ + pi_write_regr(dev->pi, 0, reg, val); +} + +static inline u8 DRIVE(struct pg *dev) +{ + return 0xa0+0x10*dev->drive; +} + +static void pg_sleep(int cs) +{ + schedule_timeout_interruptible(cs); +} + +static int pg_wait(struct pg *dev, int go, int stop, unsigned long tmo, char *msg) +{ + int j, r, e, s, p, to; + + dev->status = 0; + + j = 0; + while ((((r = status_reg(dev)) & go) || (stop && (!(r & stop)))) + && time_before(jiffies, tmo)) { + if (j++ < PG_SPIN) + udelay(PG_SPIN_DEL); + else + pg_sleep(1); + } + + to = time_after_eq(jiffies, tmo); + + if ((r & (STAT_ERR & stop)) || to) { + s = read_reg(dev, 7); + e = read_reg(dev, 1); + p = read_reg(dev, 2); + if (verbose > 1) + printk("%s: %s: stat=0x%x err=0x%x phase=%d%s\n", + dev->name, msg, s, e, p, to ? " timeout" : ""); + if (to) + e |= 0x100; + dev->status = (e >> 4) & 0xff; + return -1; + } + return 0; +} + +static int pg_command(struct pg *dev, char *cmd, int dlen, unsigned long tmo) +{ + int k; + + pi_connect(dev->pi); + + write_reg(dev, 6, DRIVE(dev)); + + if (pg_wait(dev, STAT_BUSY | STAT_DRQ, 0, tmo, "before command")) + goto fail; + + write_reg(dev, 4, dlen % 256); + write_reg(dev, 5, dlen / 256); + write_reg(dev, 7, 0xa0); /* ATAPI packet command */ + + if (pg_wait(dev, STAT_BUSY, STAT_DRQ, tmo, "command DRQ")) + goto fail; + + if (read_reg(dev, 2) != 1) { + printk("%s: command phase error\n", dev->name); + goto fail; + } + + pi_write_block(dev->pi, cmd, 12); + + if (verbose > 1) { + printk("%s: Command sent, dlen=%d packet= ", dev->name, dlen); + for (k = 0; k < 12; k++) + printk("%02x ", cmd[k] & 0xff); + printk("\n"); + } + return 0; +fail: + pi_disconnect(dev->pi); + return -1; +} + +static int pg_completion(struct pg *dev, char *buf, unsigned long tmo) +{ + int r, d, n, p; + + r = pg_wait(dev, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR, + tmo, "completion"); + + dev->dlen = 0; + + while (read_reg(dev, 7) & STAT_DRQ) { + d = (read_reg(dev, 4) + 256 * read_reg(dev, 5)); + n = ((d + 3) & 0xfffc); + p = read_reg(dev, 2) & 3; + if (p == 0) + pi_write_block(dev->pi, buf, n); + if (p == 2) + pi_read_block(dev->pi, buf, n); + if (verbose > 1) + printk("%s: %s %d bytes\n", dev->name, + p ? "Read" : "Write", n); + dev->dlen += (1 - p) * d; + buf += d; + r = pg_wait(dev, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR, + tmo, "completion"); + } + + pi_disconnect(dev->pi); + + return r; +} + +static int pg_reset(struct pg *dev) +{ + int i, k, err; + int expect[5] = { 1, 1, 1, 0x14, 0xeb }; + int got[5]; + + pi_connect(dev->pi); + write_reg(dev, 6, DRIVE(dev)); + write_reg(dev, 7, 8); + + pg_sleep(20 * HZ / 1000); + + k = 0; + while ((k++ < PG_RESET_TMO) && (status_reg(dev) & STAT_BUSY)) + pg_sleep(1); + + for (i = 0; i < 5; i++) + got[i] = read_reg(dev, i + 1); + + err = memcmp(expect, got, sizeof(got)) ? -1 : 0; + + if (verbose) { + printk("%s: Reset (%d) signature = ", dev->name, k); + for (i = 0; i < 5; i++) + printk("%3x", got[i]); + if (err) + printk(" (incorrect)"); + printk("\n"); + } + + pi_disconnect(dev->pi); + return err; +} + +static void xs(char *buf, char *targ, int len) +{ + char l = '\0'; + int k; + + for (k = 0; k < len; k++) { + char c = *buf++; + if (c != ' ' && c != l) + l = *targ++ = c; + } + if (l == ' ') + targ--; + *targ = '\0'; +} + +static int pg_identify(struct pg *dev, int log) +{ + int s; + char *ms[2] = { "master", "slave" }; + char mf[10], id[18]; + char id_cmd[12] = { ATAPI_IDENTIFY, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 }; + char buf[36]; + + s = pg_command(dev, id_cmd, 36, jiffies + PG_TMO); + if (s) + return -1; + s = pg_completion(dev, buf, jiffies + PG_TMO); + if (s) + return -1; + + if (log) { + xs(buf + 8, mf, 8); + xs(buf + 16, id, 16); + printk("%s: %s %s, %s\n", dev->name, mf, id, ms[dev->drive]); + } + + return 0; +} + +/* + * returns 0, with id set if drive is detected + * -1, if drive detection failed + */ +static int pg_probe(struct pg *dev) +{ + if (dev->drive == -1) { + for (dev->drive = 0; dev->drive <= 1; dev->drive++) + if (!pg_reset(dev)) + return pg_identify(dev, 1); + } else { + if (!pg_reset(dev)) + return pg_identify(dev, 1); + } + return -1; +} + +static int pg_detect(void) +{ + struct pg *dev = &devices[0]; + int k, unit; + + printk("%s: %s version %s, major %d\n", name, name, PG_VERSION, major); + + k = 0; + if (pg_drive_count == 0) { + if (pi_init(dev->pi, 1, -1, -1, -1, -1, -1, pg_scratch, + PI_PG, verbose, dev->name)) { + if (!pg_probe(dev)) { + dev->present = 1; + k++; + } else + pi_release(dev->pi); + } + + } else + for (unit = 0; unit < PG_UNITS; unit++, dev++) { + int *parm = *drives[unit]; + if (!parm[D_PRT]) + continue; + if (pi_init(dev->pi, 0, parm[D_PRT], parm[D_MOD], + parm[D_UNI], parm[D_PRO], parm[D_DLY], + pg_scratch, PI_PG, verbose, dev->name)) { + if (!pg_probe(dev)) { + dev->present = 1; + k++; + } else + pi_release(dev->pi); + } + } + + if (k) + return 0; + + printk("%s: No ATAPI device detected\n", name); + return -1; +} + +static int pg_open(struct inode *inode, struct file *file) +{ + int unit = iminor(inode) & 0x7f; + struct pg *dev = &devices[unit]; + int ret = 0; + + mutex_lock(&pg_mutex); + if ((unit >= PG_UNITS) || (!dev->present)) { + ret = -ENODEV; + goto out; + } + + if (test_and_set_bit(0, &dev->access)) { + ret = -EBUSY; + goto out; + } + + if (dev->busy) { + pg_reset(dev); + dev->busy = 0; + } + + pg_identify(dev, (verbose > 1)); + + dev->bufptr = kmalloc(PG_MAX_DATA, GFP_KERNEL); + if (dev->bufptr == NULL) { + clear_bit(0, &dev->access); + printk("%s: buffer allocation failed\n", dev->name); + ret = -ENOMEM; + goto out; + } + + file->private_data = dev; + +out: + mutex_unlock(&pg_mutex); + return ret; +} + +static int pg_release(struct inode *inode, struct file *file) +{ + struct pg *dev = file->private_data; + + kfree(dev->bufptr); + dev->bufptr = NULL; + clear_bit(0, &dev->access); + + return 0; +} + +static ssize_t pg_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) +{ + struct pg *dev = filp->private_data; + struct pg_write_hdr hdr; + int hs = sizeof (hdr); + + if (dev->busy) + return -EBUSY; + if (count < hs) + return -EINVAL; + + if (copy_from_user(&hdr, buf, hs)) + return -EFAULT; + + if (hdr.magic != PG_MAGIC) + return -EINVAL; + if (hdr.dlen < 0 || hdr.dlen > PG_MAX_DATA) + return -EINVAL; + if ((count - hs) > PG_MAX_DATA) + return -EINVAL; + + if (hdr.func == PG_RESET) { + if (count != hs) + return -EINVAL; + if (pg_reset(dev)) + return -EIO; + return count; + } + + if (hdr.func != PG_COMMAND) + return -EINVAL; + + dev->start = jiffies; + dev->timeout = hdr.timeout * HZ + HZ / 2 + jiffies; + + if (pg_command(dev, hdr.packet, hdr.dlen, jiffies + PG_TMO)) { + if (dev->status & 0x10) + return -ETIME; + return -EIO; + } + + dev->busy = 1; + + if (copy_from_user(dev->bufptr, buf + hs, count - hs)) + return -EFAULT; + return count; +} + +static ssize_t pg_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +{ + struct pg *dev = filp->private_data; + struct pg_read_hdr hdr; + int hs = sizeof (hdr); + int copy; + + if (!dev->busy) + return -EINVAL; + if (count < hs) + return -EINVAL; + + dev->busy = 0; + + if (pg_completion(dev, dev->bufptr, dev->timeout)) + if (dev->status & 0x10) + return -ETIME; + + memset(&hdr, 0, sizeof(hdr)); + hdr.magic = PG_MAGIC; + hdr.dlen = dev->dlen; + copy = 0; + + if (hdr.dlen < 0) { + hdr.dlen = -1 * hdr.dlen; + copy = hdr.dlen; + if (copy > (count - hs)) + copy = count - hs; + } + + hdr.duration = (jiffies - dev->start + HZ / 2) / HZ; + hdr.scsi = dev->status & 0x0f; + + if (copy_to_user(buf, &hdr, hs)) + return -EFAULT; + if (copy > 0) + if (copy_to_user(buf + hs, dev->bufptr, copy)) + return -EFAULT; + return copy + hs; +} + +static int __init pg_init(void) +{ + int unit; + int err; + + if (disable){ + err = -EINVAL; + goto out; + } + + pg_init_units(); + + if (pg_detect()) { + err = -ENODEV; + goto out; + } + + err = register_chrdev(major, name, &pg_fops); + if (err < 0) { + printk("pg_init: unable to get major number %d\n", major); + for (unit = 0; unit < PG_UNITS; unit++) { + struct pg *dev = &devices[unit]; + if (dev->present) + pi_release(dev->pi); + } + goto out; + } + major = err; /* In case the user specified `major=0' (dynamic) */ + pg_class = class_create(THIS_MODULE, "pg"); + if (IS_ERR(pg_class)) { + err = PTR_ERR(pg_class); + goto out_chrdev; + } + for (unit = 0; unit < PG_UNITS; unit++) { + struct pg *dev = &devices[unit]; + if (dev->present) + device_create(pg_class, NULL, MKDEV(major, unit), NULL, + "pg%u", unit); + } + err = 0; + goto out; + +out_chrdev: + unregister_chrdev(major, "pg"); +out: + return err; +} + +static void __exit pg_exit(void) +{ + int unit; + + for (unit = 0; unit < PG_UNITS; unit++) { + struct pg *dev = &devices[unit]; + if (dev->present) + device_destroy(pg_class, MKDEV(major, unit)); + } + class_destroy(pg_class); + unregister_chrdev(major, name); + + for (unit = 0; unit < PG_UNITS; unit++) { + struct pg *dev = &devices[unit]; + if (dev->present) + pi_release(dev->pi); + } +} + +MODULE_LICENSE("GPL"); +module_init(pg_init) +module_exit(pg_exit) diff --git a/kernel/drivers/block/paride/ppc6lnx.c b/kernel/drivers/block/paride/ppc6lnx.c new file mode 100644 index 000000000..5e5521d3b --- /dev/null +++ b/kernel/drivers/block/paride/ppc6lnx.c @@ -0,0 +1,726 @@ +/* + ppc6lnx.c (c) 2001 Micro Solutions Inc. + Released under the terms of the GNU General Public license + + ppc6lnx.c is a par of the protocol driver for the Micro Solutions + "BACKPACK" parallel port IDE adapter + (Works on Series 6 drives) + +*/ + +//*************************************************************************** + +// PPC 6 Code in C sanitized for LINUX +// Original x86 ASM by Ron, Converted to C by Clive + +//*************************************************************************** + + +#define port_stb 1 +#define port_afd 2 +#define cmd_stb port_afd +#define port_init 4 +#define data_stb port_init +#define port_sel 8 +#define port_int 16 +#define port_dir 0x20 + +#define ECR_EPP 0x80 +#define ECR_BI 0x20 + +//*************************************************************************** + +// 60772 Commands + +#define ACCESS_REG 0x00 +#define ACCESS_PORT 0x40 + +#define ACCESS_READ 0x00 +#define ACCESS_WRITE 0x20 + +// 60772 Command Prefix + +#define CMD_PREFIX_SET 0xe0 // Special command that modifies the next command's operation +#define CMD_PREFIX_RESET 0xc0 // Resets current cmd modifier reg bits + #define PREFIX_IO16 0x01 // perform 16-bit wide I/O + #define PREFIX_FASTWR 0x04 // enable PPC mode fast-write + #define PREFIX_BLK 0x08 // enable block transfer mode + +// 60772 Registers + +#define REG_STATUS 0x00 // status register + #define STATUS_IRQA 0x01 // Peripheral IRQA line + #define STATUS_EEPROM_DO 0x40 // Serial EEPROM data bit +#define REG_VERSION 0x01 // PPC version register (read) +#define REG_HWCFG 0x02 // Hardware Config register +#define REG_RAMSIZE 0x03 // Size of RAM Buffer + #define RAMSIZE_128K 0x02 +#define REG_EEPROM 0x06 // EEPROM control register + #define EEPROM_SK 0x01 // eeprom SK bit + #define EEPROM_DI 0x02 // eeprom DI bit + #define EEPROM_CS 0x04 // eeprom CS bit + #define EEPROM_EN 0x08 // eeprom output enable +#define REG_BLKSIZE 0x08 // Block transfer len (24 bit) + +//*************************************************************************** + +typedef struct ppc_storage { + u16 lpt_addr; // LPT base address + u8 ppc_id; + u8 mode; // operating mode + // 0 = PPC Uni SW + // 1 = PPC Uni FW + // 2 = PPC Bi SW + // 3 = PPC Bi FW + // 4 = EPP Byte + // 5 = EPP Word + // 6 = EPP Dword + u8 ppc_flags; + u8 org_data; // original LPT data port contents + u8 org_ctrl; // original LPT control port contents + u8 cur_ctrl; // current control port contents +} Interface; + +//*************************************************************************** + +// ppc_flags + +#define fifo_wait 0x10 + +//*************************************************************************** + +// DONT CHANGE THESE LEST YOU BREAK EVERYTHING - BIT FIELD DEPENDENCIES + +#define PPCMODE_UNI_SW 0 +#define PPCMODE_UNI_FW 1 +#define PPCMODE_BI_SW 2 +#define PPCMODE_BI_FW 3 +#define PPCMODE_EPP_BYTE 4 +#define PPCMODE_EPP_WORD 5 +#define PPCMODE_EPP_DWORD 6 + +//*************************************************************************** + +static int ppc6_select(Interface *ppc); +static void ppc6_deselect(Interface *ppc); +static void ppc6_send_cmd(Interface *ppc, u8 cmd); +static void ppc6_wr_data_byte(Interface *ppc, u8 data); +static u8 ppc6_rd_data_byte(Interface *ppc); +static u8 ppc6_rd_port(Interface *ppc, u8 port); +static void ppc6_wr_port(Interface *ppc, u8 port, u8 data); +static void ppc6_rd_data_blk(Interface *ppc, u8 *data, long count); +static void ppc6_wait_for_fifo(Interface *ppc); +static void ppc6_wr_data_blk(Interface *ppc, u8 *data, long count); +static void ppc6_rd_port16_blk(Interface *ppc, u8 port, u8 *data, long length); +static void ppc6_wr_port16_blk(Interface *ppc, u8 port, u8 *data, long length); +static void ppc6_wr_extout(Interface *ppc, u8 regdata); +static int ppc6_open(Interface *ppc); +static void ppc6_close(Interface *ppc); + +//*************************************************************************** + +static int ppc6_select(Interface *ppc) +{ + u8 i, j, k; + + i = inb(ppc->lpt_addr + 1); + + if (i & 1) + outb(i, ppc->lpt_addr + 1); + + ppc->org_data = inb(ppc->lpt_addr); + + ppc->org_ctrl = inb(ppc->lpt_addr + 2) & 0x5F; // readback ctrl + + ppc->cur_ctrl = ppc->org_ctrl; + + ppc->cur_ctrl |= port_sel; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + if (ppc->org_data == 'b') + outb('x', ppc->lpt_addr); + + outb('b', ppc->lpt_addr); + outb('p', ppc->lpt_addr); + outb(ppc->ppc_id, ppc->lpt_addr); + outb(~ppc->ppc_id,ppc->lpt_addr); + + ppc->cur_ctrl &= ~port_sel; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + ppc->cur_ctrl = (ppc->cur_ctrl & port_int) | port_init; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + i = ppc->mode & 0x0C; + + if (i == 0) + i = (ppc->mode & 2) | 1; + + outb(i, ppc->lpt_addr); + + ppc->cur_ctrl |= port_sel; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + // DELAY + + ppc->cur_ctrl |= port_afd; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + j = ((i & 0x08) << 4) | ((i & 0x07) << 3); + + k = inb(ppc->lpt_addr + 1) & 0xB8; + + if (j == k) + { + ppc->cur_ctrl &= ~port_afd; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + k = (inb(ppc->lpt_addr + 1) & 0xB8) ^ 0xB8; + + if (j == k) + { + if (i & 4) // EPP + ppc->cur_ctrl &= ~(port_sel | port_init); + else // PPC/ECP + ppc->cur_ctrl &= ~port_sel; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + return(1); + } + } + + outb(ppc->org_ctrl, ppc->lpt_addr + 2); + + outb(ppc->org_data, ppc->lpt_addr); + + return(0); // FAIL +} + +//*************************************************************************** + +static void ppc6_deselect(Interface *ppc) +{ + if (ppc->mode & 4) // EPP + ppc->cur_ctrl |= port_init; + else // PPC/ECP + ppc->cur_ctrl |= port_sel; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + outb(ppc->org_data, ppc->lpt_addr); + + outb((ppc->org_ctrl | port_sel), ppc->lpt_addr + 2); + + outb(ppc->org_ctrl, ppc->lpt_addr + 2); +} + +//*************************************************************************** + +static void ppc6_send_cmd(Interface *ppc, u8 cmd) +{ + switch(ppc->mode) + { + case PPCMODE_UNI_SW : + case PPCMODE_UNI_FW : + case PPCMODE_BI_SW : + case PPCMODE_BI_FW : + { + outb(cmd, ppc->lpt_addr); + + ppc->cur_ctrl ^= cmd_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + break; + } + + case PPCMODE_EPP_BYTE : + case PPCMODE_EPP_WORD : + case PPCMODE_EPP_DWORD : + { + outb(cmd, ppc->lpt_addr + 3); + + break; + } + } +} + +//*************************************************************************** + +static void ppc6_wr_data_byte(Interface *ppc, u8 data) +{ + switch(ppc->mode) + { + case PPCMODE_UNI_SW : + case PPCMODE_UNI_FW : + case PPCMODE_BI_SW : + case PPCMODE_BI_FW : + { + outb(data, ppc->lpt_addr); + + ppc->cur_ctrl ^= data_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + break; + } + + case PPCMODE_EPP_BYTE : + case PPCMODE_EPP_WORD : + case PPCMODE_EPP_DWORD : + { + outb(data, ppc->lpt_addr + 4); + + break; + } + } +} + +//*************************************************************************** + +static u8 ppc6_rd_data_byte(Interface *ppc) +{ + u8 data = 0; + + switch(ppc->mode) + { + case PPCMODE_UNI_SW : + case PPCMODE_UNI_FW : + { + ppc->cur_ctrl = (ppc->cur_ctrl & ~port_stb) ^ data_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + // DELAY + + data = inb(ppc->lpt_addr + 1); + + data = ((data & 0x80) >> 1) | ((data & 0x38) >> 3); + + ppc->cur_ctrl |= port_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + // DELAY + + data |= inb(ppc->lpt_addr + 1) & 0xB8; + + break; + } + + case PPCMODE_BI_SW : + case PPCMODE_BI_FW : + { + ppc->cur_ctrl |= port_dir; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + ppc->cur_ctrl = (ppc->cur_ctrl | port_stb) ^ data_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + data = inb(ppc->lpt_addr); + + ppc->cur_ctrl &= ~port_stb; + + outb(ppc->cur_ctrl,ppc->lpt_addr + 2); + + ppc->cur_ctrl &= ~port_dir; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + break; + } + + case PPCMODE_EPP_BYTE : + case PPCMODE_EPP_WORD : + case PPCMODE_EPP_DWORD : + { + outb((ppc->cur_ctrl | port_dir),ppc->lpt_addr + 2); + + data = inb(ppc->lpt_addr + 4); + + outb(ppc->cur_ctrl,ppc->lpt_addr + 2); + + break; + } + } + + return(data); +} + +//*************************************************************************** + +static u8 ppc6_rd_port(Interface *ppc, u8 port) +{ + ppc6_send_cmd(ppc,(u8)(port | ACCESS_PORT | ACCESS_READ)); + + return(ppc6_rd_data_byte(ppc)); +} + +//*************************************************************************** + +static void ppc6_wr_port(Interface *ppc, u8 port, u8 data) +{ + ppc6_send_cmd(ppc,(u8)(port | ACCESS_PORT | ACCESS_WRITE)); + + ppc6_wr_data_byte(ppc, data); +} + +//*************************************************************************** + +static void ppc6_rd_data_blk(Interface *ppc, u8 *data, long count) +{ + switch(ppc->mode) + { + case PPCMODE_UNI_SW : + case PPCMODE_UNI_FW : + { + while(count) + { + u8 d; + + ppc->cur_ctrl = (ppc->cur_ctrl & ~port_stb) ^ data_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + // DELAY + + d = inb(ppc->lpt_addr + 1); + + d = ((d & 0x80) >> 1) | ((d & 0x38) >> 3); + + ppc->cur_ctrl |= port_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + // DELAY + + d |= inb(ppc->lpt_addr + 1) & 0xB8; + + *data++ = d; + count--; + } + + break; + } + + case PPCMODE_BI_SW : + case PPCMODE_BI_FW : + { + ppc->cur_ctrl |= port_dir; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + ppc->cur_ctrl |= port_stb; + + while(count) + { + ppc->cur_ctrl ^= data_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + *data++ = inb(ppc->lpt_addr); + count--; + } + + ppc->cur_ctrl &= ~port_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + ppc->cur_ctrl &= ~port_dir; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + break; + } + + case PPCMODE_EPP_BYTE : + { + outb((ppc->cur_ctrl | port_dir), ppc->lpt_addr + 2); + + // DELAY + + while(count) + { + *data++ = inb(ppc->lpt_addr + 4); + count--; + } + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + break; + } + + case PPCMODE_EPP_WORD : + { + outb((ppc->cur_ctrl | port_dir), ppc->lpt_addr + 2); + + // DELAY + + while(count > 1) + { + *((u16 *)data) = inw(ppc->lpt_addr + 4); + data += 2; + count -= 2; + } + + while(count) + { + *data++ = inb(ppc->lpt_addr + 4); + count--; + } + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + break; + } + + case PPCMODE_EPP_DWORD : + { + outb((ppc->cur_ctrl | port_dir),ppc->lpt_addr + 2); + + // DELAY + + while(count > 3) + { + *((u32 *)data) = inl(ppc->lpt_addr + 4); + data += 4; + count -= 4; + } + + while(count) + { + *data++ = inb(ppc->lpt_addr + 4); + count--; + } + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + break; + } + } + +} + +//*************************************************************************** + +static void ppc6_wait_for_fifo(Interface *ppc) +{ + int i; + + if (ppc->ppc_flags & fifo_wait) + { + for(i=0; i<20; i++) + inb(ppc->lpt_addr + 1); + } +} + +//*************************************************************************** + +static void ppc6_wr_data_blk(Interface *ppc, u8 *data, long count) +{ + switch(ppc->mode) + { + case PPCMODE_UNI_SW : + case PPCMODE_BI_SW : + { + while(count--) + { + outb(*data++, ppc->lpt_addr); + + ppc->cur_ctrl ^= data_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + } + + break; + } + + case PPCMODE_UNI_FW : + case PPCMODE_BI_FW : + { + u8 this, last; + + ppc6_send_cmd(ppc,(CMD_PREFIX_SET | PREFIX_FASTWR)); + + ppc->cur_ctrl |= port_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + last = *data; + + outb(last, ppc->lpt_addr); + + while(count) + { + this = *data++; + count--; + + if (this == last) + { + ppc->cur_ctrl ^= data_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + } + else + { + outb(this, ppc->lpt_addr); + + last = this; + } + } + + ppc->cur_ctrl &= ~port_stb; + + outb(ppc->cur_ctrl, ppc->lpt_addr + 2); + + ppc6_send_cmd(ppc,(CMD_PREFIX_RESET | PREFIX_FASTWR)); + + break; + } + + case PPCMODE_EPP_BYTE : + { + while(count) + { + outb(*data++,ppc->lpt_addr + 4); + count--; + } + + ppc6_wait_for_fifo(ppc); + + break; + } + + case PPCMODE_EPP_WORD : + { + while(count > 1) + { + outw(*((u16 *)data),ppc->lpt_addr + 4); + data += 2; + count -= 2; + } + + while(count) + { + outb(*data++,ppc->lpt_addr + 4); + count--; + } + + ppc6_wait_for_fifo(ppc); + + break; + } + + case PPCMODE_EPP_DWORD : + { + while(count > 3) + { + outl(*((u32 *)data),ppc->lpt_addr + 4); + data += 4; + count -= 4; + } + + while(count) + { + outb(*data++,ppc->lpt_addr + 4); + count--; + } + + ppc6_wait_for_fifo(ppc); + + break; + } + } +} + +//*************************************************************************** + +static void ppc6_rd_port16_blk(Interface *ppc, u8 port, u8 *data, long length) +{ + length = length << 1; + + ppc6_send_cmd(ppc, (REG_BLKSIZE | ACCESS_REG | ACCESS_WRITE)); + ppc6_wr_data_byte(ppc,(u8)length); + ppc6_wr_data_byte(ppc,(u8)(length >> 8)); + ppc6_wr_data_byte(ppc,0); + + ppc6_send_cmd(ppc, (CMD_PREFIX_SET | PREFIX_IO16 | PREFIX_BLK)); + + ppc6_send_cmd(ppc, (u8)(port | ACCESS_PORT | ACCESS_READ)); + + ppc6_rd_data_blk(ppc, data, length); + + ppc6_send_cmd(ppc, (CMD_PREFIX_RESET | PREFIX_IO16 | PREFIX_BLK)); +} + +//*************************************************************************** + +static void ppc6_wr_port16_blk(Interface *ppc, u8 port, u8 *data, long length) +{ + length = length << 1; + + ppc6_send_cmd(ppc, (REG_BLKSIZE | ACCESS_REG | ACCESS_WRITE)); + ppc6_wr_data_byte(ppc,(u8)length); + ppc6_wr_data_byte(ppc,(u8)(length >> 8)); + ppc6_wr_data_byte(ppc,0); + + ppc6_send_cmd(ppc, (CMD_PREFIX_SET | PREFIX_IO16 | PREFIX_BLK)); + + ppc6_send_cmd(ppc, (u8)(port | ACCESS_PORT | ACCESS_WRITE)); + + ppc6_wr_data_blk(ppc, data, length); + + ppc6_send_cmd(ppc, (CMD_PREFIX_RESET | PREFIX_IO16 | PREFIX_BLK)); +} + +//*************************************************************************** + +static void ppc6_wr_extout(Interface *ppc, u8 regdata) +{ + ppc6_send_cmd(ppc,(REG_VERSION | ACCESS_REG | ACCESS_WRITE)); + + ppc6_wr_data_byte(ppc, (u8)((regdata & 0x03) << 6)); +} + +//*************************************************************************** + +static int ppc6_open(Interface *ppc) +{ + int ret; + + ret = ppc6_select(ppc); + + if (ret == 0) + return(ret); + + ppc->ppc_flags &= ~fifo_wait; + + ppc6_send_cmd(ppc, (ACCESS_REG | ACCESS_WRITE | REG_RAMSIZE)); + ppc6_wr_data_byte(ppc, RAMSIZE_128K); + + ppc6_send_cmd(ppc, (ACCESS_REG | ACCESS_READ | REG_VERSION)); + + if ((ppc6_rd_data_byte(ppc) & 0x3F) == 0x0C) + ppc->ppc_flags |= fifo_wait; + + return(ret); +} + +//*************************************************************************** + +static void ppc6_close(Interface *ppc) +{ + ppc6_deselect(ppc); +} + +//*************************************************************************** + diff --git a/kernel/drivers/block/paride/pseudo.h b/kernel/drivers/block/paride/pseudo.h new file mode 100644 index 000000000..bc3703294 --- /dev/null +++ b/kernel/drivers/block/paride/pseudo.h @@ -0,0 +1,102 @@ +/* + pseudo.h (c) 1997-8 Grant R. Guenther + Under the terms of the GNU General Public License. + + This is the "pseudo-interrupt" logic for parallel port drivers. + + This module is #included into each driver. It makes one + function available: + + ps_set_intr( void (*continuation)(void), + int (*ready)(void), + int timeout, + int nice ) + + Which will arrange for ready() to be evaluated frequently and + when either it returns true, or timeout jiffies have passed, + continuation() will be invoked. + + If nice is 1, the test will done approximately once a + jiffy. If nice is 0, the test will also be done whenever + the scheduler runs (by adding it to a task queue). If + nice is greater than 1, the test will be done once every + (nice-1) jiffies. + +*/ + +/* Changes: + + 1.01 1998.05.03 Switched from cli()/sti() to spinlocks + 1.02 1998.12.14 Added support for nice > 1 +*/ + +#define PS_VERSION "1.02" + +#include +#include + +static void ps_tq_int(struct work_struct *work); + +static void (* ps_continuation)(void); +static int (* ps_ready)(void); +static unsigned long ps_timeout; +static int ps_tq_active = 0; +static int ps_nice = 0; + +static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused))); + +static DECLARE_DELAYED_WORK(ps_tq, ps_tq_int); + +static void ps_set_intr(void (*continuation)(void), + int (*ready)(void), + int timeout, int nice) +{ + unsigned long flags; + + spin_lock_irqsave(&ps_spinlock,flags); + + ps_continuation = continuation; + ps_ready = ready; + ps_timeout = jiffies + timeout; + ps_nice = nice; + + if (!ps_tq_active) { + ps_tq_active = 1; + if (!ps_nice) + schedule_delayed_work(&ps_tq, 0); + else + schedule_delayed_work(&ps_tq, ps_nice-1); + } + spin_unlock_irqrestore(&ps_spinlock,flags); +} + +static void ps_tq_int(struct work_struct *work) +{ + void (*con)(void); + unsigned long flags; + + spin_lock_irqsave(&ps_spinlock,flags); + + con = ps_continuation; + ps_tq_active = 0; + + if (!con) { + spin_unlock_irqrestore(&ps_spinlock,flags); + return; + } + if (!ps_ready || ps_ready() || time_after_eq(jiffies, ps_timeout)) { + ps_continuation = NULL; + spin_unlock_irqrestore(&ps_spinlock,flags); + con(); + return; + } + ps_tq_active = 1; + if (!ps_nice) + schedule_delayed_work(&ps_tq, 0); + else + schedule_delayed_work(&ps_tq, ps_nice-1); + spin_unlock_irqrestore(&ps_spinlock,flags); +} + +/* end of pseudo.h */ + diff --git a/kernel/drivers/block/paride/pt.c b/kernel/drivers/block/paride/pt.c new file mode 100644 index 000000000..2596042eb --- /dev/null +++ b/kernel/drivers/block/paride/pt.c @@ -0,0 +1,1016 @@ +/* + pt.c (c) 1998 Grant R. Guenther + Under the terms of the GNU General Public License. + + This is the high-level driver for parallel port ATAPI tape + drives based on chips supported by the paride module. + + The driver implements both rewinding and non-rewinding + devices, filemarks, and the rewind ioctl. It allocates + a small internal "bounce buffer" for each open device, but + otherwise expects buffering and blocking to be done at the + user level. As with most block-structured tapes, short + writes are padded to full tape blocks, so reading back a file + may return more data than was actually written. + + By default, the driver will autoprobe for a single parallel + port ATAPI tape drive, but if their individual parameters are + specified, the driver can handle up to 4 drives. + + The rewinding devices are named /dev/pt0, /dev/pt1, ... + while the non-rewinding devices are /dev/npt0, /dev/npt1, etc. + + The behaviour of the pt driver can be altered by setting + some parameters from the insmod command line. The following + parameters are adjustable: + + drive0 These four arguments can be arrays of + drive1 1-6 integers as follows: + drive2 + drive3 ,,,,, + + Where, + + is the base of the parallel port address for + the corresponding drive. (required) + + is the protocol number for the adapter that + supports this drive. These numbers are + logged by 'paride' when the protocol modules + are initialised. (0 if not given) + + for those adapters that support chained + devices, this is the unit selector for the + chain of devices on the given port. It should + be zero for devices that don't support chaining. + (0 if not given) + + this can be -1 to choose the best mode, or one + of the mode numbers supported by the adapter. + (-1 if not given) + + ATAPI devices can be jumpered to master or slave. + Set this to 0 to choose the master drive, 1 to + choose the slave, -1 (the default) to choose the + first drive found. + + some parallel ports require the driver to + go more slowly. -1 sets a default value that + should work with the chosen protocol. Otherwise, + set this to a small integer, the larger it is + the slower the port i/o. In some cases, setting + this to zero will speed up the device. (default -1) + + major You may use this parameter to overide the + default major number (96) that this driver + will use. Be sure to change the device + name as well. + + name This parameter is a character string that + contains the name the kernel will use for this + device (in /proc output, for instance). + (default "pt"). + + verbose This parameter controls the amount of logging + that the driver will do. Set it to 0 for + normal operation, 1 to see autoprobe progress + messages, or 2 to see additional debugging + output. (default 0) + + If this driver is built into the kernel, you can use + the following command line parameters, with the same values + as the corresponding module parameters listed above: + + pt.drive0 + pt.drive1 + pt.drive2 + pt.drive3 + + In addition, you can use the parameter pt.disable to disable + the driver entirely. + +*/ + +/* Changes: + + 1.01 GRG 1998.05.06 Round up transfer size, fix ready_wait, + loosed interpretation of ATAPI standard + for clearing error status. + Eliminate sti(); + 1.02 GRG 1998.06.16 Eliminate an Ugh. + 1.03 GRG 1998.08.15 Adjusted PT_TMO, use HZ in loop timing, + extra debugging + 1.04 GRG 1998.09.24 Repair minor coding error, added jumbo support + +*/ + +#define PT_VERSION "1.04" +#define PT_MAJOR 96 +#define PT_NAME "pt" +#define PT_UNITS 4 + +#include + +/* Here are things one can override from the insmod command. + Most are autoprobed by paride unless set here. Verbose is on + by default. + +*/ + +static bool verbose = 0; +static int major = PT_MAJOR; +static char *name = PT_NAME; +static int disable = 0; + +static int drive0[6] = { 0, 0, 0, -1, -1, -1 }; +static int drive1[6] = { 0, 0, 0, -1, -1, -1 }; +static int drive2[6] = { 0, 0, 0, -1, -1, -1 }; +static int drive3[6] = { 0, 0, 0, -1, -1, -1 }; + +static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3}; + +#define D_PRT 0 +#define D_PRO 1 +#define D_UNI 2 +#define D_MOD 3 +#define D_SLV 4 +#define D_DLY 5 + +#define DU (*drives[unit]) + +/* end of parameters */ + +#include +#include +#include +#include +#include +#include +#include +#include /* current, TASK_*, schedule_timeout() */ +#include + +#include + +module_param(verbose, bool, 0); +module_param(major, int, 0); +module_param(name, charp, 0); +module_param_array(drive0, int, NULL, 0); +module_param_array(drive1, int, NULL, 0); +module_param_array(drive2, int, NULL, 0); +module_param_array(drive3, int, NULL, 0); + +#include "paride.h" + +#define PT_MAX_RETRIES 5 +#define PT_TMO 3000 /* interrupt timeout in jiffies */ +#define PT_SPIN_DEL 50 /* spin delay in micro-seconds */ +#define PT_RESET_TMO 30 /* 30 seconds */ +#define PT_READY_TMO 60 /* 60 seconds */ +#define PT_REWIND_TMO 1200 /* 20 minutes */ + +#define PT_SPIN ((1000000/(HZ*PT_SPIN_DEL))*PT_TMO) + +#define STAT_ERR 0x00001 +#define STAT_INDEX 0x00002 +#define STAT_ECC 0x00004 +#define STAT_DRQ 0x00008 +#define STAT_SEEK 0x00010 +#define STAT_WRERR 0x00020 +#define STAT_READY 0x00040 +#define STAT_BUSY 0x00080 +#define STAT_SENSE 0x1f000 + +#define ATAPI_TEST_READY 0x00 +#define ATAPI_REWIND 0x01 +#define ATAPI_REQ_SENSE 0x03 +#define ATAPI_READ_6 0x08 +#define ATAPI_WRITE_6 0x0a +#define ATAPI_WFM 0x10 +#define ATAPI_IDENTIFY 0x12 +#define ATAPI_MODE_SENSE 0x1a +#define ATAPI_LOG_SENSE 0x4d + +static DEFINE_MUTEX(pt_mutex); +static int pt_open(struct inode *inode, struct file *file); +static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +static int pt_release(struct inode *inode, struct file *file); +static ssize_t pt_read(struct file *filp, char __user *buf, + size_t count, loff_t * ppos); +static ssize_t pt_write(struct file *filp, const char __user *buf, + size_t count, loff_t * ppos); +static int pt_detect(void); + +/* bits in tape->flags */ + +#define PT_MEDIA 1 +#define PT_WRITE_OK 2 +#define PT_REWIND 4 +#define PT_WRITING 8 +#define PT_READING 16 +#define PT_EOF 32 + +#define PT_NAMELEN 8 +#define PT_BUFSIZE 16384 + +struct pt_unit { + struct pi_adapter pia; /* interface to paride layer */ + struct pi_adapter *pi; + int flags; /* various state flags */ + int last_sense; /* result of last request sense */ + int drive; /* drive */ + atomic_t available; /* 1 if access is available 0 otherwise */ + int bs; /* block size */ + int capacity; /* Size of tape in KB */ + int present; /* device present ? */ + char *bufptr; + char name[PT_NAMELEN]; /* pf0, pf1, ... */ +}; + +static int pt_identify(struct pt_unit *tape); + +static struct pt_unit pt[PT_UNITS]; + +static char pt_scratch[512]; /* scratch block buffer */ + +/* kernel glue structures */ + +static const struct file_operations pt_fops = { + .owner = THIS_MODULE, + .read = pt_read, + .write = pt_write, + .unlocked_ioctl = pt_ioctl, + .open = pt_open, + .release = pt_release, + .llseek = noop_llseek, +}; + +/* sysfs class support */ +static struct class *pt_class; + +static inline int status_reg(struct pi_adapter *pi) +{ + return pi_read_regr(pi, 1, 6); +} + +static inline int read_reg(struct pi_adapter *pi, int reg) +{ + return pi_read_regr(pi, 0, reg); +} + +static inline void write_reg(struct pi_adapter *pi, int reg, int val) +{ + pi_write_regr(pi, 0, reg, val); +} + +static inline u8 DRIVE(struct pt_unit *tape) +{ + return 0xa0+0x10*tape->drive; +} + +static int pt_wait(struct pt_unit *tape, int go, int stop, char *fun, char *msg) +{ + int j, r, e, s, p; + struct pi_adapter *pi = tape->pi; + + j = 0; + while ((((r = status_reg(pi)) & go) || (stop && (!(r & stop)))) + && (j++ < PT_SPIN)) + udelay(PT_SPIN_DEL); + + if ((r & (STAT_ERR & stop)) || (j > PT_SPIN)) { + s = read_reg(pi, 7); + e = read_reg(pi, 1); + p = read_reg(pi, 2); + if (j > PT_SPIN) + e |= 0x100; + if (fun) + printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x" + " loop=%d phase=%d\n", + tape->name, fun, msg, r, s, e, j, p); + return (e << 8) + s; + } + return 0; +} + +static int pt_command(struct pt_unit *tape, char *cmd, int dlen, char *fun) +{ + struct pi_adapter *pi = tape->pi; + pi_connect(pi); + + write_reg(pi, 6, DRIVE(tape)); + + if (pt_wait(tape, STAT_BUSY | STAT_DRQ, 0, fun, "before command")) { + pi_disconnect(pi); + return -1; + } + + write_reg(pi, 4, dlen % 256); + write_reg(pi, 5, dlen / 256); + write_reg(pi, 7, 0xa0); /* ATAPI packet command */ + + if (pt_wait(tape, STAT_BUSY, STAT_DRQ, fun, "command DRQ")) { + pi_disconnect(pi); + return -1; + } + + if (read_reg(pi, 2) != 1) { + printk("%s: %s: command phase error\n", tape->name, fun); + pi_disconnect(pi); + return -1; + } + + pi_write_block(pi, cmd, 12); + + return 0; +} + +static int pt_completion(struct pt_unit *tape, char *buf, char *fun) +{ + struct pi_adapter *pi = tape->pi; + int r, s, n, p; + + r = pt_wait(tape, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR, + fun, "completion"); + + if (read_reg(pi, 7) & STAT_DRQ) { + n = (((read_reg(pi, 4) + 256 * read_reg(pi, 5)) + + 3) & 0xfffc); + p = read_reg(pi, 2) & 3; + if (p == 0) + pi_write_block(pi, buf, n); + if (p == 2) + pi_read_block(pi, buf, n); + } + + s = pt_wait(tape, STAT_BUSY, STAT_READY | STAT_ERR, fun, "data done"); + + pi_disconnect(pi); + + return (r ? r : s); +} + +static void pt_req_sense(struct pt_unit *tape, int quiet) +{ + char rs_cmd[12] = { ATAPI_REQ_SENSE, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0 }; + char buf[16]; + int r; + + r = pt_command(tape, rs_cmd, 16, "Request sense"); + mdelay(1); + if (!r) + pt_completion(tape, buf, "Request sense"); + + tape->last_sense = -1; + if (!r) { + if (!quiet) + printk("%s: Sense key: %x, ASC: %x, ASQ: %x\n", + tape->name, buf[2] & 0xf, buf[12], buf[13]); + tape->last_sense = (buf[2] & 0xf) | ((buf[12] & 0xff) << 8) + | ((buf[13] & 0xff) << 16); + } +} + +static int pt_atapi(struct pt_unit *tape, char *cmd, int dlen, char *buf, char *fun) +{ + int r; + + r = pt_command(tape, cmd, dlen, fun); + mdelay(1); + if (!r) + r = pt_completion(tape, buf, fun); + if (r) + pt_req_sense(tape, !fun); + + return r; +} + +static void pt_sleep(int cs) +{ + schedule_timeout_interruptible(cs); +} + +static int pt_poll_dsc(struct pt_unit *tape, int pause, int tmo, char *msg) +{ + struct pi_adapter *pi = tape->pi; + int k, e, s; + + k = 0; + e = 0; + s = 0; + while (k < tmo) { + pt_sleep(pause); + k++; + pi_connect(pi); + write_reg(pi, 6, DRIVE(tape)); + s = read_reg(pi, 7); + e = read_reg(pi, 1); + pi_disconnect(pi); + if (s & (STAT_ERR | STAT_SEEK)) + break; + } + if ((k >= tmo) || (s & STAT_ERR)) { + if (k >= tmo) + printk("%s: %s DSC timeout\n", tape->name, msg); + else + printk("%s: %s stat=0x%x err=0x%x\n", tape->name, msg, s, + e); + pt_req_sense(tape, 0); + return 0; + } + return 1; +} + +static void pt_media_access_cmd(struct pt_unit *tape, int tmo, char *cmd, char *fun) +{ + if (pt_command(tape, cmd, 0, fun)) { + pt_req_sense(tape, 0); + return; + } + pi_disconnect(tape->pi); + pt_poll_dsc(tape, HZ, tmo, fun); +} + +static void pt_rewind(struct pt_unit *tape) +{ + char rw_cmd[12] = { ATAPI_REWIND, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + pt_media_access_cmd(tape, PT_REWIND_TMO, rw_cmd, "rewind"); +} + +static void pt_write_fm(struct pt_unit *tape) +{ + char wm_cmd[12] = { ATAPI_WFM, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 }; + + pt_media_access_cmd(tape, PT_TMO, wm_cmd, "write filemark"); +} + +#define DBMSG(msg) ((verbose>1)?(msg):NULL) + +static int pt_reset(struct pt_unit *tape) +{ + struct pi_adapter *pi = tape->pi; + int i, k, flg; + int expect[5] = { 1, 1, 1, 0x14, 0xeb }; + + pi_connect(pi); + write_reg(pi, 6, DRIVE(tape)); + write_reg(pi, 7, 8); + + pt_sleep(20 * HZ / 1000); + + k = 0; + while ((k++ < PT_RESET_TMO) && (status_reg(pi) & STAT_BUSY)) + pt_sleep(HZ / 10); + + flg = 1; + for (i = 0; i < 5; i++) + flg &= (read_reg(pi, i + 1) == expect[i]); + + if (verbose) { + printk("%s: Reset (%d) signature = ", tape->name, k); + for (i = 0; i < 5; i++) + printk("%3x", read_reg(pi, i + 1)); + if (!flg) + printk(" (incorrect)"); + printk("\n"); + } + + pi_disconnect(pi); + return flg - 1; +} + +static int pt_ready_wait(struct pt_unit *tape, int tmo) +{ + char tr_cmd[12] = { ATAPI_TEST_READY, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int k, p; + + k = 0; + while (k < tmo) { + tape->last_sense = 0; + pt_atapi(tape, tr_cmd, 0, NULL, DBMSG("test unit ready")); + p = tape->last_sense; + if (!p) + return 0; + if (!(((p & 0xffff) == 0x0402) || ((p & 0xff) == 6))) + return p; + k++; + pt_sleep(HZ); + } + return 0x000020; /* timeout */ +} + +static void xs(char *buf, char *targ, int offs, int len) +{ + int j, k, l; + + j = 0; + l = 0; + for (k = 0; k < len; k++) + if ((buf[k + offs] != 0x20) || (buf[k + offs] != l)) + l = targ[j++] = buf[k + offs]; + if (l == 0x20) + j--; + targ[j] = 0; +} + +static int xn(char *buf, int offs, int size) +{ + int v, k; + + v = 0; + for (k = 0; k < size; k++) + v = v * 256 + (buf[k + offs] & 0xff); + return v; +} + +static int pt_identify(struct pt_unit *tape) +{ + int dt, s; + char *ms[2] = { "master", "slave" }; + char mf[10], id[18]; + char id_cmd[12] = { ATAPI_IDENTIFY, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 }; + char ms_cmd[12] = + { ATAPI_MODE_SENSE, 0, 0x2a, 0, 36, 0, 0, 0, 0, 0, 0, 0 }; + char ls_cmd[12] = + { ATAPI_LOG_SENSE, 0, 0x71, 0, 0, 0, 0, 0, 36, 0, 0, 0 }; + char buf[36]; + + s = pt_atapi(tape, id_cmd, 36, buf, "identify"); + if (s) + return -1; + + dt = buf[0] & 0x1f; + if (dt != 1) { + if (verbose) + printk("%s: Drive %d, unsupported type %d\n", + tape->name, tape->drive, dt); + return -1; + } + + xs(buf, mf, 8, 8); + xs(buf, id, 16, 16); + + tape->flags = 0; + tape->capacity = 0; + tape->bs = 0; + + if (!pt_ready_wait(tape, PT_READY_TMO)) + tape->flags |= PT_MEDIA; + + if (!pt_atapi(tape, ms_cmd, 36, buf, "mode sense")) { + if (!(buf[2] & 0x80)) + tape->flags |= PT_WRITE_OK; + tape->bs = xn(buf, 10, 2); + } + + if (!pt_atapi(tape, ls_cmd, 36, buf, "log sense")) + tape->capacity = xn(buf, 24, 4); + + printk("%s: %s %s, %s", tape->name, mf, id, ms[tape->drive]); + if (!(tape->flags & PT_MEDIA)) + printk(", no media\n"); + else { + if (!(tape->flags & PT_WRITE_OK)) + printk(", RO"); + printk(", blocksize %d, %d MB\n", tape->bs, tape->capacity / 1024); + } + + return 0; +} + + +/* + * returns 0, with id set if drive is detected + * -1, if drive detection failed + */ +static int pt_probe(struct pt_unit *tape) +{ + if (tape->drive == -1) { + for (tape->drive = 0; tape->drive <= 1; tape->drive++) + if (!pt_reset(tape)) + return pt_identify(tape); + } else { + if (!pt_reset(tape)) + return pt_identify(tape); + } + return -1; +} + +static int pt_detect(void) +{ + struct pt_unit *tape; + int specified = 0, found = 0; + int unit; + + printk("%s: %s version %s, major %d\n", name, name, PT_VERSION, major); + + specified = 0; + for (unit = 0; unit < PT_UNITS; unit++) { + struct pt_unit *tape = &pt[unit]; + tape->pi = &tape->pia; + atomic_set(&tape->available, 1); + tape->flags = 0; + tape->last_sense = 0; + tape->present = 0; + tape->bufptr = NULL; + tape->drive = DU[D_SLV]; + snprintf(tape->name, PT_NAMELEN, "%s%d", name, unit); + if (!DU[D_PRT]) + continue; + specified++; + if (pi_init(tape->pi, 0, DU[D_PRT], DU[D_MOD], DU[D_UNI], + DU[D_PRO], DU[D_DLY], pt_scratch, PI_PT, + verbose, tape->name)) { + if (!pt_probe(tape)) { + tape->present = 1; + found++; + } else + pi_release(tape->pi); + } + } + if (specified == 0) { + tape = pt; + if (pi_init(tape->pi, 1, -1, -1, -1, -1, -1, pt_scratch, + PI_PT, verbose, tape->name)) { + if (!pt_probe(tape)) { + tape->present = 1; + found++; + } else + pi_release(tape->pi); + } + + } + if (found) + return 0; + + printk("%s: No ATAPI tape drive detected\n", name); + return -1; +} + +static int pt_open(struct inode *inode, struct file *file) +{ + int unit = iminor(inode) & 0x7F; + struct pt_unit *tape = pt + unit; + int err; + + mutex_lock(&pt_mutex); + if (unit >= PT_UNITS || (!tape->present)) { + mutex_unlock(&pt_mutex); + return -ENODEV; + } + + err = -EBUSY; + if (!atomic_dec_and_test(&tape->available)) + goto out; + + pt_identify(tape); + + err = -ENODEV; + if (!(tape->flags & PT_MEDIA)) + goto out; + + err = -EROFS; + if ((!(tape->flags & PT_WRITE_OK)) && (file->f_mode & FMODE_WRITE)) + goto out; + + if (!(iminor(inode) & 128)) + tape->flags |= PT_REWIND; + + err = -ENOMEM; + tape->bufptr = kmalloc(PT_BUFSIZE, GFP_KERNEL); + if (tape->bufptr == NULL) { + printk("%s: buffer allocation failed\n", tape->name); + goto out; + } + + file->private_data = tape; + mutex_unlock(&pt_mutex); + return 0; + +out: + atomic_inc(&tape->available); + mutex_unlock(&pt_mutex); + return err; +} + +static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct pt_unit *tape = file->private_data; + struct mtop __user *p = (void __user *)arg; + struct mtop mtop; + + switch (cmd) { + case MTIOCTOP: + if (copy_from_user(&mtop, p, sizeof(struct mtop))) + return -EFAULT; + + switch (mtop.mt_op) { + + case MTREW: + mutex_lock(&pt_mutex); + pt_rewind(tape); + mutex_unlock(&pt_mutex); + return 0; + + case MTWEOF: + mutex_lock(&pt_mutex); + pt_write_fm(tape); + mutex_unlock(&pt_mutex); + return 0; + + default: + /* FIXME: rate limit ?? */ + printk(KERN_DEBUG "%s: Unimplemented mt_op %d\n", tape->name, + mtop.mt_op); + return -EINVAL; + } + + default: + return -ENOTTY; + } +} + +static int +pt_release(struct inode *inode, struct file *file) +{ + struct pt_unit *tape = file->private_data; + + if (atomic_read(&tape->available) > 1) + return -EINVAL; + + if (tape->flags & PT_WRITING) + pt_write_fm(tape); + + if (tape->flags & PT_REWIND) + pt_rewind(tape); + + kfree(tape->bufptr); + tape->bufptr = NULL; + + atomic_inc(&tape->available); + + return 0; + +} + +static ssize_t pt_read(struct file *filp, char __user *buf, size_t count, loff_t * ppos) +{ + struct pt_unit *tape = filp->private_data; + struct pi_adapter *pi = tape->pi; + char rd_cmd[12] = { ATAPI_READ_6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int k, n, r, p, s, t, b; + + if (!(tape->flags & (PT_READING | PT_WRITING))) { + tape->flags |= PT_READING; + if (pt_atapi(tape, rd_cmd, 0, NULL, "start read-ahead")) + return -EIO; + } else if (tape->flags & PT_WRITING) + return -EIO; + + if (tape->flags & PT_EOF) + return 0; + + t = 0; + + while (count > 0) { + + if (!pt_poll_dsc(tape, HZ / 100, PT_TMO, "read")) + return -EIO; + + n = count; + if (n > 32768) + n = 32768; /* max per command */ + b = (n - 1 + tape->bs) / tape->bs; + n = b * tape->bs; /* rounded up to even block */ + + rd_cmd[4] = b; + + r = pt_command(tape, rd_cmd, n, "read"); + + mdelay(1); + + if (r) { + pt_req_sense(tape, 0); + return -EIO; + } + + while (1) { + + r = pt_wait(tape, STAT_BUSY, + STAT_DRQ | STAT_ERR | STAT_READY, + DBMSG("read DRQ"), ""); + + if (r & STAT_SENSE) { + pi_disconnect(pi); + pt_req_sense(tape, 0); + return -EIO; + } + + if (r) + tape->flags |= PT_EOF; + + s = read_reg(pi, 7); + + if (!(s & STAT_DRQ)) + break; + + n = (read_reg(pi, 4) + 256 * read_reg(pi, 5)); + p = (read_reg(pi, 2) & 3); + if (p != 2) { + pi_disconnect(pi); + printk("%s: Phase error on read: %d\n", tape->name, + p); + return -EIO; + } + + while (n > 0) { + k = n; + if (k > PT_BUFSIZE) + k = PT_BUFSIZE; + pi_read_block(pi, tape->bufptr, k); + n -= k; + b = k; + if (b > count) + b = count; + if (copy_to_user(buf + t, tape->bufptr, b)) { + pi_disconnect(pi); + return -EFAULT; + } + t += b; + count -= b; + } + + } + pi_disconnect(pi); + if (tape->flags & PT_EOF) + break; + } + + return t; + +} + +static ssize_t pt_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) +{ + struct pt_unit *tape = filp->private_data; + struct pi_adapter *pi = tape->pi; + char wr_cmd[12] = { ATAPI_WRITE_6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int k, n, r, p, s, t, b; + + if (!(tape->flags & PT_WRITE_OK)) + return -EROFS; + + if (!(tape->flags & (PT_READING | PT_WRITING))) { + tape->flags |= PT_WRITING; + if (pt_atapi + (tape, wr_cmd, 0, NULL, "start buffer-available mode")) + return -EIO; + } else if (tape->flags & PT_READING) + return -EIO; + + if (tape->flags & PT_EOF) + return -ENOSPC; + + t = 0; + + while (count > 0) { + + if (!pt_poll_dsc(tape, HZ / 100, PT_TMO, "write")) + return -EIO; + + n = count; + if (n > 32768) + n = 32768; /* max per command */ + b = (n - 1 + tape->bs) / tape->bs; + n = b * tape->bs; /* rounded up to even block */ + + wr_cmd[4] = b; + + r = pt_command(tape, wr_cmd, n, "write"); + + mdelay(1); + + if (r) { /* error delivering command only */ + pt_req_sense(tape, 0); + return -EIO; + } + + while (1) { + + r = pt_wait(tape, STAT_BUSY, + STAT_DRQ | STAT_ERR | STAT_READY, + DBMSG("write DRQ"), NULL); + + if (r & STAT_SENSE) { + pi_disconnect(pi); + pt_req_sense(tape, 0); + return -EIO; + } + + if (r) + tape->flags |= PT_EOF; + + s = read_reg(pi, 7); + + if (!(s & STAT_DRQ)) + break; + + n = (read_reg(pi, 4) + 256 * read_reg(pi, 5)); + p = (read_reg(pi, 2) & 3); + if (p != 0) { + pi_disconnect(pi); + printk("%s: Phase error on write: %d \n", + tape->name, p); + return -EIO; + } + + while (n > 0) { + k = n; + if (k > PT_BUFSIZE) + k = PT_BUFSIZE; + b = k; + if (b > count) + b = count; + if (copy_from_user(tape->bufptr, buf + t, b)) { + pi_disconnect(pi); + return -EFAULT; + } + pi_write_block(pi, tape->bufptr, k); + t += b; + count -= b; + n -= k; + } + + } + pi_disconnect(pi); + if (tape->flags & PT_EOF) + break; + } + + return t; +} + +static int __init pt_init(void) +{ + int unit; + int err; + + if (disable) { + err = -EINVAL; + goto out; + } + + if (pt_detect()) { + err = -ENODEV; + goto out; + } + + err = register_chrdev(major, name, &pt_fops); + if (err < 0) { + printk("pt_init: unable to get major number %d\n", major); + for (unit = 0; unit < PT_UNITS; unit++) + if (pt[unit].present) + pi_release(pt[unit].pi); + goto out; + } + major = err; + pt_class = class_create(THIS_MODULE, "pt"); + if (IS_ERR(pt_class)) { + err = PTR_ERR(pt_class); + goto out_chrdev; + } + + for (unit = 0; unit < PT_UNITS; unit++) + if (pt[unit].present) { + device_create(pt_class, NULL, MKDEV(major, unit), NULL, + "pt%d", unit); + device_create(pt_class, NULL, MKDEV(major, unit + 128), + NULL, "pt%dn", unit); + } + goto out; + +out_chrdev: + unregister_chrdev(major, "pt"); +out: + return err; +} + +static void __exit pt_exit(void) +{ + int unit; + for (unit = 0; unit < PT_UNITS; unit++) + if (pt[unit].present) { + device_destroy(pt_class, MKDEV(major, unit)); + device_destroy(pt_class, MKDEV(major, unit + 128)); + } + class_destroy(pt_class); + unregister_chrdev(major, name); + for (unit = 0; unit < PT_UNITS; unit++) + if (pt[unit].present) + pi_release(pt[unit].pi); +} + +MODULE_LICENSE("GPL"); +module_init(pt_init) +module_exit(pt_exit) diff --git a/kernel/drivers/block/pktcdvd.c b/kernel/drivers/block/pktcdvd.c new file mode 100644 index 000000000..09e628daf --- /dev/null +++ b/kernel/drivers/block/pktcdvd.c @@ -0,0 +1,3029 @@ +/* + * Copyright (C) 2000 Jens Axboe + * Copyright (C) 2001-2004 Peter Osterlund + * Copyright (C) 2006 Thomas Maier + * + * May be copied or modified under the terms of the GNU General Public + * License. See linux/COPYING for more information. + * + * Packet writing layer for ATAPI and SCSI CD-RW, DVD+RW, DVD-RW and + * DVD-RAM devices. + * + * Theory of operation: + * + * At the lowest level, there is the standard driver for the CD/DVD device, + * typically ide-cd.c or sr.c. This driver can handle read and write requests, + * but it doesn't know anything about the special restrictions that apply to + * packet writing. One restriction is that write requests must be aligned to + * packet boundaries on the physical media, and the size of a write request + * must be equal to the packet size. Another restriction is that a + * GPCMD_FLUSH_CACHE command has to be issued to the drive before a read + * command, if the previous command was a write. + * + * The purpose of the packet writing driver is to hide these restrictions from + * higher layers, such as file systems, and present a block device that can be + * randomly read and written using 2kB-sized blocks. + * + * The lowest layer in the packet writing driver is the packet I/O scheduler. + * Its data is defined by the struct packet_iosched and includes two bio + * queues with pending read and write requests. These queues are processed + * by the pkt_iosched_process_queue() function. The write requests in this + * queue are already properly aligned and sized. This layer is responsible for + * issuing the flush cache commands and scheduling the I/O in a good order. + * + * The next layer transforms unaligned write requests to aligned writes. This + * transformation requires reading missing pieces of data from the underlying + * block device, assembling the pieces to full packets and queuing them to the + * packet I/O scheduler. + * + * At the top layer there is a custom make_request_fn function that forwards + * read requests directly to the iosched queue and puts write requests in the + * unaligned write queue. A kernel thread performs the necessary read + * gathering to convert the unaligned writes to aligned writes and then feeds + * them to the packet I/O scheduler. + * + *************************************************************************/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define DRIVER_NAME "pktcdvd" + +#define pkt_err(pd, fmt, ...) \ + pr_err("%s: " fmt, pd->name, ##__VA_ARGS__) +#define pkt_notice(pd, fmt, ...) \ + pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__) +#define pkt_info(pd, fmt, ...) \ + pr_info("%s: " fmt, pd->name, ##__VA_ARGS__) + +#define pkt_dbg(level, pd, fmt, ...) \ +do { \ + if (level == 2 && PACKET_DEBUG >= 2) \ + pr_notice("%s: %s():" fmt, \ + pd->name, __func__, ##__VA_ARGS__); \ + else if (level == 1 && PACKET_DEBUG >= 1) \ + pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__); \ +} while (0) + +#define MAX_SPEED 0xffff + +static DEFINE_MUTEX(pktcdvd_mutex); +static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; +static struct proc_dir_entry *pkt_proc; +static int pktdev_major; +static int write_congestion_on = PKT_WRITE_CONGESTION_ON; +static int write_congestion_off = PKT_WRITE_CONGESTION_OFF; +static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */ +static mempool_t *psd_pool; + +static struct class *class_pktcdvd = NULL; /* /sys/class/pktcdvd */ +static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */ + +/* forward declaration */ +static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev); +static int pkt_remove_dev(dev_t pkt_dev); +static int pkt_seq_show(struct seq_file *m, void *p); + +static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd) +{ + return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1); +} + +/* + * create and register a pktcdvd kernel object. + */ +static struct pktcdvd_kobj* pkt_kobj_create(struct pktcdvd_device *pd, + const char* name, + struct kobject* parent, + struct kobj_type* ktype) +{ + struct pktcdvd_kobj *p; + int error; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return NULL; + p->pd = pd; + error = kobject_init_and_add(&p->kobj, ktype, parent, "%s", name); + if (error) { + kobject_put(&p->kobj); + return NULL; + } + kobject_uevent(&p->kobj, KOBJ_ADD); + return p; +} +/* + * remove a pktcdvd kernel object. + */ +static void pkt_kobj_remove(struct pktcdvd_kobj *p) +{ + if (p) + kobject_put(&p->kobj); +} +/* + * default release function for pktcdvd kernel objects. + */ +static void pkt_kobj_release(struct kobject *kobj) +{ + kfree(to_pktcdvdkobj(kobj)); +} + + +/********************************************************** + * + * sysfs interface for pktcdvd + * by (C) 2006 Thomas Maier + * + **********************************************************/ + +#define DEF_ATTR(_obj,_name,_mode) \ + static struct attribute _obj = { .name = _name, .mode = _mode } + +/********************************************************** + /sys/class/pktcdvd/pktcdvd[0-7]/ + stat/reset + stat/packets_started + stat/packets_finished + stat/kb_written + stat/kb_read + stat/kb_read_gather + write_queue/size + write_queue/congestion_off + write_queue/congestion_on + **********************************************************/ + +DEF_ATTR(kobj_pkt_attr_st1, "reset", 0200); +DEF_ATTR(kobj_pkt_attr_st2, "packets_started", 0444); +DEF_ATTR(kobj_pkt_attr_st3, "packets_finished", 0444); +DEF_ATTR(kobj_pkt_attr_st4, "kb_written", 0444); +DEF_ATTR(kobj_pkt_attr_st5, "kb_read", 0444); +DEF_ATTR(kobj_pkt_attr_st6, "kb_read_gather", 0444); + +static struct attribute *kobj_pkt_attrs_stat[] = { + &kobj_pkt_attr_st1, + &kobj_pkt_attr_st2, + &kobj_pkt_attr_st3, + &kobj_pkt_attr_st4, + &kobj_pkt_attr_st5, + &kobj_pkt_attr_st6, + NULL +}; + +DEF_ATTR(kobj_pkt_attr_wq1, "size", 0444); +DEF_ATTR(kobj_pkt_attr_wq2, "congestion_off", 0644); +DEF_ATTR(kobj_pkt_attr_wq3, "congestion_on", 0644); + +static struct attribute *kobj_pkt_attrs_wqueue[] = { + &kobj_pkt_attr_wq1, + &kobj_pkt_attr_wq2, + &kobj_pkt_attr_wq3, + NULL +}; + +static ssize_t kobj_pkt_show(struct kobject *kobj, + struct attribute *attr, char *data) +{ + struct pktcdvd_device *pd = to_pktcdvdkobj(kobj)->pd; + int n = 0; + int v; + if (strcmp(attr->name, "packets_started") == 0) { + n = sprintf(data, "%lu\n", pd->stats.pkt_started); + + } else if (strcmp(attr->name, "packets_finished") == 0) { + n = sprintf(data, "%lu\n", pd->stats.pkt_ended); + + } else if (strcmp(attr->name, "kb_written") == 0) { + n = sprintf(data, "%lu\n", pd->stats.secs_w >> 1); + + } else if (strcmp(attr->name, "kb_read") == 0) { + n = sprintf(data, "%lu\n", pd->stats.secs_r >> 1); + + } else if (strcmp(attr->name, "kb_read_gather") == 0) { + n = sprintf(data, "%lu\n", pd->stats.secs_rg >> 1); + + } else if (strcmp(attr->name, "size") == 0) { + spin_lock(&pd->lock); + v = pd->bio_queue_size; + spin_unlock(&pd->lock); + n = sprintf(data, "%d\n", v); + + } else if (strcmp(attr->name, "congestion_off") == 0) { + spin_lock(&pd->lock); + v = pd->write_congestion_off; + spin_unlock(&pd->lock); + n = sprintf(data, "%d\n", v); + + } else if (strcmp(attr->name, "congestion_on") == 0) { + spin_lock(&pd->lock); + v = pd->write_congestion_on; + spin_unlock(&pd->lock); + n = sprintf(data, "%d\n", v); + } + return n; +} + +static void init_write_congestion_marks(int* lo, int* hi) +{ + if (*hi > 0) { + *hi = max(*hi, 500); + *hi = min(*hi, 1000000); + if (*lo <= 0) + *lo = *hi - 100; + else { + *lo = min(*lo, *hi - 100); + *lo = max(*lo, 100); + } + } else { + *hi = -1; + *lo = -1; + } +} + +static ssize_t kobj_pkt_store(struct kobject *kobj, + struct attribute *attr, + const char *data, size_t len) +{ + struct pktcdvd_device *pd = to_pktcdvdkobj(kobj)->pd; + int val; + + if (strcmp(attr->name, "reset") == 0 && len > 0) { + pd->stats.pkt_started = 0; + pd->stats.pkt_ended = 0; + pd->stats.secs_w = 0; + pd->stats.secs_rg = 0; + pd->stats.secs_r = 0; + + } else if (strcmp(attr->name, "congestion_off") == 0 + && sscanf(data, "%d", &val) == 1) { + spin_lock(&pd->lock); + pd->write_congestion_off = val; + init_write_congestion_marks(&pd->write_congestion_off, + &pd->write_congestion_on); + spin_unlock(&pd->lock); + + } else if (strcmp(attr->name, "congestion_on") == 0 + && sscanf(data, "%d", &val) == 1) { + spin_lock(&pd->lock); + pd->write_congestion_on = val; + init_write_congestion_marks(&pd->write_congestion_off, + &pd->write_congestion_on); + spin_unlock(&pd->lock); + } + return len; +} + +static const struct sysfs_ops kobj_pkt_ops = { + .show = kobj_pkt_show, + .store = kobj_pkt_store +}; +static struct kobj_type kobj_pkt_type_stat = { + .release = pkt_kobj_release, + .sysfs_ops = &kobj_pkt_ops, + .default_attrs = kobj_pkt_attrs_stat +}; +static struct kobj_type kobj_pkt_type_wqueue = { + .release = pkt_kobj_release, + .sysfs_ops = &kobj_pkt_ops, + .default_attrs = kobj_pkt_attrs_wqueue +}; + +static void pkt_sysfs_dev_new(struct pktcdvd_device *pd) +{ + if (class_pktcdvd) { + pd->dev = device_create(class_pktcdvd, NULL, MKDEV(0, 0), NULL, + "%s", pd->name); + if (IS_ERR(pd->dev)) + pd->dev = NULL; + } + if (pd->dev) { + pd->kobj_stat = pkt_kobj_create(pd, "stat", + &pd->dev->kobj, + &kobj_pkt_type_stat); + pd->kobj_wqueue = pkt_kobj_create(pd, "write_queue", + &pd->dev->kobj, + &kobj_pkt_type_wqueue); + } +} + +static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd) +{ + pkt_kobj_remove(pd->kobj_stat); + pkt_kobj_remove(pd->kobj_wqueue); + if (class_pktcdvd) + device_unregister(pd->dev); +} + + +/******************************************************************** + /sys/class/pktcdvd/ + add map block device + remove unmap packet dev + device_map show mappings + *******************************************************************/ + +static void class_pktcdvd_release(struct class *cls) +{ + kfree(cls); +} +static ssize_t class_pktcdvd_show_map(struct class *c, + struct class_attribute *attr, + char *data) +{ + int n = 0; + int idx; + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + for (idx = 0; idx < MAX_WRITERS; idx++) { + struct pktcdvd_device *pd = pkt_devs[idx]; + if (!pd) + continue; + n += sprintf(data+n, "%s %u:%u %u:%u\n", + pd->name, + MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev), + MAJOR(pd->bdev->bd_dev), + MINOR(pd->bdev->bd_dev)); + } + mutex_unlock(&ctl_mutex); + return n; +} + +static ssize_t class_pktcdvd_store_add(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int major, minor; + + if (sscanf(buf, "%u:%u", &major, &minor) == 2) { + /* pkt_setup_dev() expects caller to hold reference to self */ + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + pkt_setup_dev(MKDEV(major, minor), NULL); + + module_put(THIS_MODULE); + + return count; + } + + return -EINVAL; +} + +static ssize_t class_pktcdvd_store_remove(struct class *c, + struct class_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int major, minor; + if (sscanf(buf, "%u:%u", &major, &minor) == 2) { + pkt_remove_dev(MKDEV(major, minor)); + return count; + } + return -EINVAL; +} + +static struct class_attribute class_pktcdvd_attrs[] = { + __ATTR(add, 0200, NULL, class_pktcdvd_store_add), + __ATTR(remove, 0200, NULL, class_pktcdvd_store_remove), + __ATTR(device_map, 0444, class_pktcdvd_show_map, NULL), + __ATTR_NULL +}; + + +static int pkt_sysfs_init(void) +{ + int ret = 0; + + /* + * create control files in sysfs + * /sys/class/pktcdvd/... + */ + class_pktcdvd = kzalloc(sizeof(*class_pktcdvd), GFP_KERNEL); + if (!class_pktcdvd) + return -ENOMEM; + class_pktcdvd->name = DRIVER_NAME; + class_pktcdvd->owner = THIS_MODULE; + class_pktcdvd->class_release = class_pktcdvd_release; + class_pktcdvd->class_attrs = class_pktcdvd_attrs; + ret = class_register(class_pktcdvd); + if (ret) { + kfree(class_pktcdvd); + class_pktcdvd = NULL; + pr_err("failed to create class pktcdvd\n"); + return ret; + } + return 0; +} + +static void pkt_sysfs_cleanup(void) +{ + if (class_pktcdvd) + class_destroy(class_pktcdvd); + class_pktcdvd = NULL; +} + +/******************************************************************** + entries in debugfs + + /sys/kernel/debug/pktcdvd[0-7]/ + info + + *******************************************************************/ + +static int pkt_debugfs_seq_show(struct seq_file *m, void *p) +{ + return pkt_seq_show(m, p); +} + +static int pkt_debugfs_fops_open(struct inode *inode, struct file *file) +{ + return single_open(file, pkt_debugfs_seq_show, inode->i_private); +} + +static const struct file_operations debug_fops = { + .open = pkt_debugfs_fops_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .owner = THIS_MODULE, +}; + +static void pkt_debugfs_dev_new(struct pktcdvd_device *pd) +{ + if (!pkt_debugfs_root) + return; + pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root); + if (!pd->dfs_d_root) + return; + + pd->dfs_f_info = debugfs_create_file("info", S_IRUGO, + pd->dfs_d_root, pd, &debug_fops); +} + +static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd) +{ + if (!pkt_debugfs_root) + return; + debugfs_remove(pd->dfs_f_info); + debugfs_remove(pd->dfs_d_root); + pd->dfs_f_info = NULL; + pd->dfs_d_root = NULL; +} + +static void pkt_debugfs_init(void) +{ + pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL); +} + +static void pkt_debugfs_cleanup(void) +{ + debugfs_remove(pkt_debugfs_root); + pkt_debugfs_root = NULL; +} + +/* ----------------------------------------------------------*/ + + +static void pkt_bio_finished(struct pktcdvd_device *pd) +{ + BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0); + if (atomic_dec_and_test(&pd->cdrw.pending_bios)) { + pkt_dbg(2, pd, "queue empty\n"); + atomic_set(&pd->iosched.attention, 1); + wake_up(&pd->wqueue); + } +} + +/* + * Allocate a packet_data struct + */ +static struct packet_data *pkt_alloc_packet_data(int frames) +{ + int i; + struct packet_data *pkt; + + pkt = kzalloc(sizeof(struct packet_data), GFP_KERNEL); + if (!pkt) + goto no_pkt; + + pkt->frames = frames; + pkt->w_bio = bio_kmalloc(GFP_KERNEL, frames); + if (!pkt->w_bio) + goto no_bio; + + for (i = 0; i < frames / FRAMES_PER_PAGE; i++) { + pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!pkt->pages[i]) + goto no_page; + } + + spin_lock_init(&pkt->lock); + bio_list_init(&pkt->orig_bios); + + for (i = 0; i < frames; i++) { + struct bio *bio = bio_kmalloc(GFP_KERNEL, 1); + if (!bio) + goto no_rd_bio; + + pkt->r_bios[i] = bio; + } + + return pkt; + +no_rd_bio: + for (i = 0; i < frames; i++) { + struct bio *bio = pkt->r_bios[i]; + if (bio) + bio_put(bio); + } + +no_page: + for (i = 0; i < frames / FRAMES_PER_PAGE; i++) + if (pkt->pages[i]) + __free_page(pkt->pages[i]); + bio_put(pkt->w_bio); +no_bio: + kfree(pkt); +no_pkt: + return NULL; +} + +/* + * Free a packet_data struct + */ +static void pkt_free_packet_data(struct packet_data *pkt) +{ + int i; + + for (i = 0; i < pkt->frames; i++) { + struct bio *bio = pkt->r_bios[i]; + if (bio) + bio_put(bio); + } + for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++) + __free_page(pkt->pages[i]); + bio_put(pkt->w_bio); + kfree(pkt); +} + +static void pkt_shrink_pktlist(struct pktcdvd_device *pd) +{ + struct packet_data *pkt, *next; + + BUG_ON(!list_empty(&pd->cdrw.pkt_active_list)); + + list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) { + pkt_free_packet_data(pkt); + } + INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); +} + +static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets) +{ + struct packet_data *pkt; + + BUG_ON(!list_empty(&pd->cdrw.pkt_free_list)); + + while (nr_packets > 0) { + pkt = pkt_alloc_packet_data(pd->settings.size >> 2); + if (!pkt) { + pkt_shrink_pktlist(pd); + return 0; + } + pkt->id = nr_packets; + pkt->pd = pd; + list_add(&pkt->list, &pd->cdrw.pkt_free_list); + nr_packets--; + } + return 1; +} + +static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node) +{ + struct rb_node *n = rb_next(&node->rb_node); + if (!n) + return NULL; + return rb_entry(n, struct pkt_rb_node, rb_node); +} + +static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node) +{ + rb_erase(&node->rb_node, &pd->bio_queue); + mempool_free(node, pd->rb_pool); + pd->bio_queue_size--; + BUG_ON(pd->bio_queue_size < 0); +} + +/* + * Find the first node in the pd->bio_queue rb tree with a starting sector >= s. + */ +static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s) +{ + struct rb_node *n = pd->bio_queue.rb_node; + struct rb_node *next; + struct pkt_rb_node *tmp; + + if (!n) { + BUG_ON(pd->bio_queue_size > 0); + return NULL; + } + + for (;;) { + tmp = rb_entry(n, struct pkt_rb_node, rb_node); + if (s <= tmp->bio->bi_iter.bi_sector) + next = n->rb_left; + else + next = n->rb_right; + if (!next) + break; + n = next; + } + + if (s > tmp->bio->bi_iter.bi_sector) { + tmp = pkt_rbtree_next(tmp); + if (!tmp) + return NULL; + } + BUG_ON(s > tmp->bio->bi_iter.bi_sector); + return tmp; +} + +/* + * Insert a node into the pd->bio_queue rb tree. + */ +static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node) +{ + struct rb_node **p = &pd->bio_queue.rb_node; + struct rb_node *parent = NULL; + sector_t s = node->bio->bi_iter.bi_sector; + struct pkt_rb_node *tmp; + + while (*p) { + parent = *p; + tmp = rb_entry(parent, struct pkt_rb_node, rb_node); + if (s < tmp->bio->bi_iter.bi_sector) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&node->rb_node, parent, p); + rb_insert_color(&node->rb_node, &pd->bio_queue); + pd->bio_queue_size++; +} + +/* + * Send a packet_command to the underlying block device and + * wait for completion. + */ +static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc) +{ + struct request_queue *q = bdev_get_queue(pd->bdev); + struct request *rq; + int ret = 0; + + rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? + WRITE : READ, __GFP_WAIT); + if (IS_ERR(rq)) + return PTR_ERR(rq); + blk_rq_set_block_pc(rq); + + if (cgc->buflen) { + ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, + __GFP_WAIT); + if (ret) + goto out; + } + + rq->cmd_len = COMMAND_SIZE(cgc->cmd[0]); + memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE); + + rq->timeout = 60*HZ; + if (cgc->quiet) + rq->cmd_flags |= REQ_QUIET; + + blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0); + if (rq->errors) + ret = -EIO; +out: + blk_put_request(rq); + return ret; +} + +static const char *sense_key_string(__u8 index) +{ + static const char * const info[] = { + "No sense", "Recovered error", "Not ready", + "Medium error", "Hardware error", "Illegal request", + "Unit attention", "Data protect", "Blank check", + }; + + return index < ARRAY_SIZE(info) ? info[index] : "INVALID"; +} + +/* + * A generic sense dump / resolve mechanism should be implemented across + * all ATAPI + SCSI devices. + */ +static void pkt_dump_sense(struct pktcdvd_device *pd, + struct packet_command *cgc) +{ + struct request_sense *sense = cgc->sense; + + if (sense) + pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n", + CDROM_PACKET_SIZE, cgc->cmd, + sense->sense_key, sense->asc, sense->ascq, + sense_key_string(sense->sense_key)); + else + pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); +} + +/* + * flush the drive cache to media + */ +static int pkt_flush_cache(struct pktcdvd_device *pd) +{ + struct packet_command cgc; + + init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); + cgc.cmd[0] = GPCMD_FLUSH_CACHE; + cgc.quiet = 1; + + /* + * the IMMED bit -- we default to not setting it, although that + * would allow a much faster close, this is safer + */ +#if 0 + cgc.cmd[1] = 1 << 1; +#endif + return pkt_generic_packet(pd, &cgc); +} + +/* + * speed is given as the normal factor, e.g. 4 for 4x + */ +static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, + unsigned write_speed, unsigned read_speed) +{ + struct packet_command cgc; + struct request_sense sense; + int ret; + + init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); + cgc.sense = &sense; + cgc.cmd[0] = GPCMD_SET_SPEED; + cgc.cmd[2] = (read_speed >> 8) & 0xff; + cgc.cmd[3] = read_speed & 0xff; + cgc.cmd[4] = (write_speed >> 8) & 0xff; + cgc.cmd[5] = write_speed & 0xff; + + if ((ret = pkt_generic_packet(pd, &cgc))) + pkt_dump_sense(pd, &cgc); + + return ret; +} + +/* + * Queue a bio for processing by the low-level CD device. Must be called + * from process context. + */ +static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) +{ + spin_lock(&pd->iosched.lock); + if (bio_data_dir(bio) == READ) + bio_list_add(&pd->iosched.read_queue, bio); + else + bio_list_add(&pd->iosched.write_queue, bio); + spin_unlock(&pd->iosched.lock); + + atomic_set(&pd->iosched.attention, 1); + wake_up(&pd->wqueue); +} + +/* + * Process the queued read/write requests. This function handles special + * requirements for CDRW drives: + * - A cache flush command must be inserted before a read request if the + * previous request was a write. + * - Switching between reading and writing is slow, so don't do it more often + * than necessary. + * - Optimize for throughput at the expense of latency. This means that streaming + * writes will never be interrupted by a read, but if the drive has to seek + * before the next write, switch to reading instead if there are any pending + * read requests. + * - Set the read speed according to current usage pattern. When only reading + * from the device, it's best to use the highest possible read speed, but + * when switching often between reading and writing, it's better to have the + * same read and write speeds. + */ +static void pkt_iosched_process_queue(struct pktcdvd_device *pd) +{ + + if (atomic_read(&pd->iosched.attention) == 0) + return; + atomic_set(&pd->iosched.attention, 0); + + for (;;) { + struct bio *bio; + int reads_queued, writes_queued; + + spin_lock(&pd->iosched.lock); + reads_queued = !bio_list_empty(&pd->iosched.read_queue); + writes_queued = !bio_list_empty(&pd->iosched.write_queue); + spin_unlock(&pd->iosched.lock); + + if (!reads_queued && !writes_queued) + break; + + if (pd->iosched.writing) { + int need_write_seek = 1; + spin_lock(&pd->iosched.lock); + bio = bio_list_peek(&pd->iosched.write_queue); + spin_unlock(&pd->iosched.lock); + if (bio && (bio->bi_iter.bi_sector == + pd->iosched.last_write)) + need_write_seek = 0; + if (need_write_seek && reads_queued) { + if (atomic_read(&pd->cdrw.pending_bios) > 0) { + pkt_dbg(2, pd, "write, waiting\n"); + break; + } + pkt_flush_cache(pd); + pd->iosched.writing = 0; + } + } else { + if (!reads_queued && writes_queued) { + if (atomic_read(&pd->cdrw.pending_bios) > 0) { + pkt_dbg(2, pd, "read, waiting\n"); + break; + } + pd->iosched.writing = 1; + } + } + + spin_lock(&pd->iosched.lock); + if (pd->iosched.writing) + bio = bio_list_pop(&pd->iosched.write_queue); + else + bio = bio_list_pop(&pd->iosched.read_queue); + spin_unlock(&pd->iosched.lock); + + if (!bio) + continue; + + if (bio_data_dir(bio) == READ) + pd->iosched.successive_reads += + bio->bi_iter.bi_size >> 10; + else { + pd->iosched.successive_reads = 0; + pd->iosched.last_write = bio_end_sector(bio); + } + if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) { + if (pd->read_speed == pd->write_speed) { + pd->read_speed = MAX_SPEED; + pkt_set_speed(pd, pd->write_speed, pd->read_speed); + } + } else { + if (pd->read_speed != pd->write_speed) { + pd->read_speed = pd->write_speed; + pkt_set_speed(pd, pd->write_speed, pd->read_speed); + } + } + + atomic_inc(&pd->cdrw.pending_bios); + generic_make_request(bio); + } +} + +/* + * Special care is needed if the underlying block device has a small + * max_phys_segments value. + */ +static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) +{ + if ((pd->settings.size << 9) / CD_FRAMESIZE + <= queue_max_segments(q)) { + /* + * The cdrom device can handle one segment/frame + */ + clear_bit(PACKET_MERGE_SEGS, &pd->flags); + return 0; + } else if ((pd->settings.size << 9) / PAGE_SIZE + <= queue_max_segments(q)) { + /* + * We can handle this case at the expense of some extra memory + * copies during write operations + */ + set_bit(PACKET_MERGE_SEGS, &pd->flags); + return 0; + } else { + pkt_err(pd, "cdrom max_phys_segments too small\n"); + return -EIO; + } +} + +/* + * Copy all data for this packet to pkt->pages[], so that + * a) The number of required segments for the write bio is minimized, which + * is necessary for some scsi controllers. + * b) The data can be used as cache to avoid read requests if we receive a + * new write request for the same zone. + */ +static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec) +{ + int f, p, offs; + + /* Copy all data to pkt->pages[] */ + p = 0; + offs = 0; + for (f = 0; f < pkt->frames; f++) { + if (bvec[f].bv_page != pkt->pages[p]) { + void *vfrom = kmap_atomic(bvec[f].bv_page) + bvec[f].bv_offset; + void *vto = page_address(pkt->pages[p]) + offs; + memcpy(vto, vfrom, CD_FRAMESIZE); + kunmap_atomic(vfrom); + bvec[f].bv_page = pkt->pages[p]; + bvec[f].bv_offset = offs; + } else { + BUG_ON(bvec[f].bv_offset != offs); + } + offs += CD_FRAMESIZE; + if (offs >= PAGE_SIZE) { + offs = 0; + p++; + } + } +} + +static void pkt_end_io_read(struct bio *bio, int err) +{ + struct packet_data *pkt = bio->bi_private; + struct pktcdvd_device *pd = pkt->pd; + BUG_ON(!pd); + + pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n", + bio, (unsigned long long)pkt->sector, + (unsigned long long)bio->bi_iter.bi_sector, err); + + if (err) + atomic_inc(&pkt->io_errors); + if (atomic_dec_and_test(&pkt->io_wait)) { + atomic_inc(&pkt->run_sm); + wake_up(&pd->wqueue); + } + pkt_bio_finished(pd); +} + +static void pkt_end_io_packet_write(struct bio *bio, int err) +{ + struct packet_data *pkt = bio->bi_private; + struct pktcdvd_device *pd = pkt->pd; + BUG_ON(!pd); + + pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, err); + + pd->stats.pkt_ended++; + + pkt_bio_finished(pd); + atomic_dec(&pkt->io_wait); + atomic_inc(&pkt->run_sm); + wake_up(&pd->wqueue); +} + +/* + * Schedule reads for the holes in a packet + */ +static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) +{ + int frames_read = 0; + struct bio *bio; + int f; + char written[PACKET_MAX_SIZE]; + + BUG_ON(bio_list_empty(&pkt->orig_bios)); + + atomic_set(&pkt->io_wait, 0); + atomic_set(&pkt->io_errors, 0); + + /* + * Figure out which frames we need to read before we can write. + */ + memset(written, 0, sizeof(written)); + spin_lock(&pkt->lock); + bio_list_for_each(bio, &pkt->orig_bios) { + int first_frame = (bio->bi_iter.bi_sector - pkt->sector) / + (CD_FRAMESIZE >> 9); + int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE; + pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9); + BUG_ON(first_frame < 0); + BUG_ON(first_frame + num_frames > pkt->frames); + for (f = first_frame; f < first_frame + num_frames; f++) + written[f] = 1; + } + spin_unlock(&pkt->lock); + + if (pkt->cache_valid) { + pkt_dbg(2, pd, "zone %llx cached\n", + (unsigned long long)pkt->sector); + goto out_account; + } + + /* + * Schedule reads for missing parts of the packet. + */ + for (f = 0; f < pkt->frames; f++) { + int p, offset; + + if (written[f]) + continue; + + bio = pkt->r_bios[f]; + bio_reset(bio); + bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9); + bio->bi_bdev = pd->bdev; + bio->bi_end_io = pkt_end_io_read; + bio->bi_private = pkt; + + p = (f * CD_FRAMESIZE) / PAGE_SIZE; + offset = (f * CD_FRAMESIZE) % PAGE_SIZE; + pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n", + f, pkt->pages[p], offset); + if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset)) + BUG(); + + atomic_inc(&pkt->io_wait); + bio->bi_rw = READ; + pkt_queue_bio(pd, bio); + frames_read++; + } + +out_account: + pkt_dbg(2, pd, "need %d frames for zone %llx\n", + frames_read, (unsigned long long)pkt->sector); + pd->stats.pkt_started++; + pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); +} + +/* + * Find a packet matching zone, or the least recently used packet if + * there is no match. + */ +static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone) +{ + struct packet_data *pkt; + + list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) { + if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) { + list_del_init(&pkt->list); + if (pkt->sector != zone) + pkt->cache_valid = 0; + return pkt; + } + } + BUG(); + return NULL; +} + +static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt) +{ + if (pkt->cache_valid) { + list_add(&pkt->list, &pd->cdrw.pkt_free_list); + } else { + list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list); + } +} + +/* + * recover a failed write, query for relocation if possible + * + * returns 1 if recovery is possible, or 0 if not + * + */ +static int pkt_start_recovery(struct packet_data *pkt) +{ + /* + * FIXME. We need help from the file system to implement + * recovery handling. + */ + return 0; +#if 0 + struct request *rq = pkt->rq; + struct pktcdvd_device *pd = rq->rq_disk->private_data; + struct block_device *pkt_bdev; + struct super_block *sb = NULL; + unsigned long old_block, new_block; + sector_t new_sector; + + pkt_bdev = bdget(kdev_t_to_nr(pd->pkt_dev)); + if (pkt_bdev) { + sb = get_super(pkt_bdev); + bdput(pkt_bdev); + } + + if (!sb) + return 0; + + if (!sb->s_op->relocate_blocks) + goto out; + + old_block = pkt->sector / (CD_FRAMESIZE >> 9); + if (sb->s_op->relocate_blocks(sb, old_block, &new_block)) + goto out; + + new_sector = new_block * (CD_FRAMESIZE >> 9); + pkt->sector = new_sector; + + bio_reset(pkt->bio); + pkt->bio->bi_bdev = pd->bdev; + pkt->bio->bi_rw = REQ_WRITE; + pkt->bio->bi_iter.bi_sector = new_sector; + pkt->bio->bi_iter.bi_size = pkt->frames * CD_FRAMESIZE; + pkt->bio->bi_vcnt = pkt->frames; + + pkt->bio->bi_end_io = pkt_end_io_packet_write; + pkt->bio->bi_private = pkt; + + drop_super(sb); + return 1; + +out: + drop_super(sb); + return 0; +#endif +} + +static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state) +{ +#if PACKET_DEBUG > 1 + static const char *state_name[] = { + "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED" + }; + enum packet_data_state old_state = pkt->state; + pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n", + pkt->id, (unsigned long long)pkt->sector, + state_name[old_state], state_name[state]); +#endif + pkt->state = state; +} + +/* + * Scan the work queue to see if we can start a new packet. + * returns non-zero if any work was done. + */ +static int pkt_handle_queue(struct pktcdvd_device *pd) +{ + struct packet_data *pkt, *p; + struct bio *bio = NULL; + sector_t zone = 0; /* Suppress gcc warning */ + struct pkt_rb_node *node, *first_node; + struct rb_node *n; + int wakeup; + + atomic_set(&pd->scan_queue, 0); + + if (list_empty(&pd->cdrw.pkt_free_list)) { + pkt_dbg(2, pd, "no pkt\n"); + return 0; + } + + /* + * Try to find a zone we are not already working on. + */ + spin_lock(&pd->lock); + first_node = pkt_rbtree_find(pd, pd->current_sector); + if (!first_node) { + n = rb_first(&pd->bio_queue); + if (n) + first_node = rb_entry(n, struct pkt_rb_node, rb_node); + } + node = first_node; + while (node) { + bio = node->bio; + zone = get_zone(bio->bi_iter.bi_sector, pd); + list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) { + if (p->sector == zone) { + bio = NULL; + goto try_next_bio; + } + } + break; +try_next_bio: + node = pkt_rbtree_next(node); + if (!node) { + n = rb_first(&pd->bio_queue); + if (n) + node = rb_entry(n, struct pkt_rb_node, rb_node); + } + if (node == first_node) + node = NULL; + } + spin_unlock(&pd->lock); + if (!bio) { + pkt_dbg(2, pd, "no bio\n"); + return 0; + } + + pkt = pkt_get_packet_data(pd, zone); + + pd->current_sector = zone + pd->settings.size; + pkt->sector = zone; + BUG_ON(pkt->frames != pd->settings.size >> 2); + pkt->write_size = 0; + + /* + * Scan work queue for bios in the same zone and link them + * to this packet. + */ + spin_lock(&pd->lock); + pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone); + while ((node = pkt_rbtree_find(pd, zone)) != NULL) { + bio = node->bio; + pkt_dbg(2, pd, "found zone=%llx\n", (unsigned long long) + get_zone(bio->bi_iter.bi_sector, pd)); + if (get_zone(bio->bi_iter.bi_sector, pd) != zone) + break; + pkt_rbtree_erase(pd, node); + spin_lock(&pkt->lock); + bio_list_add(&pkt->orig_bios, bio); + pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE; + spin_unlock(&pkt->lock); + } + /* check write congestion marks, and if bio_queue_size is + below, wake up any waiters */ + wakeup = (pd->write_congestion_on > 0 + && pd->bio_queue_size <= pd->write_congestion_off); + spin_unlock(&pd->lock); + if (wakeup) { + clear_bdi_congested(&pd->disk->queue->backing_dev_info, + BLK_RW_ASYNC); + } + + pkt->sleep_time = max(PACKET_WAIT_TIME, 1); + pkt_set_state(pkt, PACKET_WAITING_STATE); + atomic_set(&pkt->run_sm, 1); + + spin_lock(&pd->cdrw.active_list_lock); + list_add(&pkt->list, &pd->cdrw.pkt_active_list); + spin_unlock(&pd->cdrw.active_list_lock); + + return 1; +} + +/* + * Assemble a bio to write one packet and queue the bio for processing + * by the underlying block device. + */ +static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) +{ + int f; + struct bio_vec *bvec = pkt->w_bio->bi_io_vec; + + bio_reset(pkt->w_bio); + pkt->w_bio->bi_iter.bi_sector = pkt->sector; + pkt->w_bio->bi_bdev = pd->bdev; + pkt->w_bio->bi_end_io = pkt_end_io_packet_write; + pkt->w_bio->bi_private = pkt; + + /* XXX: locking? */ + for (f = 0; f < pkt->frames; f++) { + bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE]; + bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE; + if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset)) + BUG(); + } + pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt); + + /* + * Fill-in bvec with data from orig_bios. + */ + spin_lock(&pkt->lock); + bio_copy_data(pkt->w_bio, pkt->orig_bios.head); + + pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); + spin_unlock(&pkt->lock); + + pkt_dbg(2, pd, "Writing %d frames for zone %llx\n", + pkt->write_size, (unsigned long long)pkt->sector); + + if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) { + pkt_make_local_copy(pkt, bvec); + pkt->cache_valid = 1; + } else { + pkt->cache_valid = 0; + } + + /* Start the write request */ + atomic_set(&pkt->io_wait, 1); + pkt->w_bio->bi_rw = WRITE; + pkt_queue_bio(pd, pkt->w_bio); +} + +static void pkt_finish_packet(struct packet_data *pkt, int uptodate) +{ + struct bio *bio; + + if (!uptodate) + pkt->cache_valid = 0; + + /* Finish all bios corresponding to this packet */ + while ((bio = bio_list_pop(&pkt->orig_bios))) + bio_endio(bio, uptodate ? 0 : -EIO); +} + +static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt) +{ + int uptodate; + + pkt_dbg(2, pd, "pkt %d\n", pkt->id); + + for (;;) { + switch (pkt->state) { + case PACKET_WAITING_STATE: + if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0)) + return; + + pkt->sleep_time = 0; + pkt_gather_data(pd, pkt); + pkt_set_state(pkt, PACKET_READ_WAIT_STATE); + break; + + case PACKET_READ_WAIT_STATE: + if (atomic_read(&pkt->io_wait) > 0) + return; + + if (atomic_read(&pkt->io_errors) > 0) { + pkt_set_state(pkt, PACKET_RECOVERY_STATE); + } else { + pkt_start_write(pd, pkt); + } + break; + + case PACKET_WRITE_WAIT_STATE: + if (atomic_read(&pkt->io_wait) > 0) + return; + + if (test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags)) { + pkt_set_state(pkt, PACKET_FINISHED_STATE); + } else { + pkt_set_state(pkt, PACKET_RECOVERY_STATE); + } + break; + + case PACKET_RECOVERY_STATE: + if (pkt_start_recovery(pkt)) { + pkt_start_write(pd, pkt); + } else { + pkt_dbg(2, pd, "No recovery possible\n"); + pkt_set_state(pkt, PACKET_FINISHED_STATE); + } + break; + + case PACKET_FINISHED_STATE: + uptodate = test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags); + pkt_finish_packet(pkt, uptodate); + return; + + default: + BUG(); + break; + } + } +} + +static void pkt_handle_packets(struct pktcdvd_device *pd) +{ + struct packet_data *pkt, *next; + + /* + * Run state machine for active packets + */ + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + if (atomic_read(&pkt->run_sm) > 0) { + atomic_set(&pkt->run_sm, 0); + pkt_run_state_machine(pd, pkt); + } + } + + /* + * Move no longer active packets to the free list + */ + spin_lock(&pd->cdrw.active_list_lock); + list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) { + if (pkt->state == PACKET_FINISHED_STATE) { + list_del(&pkt->list); + pkt_put_packet_data(pd, pkt); + pkt_set_state(pkt, PACKET_IDLE_STATE); + atomic_set(&pd->scan_queue, 1); + } + } + spin_unlock(&pd->cdrw.active_list_lock); +} + +static void pkt_count_states(struct pktcdvd_device *pd, int *states) +{ + struct packet_data *pkt; + int i; + + for (i = 0; i < PACKET_NUM_STATES; i++) + states[i] = 0; + + spin_lock(&pd->cdrw.active_list_lock); + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + states[pkt->state]++; + } + spin_unlock(&pd->cdrw.active_list_lock); +} + +/* + * kcdrwd is woken up when writes have been queued for one of our + * registered devices + */ +static int kcdrwd(void *foobar) +{ + struct pktcdvd_device *pd = foobar; + struct packet_data *pkt; + long min_sleep_time, residue; + + set_user_nice(current, MIN_NICE); + set_freezable(); + + for (;;) { + DECLARE_WAITQUEUE(wait, current); + + /* + * Wait until there is something to do + */ + add_wait_queue(&pd->wqueue, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + /* Check if we need to run pkt_handle_queue */ + if (atomic_read(&pd->scan_queue) > 0) + goto work_to_do; + + /* Check if we need to run the state machine for some packet */ + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + if (atomic_read(&pkt->run_sm) > 0) + goto work_to_do; + } + + /* Check if we need to process the iosched queues */ + if (atomic_read(&pd->iosched.attention) != 0) + goto work_to_do; + + /* Otherwise, go to sleep */ + if (PACKET_DEBUG > 1) { + int states[PACKET_NUM_STATES]; + pkt_count_states(pd, states); + pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", + states[0], states[1], states[2], + states[3], states[4], states[5]); + } + + min_sleep_time = MAX_SCHEDULE_TIMEOUT; + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + if (pkt->sleep_time && pkt->sleep_time < min_sleep_time) + min_sleep_time = pkt->sleep_time; + } + + pkt_dbg(2, pd, "sleeping\n"); + residue = schedule_timeout(min_sleep_time); + pkt_dbg(2, pd, "wake up\n"); + + /* make swsusp happy with our thread */ + try_to_freeze(); + + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + if (!pkt->sleep_time) + continue; + pkt->sleep_time -= min_sleep_time - residue; + if (pkt->sleep_time <= 0) { + pkt->sleep_time = 0; + atomic_inc(&pkt->run_sm); + } + } + + if (kthread_should_stop()) + break; + } +work_to_do: + set_current_state(TASK_RUNNING); + remove_wait_queue(&pd->wqueue, &wait); + + if (kthread_should_stop()) + break; + + /* + * if pkt_handle_queue returns true, we can queue + * another request. + */ + while (pkt_handle_queue(pd)) + ; + + /* + * Handle packet state machine + */ + pkt_handle_packets(pd); + + /* + * Handle iosched queues + */ + pkt_iosched_process_queue(pd); + } + + return 0; +} + +static void pkt_print_settings(struct pktcdvd_device *pd) +{ + pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n", + pd->settings.fp ? "Fixed" : "Variable", + pd->settings.size >> 2, + pd->settings.block_mode == 8 ? '1' : '2'); +} + +static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control) +{ + memset(cgc->cmd, 0, sizeof(cgc->cmd)); + + cgc->cmd[0] = GPCMD_MODE_SENSE_10; + cgc->cmd[2] = page_code | (page_control << 6); + cgc->cmd[7] = cgc->buflen >> 8; + cgc->cmd[8] = cgc->buflen & 0xff; + cgc->data_direction = CGC_DATA_READ; + return pkt_generic_packet(pd, cgc); +} + +static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc) +{ + memset(cgc->cmd, 0, sizeof(cgc->cmd)); + memset(cgc->buffer, 0, 2); + cgc->cmd[0] = GPCMD_MODE_SELECT_10; + cgc->cmd[1] = 0x10; /* PF */ + cgc->cmd[7] = cgc->buflen >> 8; + cgc->cmd[8] = cgc->buflen & 0xff; + cgc->data_direction = CGC_DATA_WRITE; + return pkt_generic_packet(pd, cgc); +} + +static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di) +{ + struct packet_command cgc; + int ret; + + /* set up command and get the disc info */ + init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ); + cgc.cmd[0] = GPCMD_READ_DISC_INFO; + cgc.cmd[8] = cgc.buflen = 2; + cgc.quiet = 1; + + if ((ret = pkt_generic_packet(pd, &cgc))) + return ret; + + /* not all drives have the same disc_info length, so requeue + * packet with the length the drive tells us it can supply + */ + cgc.buflen = be16_to_cpu(di->disc_information_length) + + sizeof(di->disc_information_length); + + if (cgc.buflen > sizeof(disc_information)) + cgc.buflen = sizeof(disc_information); + + cgc.cmd[8] = cgc.buflen; + return pkt_generic_packet(pd, &cgc); +} + +static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti) +{ + struct packet_command cgc; + int ret; + + init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ); + cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO; + cgc.cmd[1] = type & 3; + cgc.cmd[4] = (track & 0xff00) >> 8; + cgc.cmd[5] = track & 0xff; + cgc.cmd[8] = 8; + cgc.quiet = 1; + + if ((ret = pkt_generic_packet(pd, &cgc))) + return ret; + + cgc.buflen = be16_to_cpu(ti->track_information_length) + + sizeof(ti->track_information_length); + + if (cgc.buflen > sizeof(track_information)) + cgc.buflen = sizeof(track_information); + + cgc.cmd[8] = cgc.buflen; + return pkt_generic_packet(pd, &cgc); +} + +static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd, + long *last_written) +{ + disc_information di; + track_information ti; + __u32 last_track; + int ret = -1; + + if ((ret = pkt_get_disc_info(pd, &di))) + return ret; + + last_track = (di.last_track_msb << 8) | di.last_track_lsb; + if ((ret = pkt_get_track_info(pd, last_track, 1, &ti))) + return ret; + + /* if this track is blank, try the previous. */ + if (ti.blank) { + last_track--; + if ((ret = pkt_get_track_info(pd, last_track, 1, &ti))) + return ret; + } + + /* if last recorded field is valid, return it. */ + if (ti.lra_v) { + *last_written = be32_to_cpu(ti.last_rec_address); + } else { + /* make it up instead */ + *last_written = be32_to_cpu(ti.track_start) + + be32_to_cpu(ti.track_size); + if (ti.free_blocks) + *last_written -= (be32_to_cpu(ti.free_blocks) + 7); + } + return 0; +} + +/* + * write mode select package based on pd->settings + */ +static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) +{ + struct packet_command cgc; + struct request_sense sense; + write_param_page *wp; + char buffer[128]; + int ret, size; + + /* doesn't apply to DVD+RW or DVD-RAM */ + if ((pd->mmc3_profile == 0x1a) || (pd->mmc3_profile == 0x12)) + return 0; + + memset(buffer, 0, sizeof(buffer)); + init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); + cgc.sense = &sense; + if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { + pkt_dump_sense(pd, &cgc); + return ret; + } + + size = 2 + ((buffer[0] << 8) | (buffer[1] & 0xff)); + pd->mode_offset = (buffer[6] << 8) | (buffer[7] & 0xff); + if (size > sizeof(buffer)) + size = sizeof(buffer); + + /* + * now get it all + */ + init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); + cgc.sense = &sense; + if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { + pkt_dump_sense(pd, &cgc); + return ret; + } + + /* + * write page is offset header + block descriptor length + */ + wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset]; + + wp->fp = pd->settings.fp; + wp->track_mode = pd->settings.track_mode; + wp->write_type = pd->settings.write_type; + wp->data_block_type = pd->settings.block_mode; + + wp->multi_session = 0; + +#ifdef PACKET_USE_LS + wp->link_size = 7; + wp->ls_v = 1; +#endif + + if (wp->data_block_type == PACKET_BLOCK_MODE1) { + wp->session_format = 0; + wp->subhdr2 = 0x20; + } else if (wp->data_block_type == PACKET_BLOCK_MODE2) { + wp->session_format = 0x20; + wp->subhdr2 = 8; +#if 0 + wp->mcn[0] = 0x80; + memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1); +#endif + } else { + /* + * paranoia + */ + pkt_err(pd, "write mode wrong %d\n", wp->data_block_type); + return 1; + } + wp->packet_size = cpu_to_be32(pd->settings.size >> 2); + + cgc.buflen = cgc.cmd[8] = size; + if ((ret = pkt_mode_select(pd, &cgc))) { + pkt_dump_sense(pd, &cgc); + return ret; + } + + pkt_print_settings(pd); + return 0; +} + +/* + * 1 -- we can write to this track, 0 -- we can't + */ +static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) +{ + switch (pd->mmc3_profile) { + case 0x1a: /* DVD+RW */ + case 0x12: /* DVD-RAM */ + /* The track is always writable on DVD+RW/DVD-RAM */ + return 1; + default: + break; + } + + if (!ti->packet || !ti->fp) + return 0; + + /* + * "good" settings as per Mt Fuji. + */ + if (ti->rt == 0 && ti->blank == 0) + return 1; + + if (ti->rt == 0 && ti->blank == 1) + return 1; + + if (ti->rt == 1 && ti->blank == 0) + return 1; + + pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); + return 0; +} + +/* + * 1 -- we can write to this disc, 0 -- we can't + */ +static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) +{ + switch (pd->mmc3_profile) { + case 0x0a: /* CD-RW */ + case 0xffff: /* MMC3 not supported */ + break; + case 0x1a: /* DVD+RW */ + case 0x13: /* DVD-RW */ + case 0x12: /* DVD-RAM */ + return 1; + default: + pkt_dbg(2, pd, "Wrong disc profile (%x)\n", + pd->mmc3_profile); + return 0; + } + + /* + * for disc type 0xff we should probably reserve a new track. + * but i'm not sure, should we leave this to user apps? probably. + */ + if (di->disc_type == 0xff) { + pkt_notice(pd, "unknown disc - no track?\n"); + return 0; + } + + if (di->disc_type != 0x20 && di->disc_type != 0) { + pkt_err(pd, "wrong disc type (%x)\n", di->disc_type); + return 0; + } + + if (di->erasable == 0) { + pkt_notice(pd, "disc not erasable\n"); + return 0; + } + + if (di->border_status == PACKET_SESSION_RESERVED) { + pkt_err(pd, "can't write to last track (reserved)\n"); + return 0; + } + + return 1; +} + +static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) +{ + struct packet_command cgc; + unsigned char buf[12]; + disc_information di; + track_information ti; + int ret, track; + + init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); + cgc.cmd[0] = GPCMD_GET_CONFIGURATION; + cgc.cmd[8] = 8; + ret = pkt_generic_packet(pd, &cgc); + pd->mmc3_profile = ret ? 0xffff : buf[6] << 8 | buf[7]; + + memset(&di, 0, sizeof(disc_information)); + memset(&ti, 0, sizeof(track_information)); + + if ((ret = pkt_get_disc_info(pd, &di))) { + pkt_err(pd, "failed get_disc\n"); + return ret; + } + + if (!pkt_writable_disc(pd, &di)) + return -EROFS; + + pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR; + + track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ + if ((ret = pkt_get_track_info(pd, track, 1, &ti))) { + pkt_err(pd, "failed get_track\n"); + return ret; + } + + if (!pkt_writable_track(pd, &ti)) { + pkt_err(pd, "can't write to this track\n"); + return -EROFS; + } + + /* + * we keep packet size in 512 byte units, makes it easier to + * deal with request calculations. + */ + pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; + if (pd->settings.size == 0) { + pkt_notice(pd, "detected zero packet size!\n"); + return -ENXIO; + } + if (pd->settings.size > PACKET_MAX_SECTORS) { + pkt_err(pd, "packet size is too big\n"); + return -EROFS; + } + pd->settings.fp = ti.fp; + pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1); + + if (ti.nwa_v) { + pd->nwa = be32_to_cpu(ti.next_writable); + set_bit(PACKET_NWA_VALID, &pd->flags); + } + + /* + * in theory we could use lra on -RW media as well and just zero + * blocks that haven't been written yet, but in practice that + * is just a no-go. we'll use that for -R, naturally. + */ + if (ti.lra_v) { + pd->lra = be32_to_cpu(ti.last_rec_address); + set_bit(PACKET_LRA_VALID, &pd->flags); + } else { + pd->lra = 0xffffffff; + set_bit(PACKET_LRA_VALID, &pd->flags); + } + + /* + * fine for now + */ + pd->settings.link_loss = 7; + pd->settings.write_type = 0; /* packet */ + pd->settings.track_mode = ti.track_mode; + + /* + * mode1 or mode2 disc + */ + switch (ti.data_mode) { + case PACKET_MODE1: + pd->settings.block_mode = PACKET_BLOCK_MODE1; + break; + case PACKET_MODE2: + pd->settings.block_mode = PACKET_BLOCK_MODE2; + break; + default: + pkt_err(pd, "unknown data mode\n"); + return -EROFS; + } + return 0; +} + +/* + * enable/disable write caching on drive + */ +static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, + int set) +{ + struct packet_command cgc; + struct request_sense sense; + unsigned char buf[64]; + int ret; + + init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ); + cgc.sense = &sense; + cgc.buflen = pd->mode_offset + 12; + + /* + * caching mode page might not be there, so quiet this command + */ + cgc.quiet = 1; + + if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0))) + return ret; + + buf[pd->mode_offset + 10] |= (!!set << 2); + + cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff)); + ret = pkt_mode_select(pd, &cgc); + if (ret) { + pkt_err(pd, "write caching control failed\n"); + pkt_dump_sense(pd, &cgc); + } else if (!ret && set) + pkt_notice(pd, "enabled write caching\n"); + return ret; +} + +static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag) +{ + struct packet_command cgc; + + init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); + cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL; + cgc.cmd[4] = lockflag ? 1 : 0; + return pkt_generic_packet(pd, &cgc); +} + +/* + * Returns drive maximum write speed + */ +static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd, + unsigned *write_speed) +{ + struct packet_command cgc; + struct request_sense sense; + unsigned char buf[256+18]; + unsigned char *cap_buf; + int ret, offset; + + cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset]; + init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN); + cgc.sense = &sense; + + ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); + if (ret) { + cgc.buflen = pd->mode_offset + cap_buf[1] + 2 + + sizeof(struct mode_page_header); + ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); + if (ret) { + pkt_dump_sense(pd, &cgc); + return ret; + } + } + + offset = 20; /* Obsoleted field, used by older drives */ + if (cap_buf[1] >= 28) + offset = 28; /* Current write speed selected */ + if (cap_buf[1] >= 30) { + /* If the drive reports at least one "Logical Unit Write + * Speed Performance Descriptor Block", use the information + * in the first block. (contains the highest speed) + */ + int num_spdb = (cap_buf[30] << 8) + cap_buf[31]; + if (num_spdb > 0) + offset = 34; + } + + *write_speed = (cap_buf[offset] << 8) | cap_buf[offset + 1]; + return 0; +} + +/* These tables from cdrecord - I don't have orange book */ +/* standard speed CD-RW (1-4x) */ +static char clv_to_speed[16] = { + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ + 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +/* high speed CD-RW (-10x) */ +static char hs_clv_to_speed[16] = { + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ + 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +/* ultra high speed CD-RW */ +static char us_clv_to_speed[16] = { + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ + 0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0 +}; + +/* + * reads the maximum media speed from ATIP + */ +static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, + unsigned *speed) +{ + struct packet_command cgc; + struct request_sense sense; + unsigned char buf[64]; + unsigned int size, st, sp; + int ret; + + init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ); + cgc.sense = &sense; + cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; + cgc.cmd[1] = 2; + cgc.cmd[2] = 4; /* READ ATIP */ + cgc.cmd[8] = 2; + ret = pkt_generic_packet(pd, &cgc); + if (ret) { + pkt_dump_sense(pd, &cgc); + return ret; + } + size = ((unsigned int) buf[0]<<8) + buf[1] + 2; + if (size > sizeof(buf)) + size = sizeof(buf); + + init_cdrom_command(&cgc, buf, size, CGC_DATA_READ); + cgc.sense = &sense; + cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP; + cgc.cmd[1] = 2; + cgc.cmd[2] = 4; + cgc.cmd[8] = size; + ret = pkt_generic_packet(pd, &cgc); + if (ret) { + pkt_dump_sense(pd, &cgc); + return ret; + } + + if (!(buf[6] & 0x40)) { + pkt_notice(pd, "disc type is not CD-RW\n"); + return 1; + } + if (!(buf[6] & 0x4)) { + pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n"); + return 1; + } + + st = (buf[6] >> 3) & 0x7; /* disc sub-type */ + + sp = buf[16] & 0xf; /* max speed from ATIP A1 field */ + + /* Info from cdrecord */ + switch (st) { + case 0: /* standard speed */ + *speed = clv_to_speed[sp]; + break; + case 1: /* high speed */ + *speed = hs_clv_to_speed[sp]; + break; + case 2: /* ultra high speed */ + *speed = us_clv_to_speed[sp]; + break; + default: + pkt_notice(pd, "unknown disc sub-type %d\n", st); + return 1; + } + if (*speed) { + pkt_info(pd, "maximum media speed: %d\n", *speed); + return 0; + } else { + pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st); + return 1; + } +} + +static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) +{ + struct packet_command cgc; + struct request_sense sense; + int ret; + + pkt_dbg(2, pd, "Performing OPC\n"); + + init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); + cgc.sense = &sense; + cgc.timeout = 60*HZ; + cgc.cmd[0] = GPCMD_SEND_OPC; + cgc.cmd[1] = 1; + if ((ret = pkt_generic_packet(pd, &cgc))) + pkt_dump_sense(pd, &cgc); + return ret; +} + +static int pkt_open_write(struct pktcdvd_device *pd) +{ + int ret; + unsigned int write_speed, media_write_speed, read_speed; + + if ((ret = pkt_probe_settings(pd))) { + pkt_dbg(2, pd, "failed probe\n"); + return ret; + } + + if ((ret = pkt_set_write_settings(pd))) { + pkt_dbg(1, pd, "failed saving write settings\n"); + return -EIO; + } + + pkt_write_caching(pd, USE_WCACHING); + + if ((ret = pkt_get_max_speed(pd, &write_speed))) + write_speed = 16 * 177; + switch (pd->mmc3_profile) { + case 0x13: /* DVD-RW */ + case 0x1a: /* DVD+RW */ + case 0x12: /* DVD-RAM */ + pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed); + break; + default: + if ((ret = pkt_media_speed(pd, &media_write_speed))) + media_write_speed = 16; + write_speed = min(write_speed, media_write_speed * 177); + pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176); + break; + } + read_speed = write_speed; + + if ((ret = pkt_set_speed(pd, write_speed, read_speed))) { + pkt_dbg(1, pd, "couldn't set write speed\n"); + return -EIO; + } + pd->write_speed = write_speed; + pd->read_speed = read_speed; + + if ((ret = pkt_perform_opc(pd))) { + pkt_dbg(1, pd, "Optimum Power Calibration failed\n"); + } + + return 0; +} + +/* + * called at open time. + */ +static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) +{ + int ret; + long lba; + struct request_queue *q; + + /* + * We need to re-open the cdrom device without O_NONBLOCK to be able + * to read/write from/to it. It is already opened in O_NONBLOCK mode + * so bdget() can't fail. + */ + bdget(pd->bdev->bd_dev); + if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd))) + goto out; + + if ((ret = pkt_get_last_written(pd, &lba))) { + pkt_err(pd, "pkt_get_last_written failed\n"); + goto out_putdev; + } + + set_capacity(pd->disk, lba << 2); + set_capacity(pd->bdev->bd_disk, lba << 2); + bd_set_size(pd->bdev, (loff_t)lba << 11); + + q = bdev_get_queue(pd->bdev); + if (write) { + if ((ret = pkt_open_write(pd))) + goto out_putdev; + /* + * Some CDRW drives can not handle writes larger than one packet, + * even if the size is a multiple of the packet size. + */ + spin_lock_irq(q->queue_lock); + blk_queue_max_hw_sectors(q, pd->settings.size); + spin_unlock_irq(q->queue_lock); + set_bit(PACKET_WRITABLE, &pd->flags); + } else { + pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); + clear_bit(PACKET_WRITABLE, &pd->flags); + } + + if ((ret = pkt_set_segment_merging(pd, q))) + goto out_putdev; + + if (write) { + if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { + pkt_err(pd, "not enough memory for buffers\n"); + ret = -ENOMEM; + goto out_putdev; + } + pkt_info(pd, "%lukB available on disc\n", lba << 1); + } + + return 0; + +out_putdev: + blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL); +out: + return ret; +} + +/* + * called when the device is closed. makes sure that the device flushes + * the internal cache before we close. + */ +static void pkt_release_dev(struct pktcdvd_device *pd, int flush) +{ + if (flush && pkt_flush_cache(pd)) + pkt_dbg(1, pd, "not flushing cache\n"); + + pkt_lock_door(pd, 0); + + pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); + blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL); + + pkt_shrink_pktlist(pd); +} + +static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor) +{ + if (dev_minor >= MAX_WRITERS) + return NULL; + return pkt_devs[dev_minor]; +} + +static int pkt_open(struct block_device *bdev, fmode_t mode) +{ + struct pktcdvd_device *pd = NULL; + int ret; + + mutex_lock(&pktcdvd_mutex); + mutex_lock(&ctl_mutex); + pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev)); + if (!pd) { + ret = -ENODEV; + goto out; + } + BUG_ON(pd->refcnt < 0); + + pd->refcnt++; + if (pd->refcnt > 1) { + if ((mode & FMODE_WRITE) && + !test_bit(PACKET_WRITABLE, &pd->flags)) { + ret = -EBUSY; + goto out_dec; + } + } else { + ret = pkt_open_dev(pd, mode & FMODE_WRITE); + if (ret) + goto out_dec; + /* + * needed here as well, since ext2 (among others) may change + * the blocksize at mount time + */ + set_blocksize(bdev, CD_FRAMESIZE); + } + + mutex_unlock(&ctl_mutex); + mutex_unlock(&pktcdvd_mutex); + return 0; + +out_dec: + pd->refcnt--; +out: + mutex_unlock(&ctl_mutex); + mutex_unlock(&pktcdvd_mutex); + return ret; +} + +static void pkt_close(struct gendisk *disk, fmode_t mode) +{ + struct pktcdvd_device *pd = disk->private_data; + + mutex_lock(&pktcdvd_mutex); + mutex_lock(&ctl_mutex); + pd->refcnt--; + BUG_ON(pd->refcnt < 0); + if (pd->refcnt == 0) { + int flush = test_bit(PACKET_WRITABLE, &pd->flags); + pkt_release_dev(pd, flush); + } + mutex_unlock(&ctl_mutex); + mutex_unlock(&pktcdvd_mutex); +} + + +static void pkt_end_io_read_cloned(struct bio *bio, int err) +{ + struct packet_stacked_data *psd = bio->bi_private; + struct pktcdvd_device *pd = psd->pd; + + bio_put(bio); + bio_endio(psd->bio, err); + mempool_free(psd, psd_pool); + pkt_bio_finished(pd); +} + +static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) +{ + struct bio *cloned_bio = bio_clone(bio, GFP_NOIO); + struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO); + + psd->pd = pd; + psd->bio = bio; + cloned_bio->bi_bdev = pd->bdev; + cloned_bio->bi_private = psd; + cloned_bio->bi_end_io = pkt_end_io_read_cloned; + pd->stats.secs_r += bio_sectors(bio); + pkt_queue_bio(pd, cloned_bio); +} + +static void pkt_make_request_write(struct request_queue *q, struct bio *bio) +{ + struct pktcdvd_device *pd = q->queuedata; + sector_t zone; + struct packet_data *pkt; + int was_empty, blocked_bio; + struct pkt_rb_node *node; + + zone = get_zone(bio->bi_iter.bi_sector, pd); + + /* + * If we find a matching packet in state WAITING or READ_WAIT, we can + * just append this bio to that packet. + */ + spin_lock(&pd->cdrw.active_list_lock); + blocked_bio = 0; + list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) { + if (pkt->sector == zone) { + spin_lock(&pkt->lock); + if ((pkt->state == PACKET_WAITING_STATE) || + (pkt->state == PACKET_READ_WAIT_STATE)) { + bio_list_add(&pkt->orig_bios, bio); + pkt->write_size += + bio->bi_iter.bi_size / CD_FRAMESIZE; + if ((pkt->write_size >= pkt->frames) && + (pkt->state == PACKET_WAITING_STATE)) { + atomic_inc(&pkt->run_sm); + wake_up(&pd->wqueue); + } + spin_unlock(&pkt->lock); + spin_unlock(&pd->cdrw.active_list_lock); + return; + } else { + blocked_bio = 1; + } + spin_unlock(&pkt->lock); + } + } + spin_unlock(&pd->cdrw.active_list_lock); + + /* + * Test if there is enough room left in the bio work queue + * (queue size >= congestion on mark). + * If not, wait till the work queue size is below the congestion off mark. + */ + spin_lock(&pd->lock); + if (pd->write_congestion_on > 0 + && pd->bio_queue_size >= pd->write_congestion_on) { + set_bdi_congested(&q->backing_dev_info, BLK_RW_ASYNC); + do { + spin_unlock(&pd->lock); + congestion_wait(BLK_RW_ASYNC, HZ); + spin_lock(&pd->lock); + } while(pd->bio_queue_size > pd->write_congestion_off); + } + spin_unlock(&pd->lock); + + /* + * No matching packet found. Store the bio in the work queue. + */ + node = mempool_alloc(pd->rb_pool, GFP_NOIO); + node->bio = bio; + spin_lock(&pd->lock); + BUG_ON(pd->bio_queue_size < 0); + was_empty = (pd->bio_queue_size == 0); + pkt_rbtree_insert(pd, node); + spin_unlock(&pd->lock); + + /* + * Wake up the worker thread. + */ + atomic_set(&pd->scan_queue, 1); + if (was_empty) { + /* This wake_up is required for correct operation */ + wake_up(&pd->wqueue); + } else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) { + /* + * This wake up is not required for correct operation, + * but improves performance in some cases. + */ + wake_up(&pd->wqueue); + } +} + +static void pkt_make_request(struct request_queue *q, struct bio *bio) +{ + struct pktcdvd_device *pd; + char b[BDEVNAME_SIZE]; + struct bio *split; + + pd = q->queuedata; + if (!pd) { + pr_err("%s incorrect request queue\n", + bdevname(bio->bi_bdev, b)); + goto end_io; + } + + pkt_dbg(2, pd, "start = %6llx stop = %6llx\n", + (unsigned long long)bio->bi_iter.bi_sector, + (unsigned long long)bio_end_sector(bio)); + + /* + * Clone READ bios so we can have our own bi_end_io callback. + */ + if (bio_data_dir(bio) == READ) { + pkt_make_request_read(pd, bio); + return; + } + + if (!test_bit(PACKET_WRITABLE, &pd->flags)) { + pkt_notice(pd, "WRITE for ro device (%llu)\n", + (unsigned long long)bio->bi_iter.bi_sector); + goto end_io; + } + + if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) { + pkt_err(pd, "wrong bio size\n"); + goto end_io; + } + + blk_queue_bounce(q, &bio); + + do { + sector_t zone = get_zone(bio->bi_iter.bi_sector, pd); + sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd); + + if (last_zone != zone) { + BUG_ON(last_zone != zone + pd->settings.size); + + split = bio_split(bio, last_zone - + bio->bi_iter.bi_sector, + GFP_NOIO, fs_bio_set); + bio_chain(split, bio); + } else { + split = bio; + } + + pkt_make_request_write(q, split); + } while (split != bio); + + return; +end_io: + bio_io_error(bio); +} + + + +static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, + struct bio_vec *bvec) +{ + struct pktcdvd_device *pd = q->queuedata; + sector_t zone = get_zone(bmd->bi_sector, pd); + int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size; + int remaining = (pd->settings.size << 9) - used; + int remaining2; + + /* + * A bio <= PAGE_SIZE must be allowed. If it crosses a packet + * boundary, pkt_make_request() will split the bio. + */ + remaining2 = PAGE_SIZE - bmd->bi_size; + remaining = max(remaining, remaining2); + + BUG_ON(remaining < 0); + return remaining; +} + +static void pkt_init_queue(struct pktcdvd_device *pd) +{ + struct request_queue *q = pd->disk->queue; + + blk_queue_make_request(q, pkt_make_request); + blk_queue_logical_block_size(q, CD_FRAMESIZE); + blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS); + blk_queue_merge_bvec(q, pkt_merge_bvec); + q->queuedata = pd; +} + +static int pkt_seq_show(struct seq_file *m, void *p) +{ + struct pktcdvd_device *pd = m->private; + char *msg; + char bdev_buf[BDEVNAME_SIZE]; + int states[PACKET_NUM_STATES]; + + seq_printf(m, "Writer %s mapped to %s:\n", pd->name, + bdevname(pd->bdev, bdev_buf)); + + seq_printf(m, "\nSettings:\n"); + seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); + + if (pd->settings.write_type == 0) + msg = "Packet"; + else + msg = "Unknown"; + seq_printf(m, "\twrite type:\t\t%s\n", msg); + + seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable"); + seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss); + + seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode); + + if (pd->settings.block_mode == PACKET_BLOCK_MODE1) + msg = "Mode 1"; + else if (pd->settings.block_mode == PACKET_BLOCK_MODE2) + msg = "Mode 2"; + else + msg = "Unknown"; + seq_printf(m, "\tblock mode:\t\t%s\n", msg); + + seq_printf(m, "\nStatistics:\n"); + seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started); + seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended); + seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1); + seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1); + seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1); + + seq_printf(m, "\nMisc:\n"); + seq_printf(m, "\treference count:\t%d\n", pd->refcnt); + seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags); + seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed); + seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed); + seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset); + seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset); + + seq_printf(m, "\nQueue state:\n"); + seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size); + seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios)); + seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector); + + pkt_count_states(pd, states); + seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", + states[0], states[1], states[2], states[3], states[4], states[5]); + + seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n", + pd->write_congestion_off, + pd->write_congestion_on); + return 0; +} + +static int pkt_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, pkt_seq_show, PDE_DATA(inode)); +} + +static const struct file_operations pkt_proc_fops = { + .open = pkt_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release +}; + +static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) +{ + int i; + int ret = 0; + char b[BDEVNAME_SIZE]; + struct block_device *bdev; + + if (pd->pkt_dev == dev) { + pkt_err(pd, "recursive setup not allowed\n"); + return -EBUSY; + } + for (i = 0; i < MAX_WRITERS; i++) { + struct pktcdvd_device *pd2 = pkt_devs[i]; + if (!pd2) + continue; + if (pd2->bdev->bd_dev == dev) { + pkt_err(pd, "%s already setup\n", + bdevname(pd2->bdev, b)); + return -EBUSY; + } + if (pd2->pkt_dev == dev) { + pkt_err(pd, "can't chain pktcdvd devices\n"); + return -EBUSY; + } + } + + bdev = bdget(dev); + if (!bdev) + return -ENOMEM; + ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); + if (ret) + return ret; + + /* This is safe, since we have a reference from open(). */ + __module_get(THIS_MODULE); + + pd->bdev = bdev; + set_blocksize(bdev, CD_FRAMESIZE); + + pkt_init_queue(pd); + + atomic_set(&pd->cdrw.pending_bios, 0); + pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name); + if (IS_ERR(pd->cdrw.thread)) { + pkt_err(pd, "can't start kernel thread\n"); + ret = -ENOMEM; + goto out_mem; + } + + proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd); + pkt_dbg(1, pd, "writer mapped to %s\n", bdevname(bdev, b)); + return 0; + +out_mem: + blkdev_put(bdev, FMODE_READ | FMODE_NDELAY); + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); + return ret; +} + +static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) +{ + struct pktcdvd_device *pd = bdev->bd_disk->private_data; + int ret; + + pkt_dbg(2, pd, "cmd %x, dev %d:%d\n", + cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + + mutex_lock(&pktcdvd_mutex); + switch (cmd) { + case CDROMEJECT: + /* + * The door gets locked when the device is opened, so we + * have to unlock it or else the eject command fails. + */ + if (pd->refcnt == 1) + pkt_lock_door(pd, 0); + /* fallthru */ + /* + * forward selected CDROM ioctls to CD-ROM, for UDF + */ + case CDROMMULTISESSION: + case CDROMREADTOCENTRY: + case CDROM_LAST_WRITTEN: + case CDROM_SEND_PACKET: + case SCSI_IOCTL_SEND_COMMAND: + ret = __blkdev_driver_ioctl(pd->bdev, mode, cmd, arg); + break; + + default: + pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd); + ret = -ENOTTY; + } + mutex_unlock(&pktcdvd_mutex); + + return ret; +} + +static unsigned int pkt_check_events(struct gendisk *disk, + unsigned int clearing) +{ + struct pktcdvd_device *pd = disk->private_data; + struct gendisk *attached_disk; + + if (!pd) + return 0; + if (!pd->bdev) + return 0; + attached_disk = pd->bdev->bd_disk; + if (!attached_disk || !attached_disk->fops->check_events) + return 0; + return attached_disk->fops->check_events(attached_disk, clearing); +} + +static const struct block_device_operations pktcdvd_ops = { + .owner = THIS_MODULE, + .open = pkt_open, + .release = pkt_close, + .ioctl = pkt_ioctl, + .check_events = pkt_check_events, +}; + +static char *pktcdvd_devnode(struct gendisk *gd, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name); +} + +/* + * Set up mapping from pktcdvd device to CD-ROM device. + */ +static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) +{ + int idx; + int ret = -ENOMEM; + struct pktcdvd_device *pd; + struct gendisk *disk; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + for (idx = 0; idx < MAX_WRITERS; idx++) + if (!pkt_devs[idx]) + break; + if (idx == MAX_WRITERS) { + pr_err("max %d writers supported\n", MAX_WRITERS); + ret = -EBUSY; + goto out_mutex; + } + + pd = kzalloc(sizeof(struct pktcdvd_device), GFP_KERNEL); + if (!pd) + goto out_mutex; + + pd->rb_pool = mempool_create_kmalloc_pool(PKT_RB_POOL_SIZE, + sizeof(struct pkt_rb_node)); + if (!pd->rb_pool) + goto out_mem; + + INIT_LIST_HEAD(&pd->cdrw.pkt_free_list); + INIT_LIST_HEAD(&pd->cdrw.pkt_active_list); + spin_lock_init(&pd->cdrw.active_list_lock); + + spin_lock_init(&pd->lock); + spin_lock_init(&pd->iosched.lock); + bio_list_init(&pd->iosched.read_queue); + bio_list_init(&pd->iosched.write_queue); + sprintf(pd->name, DRIVER_NAME"%d", idx); + init_waitqueue_head(&pd->wqueue); + pd->bio_queue = RB_ROOT; + + pd->write_congestion_on = write_congestion_on; + pd->write_congestion_off = write_congestion_off; + + disk = alloc_disk(1); + if (!disk) + goto out_mem; + pd->disk = disk; + disk->major = pktdev_major; + disk->first_minor = idx; + disk->fops = &pktcdvd_ops; + disk->flags = GENHD_FL_REMOVABLE; + strcpy(disk->disk_name, pd->name); + disk->devnode = pktcdvd_devnode; + disk->private_data = pd; + disk->queue = blk_alloc_queue(GFP_KERNEL); + if (!disk->queue) + goto out_mem2; + + pd->pkt_dev = MKDEV(pktdev_major, idx); + ret = pkt_new_dev(pd, dev); + if (ret) + goto out_new_dev; + + /* inherit events of the host device */ + disk->events = pd->bdev->bd_disk->events; + disk->async_events = pd->bdev->bd_disk->async_events; + + add_disk(disk); + + pkt_sysfs_dev_new(pd); + pkt_debugfs_dev_new(pd); + + pkt_devs[idx] = pd; + if (pkt_dev) + *pkt_dev = pd->pkt_dev; + + mutex_unlock(&ctl_mutex); + return 0; + +out_new_dev: + blk_cleanup_queue(disk->queue); +out_mem2: + put_disk(disk); +out_mem: + if (pd->rb_pool) + mempool_destroy(pd->rb_pool); + kfree(pd); +out_mutex: + mutex_unlock(&ctl_mutex); + pr_err("setup of pktcdvd device failed\n"); + return ret; +} + +/* + * Tear down mapping from pktcdvd device to CD-ROM device. + */ +static int pkt_remove_dev(dev_t pkt_dev) +{ + struct pktcdvd_device *pd; + int idx; + int ret = 0; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + for (idx = 0; idx < MAX_WRITERS; idx++) { + pd = pkt_devs[idx]; + if (pd && (pd->pkt_dev == pkt_dev)) + break; + } + if (idx == MAX_WRITERS) { + pr_debug("dev not setup\n"); + ret = -ENXIO; + goto out; + } + + if (pd->refcnt > 0) { + ret = -EBUSY; + goto out; + } + if (!IS_ERR(pd->cdrw.thread)) + kthread_stop(pd->cdrw.thread); + + pkt_devs[idx] = NULL; + + pkt_debugfs_dev_remove(pd); + pkt_sysfs_dev_remove(pd); + + blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY); + + remove_proc_entry(pd->name, pkt_proc); + pkt_dbg(1, pd, "writer unmapped\n"); + + del_gendisk(pd->disk); + blk_cleanup_queue(pd->disk->queue); + put_disk(pd->disk); + + mempool_destroy(pd->rb_pool); + kfree(pd); + + /* This is safe: open() is still holding a reference. */ + module_put(THIS_MODULE); + +out: + mutex_unlock(&ctl_mutex); + return ret; +} + +static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd) +{ + struct pktcdvd_device *pd; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + + pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index); + if (pd) { + ctrl_cmd->dev = new_encode_dev(pd->bdev->bd_dev); + ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev); + } else { + ctrl_cmd->dev = 0; + ctrl_cmd->pkt_dev = 0; + } + ctrl_cmd->num_devices = MAX_WRITERS; + + mutex_unlock(&ctl_mutex); +} + +static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct pkt_ctrl_command ctrl_cmd; + int ret = 0; + dev_t pkt_dev = 0; + + if (cmd != PACKET_CTRL_CMD) + return -ENOTTY; + + if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command))) + return -EFAULT; + + switch (ctrl_cmd.command) { + case PKT_CTRL_CMD_SETUP: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + ret = pkt_setup_dev(new_decode_dev(ctrl_cmd.dev), &pkt_dev); + ctrl_cmd.pkt_dev = new_encode_dev(pkt_dev); + break; + case PKT_CTRL_CMD_TEARDOWN: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + ret = pkt_remove_dev(new_decode_dev(ctrl_cmd.pkt_dev)); + break; + case PKT_CTRL_CMD_STATUS: + pkt_get_status(&ctrl_cmd); + break; + default: + return -ENOTTY; + } + + if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command))) + return -EFAULT; + return ret; +} + +#ifdef CONFIG_COMPAT +static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +static const struct file_operations pkt_ctl_fops = { + .open = nonseekable_open, + .unlocked_ioctl = pkt_ctl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = pkt_ctl_compat_ioctl, +#endif + .owner = THIS_MODULE, + .llseek = no_llseek, +}; + +static struct miscdevice pkt_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = DRIVER_NAME, + .nodename = "pktcdvd/control", + .fops = &pkt_ctl_fops +}; + +static int __init pkt_init(void) +{ + int ret; + + mutex_init(&ctl_mutex); + + psd_pool = mempool_create_kmalloc_pool(PSD_POOL_SIZE, + sizeof(struct packet_stacked_data)); + if (!psd_pool) + return -ENOMEM; + + ret = register_blkdev(pktdev_major, DRIVER_NAME); + if (ret < 0) { + pr_err("unable to register block device\n"); + goto out2; + } + if (!pktdev_major) + pktdev_major = ret; + + ret = pkt_sysfs_init(); + if (ret) + goto out; + + pkt_debugfs_init(); + + ret = misc_register(&pkt_misc); + if (ret) { + pr_err("unable to register misc device\n"); + goto out_misc; + } + + pkt_proc = proc_mkdir("driver/"DRIVER_NAME, NULL); + + return 0; + +out_misc: + pkt_debugfs_cleanup(); + pkt_sysfs_cleanup(); +out: + unregister_blkdev(pktdev_major, DRIVER_NAME); +out2: + mempool_destroy(psd_pool); + return ret; +} + +static void __exit pkt_exit(void) +{ + remove_proc_entry("driver/"DRIVER_NAME, NULL); + misc_deregister(&pkt_misc); + + pkt_debugfs_cleanup(); + pkt_sysfs_cleanup(); + + unregister_blkdev(pktdev_major, DRIVER_NAME); + mempool_destroy(psd_pool); +} + +MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives"); +MODULE_AUTHOR("Jens Axboe "); +MODULE_LICENSE("GPL"); + +module_init(pkt_init); +module_exit(pkt_exit); diff --git a/kernel/drivers/block/pmem.c b/kernel/drivers/block/pmem.c new file mode 100644 index 000000000..eabf4a8d0 --- /dev/null +++ b/kernel/drivers/block/pmem.c @@ -0,0 +1,262 @@ +/* + * Persistent Memory Driver + * + * Copyright (c) 2014, Intel Corporation. + * Copyright (c) 2015, Christoph Hellwig . + * Copyright (c) 2015, Boaz Harrosh . + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PMEM_MINORS 16 + +struct pmem_device { + struct request_queue *pmem_queue; + struct gendisk *pmem_disk; + + /* One contiguous memory region per device */ + phys_addr_t phys_addr; + void *virt_addr; + size_t size; +}; + +static int pmem_major; +static atomic_t pmem_index; + +static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, + unsigned int len, unsigned int off, int rw, + sector_t sector) +{ + void *mem = kmap_atomic(page); + size_t pmem_off = sector << 9; + + if (rw == READ) { + memcpy(mem + off, pmem->virt_addr + pmem_off, len); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + memcpy(pmem->virt_addr + pmem_off, mem + off, len); + } + + kunmap_atomic(mem); +} + +static void pmem_make_request(struct request_queue *q, struct bio *bio) +{ + struct block_device *bdev = bio->bi_bdev; + struct pmem_device *pmem = bdev->bd_disk->private_data; + int rw; + struct bio_vec bvec; + sector_t sector; + struct bvec_iter iter; + int err = 0; + + if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) { + err = -EIO; + goto out; + } + + BUG_ON(bio->bi_rw & REQ_DISCARD); + + rw = bio_data_dir(bio); + sector = bio->bi_iter.bi_sector; + bio_for_each_segment(bvec, bio, iter) { + pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, + rw, sector); + sector += bvec.bv_len >> 9; + } + +out: + bio_endio(bio, err); +} + +static int pmem_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + struct pmem_device *pmem = bdev->bd_disk->private_data; + + pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); + page_endio(page, rw & WRITE, 0); + + return 0; +} + +static long pmem_direct_access(struct block_device *bdev, sector_t sector, + void **kaddr, unsigned long *pfn, long size) +{ + struct pmem_device *pmem = bdev->bd_disk->private_data; + size_t offset = sector << 9; + + if (!pmem) + return -ENODEV; + + *kaddr = pmem->virt_addr + offset; + *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; + + return pmem->size - offset; +} + +static const struct block_device_operations pmem_fops = { + .owner = THIS_MODULE, + .rw_page = pmem_rw_page, + .direct_access = pmem_direct_access, +}; + +static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res) +{ + struct pmem_device *pmem; + struct gendisk *disk; + int idx, err; + + err = -ENOMEM; + pmem = kzalloc(sizeof(*pmem), GFP_KERNEL); + if (!pmem) + goto out; + + pmem->phys_addr = res->start; + pmem->size = resource_size(res); + + err = -EINVAL; + if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) { + dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", &pmem->phys_addr, pmem->size); + goto out_free_dev; + } + + /* + * Map the memory as non-cachable, as we can't write back the contents + * of the CPU caches in case of a crash. + */ + err = -ENOMEM; + pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size); + if (!pmem->virt_addr) + goto out_release_region; + + pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL); + if (!pmem->pmem_queue) + goto out_unmap; + + blk_queue_make_request(pmem->pmem_queue, pmem_make_request); + blk_queue_max_hw_sectors(pmem->pmem_queue, 1024); + blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); + + disk = alloc_disk(PMEM_MINORS); + if (!disk) + goto out_free_queue; + + idx = atomic_inc_return(&pmem_index) - 1; + + disk->major = pmem_major; + disk->first_minor = PMEM_MINORS * idx; + disk->fops = &pmem_fops; + disk->private_data = pmem; + disk->queue = pmem->pmem_queue; + disk->flags = GENHD_FL_EXT_DEVT; + sprintf(disk->disk_name, "pmem%d", idx); + disk->driverfs_dev = dev; + set_capacity(disk, pmem->size >> 9); + pmem->pmem_disk = disk; + + add_disk(disk); + + return pmem; + +out_free_queue: + blk_cleanup_queue(pmem->pmem_queue); +out_unmap: + iounmap(pmem->virt_addr); +out_release_region: + release_mem_region(pmem->phys_addr, pmem->size); +out_free_dev: + kfree(pmem); +out: + return ERR_PTR(err); +} + +static void pmem_free(struct pmem_device *pmem) +{ + del_gendisk(pmem->pmem_disk); + put_disk(pmem->pmem_disk); + blk_cleanup_queue(pmem->pmem_queue); + iounmap(pmem->virt_addr); + release_mem_region(pmem->phys_addr, pmem->size); + kfree(pmem); +} + +static int pmem_probe(struct platform_device *pdev) +{ + struct pmem_device *pmem; + struct resource *res; + + if (WARN_ON(pdev->num_resources > 1)) + return -ENXIO; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!res) + return -ENXIO; + + pmem = pmem_alloc(&pdev->dev, res); + if (IS_ERR(pmem)) + return PTR_ERR(pmem); + + platform_set_drvdata(pdev, pmem); + + return 0; +} + +static int pmem_remove(struct platform_device *pdev) +{ + struct pmem_device *pmem = platform_get_drvdata(pdev); + + pmem_free(pmem); + return 0; +} + +static struct platform_driver pmem_driver = { + .probe = pmem_probe, + .remove = pmem_remove, + .driver = { + .owner = THIS_MODULE, + .name = "pmem", + }, +}; + +static int __init pmem_init(void) +{ + int error; + + pmem_major = register_blkdev(0, "pmem"); + if (pmem_major < 0) + return pmem_major; + + error = platform_driver_register(&pmem_driver); + if (error) + unregister_blkdev(pmem_major, "pmem"); + return error; +} +module_init(pmem_init); + +static void pmem_exit(void) +{ + platform_driver_unregister(&pmem_driver); + unregister_blkdev(pmem_major, "pmem"); +} +module_exit(pmem_exit); + +MODULE_AUTHOR("Ross Zwisler "); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/drivers/block/ps3disk.c b/kernel/drivers/block/ps3disk.c new file mode 100644 index 000000000..c120d70d3 --- /dev/null +++ b/kernel/drivers/block/ps3disk.c @@ -0,0 +1,589 @@ +/* + * PS3 Disk Storage Driver + * + * Copyright (C) 2007 Sony Computer Entertainment Inc. + * Copyright 2007 Sony Corp. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published + * by the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include + +#include +#include +#include + + +#define DEVICE_NAME "ps3disk" + +#define BOUNCE_SIZE (64*1024) + +#define PS3DISK_MAX_DISKS 16 +#define PS3DISK_MINORS 16 + + +#define PS3DISK_NAME "ps3d%c" + + +struct ps3disk_private { + spinlock_t lock; /* Request queue spinlock */ + struct request_queue *queue; + struct gendisk *gendisk; + unsigned int blocking_factor; + struct request *req; + u64 raw_capacity; + unsigned char model[ATA_ID_PROD_LEN+1]; +}; + + +#define LV1_STORAGE_SEND_ATA_COMMAND (2) +#define LV1_STORAGE_ATA_HDDOUT (0x23) + +struct lv1_ata_cmnd_block { + u16 features; + u16 sector_count; + u16 LBA_low; + u16 LBA_mid; + u16 LBA_high; + u8 device; + u8 command; + u32 is_ext; + u32 proto; + u32 in_out; + u32 size; + u64 buffer; + u32 arglen; +}; + +enum lv1_ata_proto { + NON_DATA_PROTO = 0, + PIO_DATA_IN_PROTO = 1, + PIO_DATA_OUT_PROTO = 2, + DMA_PROTO = 3 +}; + +enum lv1_ata_in_out { + DIR_WRITE = 0, /* memory -> device */ + DIR_READ = 1 /* device -> memory */ +}; + +static int ps3disk_major; + + +static const struct block_device_operations ps3disk_fops = { + .owner = THIS_MODULE, +}; + + +static void ps3disk_scatter_gather(struct ps3_storage_device *dev, + struct request *req, int gather) +{ + unsigned int offset = 0; + struct req_iterator iter; + struct bio_vec bvec; + unsigned int i = 0; + size_t size; + void *buf; + + rq_for_each_segment(bvec, req, iter) { + unsigned long flags; + dev_dbg(&dev->sbd.core, "%s:%u: bio %u: %u sectors from %lu\n", + __func__, __LINE__, i, bio_sectors(iter.bio), + iter.bio->bi_iter.bi_sector); + + size = bvec.bv_len; + buf = bvec_kmap_irq(&bvec, &flags); + if (gather) + memcpy(dev->bounce_buf+offset, buf, size); + else + memcpy(buf, dev->bounce_buf+offset, size); + offset += size; + flush_kernel_dcache_page(bvec.bv_page); + bvec_kunmap_irq(buf, &flags); + i++; + } +} + +static int ps3disk_submit_request_sg(struct ps3_storage_device *dev, + struct request *req) +{ + struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd); + int write = rq_data_dir(req), res; + const char *op = write ? "write" : "read"; + u64 start_sector, sectors; + unsigned int region_id = dev->regions[dev->region_idx].id; + +#ifdef DEBUG + unsigned int n = 0; + struct bio_vec bv; + struct req_iterator iter; + + rq_for_each_segment(bv, req, iter) + n++; + dev_dbg(&dev->sbd.core, + "%s:%u: %s req has %u bvecs for %u sectors\n", + __func__, __LINE__, op, n, blk_rq_sectors(req)); +#endif + + start_sector = blk_rq_pos(req) * priv->blocking_factor; + sectors = blk_rq_sectors(req) * priv->blocking_factor; + dev_dbg(&dev->sbd.core, "%s:%u: %s %llu sectors starting at %llu\n", + __func__, __LINE__, op, sectors, start_sector); + + if (write) { + ps3disk_scatter_gather(dev, req, 1); + + res = lv1_storage_write(dev->sbd.dev_id, region_id, + start_sector, sectors, 0, + dev->bounce_lpar, &dev->tag); + } else { + res = lv1_storage_read(dev->sbd.dev_id, region_id, + start_sector, sectors, 0, + dev->bounce_lpar, &dev->tag); + } + if (res) { + dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__, + __LINE__, op, res); + __blk_end_request_all(req, -EIO); + return 0; + } + + priv->req = req; + return 1; +} + +static int ps3disk_submit_flush_request(struct ps3_storage_device *dev, + struct request *req) +{ + struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd); + u64 res; + + dev_dbg(&dev->sbd.core, "%s:%u: flush request\n", __func__, __LINE__); + + res = lv1_storage_send_device_command(dev->sbd.dev_id, + LV1_STORAGE_ATA_HDDOUT, 0, 0, 0, + 0, &dev->tag); + if (res) { + dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n", + __func__, __LINE__, res); + __blk_end_request_all(req, -EIO); + return 0; + } + + priv->req = req; + return 1; +} + +static void ps3disk_do_request(struct ps3_storage_device *dev, + struct request_queue *q) +{ + struct request *req; + + dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__); + + while ((req = blk_fetch_request(q))) { + if (req->cmd_flags & REQ_FLUSH) { + if (ps3disk_submit_flush_request(dev, req)) + break; + } else if (req->cmd_type == REQ_TYPE_FS) { + if (ps3disk_submit_request_sg(dev, req)) + break; + } else { + blk_dump_rq_flags(req, DEVICE_NAME " bad request"); + __blk_end_request_all(req, -EIO); + continue; + } + } +} + +static void ps3disk_request(struct request_queue *q) +{ + struct ps3_storage_device *dev = q->queuedata; + struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd); + + if (priv->req) { + dev_dbg(&dev->sbd.core, "%s:%u busy\n", __func__, __LINE__); + return; + } + + ps3disk_do_request(dev, q); +} + +static irqreturn_t ps3disk_interrupt(int irq, void *data) +{ + struct ps3_storage_device *dev = data; + struct ps3disk_private *priv; + struct request *req; + int res, read, error; + u64 tag, status; + const char *op; + + res = lv1_storage_get_async_status(dev->sbd.dev_id, &tag, &status); + + if (tag != dev->tag) + dev_err(&dev->sbd.core, + "%s:%u: tag mismatch, got %llx, expected %llx\n", + __func__, __LINE__, tag, dev->tag); + + if (res) { + dev_err(&dev->sbd.core, "%s:%u: res=%d status=0x%llx\n", + __func__, __LINE__, res, status); + return IRQ_HANDLED; + } + + priv = ps3_system_bus_get_drvdata(&dev->sbd); + req = priv->req; + if (!req) { + dev_dbg(&dev->sbd.core, + "%s:%u non-block layer request completed\n", __func__, + __LINE__); + dev->lv1_status = status; + complete(&dev->done); + return IRQ_HANDLED; + } + + if (req->cmd_flags & REQ_FLUSH) { + read = 0; + op = "flush"; + } else { + read = !rq_data_dir(req); + op = read ? "read" : "write"; + } + if (status) { + dev_dbg(&dev->sbd.core, "%s:%u: %s failed 0x%llx\n", __func__, + __LINE__, op, status); + error = -EIO; + } else { + dev_dbg(&dev->sbd.core, "%s:%u: %s completed\n", __func__, + __LINE__, op); + error = 0; + if (read) + ps3disk_scatter_gather(dev, req, 0); + } + + spin_lock(&priv->lock); + __blk_end_request_all(req, error); + priv->req = NULL; + ps3disk_do_request(dev, priv->queue); + spin_unlock(&priv->lock); + + return IRQ_HANDLED; +} + +static int ps3disk_sync_cache(struct ps3_storage_device *dev) +{ + u64 res; + + dev_dbg(&dev->sbd.core, "%s:%u: sync cache\n", __func__, __LINE__); + + res = ps3stor_send_command(dev, LV1_STORAGE_ATA_HDDOUT, 0, 0, 0, 0); + if (res) { + dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n", + __func__, __LINE__, res); + return -EIO; + } + return 0; +} + + +/* ATA helpers copied from drivers/ata/libata-core.c */ + +static void swap_buf_le16(u16 *buf, unsigned int buf_words) +{ +#ifdef __BIG_ENDIAN + unsigned int i; + + for (i = 0; i < buf_words; i++) + buf[i] = le16_to_cpu(buf[i]); +#endif /* __BIG_ENDIAN */ +} + +static u64 ata_id_n_sectors(const u16 *id) +{ + if (ata_id_has_lba(id)) { + if (ata_id_has_lba48(id)) + return ata_id_u64(id, 100); + else + return ata_id_u32(id, 60); + } else { + if (ata_id_current_chs_valid(id)) + return ata_id_u32(id, 57); + else + return id[1] * id[3] * id[6]; + } +} + +static void ata_id_string(const u16 *id, unsigned char *s, unsigned int ofs, + unsigned int len) +{ + unsigned int c; + + while (len > 0) { + c = id[ofs] >> 8; + *s = c; + s++; + + c = id[ofs] & 0xff; + *s = c; + s++; + + ofs++; + len -= 2; + } +} + +static void ata_id_c_string(const u16 *id, unsigned char *s, unsigned int ofs, + unsigned int len) +{ + unsigned char *p; + + WARN_ON(!(len & 1)); + + ata_id_string(id, s, ofs, len - 1); + + p = s + strnlen(s, len - 1); + while (p > s && p[-1] == ' ') + p--; + *p = '\0'; +} + +static int ps3disk_identify(struct ps3_storage_device *dev) +{ + struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd); + struct lv1_ata_cmnd_block ata_cmnd; + u16 *id = dev->bounce_buf; + u64 res; + + dev_dbg(&dev->sbd.core, "%s:%u: identify disk\n", __func__, __LINE__); + + memset(&ata_cmnd, 0, sizeof(struct lv1_ata_cmnd_block)); + ata_cmnd.command = ATA_CMD_ID_ATA; + ata_cmnd.sector_count = 1; + ata_cmnd.size = ata_cmnd.arglen = ATA_ID_WORDS * 2; + ata_cmnd.buffer = dev->bounce_lpar; + ata_cmnd.proto = PIO_DATA_IN_PROTO; + ata_cmnd.in_out = DIR_READ; + + res = ps3stor_send_command(dev, LV1_STORAGE_SEND_ATA_COMMAND, + ps3_mm_phys_to_lpar(__pa(&ata_cmnd)), + sizeof(ata_cmnd), ata_cmnd.buffer, + ata_cmnd.arglen); + if (res) { + dev_err(&dev->sbd.core, "%s:%u: identify disk failed 0x%llx\n", + __func__, __LINE__, res); + return -EIO; + } + + swap_buf_le16(id, ATA_ID_WORDS); + + /* All we're interested in are raw capacity and model name */ + priv->raw_capacity = ata_id_n_sectors(id); + ata_id_c_string(id, priv->model, ATA_ID_PROD, sizeof(priv->model)); + return 0; +} + +static unsigned long ps3disk_mask; + +static DEFINE_MUTEX(ps3disk_mask_mutex); + +static int ps3disk_probe(struct ps3_system_bus_device *_dev) +{ + struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core); + struct ps3disk_private *priv; + int error; + unsigned int devidx; + struct request_queue *queue; + struct gendisk *gendisk; + + if (dev->blk_size < 512) { + dev_err(&dev->sbd.core, + "%s:%u: cannot handle block size %llu\n", __func__, + __LINE__, dev->blk_size); + return -EINVAL; + } + + BUILD_BUG_ON(PS3DISK_MAX_DISKS > BITS_PER_LONG); + mutex_lock(&ps3disk_mask_mutex); + devidx = find_first_zero_bit(&ps3disk_mask, PS3DISK_MAX_DISKS); + if (devidx >= PS3DISK_MAX_DISKS) { + dev_err(&dev->sbd.core, "%s:%u: Too many disks\n", __func__, + __LINE__); + mutex_unlock(&ps3disk_mask_mutex); + return -ENOSPC; + } + __set_bit(devidx, &ps3disk_mask); + mutex_unlock(&ps3disk_mask_mutex); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + error = -ENOMEM; + goto fail; + } + + ps3_system_bus_set_drvdata(_dev, priv); + spin_lock_init(&priv->lock); + + dev->bounce_size = BOUNCE_SIZE; + dev->bounce_buf = kmalloc(BOUNCE_SIZE, GFP_DMA); + if (!dev->bounce_buf) { + error = -ENOMEM; + goto fail_free_priv; + } + + error = ps3stor_setup(dev, ps3disk_interrupt); + if (error) + goto fail_free_bounce; + + ps3disk_identify(dev); + + queue = blk_init_queue(ps3disk_request, &priv->lock); + if (!queue) { + dev_err(&dev->sbd.core, "%s:%u: blk_init_queue failed\n", + __func__, __LINE__); + error = -ENOMEM; + goto fail_teardown; + } + + priv->queue = queue; + queue->queuedata = dev; + + blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH); + + blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9); + blk_queue_segment_boundary(queue, -1UL); + blk_queue_dma_alignment(queue, dev->blk_size-1); + blk_queue_logical_block_size(queue, dev->blk_size); + + blk_queue_flush(queue, REQ_FLUSH); + + blk_queue_max_segments(queue, -1); + blk_queue_max_segment_size(queue, dev->bounce_size); + + gendisk = alloc_disk(PS3DISK_MINORS); + if (!gendisk) { + dev_err(&dev->sbd.core, "%s:%u: alloc_disk failed\n", __func__, + __LINE__); + error = -ENOMEM; + goto fail_cleanup_queue; + } + + priv->gendisk = gendisk; + gendisk->major = ps3disk_major; + gendisk->first_minor = devidx * PS3DISK_MINORS; + gendisk->fops = &ps3disk_fops; + gendisk->queue = queue; + gendisk->private_data = dev; + gendisk->driverfs_dev = &dev->sbd.core; + snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME, + devidx+'a'); + priv->blocking_factor = dev->blk_size >> 9; + set_capacity(gendisk, + dev->regions[dev->region_idx].size*priv->blocking_factor); + + dev_info(&dev->sbd.core, + "%s is a %s (%llu MiB total, %lu MiB for OtherOS)\n", + gendisk->disk_name, priv->model, priv->raw_capacity >> 11, + get_capacity(gendisk) >> 11); + + add_disk(gendisk); + return 0; + +fail_cleanup_queue: + blk_cleanup_queue(queue); +fail_teardown: + ps3stor_teardown(dev); +fail_free_bounce: + kfree(dev->bounce_buf); +fail_free_priv: + kfree(priv); + ps3_system_bus_set_drvdata(_dev, NULL); +fail: + mutex_lock(&ps3disk_mask_mutex); + __clear_bit(devidx, &ps3disk_mask); + mutex_unlock(&ps3disk_mask_mutex); + return error; +} + +static int ps3disk_remove(struct ps3_system_bus_device *_dev) +{ + struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core); + struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd); + + mutex_lock(&ps3disk_mask_mutex); + __clear_bit(MINOR(disk_devt(priv->gendisk)) / PS3DISK_MINORS, + &ps3disk_mask); + mutex_unlock(&ps3disk_mask_mutex); + del_gendisk(priv->gendisk); + blk_cleanup_queue(priv->queue); + put_disk(priv->gendisk); + dev_notice(&dev->sbd.core, "Synchronizing disk cache\n"); + ps3disk_sync_cache(dev); + ps3stor_teardown(dev); + kfree(dev->bounce_buf); + kfree(priv); + ps3_system_bus_set_drvdata(_dev, NULL); + return 0; +} + +static struct ps3_system_bus_driver ps3disk = { + .match_id = PS3_MATCH_ID_STOR_DISK, + .core.name = DEVICE_NAME, + .core.owner = THIS_MODULE, + .probe = ps3disk_probe, + .remove = ps3disk_remove, + .shutdown = ps3disk_remove, +}; + + +static int __init ps3disk_init(void) +{ + int error; + + if (!firmware_has_feature(FW_FEATURE_PS3_LV1)) + return -ENODEV; + + error = register_blkdev(0, DEVICE_NAME); + if (error <= 0) { + printk(KERN_ERR "%s:%u: register_blkdev failed %d\n", __func__, + __LINE__, error); + return error; + } + ps3disk_major = error; + + pr_info("%s:%u: registered block device major %d\n", __func__, + __LINE__, ps3disk_major); + + error = ps3_system_bus_driver_register(&ps3disk); + if (error) + unregister_blkdev(ps3disk_major, DEVICE_NAME); + + return error; +} + +static void __exit ps3disk_exit(void) +{ + ps3_system_bus_driver_unregister(&ps3disk); + unregister_blkdev(ps3disk_major, DEVICE_NAME); +} + +module_init(ps3disk_init); +module_exit(ps3disk_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("PS3 Disk Storage Driver"); +MODULE_AUTHOR("Sony Corporation"); +MODULE_ALIAS(PS3_MODULE_ALIAS_STOR_DISK); diff --git a/kernel/drivers/block/ps3vram.c b/kernel/drivers/block/ps3vram.c new file mode 100644 index 000000000..ef45cfb98 --- /dev/null +++ b/kernel/drivers/block/ps3vram.c @@ -0,0 +1,878 @@ +/* + * ps3vram - Use extra PS3 video ram as MTD block device. + * + * Copyright 2009 Sony Corporation + * + * Based on the MTD ps3vram driver, which is + * Copyright (c) 2007-2008 Jim Paris + * Added support RSX DMA Vivien Chappelier + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +#define DEVICE_NAME "ps3vram" + + +#define XDR_BUF_SIZE (2 * 1024 * 1024) /* XDR buffer (must be 1MiB aligned) */ +#define XDR_IOIF 0x0c000000 + +#define FIFO_BASE XDR_IOIF +#define FIFO_SIZE (64 * 1024) + +#define DMA_PAGE_SIZE (4 * 1024) + +#define CACHE_PAGE_SIZE (256 * 1024) +#define CACHE_PAGE_COUNT ((XDR_BUF_SIZE - FIFO_SIZE) / CACHE_PAGE_SIZE) + +#define CACHE_OFFSET CACHE_PAGE_SIZE +#define FIFO_OFFSET 0 + +#define CTRL_PUT 0x10 +#define CTRL_GET 0x11 +#define CTRL_TOP 0x15 + +#define UPLOAD_SUBCH 1 +#define DOWNLOAD_SUBCH 2 + +#define NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN 0x0000030c +#define NV_MEMORY_TO_MEMORY_FORMAT_NOTIFY 0x00000104 + +#define CACHE_PAGE_PRESENT 1 +#define CACHE_PAGE_DIRTY 2 + +struct ps3vram_tag { + unsigned int address; + unsigned int flags; +}; + +struct ps3vram_cache { + unsigned int page_count; + unsigned int page_size; + struct ps3vram_tag *tags; + unsigned int hit; + unsigned int miss; +}; + +struct ps3vram_priv { + struct request_queue *queue; + struct gendisk *gendisk; + + u64 size; + + u64 memory_handle; + u64 context_handle; + u32 *ctrl; + void *reports; + u8 *xdr_buf; + + u32 *fifo_base; + u32 *fifo_ptr; + + struct ps3vram_cache cache; + + spinlock_t lock; /* protecting list of bios */ + struct bio_list list; +}; + + +static int ps3vram_major; + + +static const struct block_device_operations ps3vram_fops = { + .owner = THIS_MODULE, +}; + + +#define DMA_NOTIFIER_HANDLE_BASE 0x66604200 /* first DMA notifier handle */ +#define DMA_NOTIFIER_OFFSET_BASE 0x1000 /* first DMA notifier offset */ +#define DMA_NOTIFIER_SIZE 0x40 +#define NOTIFIER 7 /* notifier used for completion report */ + +static char *size = "256M"; +module_param(size, charp, 0); +MODULE_PARM_DESC(size, "memory size"); + +static u32 *ps3vram_get_notifier(void *reports, int notifier) +{ + return reports + DMA_NOTIFIER_OFFSET_BASE + + DMA_NOTIFIER_SIZE * notifier; +} + +static void ps3vram_notifier_reset(struct ps3_system_bus_device *dev) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); + int i; + + for (i = 0; i < 4; i++) + notify[i] = 0xffffffff; +} + +static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev, + unsigned int timeout_ms) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); + unsigned long timeout; + + for (timeout = 20; timeout; timeout--) { + if (!notify[3]) + return 0; + udelay(10); + } + + timeout = jiffies + msecs_to_jiffies(timeout_ms); + + do { + if (!notify[3]) + return 0; + msleep(1); + } while (time_before(jiffies, timeout)); + + return -ETIMEDOUT; +} + +static void ps3vram_init_ring(struct ps3_system_bus_device *dev) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + + priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET; + priv->ctrl[CTRL_GET] = FIFO_BASE + FIFO_OFFSET; +} + +static int ps3vram_wait_ring(struct ps3_system_bus_device *dev, + unsigned int timeout_ms) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms); + + do { + if (priv->ctrl[CTRL_PUT] == priv->ctrl[CTRL_GET]) + return 0; + msleep(1); + } while (time_before(jiffies, timeout)); + + dev_warn(&dev->core, "FIFO timeout (%08x/%08x/%08x)\n", + priv->ctrl[CTRL_PUT], priv->ctrl[CTRL_GET], + priv->ctrl[CTRL_TOP]); + + return -ETIMEDOUT; +} + +static void ps3vram_out_ring(struct ps3vram_priv *priv, u32 data) +{ + *(priv->fifo_ptr)++ = data; +} + +static void ps3vram_begin_ring(struct ps3vram_priv *priv, u32 chan, u32 tag, + u32 size) +{ + ps3vram_out_ring(priv, (size << 18) | (chan << 13) | tag); +} + +static void ps3vram_rewind_ring(struct ps3_system_bus_device *dev) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + int status; + + ps3vram_out_ring(priv, 0x20000000 | (FIFO_BASE + FIFO_OFFSET)); + + priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET; + + /* asking the HV for a blit will kick the FIFO */ + status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0); + if (status) + dev_err(&dev->core, "%s: lv1_gpu_fb_blit failed %d\n", + __func__, status); + + priv->fifo_ptr = priv->fifo_base; +} + +static void ps3vram_fire_ring(struct ps3_system_bus_device *dev) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + int status; + + mutex_lock(&ps3_gpu_mutex); + + priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET + + (priv->fifo_ptr - priv->fifo_base) * sizeof(u32); + + /* asking the HV for a blit will kick the FIFO */ + status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0); + if (status) + dev_err(&dev->core, "%s: lv1_gpu_fb_blit failed %d\n", + __func__, status); + + if ((priv->fifo_ptr - priv->fifo_base) * sizeof(u32) > + FIFO_SIZE - 1024) { + dev_dbg(&dev->core, "FIFO full, rewinding\n"); + ps3vram_wait_ring(dev, 200); + ps3vram_rewind_ring(dev); + } + + mutex_unlock(&ps3_gpu_mutex); +} + +static void ps3vram_bind(struct ps3_system_bus_device *dev) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + + ps3vram_begin_ring(priv, UPLOAD_SUBCH, 0, 1); + ps3vram_out_ring(priv, 0x31337303); + ps3vram_begin_ring(priv, UPLOAD_SUBCH, 0x180, 3); + ps3vram_out_ring(priv, DMA_NOTIFIER_HANDLE_BASE + NOTIFIER); + ps3vram_out_ring(priv, 0xfeed0001); /* DMA system RAM instance */ + ps3vram_out_ring(priv, 0xfeed0000); /* DMA video RAM instance */ + + ps3vram_begin_ring(priv, DOWNLOAD_SUBCH, 0, 1); + ps3vram_out_ring(priv, 0x3137c0de); + ps3vram_begin_ring(priv, DOWNLOAD_SUBCH, 0x180, 3); + ps3vram_out_ring(priv, DMA_NOTIFIER_HANDLE_BASE + NOTIFIER); + ps3vram_out_ring(priv, 0xfeed0000); /* DMA video RAM instance */ + ps3vram_out_ring(priv, 0xfeed0001); /* DMA system RAM instance */ + + ps3vram_fire_ring(dev); +} + +static int ps3vram_upload(struct ps3_system_bus_device *dev, + unsigned int src_offset, unsigned int dst_offset, + int len, int count) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + + ps3vram_begin_ring(priv, UPLOAD_SUBCH, + NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8); + ps3vram_out_ring(priv, XDR_IOIF + src_offset); + ps3vram_out_ring(priv, dst_offset); + ps3vram_out_ring(priv, len); + ps3vram_out_ring(priv, len); + ps3vram_out_ring(priv, len); + ps3vram_out_ring(priv, count); + ps3vram_out_ring(priv, (1 << 8) | 1); + ps3vram_out_ring(priv, 0); + + ps3vram_notifier_reset(dev); + ps3vram_begin_ring(priv, UPLOAD_SUBCH, + NV_MEMORY_TO_MEMORY_FORMAT_NOTIFY, 1); + ps3vram_out_ring(priv, 0); + ps3vram_begin_ring(priv, UPLOAD_SUBCH, 0x100, 1); + ps3vram_out_ring(priv, 0); + ps3vram_fire_ring(dev); + if (ps3vram_notifier_wait(dev, 200) < 0) { + dev_warn(&dev->core, "%s: Notifier timeout\n", __func__); + return -1; + } + + return 0; +} + +static int ps3vram_download(struct ps3_system_bus_device *dev, + unsigned int src_offset, unsigned int dst_offset, + int len, int count) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + + ps3vram_begin_ring(priv, DOWNLOAD_SUBCH, + NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8); + ps3vram_out_ring(priv, src_offset); + ps3vram_out_ring(priv, XDR_IOIF + dst_offset); + ps3vram_out_ring(priv, len); + ps3vram_out_ring(priv, len); + ps3vram_out_ring(priv, len); + ps3vram_out_ring(priv, count); + ps3vram_out_ring(priv, (1 << 8) | 1); + ps3vram_out_ring(priv, 0); + + ps3vram_notifier_reset(dev); + ps3vram_begin_ring(priv, DOWNLOAD_SUBCH, + NV_MEMORY_TO_MEMORY_FORMAT_NOTIFY, 1); + ps3vram_out_ring(priv, 0); + ps3vram_begin_ring(priv, DOWNLOAD_SUBCH, 0x100, 1); + ps3vram_out_ring(priv, 0); + ps3vram_fire_ring(dev); + if (ps3vram_notifier_wait(dev, 200) < 0) { + dev_warn(&dev->core, "%s: Notifier timeout\n", __func__); + return -1; + } + + return 0; +} + +static void ps3vram_cache_evict(struct ps3_system_bus_device *dev, int entry) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + struct ps3vram_cache *cache = &priv->cache; + + if (!(cache->tags[entry].flags & CACHE_PAGE_DIRTY)) + return; + + dev_dbg(&dev->core, "Flushing %d: 0x%08x\n", entry, + cache->tags[entry].address); + if (ps3vram_upload(dev, CACHE_OFFSET + entry * cache->page_size, + cache->tags[entry].address, DMA_PAGE_SIZE, + cache->page_size / DMA_PAGE_SIZE) < 0) { + dev_err(&dev->core, + "Failed to upload from 0x%x to " "0x%x size 0x%x\n", + entry * cache->page_size, cache->tags[entry].address, + cache->page_size); + } + cache->tags[entry].flags &= ~CACHE_PAGE_DIRTY; +} + +static void ps3vram_cache_load(struct ps3_system_bus_device *dev, int entry, + unsigned int address) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + struct ps3vram_cache *cache = &priv->cache; + + dev_dbg(&dev->core, "Fetching %d: 0x%08x\n", entry, address); + if (ps3vram_download(dev, address, + CACHE_OFFSET + entry * cache->page_size, + DMA_PAGE_SIZE, + cache->page_size / DMA_PAGE_SIZE) < 0) { + dev_err(&dev->core, + "Failed to download from 0x%x to 0x%x size 0x%x\n", + address, entry * cache->page_size, cache->page_size); + } + + cache->tags[entry].address = address; + cache->tags[entry].flags |= CACHE_PAGE_PRESENT; +} + + +static void ps3vram_cache_flush(struct ps3_system_bus_device *dev) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + struct ps3vram_cache *cache = &priv->cache; + int i; + + dev_dbg(&dev->core, "FLUSH\n"); + for (i = 0; i < cache->page_count; i++) { + ps3vram_cache_evict(dev, i); + cache->tags[i].flags = 0; + } +} + +static unsigned int ps3vram_cache_match(struct ps3_system_bus_device *dev, + loff_t address) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + struct ps3vram_cache *cache = &priv->cache; + unsigned int base; + unsigned int offset; + int i; + static int counter; + + offset = (unsigned int) (address & (cache->page_size - 1)); + base = (unsigned int) (address - offset); + + /* fully associative check */ + for (i = 0; i < cache->page_count; i++) { + if ((cache->tags[i].flags & CACHE_PAGE_PRESENT) && + cache->tags[i].address == base) { + cache->hit++; + dev_dbg(&dev->core, "Found entry %d: 0x%08x\n", i, + cache->tags[i].address); + return i; + } + } + + /* choose a random entry */ + i = (jiffies + (counter++)) % cache->page_count; + dev_dbg(&dev->core, "Using entry %d\n", i); + + ps3vram_cache_evict(dev, i); + ps3vram_cache_load(dev, i, base); + + cache->miss++; + return i; +} + +static int ps3vram_cache_init(struct ps3_system_bus_device *dev) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + + priv->cache.page_count = CACHE_PAGE_COUNT; + priv->cache.page_size = CACHE_PAGE_SIZE; + priv->cache.tags = kzalloc(sizeof(struct ps3vram_tag) * + CACHE_PAGE_COUNT, GFP_KERNEL); + if (priv->cache.tags == NULL) { + dev_err(&dev->core, "Could not allocate cache tags\n"); + return -ENOMEM; + } + + dev_info(&dev->core, "Created ram cache: %d entries, %d KiB each\n", + CACHE_PAGE_COUNT, CACHE_PAGE_SIZE / 1024); + + return 0; +} + +static void ps3vram_cache_cleanup(struct ps3_system_bus_device *dev) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + + ps3vram_cache_flush(dev); + kfree(priv->cache.tags); +} + +static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from, + size_t len, size_t *retlen, u_char *buf) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + unsigned int cached, count; + + dev_dbg(&dev->core, "%s: from=0x%08x len=0x%zx\n", __func__, + (unsigned int)from, len); + + if (from >= priv->size) + return -EIO; + + if (len > priv->size - from) + len = priv->size - from; + + /* Copy from vram to buf */ + count = len; + while (count) { + unsigned int offset, avail; + unsigned int entry; + + offset = (unsigned int) (from & (priv->cache.page_size - 1)); + avail = priv->cache.page_size - offset; + + entry = ps3vram_cache_match(dev, from); + cached = CACHE_OFFSET + entry * priv->cache.page_size + offset; + + dev_dbg(&dev->core, "%s: from=%08x cached=%08x offset=%08x " + "avail=%08x count=%08x\n", __func__, + (unsigned int)from, cached, offset, avail, count); + + if (avail > count) + avail = count; + memcpy(buf, priv->xdr_buf + cached, avail); + + buf += avail; + count -= avail; + from += avail; + } + + *retlen = len; + return 0; +} + +static int ps3vram_write(struct ps3_system_bus_device *dev, loff_t to, + size_t len, size_t *retlen, const u_char *buf) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + unsigned int cached, count; + + if (to >= priv->size) + return -EIO; + + if (len > priv->size - to) + len = priv->size - to; + + /* Copy from buf to vram */ + count = len; + while (count) { + unsigned int offset, avail; + unsigned int entry; + + offset = (unsigned int) (to & (priv->cache.page_size - 1)); + avail = priv->cache.page_size - offset; + + entry = ps3vram_cache_match(dev, to); + cached = CACHE_OFFSET + entry * priv->cache.page_size + offset; + + dev_dbg(&dev->core, "%s: to=%08x cached=%08x offset=%08x " + "avail=%08x count=%08x\n", __func__, (unsigned int)to, + cached, offset, avail, count); + + if (avail > count) + avail = count; + memcpy(priv->xdr_buf + cached, buf, avail); + + priv->cache.tags[entry].flags |= CACHE_PAGE_DIRTY; + + buf += avail; + count -= avail; + to += avail; + } + + *retlen = len; + return 0; +} + +static int ps3vram_proc_show(struct seq_file *m, void *v) +{ + struct ps3vram_priv *priv = m->private; + + seq_printf(m, "hit:%u\nmiss:%u\n", priv->cache.hit, priv->cache.miss); + return 0; +} + +static int ps3vram_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ps3vram_proc_show, PDE_DATA(inode)); +} + +static const struct file_operations ps3vram_proc_fops = { + .owner = THIS_MODULE, + .open = ps3vram_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void ps3vram_proc_init(struct ps3_system_bus_device *dev) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + struct proc_dir_entry *pde; + + pde = proc_create_data(DEVICE_NAME, 0444, NULL, &ps3vram_proc_fops, + priv); + if (!pde) + dev_warn(&dev->core, "failed to create /proc entry\n"); +} + +static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev, + struct bio *bio) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + int write = bio_data_dir(bio) == WRITE; + const char *op = write ? "write" : "read"; + loff_t offset = bio->bi_iter.bi_sector << 9; + int error = 0; + struct bio_vec bvec; + struct bvec_iter iter; + struct bio *next; + + bio_for_each_segment(bvec, bio, iter) { + /* PS3 is ppc64, so we don't handle highmem */ + char *ptr = page_address(bvec.bv_page) + bvec.bv_offset; + size_t len = bvec.bv_len, retlen; + + dev_dbg(&dev->core, " %s %zu bytes at offset %llu\n", op, + len, offset); + if (write) + error = ps3vram_write(dev, offset, len, &retlen, ptr); + else + error = ps3vram_read(dev, offset, len, &retlen, ptr); + + if (error) { + dev_err(&dev->core, "%s failed\n", op); + goto out; + } + + if (retlen != len) { + dev_err(&dev->core, "Short %s\n", op); + error = -EIO; + goto out; + } + + offset += len; + } + + dev_dbg(&dev->core, "%s completed\n", op); + +out: + spin_lock_irq(&priv->lock); + bio_list_pop(&priv->list); + next = bio_list_peek(&priv->list); + spin_unlock_irq(&priv->lock); + + bio_endio(bio, error); + return next; +} + +static void ps3vram_make_request(struct request_queue *q, struct bio *bio) +{ + struct ps3_system_bus_device *dev = q->queuedata; + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + int busy; + + dev_dbg(&dev->core, "%s\n", __func__); + + spin_lock_irq(&priv->lock); + busy = !bio_list_empty(&priv->list); + bio_list_add(&priv->list, bio); + spin_unlock_irq(&priv->lock); + + if (busy) + return; + + do { + bio = ps3vram_do_bio(dev, bio); + } while (bio); +} + +static int ps3vram_probe(struct ps3_system_bus_device *dev) +{ + struct ps3vram_priv *priv; + int error, status; + struct request_queue *queue; + struct gendisk *gendisk; + u64 ddr_size, ddr_lpar, ctrl_lpar, info_lpar, reports_lpar, + reports_size, xdr_lpar; + char *rest; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) { + error = -ENOMEM; + goto fail; + } + + spin_lock_init(&priv->lock); + bio_list_init(&priv->list); + ps3_system_bus_set_drvdata(dev, priv); + + /* Allocate XDR buffer (1MiB aligned) */ + priv->xdr_buf = (void *)__get_free_pages(GFP_KERNEL, + get_order(XDR_BUF_SIZE)); + if (priv->xdr_buf == NULL) { + dev_err(&dev->core, "Could not allocate XDR buffer\n"); + error = -ENOMEM; + goto fail_free_priv; + } + + /* Put FIFO at begginning of XDR buffer */ + priv->fifo_base = (u32 *) (priv->xdr_buf + FIFO_OFFSET); + priv->fifo_ptr = priv->fifo_base; + + /* XXX: Need to open GPU, in case ps3fb or snd_ps3 aren't loaded */ + if (ps3_open_hv_device(dev)) { + dev_err(&dev->core, "ps3_open_hv_device failed\n"); + error = -EAGAIN; + goto out_free_xdr_buf; + } + + /* Request memory */ + status = -1; + ddr_size = ALIGN(memparse(size, &rest), 1024*1024); + if (!ddr_size) { + dev_err(&dev->core, "Specified size is too small\n"); + error = -EINVAL; + goto out_close_gpu; + } + + while (ddr_size > 0) { + status = lv1_gpu_memory_allocate(ddr_size, 0, 0, 0, 0, + &priv->memory_handle, + &ddr_lpar); + if (!status) + break; + ddr_size -= 1024*1024; + } + if (status) { + dev_err(&dev->core, "lv1_gpu_memory_allocate failed %d\n", + status); + error = -ENOMEM; + goto out_close_gpu; + } + + /* Request context */ + status = lv1_gpu_context_allocate(priv->memory_handle, 0, + &priv->context_handle, &ctrl_lpar, + &info_lpar, &reports_lpar, + &reports_size); + if (status) { + dev_err(&dev->core, "lv1_gpu_context_allocate failed %d\n", + status); + error = -ENOMEM; + goto out_free_memory; + } + + /* Map XDR buffer to RSX */ + xdr_lpar = ps3_mm_phys_to_lpar(__pa(priv->xdr_buf)); + status = lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF, + xdr_lpar, XDR_BUF_SIZE, + CBE_IOPTE_PP_W | CBE_IOPTE_PP_R | + CBE_IOPTE_M); + if (status) { + dev_err(&dev->core, "lv1_gpu_context_iomap failed %d\n", + status); + error = -ENOMEM; + goto out_free_context; + } + + priv->ctrl = ioremap(ctrl_lpar, 64 * 1024); + if (!priv->ctrl) { + dev_err(&dev->core, "ioremap CTRL failed\n"); + error = -ENOMEM; + goto out_unmap_context; + } + + priv->reports = ioremap(reports_lpar, reports_size); + if (!priv->reports) { + dev_err(&dev->core, "ioremap REPORTS failed\n"); + error = -ENOMEM; + goto out_unmap_ctrl; + } + + mutex_lock(&ps3_gpu_mutex); + ps3vram_init_ring(dev); + mutex_unlock(&ps3_gpu_mutex); + + priv->size = ddr_size; + + ps3vram_bind(dev); + + mutex_lock(&ps3_gpu_mutex); + error = ps3vram_wait_ring(dev, 100); + mutex_unlock(&ps3_gpu_mutex); + if (error < 0) { + dev_err(&dev->core, "Failed to initialize channels\n"); + error = -ETIMEDOUT; + goto out_unmap_reports; + } + + ps3vram_cache_init(dev); + ps3vram_proc_init(dev); + + queue = blk_alloc_queue(GFP_KERNEL); + if (!queue) { + dev_err(&dev->core, "blk_alloc_queue failed\n"); + error = -ENOMEM; + goto out_cache_cleanup; + } + + priv->queue = queue; + queue->queuedata = dev; + blk_queue_make_request(queue, ps3vram_make_request); + blk_queue_max_segments(queue, BLK_MAX_SEGMENTS); + blk_queue_max_segment_size(queue, BLK_MAX_SEGMENT_SIZE); + blk_queue_max_hw_sectors(queue, BLK_SAFE_MAX_SECTORS); + + gendisk = alloc_disk(1); + if (!gendisk) { + dev_err(&dev->core, "alloc_disk failed\n"); + error = -ENOMEM; + goto fail_cleanup_queue; + } + + priv->gendisk = gendisk; + gendisk->major = ps3vram_major; + gendisk->first_minor = 0; + gendisk->fops = &ps3vram_fops; + gendisk->queue = queue; + gendisk->private_data = dev; + gendisk->driverfs_dev = &dev->core; + strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); + set_capacity(gendisk, priv->size >> 9); + + dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n", + gendisk->disk_name, get_capacity(gendisk) >> 11); + + add_disk(gendisk); + return 0; + +fail_cleanup_queue: + blk_cleanup_queue(queue); +out_cache_cleanup: + remove_proc_entry(DEVICE_NAME, NULL); + ps3vram_cache_cleanup(dev); +out_unmap_reports: + iounmap(priv->reports); +out_unmap_ctrl: + iounmap(priv->ctrl); +out_unmap_context: + lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF, xdr_lpar, + XDR_BUF_SIZE, CBE_IOPTE_M); +out_free_context: + lv1_gpu_context_free(priv->context_handle); +out_free_memory: + lv1_gpu_memory_free(priv->memory_handle); +out_close_gpu: + ps3_close_hv_device(dev); +out_free_xdr_buf: + free_pages((unsigned long) priv->xdr_buf, get_order(XDR_BUF_SIZE)); +fail_free_priv: + kfree(priv); + ps3_system_bus_set_drvdata(dev, NULL); +fail: + return error; +} + +static int ps3vram_remove(struct ps3_system_bus_device *dev) +{ + struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); + + del_gendisk(priv->gendisk); + put_disk(priv->gendisk); + blk_cleanup_queue(priv->queue); + remove_proc_entry(DEVICE_NAME, NULL); + ps3vram_cache_cleanup(dev); + iounmap(priv->reports); + iounmap(priv->ctrl); + lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF, + ps3_mm_phys_to_lpar(__pa(priv->xdr_buf)), + XDR_BUF_SIZE, CBE_IOPTE_M); + lv1_gpu_context_free(priv->context_handle); + lv1_gpu_memory_free(priv->memory_handle); + ps3_close_hv_device(dev); + free_pages((unsigned long) priv->xdr_buf, get_order(XDR_BUF_SIZE)); + kfree(priv); + ps3_system_bus_set_drvdata(dev, NULL); + return 0; +} + +static struct ps3_system_bus_driver ps3vram = { + .match_id = PS3_MATCH_ID_GPU, + .match_sub_id = PS3_MATCH_SUB_ID_GPU_RAMDISK, + .core.name = DEVICE_NAME, + .core.owner = THIS_MODULE, + .probe = ps3vram_probe, + .remove = ps3vram_remove, + .shutdown = ps3vram_remove, +}; + + +static int __init ps3vram_init(void) +{ + int error; + + if (!firmware_has_feature(FW_FEATURE_PS3_LV1)) + return -ENODEV; + + error = register_blkdev(0, DEVICE_NAME); + if (error <= 0) { + pr_err("%s: register_blkdev failed %d\n", DEVICE_NAME, error); + return error; + } + ps3vram_major = error; + + pr_info("%s: registered block device major %d\n", DEVICE_NAME, + ps3vram_major); + + error = ps3_system_bus_driver_register(&ps3vram); + if (error) + unregister_blkdev(ps3vram_major, DEVICE_NAME); + + return error; +} + +static void __exit ps3vram_exit(void) +{ + ps3_system_bus_driver_unregister(&ps3vram); + unregister_blkdev(ps3vram_major, DEVICE_NAME); +} + +module_init(ps3vram_init); +module_exit(ps3vram_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("PS3 Video RAM Storage Driver"); +MODULE_AUTHOR("Sony Corporation"); +MODULE_ALIAS(PS3_MODULE_ALIAS_GPU_RAMDISK); diff --git a/kernel/drivers/block/rbd.c b/kernel/drivers/block/rbd.c new file mode 100644 index 000000000..ec6c5c6e1 --- /dev/null +++ b/kernel/drivers/block/rbd.c @@ -0,0 +1,5752 @@ + +/* + rbd.c -- Export ceph rados objects as a Linux block device + + + based on drivers/block/osdblk.c: + + Copyright 2009 Red Hat, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + + + For usage instructions, please refer to: + + Documentation/ABI/testing/sysfs-bus-rbd + + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rbd_types.h" + +#define RBD_DEBUG /* Activate rbd_assert() calls */ + +/* + * The basic unit of block I/O is a sector. It is interpreted in a + * number of contexts in Linux (blk, bio, genhd), but the default is + * universally 512 bytes. These symbols are just slightly more + * meaningful than the bare numbers they represent. + */ +#define SECTOR_SHIFT 9 +#define SECTOR_SIZE (1ULL << SECTOR_SHIFT) + +/* + * Increment the given counter and return its updated value. + * If the counter is already 0 it will not be incremented. + * If the counter is already at its maximum value returns + * -EINVAL without updating it. + */ +static int atomic_inc_return_safe(atomic_t *v) +{ + unsigned int counter; + + counter = (unsigned int)__atomic_add_unless(v, 1, 0); + if (counter <= (unsigned int)INT_MAX) + return (int)counter; + + atomic_dec(v); + + return -EINVAL; +} + +/* Decrement the counter. Return the resulting value, or -EINVAL */ +static int atomic_dec_return_safe(atomic_t *v) +{ + int counter; + + counter = atomic_dec_return(v); + if (counter >= 0) + return counter; + + atomic_inc(v); + + return -EINVAL; +} + +#define RBD_DRV_NAME "rbd" + +#define RBD_MINORS_PER_MAJOR 256 +#define RBD_SINGLE_MAJOR_PART_SHIFT 4 + +#define RBD_SNAP_DEV_NAME_PREFIX "snap_" +#define RBD_MAX_SNAP_NAME_LEN \ + (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) + +#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */ + +#define RBD_SNAP_HEAD_NAME "-" + +#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */ + +/* This allows a single page to hold an image name sent by OSD */ +#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1) +#define RBD_IMAGE_ID_LEN_MAX 64 + +#define RBD_OBJ_PREFIX_LEN_MAX 64 + +/* Feature bits */ + +#define RBD_FEATURE_LAYERING (1<<0) +#define RBD_FEATURE_STRIPINGV2 (1<<1) +#define RBD_FEATURES_ALL \ + (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2) + +/* Features supported by this (client software) implementation. */ + +#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL) + +/* + * An RBD device name will be "rbd#", where the "rbd" comes from + * RBD_DRV_NAME above, and # is a unique integer identifier. + * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big + * enough to hold all possible device names. + */ +#define DEV_NAME_LEN 32 +#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1) + +/* + * block device image metadata (in-memory version) + */ +struct rbd_image_header { + /* These six fields never change for a given rbd image */ + char *object_prefix; + __u8 obj_order; + __u8 crypt_type; + __u8 comp_type; + u64 stripe_unit; + u64 stripe_count; + u64 features; /* Might be changeable someday? */ + + /* The remaining fields need to be updated occasionally */ + u64 image_size; + struct ceph_snap_context *snapc; + char *snap_names; /* format 1 only */ + u64 *snap_sizes; /* format 1 only */ +}; + +/* + * An rbd image specification. + * + * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely + * identify an image. Each rbd_dev structure includes a pointer to + * an rbd_spec structure that encapsulates this identity. + * + * Each of the id's in an rbd_spec has an associated name. For a + * user-mapped image, the names are supplied and the id's associated + * with them are looked up. For a layered image, a parent image is + * defined by the tuple, and the names are looked up. + * + * An rbd_dev structure contains a parent_spec pointer which is + * non-null if the image it represents is a child in a layered + * image. This pointer will refer to the rbd_spec structure used + * by the parent rbd_dev for its own identity (i.e., the structure + * is shared between the parent and child). + * + * Since these structures are populated once, during the discovery + * phase of image construction, they are effectively immutable so + * we make no effort to synchronize access to them. + * + * Note that code herein does not assume the image name is known (it + * could be a null pointer). + */ +struct rbd_spec { + u64 pool_id; + const char *pool_name; + + const char *image_id; + const char *image_name; + + u64 snap_id; + const char *snap_name; + + struct kref kref; +}; + +/* + * an instance of the client. multiple devices may share an rbd client. + */ +struct rbd_client { + struct ceph_client *client; + struct kref kref; + struct list_head node; +}; + +struct rbd_img_request; +typedef void (*rbd_img_callback_t)(struct rbd_img_request *); + +#define BAD_WHICH U32_MAX /* Good which or bad which, which? */ + +struct rbd_obj_request; +typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); + +enum obj_request_type { + OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES +}; + +enum obj_operation_type { + OBJ_OP_WRITE, + OBJ_OP_READ, + OBJ_OP_DISCARD, +}; + +enum obj_req_flags { + OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ + OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ + OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ + OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ +}; + +struct rbd_obj_request { + const char *object_name; + u64 offset; /* object start byte */ + u64 length; /* bytes from offset */ + unsigned long flags; + + /* + * An object request associated with an image will have its + * img_data flag set; a standalone object request will not. + * + * A standalone object request will have which == BAD_WHICH + * and a null obj_request pointer. + * + * An object request initiated in support of a layered image + * object (to check for its existence before a write) will + * have which == BAD_WHICH and a non-null obj_request pointer. + * + * Finally, an object request for rbd image data will have + * which != BAD_WHICH, and will have a non-null img_request + * pointer. The value of which will be in the range + * 0..(img_request->obj_request_count-1). + */ + union { + struct rbd_obj_request *obj_request; /* STAT op */ + struct { + struct rbd_img_request *img_request; + u64 img_offset; + /* links for img_request->obj_requests list */ + struct list_head links; + }; + }; + u32 which; /* posn image request list */ + + enum obj_request_type type; + union { + struct bio *bio_list; + struct { + struct page **pages; + u32 page_count; + }; + }; + struct page **copyup_pages; + u32 copyup_page_count; + + struct ceph_osd_request *osd_req; + + u64 xferred; /* bytes transferred */ + int result; + + rbd_obj_callback_t callback; + struct completion completion; + + struct kref kref; +}; + +enum img_req_flags { + IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ + IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ + IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ + IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ +}; + +struct rbd_img_request { + struct rbd_device *rbd_dev; + u64 offset; /* starting image byte offset */ + u64 length; /* byte count from offset */ + unsigned long flags; + union { + u64 snap_id; /* for reads */ + struct ceph_snap_context *snapc; /* for writes */ + }; + union { + struct request *rq; /* block request */ + struct rbd_obj_request *obj_request; /* obj req initiator */ + }; + struct page **copyup_pages; + u32 copyup_page_count; + spinlock_t completion_lock;/* protects next_completion */ + u32 next_completion; + rbd_img_callback_t callback; + u64 xferred;/* aggregate bytes transferred */ + int result; /* first nonzero obj_request result */ + + u32 obj_request_count; + struct list_head obj_requests; /* rbd_obj_request structs */ + + struct kref kref; +}; + +#define for_each_obj_request(ireq, oreq) \ + list_for_each_entry(oreq, &(ireq)->obj_requests, links) +#define for_each_obj_request_from(ireq, oreq) \ + list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) +#define for_each_obj_request_safe(ireq, oreq, n) \ + list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) + +struct rbd_mapping { + u64 size; + u64 features; + bool read_only; +}; + +/* + * a single device + */ +struct rbd_device { + int dev_id; /* blkdev unique id */ + + int major; /* blkdev assigned major */ + int minor; + struct gendisk *disk; /* blkdev's gendisk and rq */ + + u32 image_format; /* Either 1 or 2 */ + struct rbd_client *rbd_client; + + char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */ + + spinlock_t lock; /* queue, flags, open_count */ + + struct rbd_image_header header; + unsigned long flags; /* possibly lock protected */ + struct rbd_spec *spec; + + char *header_name; + + struct ceph_file_layout layout; + + struct ceph_osd_event *watch_event; + struct rbd_obj_request *watch_request; + + struct rbd_spec *parent_spec; + u64 parent_overlap; + atomic_t parent_ref; + struct rbd_device *parent; + + /* Block layer tags. */ + struct blk_mq_tag_set tag_set; + + /* protects updating the header */ + struct rw_semaphore header_rwsem; + + struct rbd_mapping mapping; + + struct list_head node; + + /* sysfs related */ + struct device dev; + unsigned long open_count; /* protected by lock */ +}; + +/* + * Flag bits for rbd_dev->flags. If atomicity is required, + * rbd_dev->lock is used to protect access. + * + * Currently, only the "removing" flag (which is coupled with the + * "open_count" field) requires atomic access. + */ +enum rbd_dev_flags { + RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */ + RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */ +}; + +static DEFINE_MUTEX(client_mutex); /* Serialize client creation */ + +static LIST_HEAD(rbd_dev_list); /* devices */ +static DEFINE_SPINLOCK(rbd_dev_list_lock); + +static LIST_HEAD(rbd_client_list); /* clients */ +static DEFINE_SPINLOCK(rbd_client_list_lock); + +/* Slab caches for frequently-allocated structures */ + +static struct kmem_cache *rbd_img_request_cache; +static struct kmem_cache *rbd_obj_request_cache; +static struct kmem_cache *rbd_segment_name_cache; + +static int rbd_major; +static DEFINE_IDA(rbd_dev_id_ida); + +static struct workqueue_struct *rbd_wq; + +/* + * Default to false for now, as single-major requires >= 0.75 version of + * userspace rbd utility. + */ +static bool single_major = false; +module_param(single_major, bool, S_IRUGO); +MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); + +static int rbd_img_request_submit(struct rbd_img_request *img_request); + +static void rbd_dev_device_release(struct device *dev); + +static ssize_t rbd_add(struct bus_type *bus, const char *buf, + size_t count); +static ssize_t rbd_remove(struct bus_type *bus, const char *buf, + size_t count); +static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, + size_t count); +static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, + size_t count); +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); +static void rbd_spec_put(struct rbd_spec *spec); + +static int rbd_dev_id_to_minor(int dev_id) +{ + return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT; +} + +static int minor_to_rbd_dev_id(int minor) +{ + return minor >> RBD_SINGLE_MAJOR_PART_SHIFT; +} + +static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); +static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); +static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); +static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); + +static struct attribute *rbd_bus_attrs[] = { + &bus_attr_add.attr, + &bus_attr_remove.attr, + &bus_attr_add_single_major.attr, + &bus_attr_remove_single_major.attr, + NULL, +}; + +static umode_t rbd_bus_is_visible(struct kobject *kobj, + struct attribute *attr, int index) +{ + if (!single_major && + (attr == &bus_attr_add_single_major.attr || + attr == &bus_attr_remove_single_major.attr)) + return 0; + + return attr->mode; +} + +static const struct attribute_group rbd_bus_group = { + .attrs = rbd_bus_attrs, + .is_visible = rbd_bus_is_visible, +}; +__ATTRIBUTE_GROUPS(rbd_bus); + +static struct bus_type rbd_bus_type = { + .name = "rbd", + .bus_groups = rbd_bus_groups, +}; + +static void rbd_root_dev_release(struct device *dev) +{ +} + +static struct device rbd_root_dev = { + .init_name = "rbd", + .release = rbd_root_dev_release, +}; + +static __printf(2, 3) +void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (!rbd_dev) + printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf); + else if (rbd_dev->disk) + printk(KERN_WARNING "%s: %s: %pV\n", + RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_name) + printk(KERN_WARNING "%s: image %s: %pV\n", + RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_id) + printk(KERN_WARNING "%s: id %s: %pV\n", + RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf); + else /* punt */ + printk(KERN_WARNING "%s: rbd_dev %p: %pV\n", + RBD_DRV_NAME, rbd_dev, &vaf); + va_end(args); +} + +#ifdef RBD_DEBUG +#define rbd_assert(expr) \ + if (unlikely(!(expr))) { \ + printk(KERN_ERR "\nAssertion failure in %s() " \ + "at line %d:\n\n" \ + "\trbd_assert(%s);\n\n", \ + __func__, __LINE__, #expr); \ + BUG(); \ + } +#else /* !RBD_DEBUG */ +# define rbd_assert(expr) ((void) 0) +#endif /* !RBD_DEBUG */ + +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); +static void rbd_img_parent_read(struct rbd_obj_request *obj_request); +static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); + +static int rbd_dev_refresh(struct rbd_device *rbd_dev); +static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev); +static int rbd_dev_header_info(struct rbd_device *rbd_dev); +static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev); +static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, + u64 snap_id); +static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, + u8 *order, u64 *snap_size); +static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, + u64 *snap_features); +static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name); + +static int rbd_open(struct block_device *bdev, fmode_t mode) +{ + struct rbd_device *rbd_dev = bdev->bd_disk->private_data; + bool removing = false; + + if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) + return -EROFS; + + spin_lock_irq(&rbd_dev->lock); + if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) + removing = true; + else + rbd_dev->open_count++; + spin_unlock_irq(&rbd_dev->lock); + if (removing) + return -ENOENT; + + (void) get_device(&rbd_dev->dev); + + return 0; +} + +static void rbd_release(struct gendisk *disk, fmode_t mode) +{ + struct rbd_device *rbd_dev = disk->private_data; + unsigned long open_count_before; + + spin_lock_irq(&rbd_dev->lock); + open_count_before = rbd_dev->open_count--; + spin_unlock_irq(&rbd_dev->lock); + rbd_assert(open_count_before > 0); + + put_device(&rbd_dev->dev); +} + +static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) +{ + int ret = 0; + int val; + bool ro; + bool ro_changed = false; + + /* get_user() may sleep, so call it before taking rbd_dev->lock */ + if (get_user(val, (int __user *)(arg))) + return -EFAULT; + + ro = val ? true : false; + /* Snapshot doesn't allow to write*/ + if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) + return -EROFS; + + spin_lock_irq(&rbd_dev->lock); + /* prevent others open this device */ + if (rbd_dev->open_count > 1) { + ret = -EBUSY; + goto out; + } + + if (rbd_dev->mapping.read_only != ro) { + rbd_dev->mapping.read_only = ro; + ro_changed = true; + } + +out: + spin_unlock_irq(&rbd_dev->lock); + /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ + if (ret == 0 && ro_changed) + set_disk_ro(rbd_dev->disk, ro ? 1 : 0); + + return ret; +} + +static int rbd_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct rbd_device *rbd_dev = bdev->bd_disk->private_data; + int ret = 0; + + switch (cmd) { + case BLKROSET: + ret = rbd_ioctl_set_ro(rbd_dev, arg); + break; + default: + ret = -ENOTTY; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + return rbd_ioctl(bdev, mode, cmd, arg); +} +#endif /* CONFIG_COMPAT */ + +static const struct block_device_operations rbd_bd_ops = { + .owner = THIS_MODULE, + .open = rbd_open, + .release = rbd_release, + .ioctl = rbd_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = rbd_compat_ioctl, +#endif +}; + +/* + * Initialize an rbd client instance. Success or not, this function + * consumes ceph_opts. Caller holds client_mutex. + */ +static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts) +{ + struct rbd_client *rbdc; + int ret = -ENOMEM; + + dout("%s:\n", __func__); + rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL); + if (!rbdc) + goto out_opt; + + kref_init(&rbdc->kref); + INIT_LIST_HEAD(&rbdc->node); + + rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); + if (IS_ERR(rbdc->client)) + goto out_rbdc; + ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ + + ret = ceph_open_session(rbdc->client); + if (ret < 0) + goto out_client; + + spin_lock(&rbd_client_list_lock); + list_add_tail(&rbdc->node, &rbd_client_list); + spin_unlock(&rbd_client_list_lock); + + dout("%s: rbdc %p\n", __func__, rbdc); + + return rbdc; +out_client: + ceph_destroy_client(rbdc->client); +out_rbdc: + kfree(rbdc); +out_opt: + if (ceph_opts) + ceph_destroy_options(ceph_opts); + dout("%s: error %d\n", __func__, ret); + + return ERR_PTR(ret); +} + +static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc) +{ + kref_get(&rbdc->kref); + + return rbdc; +} + +/* + * Find a ceph client with specific addr and configuration. If + * found, bump its reference count. + */ +static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) +{ + struct rbd_client *client_node; + bool found = false; + + if (ceph_opts->flags & CEPH_OPT_NOSHARE) + return NULL; + + spin_lock(&rbd_client_list_lock); + list_for_each_entry(client_node, &rbd_client_list, node) { + if (!ceph_compare_options(ceph_opts, client_node->client)) { + __rbd_get_client(client_node); + + found = true; + break; + } + } + spin_unlock(&rbd_client_list_lock); + + return found ? client_node : NULL; +} + +/* + * mount options + */ +enum { + Opt_last_int, + /* int args above */ + Opt_last_string, + /* string args above */ + Opt_read_only, + Opt_read_write, + /* Boolean args above */ + Opt_last_bool, +}; + +static match_table_t rbd_opts_tokens = { + /* int args above */ + /* string args above */ + {Opt_read_only, "read_only"}, + {Opt_read_only, "ro"}, /* Alternate spelling */ + {Opt_read_write, "read_write"}, + {Opt_read_write, "rw"}, /* Alternate spelling */ + /* Boolean args above */ + {-1, NULL} +}; + +struct rbd_options { + bool read_only; +}; + +#define RBD_READ_ONLY_DEFAULT false + +static int parse_rbd_opts_token(char *c, void *private) +{ + struct rbd_options *rbd_opts = private; + substring_t argstr[MAX_OPT_ARGS]; + int token, intval, ret; + + token = match_token(c, rbd_opts_tokens, argstr); + if (token < 0) + return -EINVAL; + + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + return ret; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else if (token > Opt_last_string && token < Opt_last_bool) { + dout("got Boolean token %d\n", token); + } else { + dout("got token %d\n", token); + } + + switch (token) { + case Opt_read_only: + rbd_opts->read_only = true; + break; + case Opt_read_write: + rbd_opts->read_only = false; + break; + default: + rbd_assert(false); + break; + } + return 0; +} + +static char* obj_op_name(enum obj_operation_type op_type) +{ + switch (op_type) { + case OBJ_OP_READ: + return "read"; + case OBJ_OP_WRITE: + return "write"; + case OBJ_OP_DISCARD: + return "discard"; + default: + return "???"; + } +} + +/* + * Get a ceph client with specific addr and configuration, if one does + * not exist create it. Either way, ceph_opts is consumed by this + * function. + */ +static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) +{ + struct rbd_client *rbdc; + + mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); + rbdc = rbd_client_find(ceph_opts); + if (rbdc) /* using an existing client */ + ceph_destroy_options(ceph_opts); + else + rbdc = rbd_client_create(ceph_opts); + mutex_unlock(&client_mutex); + + return rbdc; +} + +/* + * Destroy ceph client + * + * Caller must hold rbd_client_list_lock. + */ +static void rbd_client_release(struct kref *kref) +{ + struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref); + + dout("%s: rbdc %p\n", __func__, rbdc); + spin_lock(&rbd_client_list_lock); + list_del(&rbdc->node); + spin_unlock(&rbd_client_list_lock); + + ceph_destroy_client(rbdc->client); + kfree(rbdc); +} + +/* + * Drop reference to ceph client node. If it's not referenced anymore, release + * it. + */ +static void rbd_put_client(struct rbd_client *rbdc) +{ + if (rbdc) + kref_put(&rbdc->kref, rbd_client_release); +} + +static bool rbd_image_format_valid(u32 image_format) +{ + return image_format == 1 || image_format == 2; +} + +static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) +{ + size_t size; + u32 snap_count; + + /* The header has to start with the magic rbd header text */ + if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT))) + return false; + + /* The bio layer requires at least sector-sized I/O */ + + if (ondisk->options.order < SECTOR_SHIFT) + return false; + + /* If we use u64 in a few spots we may be able to loosen this */ + + if (ondisk->options.order > 8 * sizeof (int) - 1) + return false; + + /* + * The size of a snapshot header has to fit in a size_t, and + * that limits the number of snapshots. + */ + snap_count = le32_to_cpu(ondisk->snap_count); + size = SIZE_MAX - sizeof (struct ceph_snap_context); + if (snap_count > size / sizeof (__le64)) + return false; + + /* + * Not only that, but the size of the entire the snapshot + * header must also be representable in a size_t. + */ + size -= snap_count * sizeof (__le64); + if ((u64) size < le64_to_cpu(ondisk->snap_names_len)) + return false; + + return true; +} + +/* + * Fill an rbd image header with information from the given format 1 + * on-disk header. + */ +static int rbd_header_from_disk(struct rbd_device *rbd_dev, + struct rbd_image_header_ondisk *ondisk) +{ + struct rbd_image_header *header = &rbd_dev->header; + bool first_time = header->object_prefix == NULL; + struct ceph_snap_context *snapc; + char *object_prefix = NULL; + char *snap_names = NULL; + u64 *snap_sizes = NULL; + u32 snap_count; + size_t size; + int ret = -ENOMEM; + u32 i; + + /* Allocate this now to avoid having to handle failure below */ + + if (first_time) { + size_t len; + + len = strnlen(ondisk->object_prefix, + sizeof (ondisk->object_prefix)); + object_prefix = kmalloc(len + 1, GFP_KERNEL); + if (!object_prefix) + return -ENOMEM; + memcpy(object_prefix, ondisk->object_prefix, len); + object_prefix[len] = '\0'; + } + + /* Allocate the snapshot context and fill it in */ + + snap_count = le32_to_cpu(ondisk->snap_count); + snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); + if (!snapc) + goto out_err; + snapc->seq = le64_to_cpu(ondisk->snap_seq); + if (snap_count) { + struct rbd_image_snap_ondisk *snaps; + u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len); + + /* We'll keep a copy of the snapshot names... */ + + if (snap_names_len > (u64)SIZE_MAX) + goto out_2big; + snap_names = kmalloc(snap_names_len, GFP_KERNEL); + if (!snap_names) + goto out_err; + + /* ...as well as the array of their sizes. */ + + size = snap_count * sizeof (*header->snap_sizes); + snap_sizes = kmalloc(size, GFP_KERNEL); + if (!snap_sizes) + goto out_err; + + /* + * Copy the names, and fill in each snapshot's id + * and size. + * + * Note that rbd_dev_v1_header_info() guarantees the + * ondisk buffer we're working with has + * snap_names_len bytes beyond the end of the + * snapshot id array, this memcpy() is safe. + */ + memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len); + snaps = ondisk->snaps; + for (i = 0; i < snap_count; i++) { + snapc->snaps[i] = le64_to_cpu(snaps[i].id); + snap_sizes[i] = le64_to_cpu(snaps[i].image_size); + } + } + + /* We won't fail any more, fill in the header */ + + if (first_time) { + header->object_prefix = object_prefix; + header->obj_order = ondisk->options.order; + header->crypt_type = ondisk->options.crypt_type; + header->comp_type = ondisk->options.comp_type; + /* The rest aren't used for format 1 images */ + header->stripe_unit = 0; + header->stripe_count = 0; + header->features = 0; + } else { + ceph_put_snap_context(header->snapc); + kfree(header->snap_names); + kfree(header->snap_sizes); + } + + /* The remaining fields always get updated (when we refresh) */ + + header->image_size = le64_to_cpu(ondisk->image_size); + header->snapc = snapc; + header->snap_names = snap_names; + header->snap_sizes = snap_sizes; + + return 0; +out_2big: + ret = -EIO; +out_err: + kfree(snap_sizes); + kfree(snap_names); + ceph_put_snap_context(snapc); + kfree(object_prefix); + + return ret; +} + +static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which) +{ + const char *snap_name; + + rbd_assert(which < rbd_dev->header.snapc->num_snaps); + + /* Skip over names until we find the one we are looking for */ + + snap_name = rbd_dev->header.snap_names; + while (which--) + snap_name += strlen(snap_name) + 1; + + return kstrdup(snap_name, GFP_KERNEL); +} + +/* + * Snapshot id comparison function for use with qsort()/bsearch(). + * Note that result is for snapshots in *descending* order. + */ +static int snapid_compare_reverse(const void *s1, const void *s2) +{ + u64 snap_id1 = *(u64 *)s1; + u64 snap_id2 = *(u64 *)s2; + + if (snap_id1 < snap_id2) + return 1; + return snap_id1 == snap_id2 ? 0 : -1; +} + +/* + * Search a snapshot context to see if the given snapshot id is + * present. + * + * Returns the position of the snapshot id in the array if it's found, + * or BAD_SNAP_INDEX otherwise. + * + * Note: The snapshot array is in kept sorted (by the osd) in + * reverse order, highest snapshot id first. + */ +static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id) +{ + struct ceph_snap_context *snapc = rbd_dev->header.snapc; + u64 *found; + + found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps, + sizeof (snap_id), snapid_compare_reverse); + + return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX; +} + +static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, + u64 snap_id) +{ + u32 which; + const char *snap_name; + + which = rbd_dev_snap_index(rbd_dev, snap_id); + if (which == BAD_SNAP_INDEX) + return ERR_PTR(-ENOENT); + + snap_name = _rbd_dev_v1_snap_name(rbd_dev, which); + return snap_name ? snap_name : ERR_PTR(-ENOMEM); +} + +static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id) +{ + if (snap_id == CEPH_NOSNAP) + return RBD_SNAP_HEAD_NAME; + + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + if (rbd_dev->image_format == 1) + return rbd_dev_v1_snap_name(rbd_dev, snap_id); + + return rbd_dev_v2_snap_name(rbd_dev, snap_id); +} + +static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id, + u64 *snap_size) +{ + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + if (snap_id == CEPH_NOSNAP) { + *snap_size = rbd_dev->header.image_size; + } else if (rbd_dev->image_format == 1) { + u32 which; + + which = rbd_dev_snap_index(rbd_dev, snap_id); + if (which == BAD_SNAP_INDEX) + return -ENOENT; + + *snap_size = rbd_dev->header.snap_sizes[which]; + } else { + u64 size = 0; + int ret; + + ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size); + if (ret) + return ret; + + *snap_size = size; + } + return 0; +} + +static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id, + u64 *snap_features) +{ + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + if (snap_id == CEPH_NOSNAP) { + *snap_features = rbd_dev->header.features; + } else if (rbd_dev->image_format == 1) { + *snap_features = 0; /* No features for format 1 */ + } else { + u64 features = 0; + int ret; + + ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features); + if (ret) + return ret; + + *snap_features = features; + } + return 0; +} + +static int rbd_dev_mapping_set(struct rbd_device *rbd_dev) +{ + u64 snap_id = rbd_dev->spec->snap_id; + u64 size = 0; + u64 features = 0; + int ret; + + ret = rbd_snap_size(rbd_dev, snap_id, &size); + if (ret) + return ret; + ret = rbd_snap_features(rbd_dev, snap_id, &features); + if (ret) + return ret; + + rbd_dev->mapping.size = size; + rbd_dev->mapping.features = features; + + return 0; +} + +static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev) +{ + rbd_dev->mapping.size = 0; + rbd_dev->mapping.features = 0; +} + +static void rbd_segment_name_free(const char *name) +{ + /* The explicit cast here is needed to drop the const qualifier */ + + kmem_cache_free(rbd_segment_name_cache, (void *)name); +} + +static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset) +{ + char *name; + u64 segment; + int ret; + char *name_format; + + name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO); + if (!name) + return NULL; + segment = offset >> rbd_dev->header.obj_order; + name_format = "%s.%012llx"; + if (rbd_dev->image_format == 2) + name_format = "%s.%016llx"; + ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format, + rbd_dev->header.object_prefix, segment); + if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) { + pr_err("error formatting segment name for #%llu (%d)\n", + segment, ret); + rbd_segment_name_free(name); + name = NULL; + } + + return name; +} + +static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) +{ + u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; + + return offset & (segment_size - 1); +} + +static u64 rbd_segment_length(struct rbd_device *rbd_dev, + u64 offset, u64 length) +{ + u64 segment_size = (u64) 1 << rbd_dev->header.obj_order; + + offset &= segment_size - 1; + + rbd_assert(length <= U64_MAX - offset); + if (offset + length > segment_size) + length = segment_size - offset; + + return length; +} + +/* + * returns the size of an object in the image + */ +static u64 rbd_obj_bytes(struct rbd_image_header *header) +{ + return 1 << header->obj_order; +} + +/* + * bio helpers + */ + +static void bio_chain_put(struct bio *chain) +{ + struct bio *tmp; + + while (chain) { + tmp = chain; + chain = chain->bi_next; + bio_put(tmp); + } +} + +/* + * zeros a bio chain, starting at specific offset + */ +static void zero_bio_chain(struct bio *chain, int start_ofs) +{ + struct bio_vec bv; + struct bvec_iter iter; + unsigned long flags; + void *buf; + int pos = 0; + + while (chain) { + bio_for_each_segment(bv, chain, iter) { + if (pos + bv.bv_len > start_ofs) { + int remainder = max(start_ofs - pos, 0); + buf = bvec_kmap_irq(&bv, &flags); + memset(buf + remainder, 0, + bv.bv_len - remainder); + flush_dcache_page(bv.bv_page); + bvec_kunmap_irq(buf, &flags); + } + pos += bv.bv_len; + } + + chain = chain->bi_next; + } +} + +/* + * similar to zero_bio_chain(), zeros data defined by a page array, + * starting at the given byte offset from the start of the array and + * continuing up to the given end offset. The pages array is + * assumed to be big enough to hold all bytes up to the end. + */ +static void zero_pages(struct page **pages, u64 offset, u64 end) +{ + struct page **page = &pages[offset >> PAGE_SHIFT]; + + rbd_assert(end > offset); + rbd_assert(end - offset <= (u64)SIZE_MAX); + while (offset < end) { + size_t page_offset; + size_t length; + unsigned long flags; + void *kaddr; + + page_offset = offset & ~PAGE_MASK; + length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); + local_irq_save(flags); + kaddr = kmap_atomic(*page); + memset(kaddr + page_offset, 0, length); + flush_dcache_page(*page); + kunmap_atomic(kaddr); + local_irq_restore(flags); + + offset += length; + page++; + } +} + +/* + * Clone a portion of a bio, starting at the given byte offset + * and continuing for the number of bytes indicated. + */ +static struct bio *bio_clone_range(struct bio *bio_src, + unsigned int offset, + unsigned int len, + gfp_t gfpmask) +{ + struct bio *bio; + + bio = bio_clone(bio_src, gfpmask); + if (!bio) + return NULL; /* ENOMEM */ + + bio_advance(bio, offset); + bio->bi_iter.bi_size = len; + + return bio; +} + +/* + * Clone a portion of a bio chain, starting at the given byte offset + * into the first bio in the source chain and continuing for the + * number of bytes indicated. The result is another bio chain of + * exactly the given length, or a null pointer on error. + * + * The bio_src and offset parameters are both in-out. On entry they + * refer to the first source bio and the offset into that bio where + * the start of data to be cloned is located. + * + * On return, bio_src is updated to refer to the bio in the source + * chain that contains first un-cloned byte, and *offset will + * contain the offset of that byte within that bio. + */ +static struct bio *bio_chain_clone_range(struct bio **bio_src, + unsigned int *offset, + unsigned int len, + gfp_t gfpmask) +{ + struct bio *bi = *bio_src; + unsigned int off = *offset; + struct bio *chain = NULL; + struct bio **end; + + /* Build up a chain of clone bios up to the limit */ + + if (!bi || off >= bi->bi_iter.bi_size || !len) + return NULL; /* Nothing to clone */ + + end = &chain; + while (len) { + unsigned int bi_size; + struct bio *bio; + + if (!bi) { + rbd_warn(NULL, "bio_chain exhausted with %u left", len); + goto out_err; /* EINVAL; ran out of bio's */ + } + bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); + bio = bio_clone_range(bi, off, bi_size, gfpmask); + if (!bio) + goto out_err; /* ENOMEM */ + + *end = bio; + end = &bio->bi_next; + + off += bi_size; + if (off == bi->bi_iter.bi_size) { + bi = bi->bi_next; + off = 0; + } + len -= bi_size; + } + *bio_src = bi; + *offset = off; + + return chain; +out_err: + bio_chain_put(chain); + + return NULL; +} + +/* + * The default/initial value for all object request flags is 0. For + * each flag, once its value is set to 1 it is never reset to 0 + * again. + */ +static void obj_request_img_data_set(struct rbd_obj_request *obj_request) +{ + if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { + struct rbd_device *rbd_dev; + + rbd_dev = obj_request->img_request->rbd_dev; + rbd_warn(rbd_dev, "obj_request %p already marked img_data", + obj_request); + } +} + +static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; +} + +static void obj_request_done_set(struct rbd_obj_request *obj_request) +{ + if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { + struct rbd_device *rbd_dev = NULL; + + if (obj_request_img_data_test(obj_request)) + rbd_dev = obj_request->img_request->rbd_dev; + rbd_warn(rbd_dev, "obj_request %p already marked done", + obj_request); + } +} + +static bool obj_request_done_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; +} + +/* + * This sets the KNOWN flag after (possibly) setting the EXISTS + * flag. The latter is set based on the "exists" value provided. + * + * Note that for our purposes once an object exists it never goes + * away again. It's possible that the response from two existence + * checks are separated by the creation of the target object, and + * the first ("doesn't exist") response arrives *after* the second + * ("does exist"). In that case we ignore the second one. + */ +static void obj_request_existence_set(struct rbd_obj_request *obj_request, + bool exists) +{ + if (exists) + set_bit(OBJ_REQ_EXISTS, &obj_request->flags); + set_bit(OBJ_REQ_KNOWN, &obj_request->flags); + smp_mb(); +} + +static bool obj_request_known_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; +} + +static bool obj_request_exists_test(struct rbd_obj_request *obj_request) +{ + smp_mb(); + return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; +} + +static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) +{ + struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; + + return obj_request->img_offset < + round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); +} + +static void rbd_obj_request_get(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p (was %d)\n", __func__, obj_request, + atomic_read(&obj_request->kref.refcount)); + kref_get(&obj_request->kref); +} + +static void rbd_obj_request_destroy(struct kref *kref); +static void rbd_obj_request_put(struct rbd_obj_request *obj_request) +{ + rbd_assert(obj_request != NULL); + dout("%s: obj %p (was %d)\n", __func__, obj_request, + atomic_read(&obj_request->kref.refcount)); + kref_put(&obj_request->kref, rbd_obj_request_destroy); +} + +static void rbd_img_request_get(struct rbd_img_request *img_request) +{ + dout("%s: img %p (was %d)\n", __func__, img_request, + atomic_read(&img_request->kref.refcount)); + kref_get(&img_request->kref); +} + +static bool img_request_child_test(struct rbd_img_request *img_request); +static void rbd_parent_request_destroy(struct kref *kref); +static void rbd_img_request_destroy(struct kref *kref); +static void rbd_img_request_put(struct rbd_img_request *img_request) +{ + rbd_assert(img_request != NULL); + dout("%s: img %p (was %d)\n", __func__, img_request, + atomic_read(&img_request->kref.refcount)); + if (img_request_child_test(img_request)) + kref_put(&img_request->kref, rbd_parent_request_destroy); + else + kref_put(&img_request->kref, rbd_img_request_destroy); +} + +static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, + struct rbd_obj_request *obj_request) +{ + rbd_assert(obj_request->img_request == NULL); + + /* Image request now owns object's original reference */ + obj_request->img_request = img_request; + obj_request->which = img_request->obj_request_count; + rbd_assert(!obj_request_img_data_test(obj_request)); + obj_request_img_data_set(obj_request); + rbd_assert(obj_request->which != BAD_WHICH); + img_request->obj_request_count++; + list_add_tail(&obj_request->links, &img_request->obj_requests); + dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, + obj_request->which); +} + +static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, + struct rbd_obj_request *obj_request) +{ + rbd_assert(obj_request->which != BAD_WHICH); + + dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, + obj_request->which); + list_del(&obj_request->links); + rbd_assert(img_request->obj_request_count > 0); + img_request->obj_request_count--; + rbd_assert(obj_request->which == img_request->obj_request_count); + obj_request->which = BAD_WHICH; + rbd_assert(obj_request_img_data_test(obj_request)); + rbd_assert(obj_request->img_request == img_request); + obj_request->img_request = NULL; + obj_request->callback = NULL; + rbd_obj_request_put(obj_request); +} + +static bool obj_request_type_valid(enum obj_request_type type) +{ + switch (type) { + case OBJ_REQUEST_NODATA: + case OBJ_REQUEST_BIO: + case OBJ_REQUEST_PAGES: + return true; + default: + return false; + } +} + +static int rbd_obj_request_submit(struct ceph_osd_client *osdc, + struct rbd_obj_request *obj_request) +{ + dout("%s %p\n", __func__, obj_request); + return ceph_osdc_start_request(osdc, obj_request->osd_req, false); +} + +static void rbd_obj_request_end(struct rbd_obj_request *obj_request) +{ + dout("%s %p\n", __func__, obj_request); + ceph_osdc_cancel_request(obj_request->osd_req); +} + +/* + * Wait for an object request to complete. If interrupted, cancel the + * underlying osd request. + */ +static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) +{ + int ret; + + dout("%s %p\n", __func__, obj_request); + + ret = wait_for_completion_interruptible(&obj_request->completion); + if (ret < 0) { + dout("%s %p interrupted\n", __func__, obj_request); + rbd_obj_request_end(obj_request); + return ret; + } + + dout("%s %p done\n", __func__, obj_request); + return 0; +} + +static void rbd_img_request_complete(struct rbd_img_request *img_request) +{ + + dout("%s: img %p\n", __func__, img_request); + + /* + * If no error occurred, compute the aggregate transfer + * count for the image request. We could instead use + * atomic64_cmpxchg() to update it as each object request + * completes; not clear which way is better off hand. + */ + if (!img_request->result) { + struct rbd_obj_request *obj_request; + u64 xferred = 0; + + for_each_obj_request(img_request, obj_request) + xferred += obj_request->xferred; + img_request->xferred = xferred; + } + + if (img_request->callback) + img_request->callback(img_request); + else + rbd_img_request_put(img_request); +} + +/* + * The default/initial value for all image request flags is 0. Each + * is conditionally set to 1 at image request initialization time + * and currently never change thereafter. + */ +static void img_request_write_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_WRITE, &img_request->flags); + smp_mb(); +} + +static bool img_request_write_test(struct rbd_img_request *img_request) +{ + smp_mb(); + return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; +} + +/* + * Set the discard flag when the img_request is an discard request + */ +static void img_request_discard_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_DISCARD, &img_request->flags); + smp_mb(); +} + +static bool img_request_discard_test(struct rbd_img_request *img_request) +{ + smp_mb(); + return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; +} + +static void img_request_child_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_CHILD, &img_request->flags); + smp_mb(); +} + +static void img_request_child_clear(struct rbd_img_request *img_request) +{ + clear_bit(IMG_REQ_CHILD, &img_request->flags); + smp_mb(); +} + +static bool img_request_child_test(struct rbd_img_request *img_request) +{ + smp_mb(); + return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; +} + +static void img_request_layered_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_LAYERED, &img_request->flags); + smp_mb(); +} + +static void img_request_layered_clear(struct rbd_img_request *img_request) +{ + clear_bit(IMG_REQ_LAYERED, &img_request->flags); + smp_mb(); +} + +static bool img_request_layered_test(struct rbd_img_request *img_request) +{ + smp_mb(); + return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; +} + +static enum obj_operation_type +rbd_img_request_op_type(struct rbd_img_request *img_request) +{ + if (img_request_write_test(img_request)) + return OBJ_OP_WRITE; + else if (img_request_discard_test(img_request)) + return OBJ_OP_DISCARD; + else + return OBJ_OP_READ; +} + +static void +rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) +{ + u64 xferred = obj_request->xferred; + u64 length = obj_request->length; + + dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, + obj_request, obj_request->img_request, obj_request->result, + xferred, length); + /* + * ENOENT means a hole in the image. We zero-fill the entire + * length of the request. A short read also implies zero-fill + * to the end of the request. An error requires the whole + * length of the request to be reported finished with an error + * to the block layer. In each case we update the xferred + * count to indicate the whole request was satisfied. + */ + rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); + if (obj_request->result == -ENOENT) { + if (obj_request->type == OBJ_REQUEST_BIO) + zero_bio_chain(obj_request->bio_list, 0); + else + zero_pages(obj_request->pages, 0, length); + obj_request->result = 0; + } else if (xferred < length && !obj_request->result) { + if (obj_request->type == OBJ_REQUEST_BIO) + zero_bio_chain(obj_request->bio_list, xferred); + else + zero_pages(obj_request->pages, xferred, length); + } + obj_request->xferred = length; + obj_request_done_set(obj_request); +} + +static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p cb %p\n", __func__, obj_request, + obj_request->callback); + if (obj_request->callback) + obj_request->callback(obj_request); + else + complete_all(&obj_request->completion); +} + +static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p\n", __func__, obj_request); + obj_request_done_set(obj_request); +} + +static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = NULL; + struct rbd_device *rbd_dev = NULL; + bool layered = false; + + if (obj_request_img_data_test(obj_request)) { + img_request = obj_request->img_request; + layered = img_request && img_request_layered_test(img_request); + rbd_dev = img_request->rbd_dev; + } + + dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, + obj_request, img_request, obj_request->result, + obj_request->xferred, obj_request->length); + if (layered && obj_request->result == -ENOENT && + obj_request->img_offset < rbd_dev->parent_overlap) + rbd_img_parent_read(obj_request); + else if (img_request) + rbd_img_obj_request_read_callback(obj_request); + else + obj_request_done_set(obj_request); +} + +static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p result %d %llu\n", __func__, obj_request, + obj_request->result, obj_request->length); + /* + * There is no such thing as a successful short write. Set + * it to our originally-requested length. + */ + obj_request->xferred = obj_request->length; + obj_request_done_set(obj_request); +} + +static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p result %d %llu\n", __func__, obj_request, + obj_request->result, obj_request->length); + /* + * There is no such thing as a successful short discard. Set + * it to our originally-requested length. + */ + obj_request->xferred = obj_request->length; + /* discarding a non-existent object is not a problem */ + if (obj_request->result == -ENOENT) + obj_request->result = 0; + obj_request_done_set(obj_request); +} + +/* + * For a simple stat call there's nothing to do. We'll do more if + * this is part of a write sequence for a layered image. + */ +static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p\n", __func__, obj_request); + obj_request_done_set(obj_request); +} + +static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, + struct ceph_msg *msg) +{ + struct rbd_obj_request *obj_request = osd_req->r_priv; + u16 opcode; + + dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg); + rbd_assert(osd_req == obj_request->osd_req); + if (obj_request_img_data_test(obj_request)) { + rbd_assert(obj_request->img_request); + rbd_assert(obj_request->which != BAD_WHICH); + } else { + rbd_assert(obj_request->which == BAD_WHICH); + } + + if (osd_req->r_result < 0) + obj_request->result = osd_req->r_result; + + rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); + + /* + * We support a 64-bit length, but ultimately it has to be + * passed to the block layer, which just supports a 32-bit + * length field. + */ + obj_request->xferred = osd_req->r_reply_op_len[0]; + rbd_assert(obj_request->xferred < (u64)UINT_MAX); + + opcode = osd_req->r_ops[0].op; + switch (opcode) { + case CEPH_OSD_OP_READ: + rbd_osd_read_callback(obj_request); + break; + case CEPH_OSD_OP_SETALLOCHINT: + rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE); + /* fall through */ + case CEPH_OSD_OP_WRITE: + rbd_osd_write_callback(obj_request); + break; + case CEPH_OSD_OP_STAT: + rbd_osd_stat_callback(obj_request); + break; + case CEPH_OSD_OP_DELETE: + case CEPH_OSD_OP_TRUNCATE: + case CEPH_OSD_OP_ZERO: + rbd_osd_discard_callback(obj_request); + break; + case CEPH_OSD_OP_CALL: + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + rbd_osd_trivial_callback(obj_request); + break; + default: + rbd_warn(NULL, "%s: unsupported op %hu", + obj_request->object_name, (unsigned short) opcode); + break; + } + + if (obj_request_done_test(obj_request)) + rbd_obj_request_complete(obj_request); +} + +static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_osd_request *osd_req = obj_request->osd_req; + u64 snap_id; + + rbd_assert(osd_req != NULL); + + snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP; + ceph_osdc_build_request(osd_req, obj_request->offset, + NULL, snap_id, NULL); +} + +static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct ceph_osd_request *osd_req = obj_request->osd_req; + struct ceph_snap_context *snapc; + struct timespec mtime = CURRENT_TIME; + + rbd_assert(osd_req != NULL); + + snapc = img_request ? img_request->snapc : NULL; + ceph_osdc_build_request(osd_req, obj_request->offset, + snapc, CEPH_NOSNAP, &mtime); +} + +/* + * Create an osd request. A read request has one osd op (read). + * A write request has either one (watch) or two (hint+write) osd ops. + * (All rbd data writes are prefixed with an allocation hint op, but + * technically osd watch is a write request, hence this distinction.) + */ +static struct ceph_osd_request *rbd_osd_req_create( + struct rbd_device *rbd_dev, + enum obj_operation_type op_type, + unsigned int num_ops, + struct rbd_obj_request *obj_request) +{ + struct ceph_snap_context *snapc = NULL; + struct ceph_osd_client *osdc; + struct ceph_osd_request *osd_req; + + if (obj_request_img_data_test(obj_request) && + (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { + struct rbd_img_request *img_request = obj_request->img_request; + if (op_type == OBJ_OP_WRITE) { + rbd_assert(img_request_write_test(img_request)); + } else { + rbd_assert(img_request_discard_test(img_request)); + } + snapc = img_request->snapc; + } + + rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); + + /* Allocate and initialize the request, for the num_ops ops */ + + osdc = &rbd_dev->rbd_client->client->osdc; + osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, + GFP_ATOMIC); + if (!osd_req) + return NULL; /* ENOMEM */ + + if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) + osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + else + osd_req->r_flags = CEPH_OSD_FLAG_READ; + + osd_req->r_callback = rbd_osd_req_callback; + osd_req->r_priv = obj_request; + + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); + + return osd_req; +} + +/* + * Create a copyup osd request based on the information in the object + * request supplied. A copyup request has two or three osd ops, a + * copyup method call, potentially a hint op, and a write or truncate + * or zero op. + */ +static struct ceph_osd_request * +rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + struct ceph_snap_context *snapc; + struct rbd_device *rbd_dev; + struct ceph_osd_client *osdc; + struct ceph_osd_request *osd_req; + int num_osd_ops = 3; + + rbd_assert(obj_request_img_data_test(obj_request)); + img_request = obj_request->img_request; + rbd_assert(img_request); + rbd_assert(img_request_write_test(img_request) || + img_request_discard_test(img_request)); + + if (img_request_discard_test(img_request)) + num_osd_ops = 2; + + /* Allocate and initialize the request, for all the ops */ + + snapc = img_request->snapc; + rbd_dev = img_request->rbd_dev; + osdc = &rbd_dev->rbd_client->client->osdc; + osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, + false, GFP_ATOMIC); + if (!osd_req) + return NULL; /* ENOMEM */ + + osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + osd_req->r_callback = rbd_osd_req_callback; + osd_req->r_priv = obj_request; + + osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout); + ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name); + + return osd_req; +} + + +static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) +{ + ceph_osdc_put_request(osd_req); +} + +/* object_name is assumed to be a non-null pointer and NUL-terminated */ + +static struct rbd_obj_request *rbd_obj_request_create(const char *object_name, + u64 offset, u64 length, + enum obj_request_type type) +{ + struct rbd_obj_request *obj_request; + size_t size; + char *name; + + rbd_assert(obj_request_type_valid(type)); + + size = strlen(object_name) + 1; + name = kmalloc(size, GFP_KERNEL); + if (!name) + return NULL; + + obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL); + if (!obj_request) { + kfree(name); + return NULL; + } + + obj_request->object_name = memcpy(name, object_name, size); + obj_request->offset = offset; + obj_request->length = length; + obj_request->flags = 0; + obj_request->which = BAD_WHICH; + obj_request->type = type; + INIT_LIST_HEAD(&obj_request->links); + init_completion(&obj_request->completion); + kref_init(&obj_request->kref); + + dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name, + offset, length, (int)type, obj_request); + + return obj_request; +} + +static void rbd_obj_request_destroy(struct kref *kref) +{ + struct rbd_obj_request *obj_request; + + obj_request = container_of(kref, struct rbd_obj_request, kref); + + dout("%s: obj %p\n", __func__, obj_request); + + rbd_assert(obj_request->img_request == NULL); + rbd_assert(obj_request->which == BAD_WHICH); + + if (obj_request->osd_req) + rbd_osd_req_destroy(obj_request->osd_req); + + rbd_assert(obj_request_type_valid(obj_request->type)); + switch (obj_request->type) { + case OBJ_REQUEST_NODATA: + break; /* Nothing to do */ + case OBJ_REQUEST_BIO: + if (obj_request->bio_list) + bio_chain_put(obj_request->bio_list); + break; + case OBJ_REQUEST_PAGES: + if (obj_request->pages) + ceph_release_page_vector(obj_request->pages, + obj_request->page_count); + break; + } + + kfree(obj_request->object_name); + obj_request->object_name = NULL; + kmem_cache_free(rbd_obj_request_cache, obj_request); +} + +/* It's OK to call this for a device with no parent */ + +static void rbd_spec_put(struct rbd_spec *spec); +static void rbd_dev_unparent(struct rbd_device *rbd_dev) +{ + rbd_dev_remove_parent(rbd_dev); + rbd_spec_put(rbd_dev->parent_spec); + rbd_dev->parent_spec = NULL; + rbd_dev->parent_overlap = 0; +} + +/* + * Parent image reference counting is used to determine when an + * image's parent fields can be safely torn down--after there are no + * more in-flight requests to the parent image. When the last + * reference is dropped, cleaning them up is safe. + */ +static void rbd_dev_parent_put(struct rbd_device *rbd_dev) +{ + int counter; + + if (!rbd_dev->parent_spec) + return; + + counter = atomic_dec_return_safe(&rbd_dev->parent_ref); + if (counter > 0) + return; + + /* Last reference; clean up parent data structures */ + + if (!counter) + rbd_dev_unparent(rbd_dev); + else + rbd_warn(rbd_dev, "parent reference underflow"); +} + +/* + * If an image has a non-zero parent overlap, get a reference to its + * parent. + * + * Returns true if the rbd device has a parent with a non-zero + * overlap and a reference for it was successfully taken, or + * false otherwise. + */ +static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) +{ + int counter = 0; + + if (!rbd_dev->parent_spec) + return false; + + down_read(&rbd_dev->header_rwsem); + if (rbd_dev->parent_overlap) + counter = atomic_inc_return_safe(&rbd_dev->parent_ref); + up_read(&rbd_dev->header_rwsem); + + if (counter < 0) + rbd_warn(rbd_dev, "parent reference overflow"); + + return counter > 0; +} + +/* + * Caller is responsible for filling in the list of object requests + * that comprises the image request, and the Linux request pointer + * (if there is one). + */ +static struct rbd_img_request *rbd_img_request_create( + struct rbd_device *rbd_dev, + u64 offset, u64 length, + enum obj_operation_type op_type, + struct ceph_snap_context *snapc) +{ + struct rbd_img_request *img_request; + + img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO); + if (!img_request) + return NULL; + + img_request->rq = NULL; + img_request->rbd_dev = rbd_dev; + img_request->offset = offset; + img_request->length = length; + img_request->flags = 0; + if (op_type == OBJ_OP_DISCARD) { + img_request_discard_set(img_request); + img_request->snapc = snapc; + } else if (op_type == OBJ_OP_WRITE) { + img_request_write_set(img_request); + img_request->snapc = snapc; + } else { + img_request->snap_id = rbd_dev->spec->snap_id; + } + if (rbd_dev_parent_get(rbd_dev)) + img_request_layered_set(img_request); + spin_lock_init(&img_request->completion_lock); + img_request->next_completion = 0; + img_request->callback = NULL; + img_request->result = 0; + img_request->obj_request_count = 0; + INIT_LIST_HEAD(&img_request->obj_requests); + kref_init(&img_request->kref); + + dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, + obj_op_name(op_type), offset, length, img_request); + + return img_request; +} + +static void rbd_img_request_destroy(struct kref *kref) +{ + struct rbd_img_request *img_request; + struct rbd_obj_request *obj_request; + struct rbd_obj_request *next_obj_request; + + img_request = container_of(kref, struct rbd_img_request, kref); + + dout("%s: img %p\n", __func__, img_request); + + for_each_obj_request_safe(img_request, obj_request, next_obj_request) + rbd_img_obj_request_del(img_request, obj_request); + rbd_assert(img_request->obj_request_count == 0); + + if (img_request_layered_test(img_request)) { + img_request_layered_clear(img_request); + rbd_dev_parent_put(img_request->rbd_dev); + } + + if (img_request_write_test(img_request) || + img_request_discard_test(img_request)) + ceph_put_snap_context(img_request->snapc); + + kmem_cache_free(rbd_img_request_cache, img_request); +} + +static struct rbd_img_request *rbd_parent_request_create( + struct rbd_obj_request *obj_request, + u64 img_offset, u64 length) +{ + struct rbd_img_request *parent_request; + struct rbd_device *rbd_dev; + + rbd_assert(obj_request->img_request); + rbd_dev = obj_request->img_request->rbd_dev; + + parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, + length, OBJ_OP_READ, NULL); + if (!parent_request) + return NULL; + + img_request_child_set(parent_request); + rbd_obj_request_get(obj_request); + parent_request->obj_request = obj_request; + + return parent_request; +} + +static void rbd_parent_request_destroy(struct kref *kref) +{ + struct rbd_img_request *parent_request; + struct rbd_obj_request *orig_request; + + parent_request = container_of(kref, struct rbd_img_request, kref); + orig_request = parent_request->obj_request; + + parent_request->obj_request = NULL; + rbd_obj_request_put(orig_request); + img_request_child_clear(parent_request); + + rbd_img_request_destroy(kref); +} + +static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + unsigned int xferred; + int result; + bool more; + + rbd_assert(obj_request_img_data_test(obj_request)); + img_request = obj_request->img_request; + + rbd_assert(obj_request->xferred <= (u64)UINT_MAX); + xferred = (unsigned int)obj_request->xferred; + result = obj_request->result; + if (result) { + struct rbd_device *rbd_dev = img_request->rbd_dev; + enum obj_operation_type op_type; + + if (img_request_discard_test(img_request)) + op_type = OBJ_OP_DISCARD; + else if (img_request_write_test(img_request)) + op_type = OBJ_OP_WRITE; + else + op_type = OBJ_OP_READ; + + rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", + obj_op_name(op_type), obj_request->length, + obj_request->img_offset, obj_request->offset); + rbd_warn(rbd_dev, " result %d xferred %x", + result, xferred); + if (!img_request->result) + img_request->result = result; + /* + * Need to end I/O on the entire obj_request worth of + * bytes in case of error. + */ + xferred = obj_request->length; + } + + /* Image object requests don't own their page array */ + + if (obj_request->type == OBJ_REQUEST_PAGES) { + obj_request->pages = NULL; + obj_request->page_count = 0; + } + + if (img_request_child_test(img_request)) { + rbd_assert(img_request->obj_request != NULL); + more = obj_request->which < img_request->obj_request_count - 1; + } else { + rbd_assert(img_request->rq != NULL); + + more = blk_update_request(img_request->rq, result, xferred); + if (!more) + __blk_mq_end_request(img_request->rq, result); + } + + return more; +} + +static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + u32 which = obj_request->which; + bool more = true; + + rbd_assert(obj_request_img_data_test(obj_request)); + img_request = obj_request->img_request; + + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); + rbd_assert(img_request != NULL); + rbd_assert(img_request->obj_request_count > 0); + rbd_assert(which != BAD_WHICH); + rbd_assert(which < img_request->obj_request_count); + + spin_lock_irq(&img_request->completion_lock); + if (which != img_request->next_completion) + goto out; + + for_each_obj_request_from(img_request, obj_request) { + rbd_assert(more); + rbd_assert(which < img_request->obj_request_count); + + if (!obj_request_done_test(obj_request)) + break; + more = rbd_img_obj_end_request(obj_request); + which++; + } + + rbd_assert(more ^ (which == img_request->obj_request_count)); + img_request->next_completion = which; +out: + spin_unlock_irq(&img_request->completion_lock); + rbd_img_request_put(img_request); + + if (!more) + rbd_img_request_complete(img_request); +} + +/* + * Add individual osd ops to the given ceph_osd_request and prepare + * them for submission. num_ops is the current number of + * osd operations already to the object request. + */ +static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, + struct ceph_osd_request *osd_request, + enum obj_operation_type op_type, + unsigned int num_ops) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct rbd_device *rbd_dev = img_request->rbd_dev; + u64 object_size = rbd_obj_bytes(&rbd_dev->header); + u64 offset = obj_request->offset; + u64 length = obj_request->length; + u64 img_end; + u16 opcode; + + if (op_type == OBJ_OP_DISCARD) { + if (!offset && length == object_size && + (!img_request_layered_test(img_request) || + !obj_request_overlaps_parent(obj_request))) { + opcode = CEPH_OSD_OP_DELETE; + } else if ((offset + length == object_size)) { + opcode = CEPH_OSD_OP_TRUNCATE; + } else { + down_read(&rbd_dev->header_rwsem); + img_end = rbd_dev->header.image_size; + up_read(&rbd_dev->header_rwsem); + + if (obj_request->img_offset + length == img_end) + opcode = CEPH_OSD_OP_TRUNCATE; + else + opcode = CEPH_OSD_OP_ZERO; + } + } else if (op_type == OBJ_OP_WRITE) { + opcode = CEPH_OSD_OP_WRITE; + osd_req_op_alloc_hint_init(osd_request, num_ops, + object_size, object_size); + num_ops++; + } else { + opcode = CEPH_OSD_OP_READ; + } + + if (opcode == CEPH_OSD_OP_DELETE) + osd_req_op_init(osd_request, num_ops, opcode); + else + osd_req_op_extent_init(osd_request, num_ops, opcode, + offset, length, 0, 0); + + if (obj_request->type == OBJ_REQUEST_BIO) + osd_req_op_extent_osd_data_bio(osd_request, num_ops, + obj_request->bio_list, length); + else if (obj_request->type == OBJ_REQUEST_PAGES) + osd_req_op_extent_osd_data_pages(osd_request, num_ops, + obj_request->pages, length, + offset & ~PAGE_MASK, false, false); + + /* Discards are also writes */ + if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) + rbd_osd_req_format_write(obj_request); + else + rbd_osd_req_format_read(obj_request); +} + +/* + * Split up an image request into one or more object requests, each + * to a different object. The "type" parameter indicates whether + * "data_desc" is the pointer to the head of a list of bio + * structures, or the base of a page array. In either case this + * function assumes data_desc describes memory sufficient to hold + * all data described by the image request. + */ +static int rbd_img_request_fill(struct rbd_img_request *img_request, + enum obj_request_type type, + void *data_desc) +{ + struct rbd_device *rbd_dev = img_request->rbd_dev; + struct rbd_obj_request *obj_request = NULL; + struct rbd_obj_request *next_obj_request; + struct bio *bio_list = NULL; + unsigned int bio_offset = 0; + struct page **pages = NULL; + enum obj_operation_type op_type; + u64 img_offset; + u64 resid; + + dout("%s: img %p type %d data_desc %p\n", __func__, img_request, + (int)type, data_desc); + + img_offset = img_request->offset; + resid = img_request->length; + rbd_assert(resid > 0); + op_type = rbd_img_request_op_type(img_request); + + if (type == OBJ_REQUEST_BIO) { + bio_list = data_desc; + rbd_assert(img_offset == + bio_list->bi_iter.bi_sector << SECTOR_SHIFT); + } else if (type == OBJ_REQUEST_PAGES) { + pages = data_desc; + } + + while (resid) { + struct ceph_osd_request *osd_req; + const char *object_name; + u64 offset; + u64 length; + + object_name = rbd_segment_name(rbd_dev, img_offset); + if (!object_name) + goto out_unwind; + offset = rbd_segment_offset(rbd_dev, img_offset); + length = rbd_segment_length(rbd_dev, img_offset, resid); + obj_request = rbd_obj_request_create(object_name, + offset, length, type); + /* object request has its own copy of the object name */ + rbd_segment_name_free(object_name); + if (!obj_request) + goto out_unwind; + + /* + * set obj_request->img_request before creating the + * osd_request so that it gets the right snapc + */ + rbd_img_obj_request_add(img_request, obj_request); + + if (type == OBJ_REQUEST_BIO) { + unsigned int clone_size; + + rbd_assert(length <= (u64)UINT_MAX); + clone_size = (unsigned int)length; + obj_request->bio_list = + bio_chain_clone_range(&bio_list, + &bio_offset, + clone_size, + GFP_ATOMIC); + if (!obj_request->bio_list) + goto out_unwind; + } else if (type == OBJ_REQUEST_PAGES) { + unsigned int page_count; + + obj_request->pages = pages; + page_count = (u32)calc_pages_for(offset, length); + obj_request->page_count = page_count; + if ((offset + length) & ~PAGE_MASK) + page_count--; /* more on last page */ + pages += page_count; + } + + osd_req = rbd_osd_req_create(rbd_dev, op_type, + (op_type == OBJ_OP_WRITE) ? 2 : 1, + obj_request); + if (!osd_req) + goto out_unwind; + + obj_request->osd_req = osd_req; + obj_request->callback = rbd_img_obj_callback; + obj_request->img_offset = img_offset; + + rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); + + rbd_img_request_get(img_request); + + img_offset += length; + resid -= length; + } + + return 0; + +out_unwind: + for_each_obj_request_safe(img_request, obj_request, next_obj_request) + rbd_img_obj_request_del(img_request, obj_request); + + return -ENOMEM; +} + +static void +rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + struct rbd_device *rbd_dev; + struct page **pages; + u32 page_count; + + rbd_assert(obj_request->type == OBJ_REQUEST_BIO || + obj_request->type == OBJ_REQUEST_NODATA); + rbd_assert(obj_request_img_data_test(obj_request)); + img_request = obj_request->img_request; + rbd_assert(img_request); + + rbd_dev = img_request->rbd_dev; + rbd_assert(rbd_dev); + + pages = obj_request->copyup_pages; + rbd_assert(pages != NULL); + obj_request->copyup_pages = NULL; + page_count = obj_request->copyup_page_count; + rbd_assert(page_count); + obj_request->copyup_page_count = 0; + ceph_release_page_vector(pages, page_count); + + /* + * We want the transfer count to reflect the size of the + * original write request. There is no such thing as a + * successful short write, so if the request was successful + * we can just set it to the originally-requested length. + */ + if (!obj_request->result) + obj_request->xferred = obj_request->length; + + /* Finish up with the normal image object callback */ + + rbd_img_obj_callback(obj_request); +} + +static void +rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) +{ + struct rbd_obj_request *orig_request; + struct ceph_osd_request *osd_req; + struct ceph_osd_client *osdc; + struct rbd_device *rbd_dev; + struct page **pages; + enum obj_operation_type op_type; + u32 page_count; + int img_result; + u64 parent_length; + + rbd_assert(img_request_child_test(img_request)); + + /* First get what we need from the image request */ + + pages = img_request->copyup_pages; + rbd_assert(pages != NULL); + img_request->copyup_pages = NULL; + page_count = img_request->copyup_page_count; + rbd_assert(page_count); + img_request->copyup_page_count = 0; + + orig_request = img_request->obj_request; + rbd_assert(orig_request != NULL); + rbd_assert(obj_request_type_valid(orig_request->type)); + img_result = img_request->result; + parent_length = img_request->length; + rbd_assert(parent_length == img_request->xferred); + rbd_img_request_put(img_request); + + rbd_assert(orig_request->img_request); + rbd_dev = orig_request->img_request->rbd_dev; + rbd_assert(rbd_dev); + + /* + * If the overlap has become 0 (most likely because the + * image has been flattened) we need to free the pages + * and re-submit the original write request. + */ + if (!rbd_dev->parent_overlap) { + struct ceph_osd_client *osdc; + + ceph_release_page_vector(pages, page_count); + osdc = &rbd_dev->rbd_client->client->osdc; + img_result = rbd_obj_request_submit(osdc, orig_request); + if (!img_result) + return; + } + + if (img_result) + goto out_err; + + /* + * The original osd request is of no use to use any more. + * We need a new one that can hold the three ops in a copyup + * request. Allocate the new copyup osd request for the + * original request, and release the old one. + */ + img_result = -ENOMEM; + osd_req = rbd_osd_req_create_copyup(orig_request); + if (!osd_req) + goto out_err; + rbd_osd_req_destroy(orig_request->osd_req); + orig_request->osd_req = osd_req; + orig_request->copyup_pages = pages; + orig_request->copyup_page_count = page_count; + + /* Initialize the copyup op */ + + osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); + osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, + false, false); + + /* Add the other op(s) */ + + op_type = rbd_img_request_op_type(orig_request->img_request); + rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); + + /* All set, send it off. */ + + orig_request->callback = rbd_img_obj_copyup_callback; + osdc = &rbd_dev->rbd_client->client->osdc; + img_result = rbd_obj_request_submit(osdc, orig_request); + if (!img_result) + return; +out_err: + /* Record the error code and complete the request */ + + orig_request->result = img_result; + orig_request->xferred = 0; + obj_request_done_set(orig_request); + rbd_obj_request_complete(orig_request); +} + +/* + * Read from the parent image the range of data that covers the + * entire target of the given object request. This is used for + * satisfying a layered image write request when the target of an + * object request from the image request does not exist. + * + * A page array big enough to hold the returned data is allocated + * and supplied to rbd_img_request_fill() as the "data descriptor." + * When the read completes, this page array will be transferred to + * the original object request for the copyup operation. + * + * If an error occurs, record it as the result of the original + * object request and mark it done so it gets completed. + */ +static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request = NULL; + struct rbd_img_request *parent_request = NULL; + struct rbd_device *rbd_dev; + u64 img_offset; + u64 length; + struct page **pages = NULL; + u32 page_count; + int result; + + rbd_assert(obj_request_img_data_test(obj_request)); + rbd_assert(obj_request_type_valid(obj_request->type)); + + img_request = obj_request->img_request; + rbd_assert(img_request != NULL); + rbd_dev = img_request->rbd_dev; + rbd_assert(rbd_dev->parent != NULL); + + /* + * Determine the byte range covered by the object in the + * child image to which the original request was to be sent. + */ + img_offset = obj_request->img_offset - obj_request->offset; + length = (u64)1 << rbd_dev->header.obj_order; + + /* + * There is no defined parent data beyond the parent + * overlap, so limit what we read at that boundary if + * necessary. + */ + if (img_offset + length > rbd_dev->parent_overlap) { + rbd_assert(img_offset < rbd_dev->parent_overlap); + length = rbd_dev->parent_overlap - img_offset; + } + + /* + * Allocate a page array big enough to receive the data read + * from the parent. + */ + page_count = (u32)calc_pages_for(0, length); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) { + result = PTR_ERR(pages); + pages = NULL; + goto out_err; + } + + result = -ENOMEM; + parent_request = rbd_parent_request_create(obj_request, + img_offset, length); + if (!parent_request) + goto out_err; + + result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); + if (result) + goto out_err; + parent_request->copyup_pages = pages; + parent_request->copyup_page_count = page_count; + + parent_request->callback = rbd_img_obj_parent_read_full_callback; + result = rbd_img_request_submit(parent_request); + if (!result) + return 0; + + parent_request->copyup_pages = NULL; + parent_request->copyup_page_count = 0; + parent_request->obj_request = NULL; + rbd_obj_request_put(obj_request); +out_err: + if (pages) + ceph_release_page_vector(pages, page_count); + if (parent_request) + rbd_img_request_put(parent_request); + obj_request->result = result; + obj_request->xferred = 0; + obj_request_done_set(obj_request); + + return result; +} + +static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) +{ + struct rbd_obj_request *orig_request; + struct rbd_device *rbd_dev; + int result; + + rbd_assert(!obj_request_img_data_test(obj_request)); + + /* + * All we need from the object request is the original + * request and the result of the STAT op. Grab those, then + * we're done with the request. + */ + orig_request = obj_request->obj_request; + obj_request->obj_request = NULL; + rbd_obj_request_put(orig_request); + rbd_assert(orig_request); + rbd_assert(orig_request->img_request); + + result = obj_request->result; + obj_request->result = 0; + + dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, + obj_request, orig_request, result, + obj_request->xferred, obj_request->length); + rbd_obj_request_put(obj_request); + + /* + * If the overlap has become 0 (most likely because the + * image has been flattened) we need to free the pages + * and re-submit the original write request. + */ + rbd_dev = orig_request->img_request->rbd_dev; + if (!rbd_dev->parent_overlap) { + struct ceph_osd_client *osdc; + + osdc = &rbd_dev->rbd_client->client->osdc; + result = rbd_obj_request_submit(osdc, orig_request); + if (!result) + return; + } + + /* + * Our only purpose here is to determine whether the object + * exists, and we don't want to treat the non-existence as + * an error. If something else comes back, transfer the + * error to the original request and complete it now. + */ + if (!result) { + obj_request_existence_set(orig_request, true); + } else if (result == -ENOENT) { + obj_request_existence_set(orig_request, false); + } else if (result) { + orig_request->result = result; + goto out; + } + + /* + * Resubmit the original request now that we have recorded + * whether the target object exists. + */ + orig_request->result = rbd_img_obj_request_submit(orig_request); +out: + if (orig_request->result) + rbd_obj_request_complete(orig_request); +} + +static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) +{ + struct rbd_obj_request *stat_request; + struct rbd_device *rbd_dev; + struct ceph_osd_client *osdc; + struct page **pages = NULL; + u32 page_count; + size_t size; + int ret; + + /* + * The response data for a STAT call consists of: + * le64 length; + * struct { + * le32 tv_sec; + * le32 tv_nsec; + * } mtime; + */ + size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); + page_count = (u32)calc_pages_for(0, size); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = -ENOMEM; + stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0, + OBJ_REQUEST_PAGES); + if (!stat_request) + goto out; + + rbd_obj_request_get(obj_request); + stat_request->obj_request = obj_request; + stat_request->pages = pages; + stat_request->page_count = page_count; + + rbd_assert(obj_request->img_request); + rbd_dev = obj_request->img_request->rbd_dev; + stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, + stat_request); + if (!stat_request->osd_req) + goto out; + stat_request->callback = rbd_img_obj_exists_callback; + + osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); + osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, + false, false); + rbd_osd_req_format_read(stat_request); + + osdc = &rbd_dev->rbd_client->client->osdc; + ret = rbd_obj_request_submit(osdc, stat_request); +out: + if (ret) + rbd_obj_request_put(obj_request); + + return ret; +} + +static bool img_obj_request_simple(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + struct rbd_device *rbd_dev; + + rbd_assert(obj_request_img_data_test(obj_request)); + + img_request = obj_request->img_request; + rbd_assert(img_request); + rbd_dev = img_request->rbd_dev; + + /* Reads */ + if (!img_request_write_test(img_request) && + !img_request_discard_test(img_request)) + return true; + + /* Non-layered writes */ + if (!img_request_layered_test(img_request)) + return true; + + /* + * Layered writes outside of the parent overlap range don't + * share any data with the parent. + */ + if (!obj_request_overlaps_parent(obj_request)) + return true; + + /* + * Entire-object layered writes - we will overwrite whatever + * parent data there is anyway. + */ + if (!obj_request->offset && + obj_request->length == rbd_obj_bytes(&rbd_dev->header)) + return true; + + /* + * If the object is known to already exist, its parent data has + * already been copied. + */ + if (obj_request_known_test(obj_request) && + obj_request_exists_test(obj_request)) + return true; + + return false; +} + +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) +{ + if (img_obj_request_simple(obj_request)) { + struct rbd_device *rbd_dev; + struct ceph_osd_client *osdc; + + rbd_dev = obj_request->img_request->rbd_dev; + osdc = &rbd_dev->rbd_client->client->osdc; + + return rbd_obj_request_submit(osdc, obj_request); + } + + /* + * It's a layered write. The target object might exist but + * we may not know that yet. If we know it doesn't exist, + * start by reading the data for the full target object from + * the parent so we can use it for a copyup to the target. + */ + if (obj_request_known_test(obj_request)) + return rbd_img_obj_parent_read_full(obj_request); + + /* We don't know whether the target exists. Go find out. */ + + return rbd_img_obj_exists_submit(obj_request); +} + +static int rbd_img_request_submit(struct rbd_img_request *img_request) +{ + struct rbd_obj_request *obj_request; + struct rbd_obj_request *next_obj_request; + + dout("%s: img %p\n", __func__, img_request); + for_each_obj_request_safe(img_request, obj_request, next_obj_request) { + int ret; + + ret = rbd_img_obj_request_submit(obj_request); + if (ret) + return ret; + } + + return 0; +} + +static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) +{ + struct rbd_obj_request *obj_request; + struct rbd_device *rbd_dev; + u64 obj_end; + u64 img_xferred; + int img_result; + + rbd_assert(img_request_child_test(img_request)); + + /* First get what we need from the image request and release it */ + + obj_request = img_request->obj_request; + img_xferred = img_request->xferred; + img_result = img_request->result; + rbd_img_request_put(img_request); + + /* + * If the overlap has become 0 (most likely because the + * image has been flattened) we need to re-submit the + * original request. + */ + rbd_assert(obj_request); + rbd_assert(obj_request->img_request); + rbd_dev = obj_request->img_request->rbd_dev; + if (!rbd_dev->parent_overlap) { + struct ceph_osd_client *osdc; + + osdc = &rbd_dev->rbd_client->client->osdc; + img_result = rbd_obj_request_submit(osdc, obj_request); + if (!img_result) + return; + } + + obj_request->result = img_result; + if (obj_request->result) + goto out; + + /* + * We need to zero anything beyond the parent overlap + * boundary. Since rbd_img_obj_request_read_callback() + * will zero anything beyond the end of a short read, an + * easy way to do this is to pretend the data from the + * parent came up short--ending at the overlap boundary. + */ + rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); + obj_end = obj_request->img_offset + obj_request->length; + if (obj_end > rbd_dev->parent_overlap) { + u64 xferred = 0; + + if (obj_request->img_offset < rbd_dev->parent_overlap) + xferred = rbd_dev->parent_overlap - + obj_request->img_offset; + + obj_request->xferred = min(img_xferred, xferred); + } else { + obj_request->xferred = img_xferred; + } +out: + rbd_img_obj_request_read_callback(obj_request); + rbd_obj_request_complete(obj_request); +} + +static void rbd_img_parent_read(struct rbd_obj_request *obj_request) +{ + struct rbd_img_request *img_request; + int result; + + rbd_assert(obj_request_img_data_test(obj_request)); + rbd_assert(obj_request->img_request != NULL); + rbd_assert(obj_request->result == (s32) -ENOENT); + rbd_assert(obj_request_type_valid(obj_request->type)); + + /* rbd_read_finish(obj_request, obj_request->length); */ + img_request = rbd_parent_request_create(obj_request, + obj_request->img_offset, + obj_request->length); + result = -ENOMEM; + if (!img_request) + goto out_err; + + if (obj_request->type == OBJ_REQUEST_BIO) + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, + obj_request->bio_list); + else + result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, + obj_request->pages); + if (result) + goto out_err; + + img_request->callback = rbd_img_parent_read_callback; + result = rbd_img_request_submit(img_request); + if (result) + goto out_err; + + return; +out_err: + if (img_request) + rbd_img_request_put(img_request); + obj_request->result = result; + obj_request->xferred = 0; + obj_request_done_set(obj_request); +} + +static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) +{ + struct rbd_obj_request *obj_request; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + int ret; + + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + OBJ_REQUEST_NODATA); + if (!obj_request) + return -ENOMEM; + + ret = -ENOMEM; + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, + obj_request); + if (!obj_request->osd_req) + goto out; + + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK, + notify_id, 0, 0); + rbd_osd_req_format_read(obj_request); + + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out; + ret = rbd_obj_request_wait(obj_request); +out: + rbd_obj_request_put(obj_request); + + return ret; +} + +static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) +{ + struct rbd_device *rbd_dev = (struct rbd_device *)data; + int ret; + + if (!rbd_dev) + return; + + dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__, + rbd_dev->header_name, (unsigned long long)notify_id, + (unsigned int)opcode); + + /* + * Until adequate refresh error handling is in place, there is + * not much we can do here, except warn. + * + * See http://tracker.ceph.com/issues/5040 + */ + ret = rbd_dev_refresh(rbd_dev); + if (ret) + rbd_warn(rbd_dev, "refresh failed: %d", ret); + + ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id); + if (ret) + rbd_warn(rbd_dev, "notify_ack ret %d", ret); +} + +/* + * Send a (un)watch request and wait for the ack. Return a request + * with a ref held on success or error. + */ +static struct rbd_obj_request *rbd_obj_watch_request_helper( + struct rbd_device *rbd_dev, + bool watch) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + int ret; + + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, + OBJ_REQUEST_NODATA); + if (!obj_request) + return ERR_PTR(-ENOMEM); + + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1, + obj_request); + if (!obj_request->osd_req) { + ret = -ENOMEM; + goto out; + } + + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, 0, watch); + rbd_osd_req_format_write(obj_request); + + if (watch) + ceph_osdc_set_request_linger(osdc, obj_request->osd_req); + + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out; + + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out; + + ret = obj_request->result; + if (ret) { + if (watch) + rbd_obj_request_end(obj_request); + goto out; + } + + return obj_request; + +out: + rbd_obj_request_put(obj_request); + return ERR_PTR(ret); +} + +/* + * Initiate a watch request, synchronously. + */ +static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + int ret; + + rbd_assert(!rbd_dev->watch_event); + rbd_assert(!rbd_dev->watch_request); + + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, + &rbd_dev->watch_event); + if (ret < 0) + return ret; + + obj_request = rbd_obj_watch_request_helper(rbd_dev, true); + if (IS_ERR(obj_request)) { + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; + return PTR_ERR(obj_request); + } + + /* + * A watch request is set to linger, so the underlying osd + * request won't go away until we unregister it. We retain + * a pointer to the object request during that time (in + * rbd_dev->watch_request), so we'll keep a reference to it. + * We'll drop that reference after we've unregistered it in + * rbd_dev_header_unwatch_sync(). + */ + rbd_dev->watch_request = obj_request; + + return 0; +} + +/* + * Tear down a watch request, synchronously. + */ +static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) +{ + struct rbd_obj_request *obj_request; + + rbd_assert(rbd_dev->watch_event); + rbd_assert(rbd_dev->watch_request); + + rbd_obj_request_end(rbd_dev->watch_request); + rbd_obj_request_put(rbd_dev->watch_request); + rbd_dev->watch_request = NULL; + + obj_request = rbd_obj_watch_request_helper(rbd_dev, false); + if (!IS_ERR(obj_request)) + rbd_obj_request_put(obj_request); + else + rbd_warn(rbd_dev, "unable to tear down watch request (%ld)", + PTR_ERR(obj_request)); + + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; +} + +/* + * Synchronous osd object method call. Returns the number of bytes + * returned in the outbound buffer, or a negative error code. + */ +static int rbd_obj_method_sync(struct rbd_device *rbd_dev, + const char *object_name, + const char *class_name, + const char *method_name, + const void *outbound, + size_t outbound_size, + void *inbound, + size_t inbound_size) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + struct page **pages; + u32 page_count; + int ret; + + /* + * Method calls are ultimately read operations. The result + * should placed into the inbound buffer provided. They + * also supply outbound data--parameters for the object + * method. Currently if this is present it will be a + * snapshot id. + */ + page_count = (u32)calc_pages_for(0, inbound_size); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = -ENOMEM; + obj_request = rbd_obj_request_create(object_name, 0, inbound_size, + OBJ_REQUEST_PAGES); + if (!obj_request) + goto out; + + obj_request->pages = pages; + obj_request->page_count = page_count; + + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, + obj_request); + if (!obj_request->osd_req) + goto out; + + osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL, + class_name, method_name); + if (outbound_size) { + struct ceph_pagelist *pagelist; + + pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + if (!pagelist) + goto out; + + ceph_pagelist_init(pagelist); + ceph_pagelist_append(pagelist, outbound, outbound_size); + osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0, + pagelist); + } + osd_req_op_cls_response_data_pages(obj_request->osd_req, 0, + obj_request->pages, inbound_size, + 0, false, false); + rbd_osd_req_format_read(obj_request); + + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out; + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out; + + ret = obj_request->result; + if (ret < 0) + goto out; + + rbd_assert(obj_request->xferred < (u64)INT_MAX); + ret = (int)obj_request->xferred; + ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred); +out: + if (obj_request) + rbd_obj_request_put(obj_request); + else + ceph_release_page_vector(pages, page_count); + + return ret; +} + +static void rbd_queue_workfn(struct work_struct *work) +{ + struct request *rq = blk_mq_rq_from_pdu(work); + struct rbd_device *rbd_dev = rq->q->queuedata; + struct rbd_img_request *img_request; + struct ceph_snap_context *snapc = NULL; + u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; + u64 length = blk_rq_bytes(rq); + enum obj_operation_type op_type; + u64 mapping_size; + int result; + + if (rq->cmd_type != REQ_TYPE_FS) { + dout("%s: non-fs request type %d\n", __func__, + (int) rq->cmd_type); + result = -EIO; + goto err; + } + + if (rq->cmd_flags & REQ_DISCARD) + op_type = OBJ_OP_DISCARD; + else if (rq->cmd_flags & REQ_WRITE) + op_type = OBJ_OP_WRITE; + else + op_type = OBJ_OP_READ; + + /* Ignore/skip any zero-length requests */ + + if (!length) { + dout("%s: zero-length request\n", __func__); + result = 0; + goto err_rq; + } + + /* Only reads are allowed to a read-only device */ + + if (op_type != OBJ_OP_READ) { + if (rbd_dev->mapping.read_only) { + result = -EROFS; + goto err_rq; + } + rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); + } + + /* + * Quit early if the mapped snapshot no longer exists. It's + * still possible the snapshot will have disappeared by the + * time our request arrives at the osd, but there's no sense in + * sending it if we already know. + */ + if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) { + dout("request for non-existent snapshot"); + rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP); + result = -ENXIO; + goto err_rq; + } + + if (offset && length > U64_MAX - offset + 1) { + rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset, + length); + result = -EINVAL; + goto err_rq; /* Shouldn't happen */ + } + + blk_mq_start_request(rq); + + down_read(&rbd_dev->header_rwsem); + mapping_size = rbd_dev->mapping.size; + if (op_type != OBJ_OP_READ) { + snapc = rbd_dev->header.snapc; + ceph_get_snap_context(snapc); + } + up_read(&rbd_dev->header_rwsem); + + if (offset + length > mapping_size) { + rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, + length, mapping_size); + result = -EIO; + goto err_rq; + } + + img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, + snapc); + if (!img_request) { + result = -ENOMEM; + goto err_rq; + } + img_request->rq = rq; + + if (op_type == OBJ_OP_DISCARD) + result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, + NULL); + else + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, + rq->bio); + if (result) + goto err_img_request; + + result = rbd_img_request_submit(img_request); + if (result) + goto err_img_request; + + return; + +err_img_request: + rbd_img_request_put(img_request); +err_rq: + if (result) + rbd_warn(rbd_dev, "%s %llx at %llx result %d", + obj_op_name(op_type), length, offset, result); + ceph_put_snap_context(snapc); +err: + blk_mq_end_request(rq, result); +} + +static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + struct work_struct *work = blk_mq_rq_to_pdu(rq); + + queue_work(rbd_wq, work); + return BLK_MQ_RQ_QUEUE_OK; +} + +/* + * a queue callback. Makes sure that we don't create a bio that spans across + * multiple osd objects. One exception would be with a single page bios, + * which we handle later at bio_chain_clone_range() + */ +static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, + struct bio_vec *bvec) +{ + struct rbd_device *rbd_dev = q->queuedata; + sector_t sector_offset; + sector_t sectors_per_obj; + sector_t obj_sector_offset; + int ret; + + /* + * Find how far into its rbd object the partition-relative + * bio start sector is to offset relative to the enclosing + * device. + */ + sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; + sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); + obj_sector_offset = sector_offset & (sectors_per_obj - 1); + + /* + * Compute the number of bytes from that offset to the end + * of the object. Account for what's already used by the bio. + */ + ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; + if (ret > bmd->bi_size) + ret -= bmd->bi_size; + else + ret = 0; + + /* + * Don't send back more than was asked for. And if the bio + * was empty, let the whole thing through because: "Note + * that a block device *must* allow a single page to be + * added to an empty bio." + */ + rbd_assert(bvec->bv_len <= PAGE_SIZE); + if (ret > (int) bvec->bv_len || !bmd->bi_size) + ret = (int) bvec->bv_len; + + return ret; +} + +static void rbd_free_disk(struct rbd_device *rbd_dev) +{ + struct gendisk *disk = rbd_dev->disk; + + if (!disk) + return; + + rbd_dev->disk = NULL; + if (disk->flags & GENHD_FL_UP) { + del_gendisk(disk); + if (disk->queue) + blk_cleanup_queue(disk->queue); + blk_mq_free_tag_set(&rbd_dev->tag_set); + } + put_disk(disk); +} + +static int rbd_obj_read_sync(struct rbd_device *rbd_dev, + const char *object_name, + u64 offset, u64 length, void *buf) + +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_obj_request *obj_request; + struct page **pages = NULL; + u32 page_count; + size_t size; + int ret; + + page_count = (u32) calc_pages_for(offset, length); + pages = ceph_alloc_page_vector(page_count, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = -ENOMEM; + obj_request = rbd_obj_request_create(object_name, offset, length, + OBJ_REQUEST_PAGES); + if (!obj_request) + goto out; + + obj_request->pages = pages; + obj_request->page_count = page_count; + + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, + obj_request); + if (!obj_request->osd_req) + goto out; + + osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ, + offset, length, 0, 0); + osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, + obj_request->pages, + obj_request->length, + obj_request->offset & ~PAGE_MASK, + false, false); + rbd_osd_req_format_read(obj_request); + + ret = rbd_obj_request_submit(osdc, obj_request); + if (ret) + goto out; + ret = rbd_obj_request_wait(obj_request); + if (ret) + goto out; + + ret = obj_request->result; + if (ret < 0) + goto out; + + rbd_assert(obj_request->xferred <= (u64) SIZE_MAX); + size = (size_t) obj_request->xferred; + ceph_copy_from_page_vector(pages, buf, 0, size); + rbd_assert(size <= (size_t)INT_MAX); + ret = (int)size; +out: + if (obj_request) + rbd_obj_request_put(obj_request); + else + ceph_release_page_vector(pages, page_count); + + return ret; +} + +/* + * Read the complete header for the given rbd device. On successful + * return, the rbd_dev->header field will contain up-to-date + * information about the image. + */ +static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev) +{ + struct rbd_image_header_ondisk *ondisk = NULL; + u32 snap_count = 0; + u64 names_size = 0; + u32 want_count; + int ret; + + /* + * The complete header will include an array of its 64-bit + * snapshot ids, followed by the names of those snapshots as + * a contiguous block of NUL-terminated strings. Note that + * the number of snapshots could change by the time we read + * it in, in which case we re-read it. + */ + do { + size_t size; + + kfree(ondisk); + + size = sizeof (*ondisk); + size += snap_count * sizeof (struct rbd_image_snap_ondisk); + size += names_size; + ondisk = kmalloc(size, GFP_KERNEL); + if (!ondisk) + return -ENOMEM; + + ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name, + 0, size, ondisk); + if (ret < 0) + goto out; + if ((size_t)ret < size) { + ret = -ENXIO; + rbd_warn(rbd_dev, "short header read (want %zd got %d)", + size, ret); + goto out; + } + if (!rbd_dev_ondisk_valid(ondisk)) { + ret = -ENXIO; + rbd_warn(rbd_dev, "invalid header"); + goto out; + } + + names_size = le64_to_cpu(ondisk->snap_names_len); + want_count = snap_count; + snap_count = le32_to_cpu(ondisk->snap_count); + } while (snap_count != want_count); + + ret = rbd_header_from_disk(rbd_dev, ondisk); +out: + kfree(ondisk); + + return ret; +} + +/* + * Clear the rbd device's EXISTS flag if the snapshot it's mapped to + * has disappeared from the (just updated) snapshot context. + */ +static void rbd_exists_validate(struct rbd_device *rbd_dev) +{ + u64 snap_id; + + if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) + return; + + snap_id = rbd_dev->spec->snap_id; + if (snap_id == CEPH_NOSNAP) + return; + + if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX) + clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); +} + +static void rbd_dev_update_size(struct rbd_device *rbd_dev) +{ + sector_t size; + bool removing; + + /* + * Don't hold the lock while doing disk operations, + * or lock ordering will conflict with the bdev mutex via: + * rbd_add() -> blkdev_get() -> rbd_open() + */ + spin_lock_irq(&rbd_dev->lock); + removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags); + spin_unlock_irq(&rbd_dev->lock); + /* + * If the device is being removed, rbd_dev->disk has + * been destroyed, so don't try to update its size + */ + if (!removing) { + size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE; + dout("setting size to %llu sectors", (unsigned long long)size); + set_capacity(rbd_dev->disk, size); + revalidate_disk(rbd_dev->disk); + } +} + +static int rbd_dev_refresh(struct rbd_device *rbd_dev) +{ + u64 mapping_size; + int ret; + + down_write(&rbd_dev->header_rwsem); + mapping_size = rbd_dev->mapping.size; + + ret = rbd_dev_header_info(rbd_dev); + if (ret) + goto out; + + /* + * If there is a parent, see if it has disappeared due to the + * mapped image getting flattened. + */ + if (rbd_dev->parent) { + ret = rbd_dev_v2_parent_info(rbd_dev); + if (ret) + goto out; + } + + if (rbd_dev->spec->snap_id == CEPH_NOSNAP) { + rbd_dev->mapping.size = rbd_dev->header.image_size; + } else { + /* validate mapped snapshot's EXISTS flag */ + rbd_exists_validate(rbd_dev); + } + +out: + up_write(&rbd_dev->header_rwsem); + if (!ret && mapping_size != rbd_dev->mapping.size) + rbd_dev_update_size(rbd_dev); + + return ret; +} + +static int rbd_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct work_struct *work = blk_mq_rq_to_pdu(rq); + + INIT_WORK(work, rbd_queue_workfn); + return 0; +} + +static struct blk_mq_ops rbd_mq_ops = { + .queue_rq = rbd_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = rbd_init_request, +}; + +static int rbd_init_disk(struct rbd_device *rbd_dev) +{ + struct gendisk *disk; + struct request_queue *q; + u64 segment_size; + int err; + + /* create gendisk info */ + disk = alloc_disk(single_major ? + (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : + RBD_MINORS_PER_MAJOR); + if (!disk) + return -ENOMEM; + + snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", + rbd_dev->dev_id); + disk->major = rbd_dev->major; + disk->first_minor = rbd_dev->minor; + if (single_major) + disk->flags |= GENHD_FL_EXT_DEVT; + disk->fops = &rbd_bd_ops; + disk->private_data = rbd_dev; + + memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); + rbd_dev->tag_set.ops = &rbd_mq_ops; + rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ; + rbd_dev->tag_set.numa_node = NUMA_NO_NODE; + rbd_dev->tag_set.flags = + BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + rbd_dev->tag_set.nr_hw_queues = 1; + rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); + + err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); + if (err) + goto out_disk; + + q = blk_mq_init_queue(&rbd_dev->tag_set); + if (IS_ERR(q)) { + err = PTR_ERR(q); + goto out_tag_set; + } + + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ + + /* set io sizes to object size */ + segment_size = rbd_obj_bytes(&rbd_dev->header); + blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); + blk_queue_max_segment_size(q, segment_size); + blk_queue_io_min(q, segment_size); + blk_queue_io_opt(q, segment_size); + + /* enable the discard support */ + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + q->limits.discard_granularity = segment_size; + q->limits.discard_alignment = segment_size; + q->limits.max_discard_sectors = segment_size / SECTOR_SIZE; + q->limits.discard_zeroes_data = 1; + + blk_queue_merge_bvec(q, rbd_merge_bvec); + disk->queue = q; + + q->queuedata = rbd_dev; + + rbd_dev->disk = disk; + + return 0; +out_tag_set: + blk_mq_free_tag_set(&rbd_dev->tag_set); +out_disk: + put_disk(disk); + return err; +} + +/* + sysfs +*/ + +static struct rbd_device *dev_to_rbd_dev(struct device *dev) +{ + return container_of(dev, struct rbd_device, dev); +} + +static ssize_t rbd_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%llu\n", + (unsigned long long)rbd_dev->mapping.size); +} + +/* + * Note this shows the features for whatever's mapped, which is not + * necessarily the base image. + */ +static ssize_t rbd_features_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "0x%016llx\n", + (unsigned long long)rbd_dev->mapping.features); +} + +static ssize_t rbd_major_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + if (rbd_dev->major) + return sprintf(buf, "%d\n", rbd_dev->major); + + return sprintf(buf, "(none)\n"); +} + +static ssize_t rbd_minor_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%d\n", rbd_dev->minor); +} + +static ssize_t rbd_client_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "client%lld\n", + ceph_client_id(rbd_dev->rbd_client->client)); +} + +static ssize_t rbd_pool_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%s\n", rbd_dev->spec->pool_name); +} + +static ssize_t rbd_pool_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%llu\n", + (unsigned long long) rbd_dev->spec->pool_id); +} + +static ssize_t rbd_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + if (rbd_dev->spec->image_name) + return sprintf(buf, "%s\n", rbd_dev->spec->image_name); + + return sprintf(buf, "(unknown)\n"); +} + +static ssize_t rbd_image_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%s\n", rbd_dev->spec->image_id); +} + +/* + * Shows the name of the currently-mapped snapshot (or + * RBD_SNAP_HEAD_NAME for the base image). + */ +static ssize_t rbd_snap_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%s\n", rbd_dev->spec->snap_name); +} + +/* + * For a v2 image, shows the chain of parent images, separated by empty + * lines. For v1 images or if there is no parent, shows "(no parent + * image)". + */ +static ssize_t rbd_parent_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + ssize_t count = 0; + + if (!rbd_dev->parent) + return sprintf(buf, "(no parent image)\n"); + + for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) { + struct rbd_spec *spec = rbd_dev->parent_spec; + + count += sprintf(&buf[count], "%s" + "pool_id %llu\npool_name %s\n" + "image_id %s\nimage_name %s\n" + "snap_id %llu\nsnap_name %s\n" + "overlap %llu\n", + !count ? "" : "\n", /* first? */ + spec->pool_id, spec->pool_name, + spec->image_id, spec->image_name ?: "(unknown)", + spec->snap_id, spec->snap_name, + rbd_dev->parent_overlap); + } + + return count; +} + +static ssize_t rbd_image_refresh(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t size) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + int ret; + + ret = rbd_dev_refresh(rbd_dev); + if (ret) + return ret; + + return size; +} + +static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); +static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL); +static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); +static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL); +static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); +static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); +static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); +static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); +static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL); +static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); +static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); +static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL); + +static struct attribute *rbd_attrs[] = { + &dev_attr_size.attr, + &dev_attr_features.attr, + &dev_attr_major.attr, + &dev_attr_minor.attr, + &dev_attr_client_id.attr, + &dev_attr_pool.attr, + &dev_attr_pool_id.attr, + &dev_attr_name.attr, + &dev_attr_image_id.attr, + &dev_attr_current_snap.attr, + &dev_attr_parent.attr, + &dev_attr_refresh.attr, + NULL +}; + +static struct attribute_group rbd_attr_group = { + .attrs = rbd_attrs, +}; + +static const struct attribute_group *rbd_attr_groups[] = { + &rbd_attr_group, + NULL +}; + +static void rbd_sysfs_dev_release(struct device *dev) +{ +} + +static struct device_type rbd_device_type = { + .name = "rbd", + .groups = rbd_attr_groups, + .release = rbd_sysfs_dev_release, +}; + +static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) +{ + kref_get(&spec->kref); + + return spec; +} + +static void rbd_spec_free(struct kref *kref); +static void rbd_spec_put(struct rbd_spec *spec) +{ + if (spec) + kref_put(&spec->kref, rbd_spec_free); +} + +static struct rbd_spec *rbd_spec_alloc(void) +{ + struct rbd_spec *spec; + + spec = kzalloc(sizeof (*spec), GFP_KERNEL); + if (!spec) + return NULL; + + spec->pool_id = CEPH_NOPOOL; + spec->snap_id = CEPH_NOSNAP; + kref_init(&spec->kref); + + return spec; +} + +static void rbd_spec_free(struct kref *kref) +{ + struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); + + kfree(spec->pool_name); + kfree(spec->image_id); + kfree(spec->image_name); + kfree(spec->snap_name); + kfree(spec); +} + +static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, + struct rbd_spec *spec) +{ + struct rbd_device *rbd_dev; + + rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL); + if (!rbd_dev) + return NULL; + + spin_lock_init(&rbd_dev->lock); + rbd_dev->flags = 0; + atomic_set(&rbd_dev->parent_ref, 0); + INIT_LIST_HEAD(&rbd_dev->node); + init_rwsem(&rbd_dev->header_rwsem); + + rbd_dev->spec = spec; + rbd_dev->rbd_client = rbdc; + + /* Initialize the layout used for all rbd requests */ + + rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); + rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); + + return rbd_dev; +} + +static void rbd_dev_destroy(struct rbd_device *rbd_dev) +{ + rbd_put_client(rbd_dev->rbd_client); + rbd_spec_put(rbd_dev->spec); + kfree(rbd_dev); +} + +/* + * Get the size and object order for an image snapshot, or if + * snap_id is CEPH_NOSNAP, gets this information for the base + * image. + */ +static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id, + u8 *order, u64 *snap_size) +{ + __le64 snapid = cpu_to_le64(snap_id); + int ret; + struct { + u8 order; + __le64 size; + } __attribute__ ((packed)) size_buf = { 0 }; + + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + "rbd", "get_size", + &snapid, sizeof (snapid), + &size_buf, sizeof (size_buf)); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); + if (ret < 0) + return ret; + if (ret < sizeof (size_buf)) + return -ERANGE; + + if (order) { + *order = size_buf.order; + dout(" order %u", (unsigned int)*order); + } + *snap_size = le64_to_cpu(size_buf.size); + + dout(" snap_id 0x%016llx snap_size = %llu\n", + (unsigned long long)snap_id, + (unsigned long long)*snap_size); + + return 0; +} + +static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) +{ + return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP, + &rbd_dev->header.obj_order, + &rbd_dev->header.image_size); +} + +static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) +{ + void *reply_buf; + int ret; + void *p; + + reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); + if (!reply_buf) + return -ENOMEM; + + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + "rbd", "get_object_prefix", NULL, 0, + reply_buf, RBD_OBJ_PREFIX_LEN_MAX); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); + if (ret < 0) + goto out; + + p = reply_buf; + rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p, + p + ret, NULL, GFP_NOIO); + ret = 0; + + if (IS_ERR(rbd_dev->header.object_prefix)) { + ret = PTR_ERR(rbd_dev->header.object_prefix); + rbd_dev->header.object_prefix = NULL; + } else { + dout(" object_prefix = %s\n", rbd_dev->header.object_prefix); + } +out: + kfree(reply_buf); + + return ret; +} + +static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id, + u64 *snap_features) +{ + __le64 snapid = cpu_to_le64(snap_id); + struct { + __le64 features; + __le64 incompat; + } __attribute__ ((packed)) features_buf = { 0 }; + u64 incompat; + int ret; + + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + "rbd", "get_features", + &snapid, sizeof (snapid), + &features_buf, sizeof (features_buf)); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); + if (ret < 0) + return ret; + if (ret < sizeof (features_buf)) + return -ERANGE; + + incompat = le64_to_cpu(features_buf.incompat); + if (incompat & ~RBD_FEATURES_SUPPORTED) + return -ENXIO; + + *snap_features = le64_to_cpu(features_buf.features); + + dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n", + (unsigned long long)snap_id, + (unsigned long long)*snap_features, + (unsigned long long)le64_to_cpu(features_buf.incompat)); + + return 0; +} + +static int rbd_dev_v2_features(struct rbd_device *rbd_dev) +{ + return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP, + &rbd_dev->header.features); +} + +static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) +{ + struct rbd_spec *parent_spec; + size_t size; + void *reply_buf = NULL; + __le64 snapid; + void *p; + void *end; + u64 pool_id; + char *image_id; + u64 snap_id; + u64 overlap; + int ret; + + parent_spec = rbd_spec_alloc(); + if (!parent_spec) + return -ENOMEM; + + size = sizeof (__le64) + /* pool_id */ + sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */ + sizeof (__le64) + /* snap_id */ + sizeof (__le64); /* overlap */ + reply_buf = kmalloc(size, GFP_KERNEL); + if (!reply_buf) { + ret = -ENOMEM; + goto out_err; + } + + snapid = cpu_to_le64(rbd_dev->spec->snap_id); + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + "rbd", "get_parent", + &snapid, sizeof (snapid), + reply_buf, size); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); + if (ret < 0) + goto out_err; + + p = reply_buf; + end = reply_buf + ret; + ret = -ERANGE; + ceph_decode_64_safe(&p, end, pool_id, out_err); + if (pool_id == CEPH_NOPOOL) { + /* + * Either the parent never existed, or we have + * record of it but the image got flattened so it no + * longer has a parent. When the parent of a + * layered image disappears we immediately set the + * overlap to 0. The effect of this is that all new + * requests will be treated as if the image had no + * parent. + */ + if (rbd_dev->parent_overlap) { + rbd_dev->parent_overlap = 0; + rbd_dev_parent_put(rbd_dev); + pr_info("%s: clone image has been flattened\n", + rbd_dev->disk->disk_name); + } + + goto out; /* No parent? No problem. */ + } + + /* The ceph file layout needs to fit pool id in 32 bits */ + + ret = -EIO; + if (pool_id > (u64)U32_MAX) { + rbd_warn(NULL, "parent pool id too large (%llu > %u)", + (unsigned long long)pool_id, U32_MAX); + goto out_err; + } + + image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); + if (IS_ERR(image_id)) { + ret = PTR_ERR(image_id); + goto out_err; + } + ceph_decode_64_safe(&p, end, snap_id, out_err); + ceph_decode_64_safe(&p, end, overlap, out_err); + + /* + * The parent won't change (except when the clone is + * flattened, already handled that). So we only need to + * record the parent spec we have not already done so. + */ + if (!rbd_dev->parent_spec) { + parent_spec->pool_id = pool_id; + parent_spec->image_id = image_id; + parent_spec->snap_id = snap_id; + rbd_dev->parent_spec = parent_spec; + parent_spec = NULL; /* rbd_dev now owns this */ + } else { + kfree(image_id); + } + + /* + * We always update the parent overlap. If it's zero we issue + * a warning, as we will proceed as if there was no parent. + */ + if (!overlap) { + if (parent_spec) { + /* refresh, careful to warn just once */ + if (rbd_dev->parent_overlap) + rbd_warn(rbd_dev, + "clone now standalone (overlap became 0)"); + } else { + /* initial probe */ + rbd_warn(rbd_dev, "clone is standalone (overlap 0)"); + } + } + rbd_dev->parent_overlap = overlap; + +out: + ret = 0; +out_err: + kfree(reply_buf); + rbd_spec_put(parent_spec); + + return ret; +} + +static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev) +{ + struct { + __le64 stripe_unit; + __le64 stripe_count; + } __attribute__ ((packed)) striping_info_buf = { 0 }; + size_t size = sizeof (striping_info_buf); + void *p; + u64 obj_size; + u64 stripe_unit; + u64 stripe_count; + int ret; + + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + "rbd", "get_stripe_unit_count", NULL, 0, + (char *)&striping_info_buf, size); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); + if (ret < 0) + return ret; + if (ret < size) + return -ERANGE; + + /* + * We don't actually support the "fancy striping" feature + * (STRIPINGV2) yet, but if the striping sizes are the + * defaults the behavior is the same as before. So find + * out, and only fail if the image has non-default values. + */ + ret = -EINVAL; + obj_size = (u64)1 << rbd_dev->header.obj_order; + p = &striping_info_buf; + stripe_unit = ceph_decode_64(&p); + if (stripe_unit != obj_size) { + rbd_warn(rbd_dev, "unsupported stripe unit " + "(got %llu want %llu)", + stripe_unit, obj_size); + return -EINVAL; + } + stripe_count = ceph_decode_64(&p); + if (stripe_count != 1) { + rbd_warn(rbd_dev, "unsupported stripe count " + "(got %llu want 1)", stripe_count); + return -EINVAL; + } + rbd_dev->header.stripe_unit = stripe_unit; + rbd_dev->header.stripe_count = stripe_count; + + return 0; +} + +static char *rbd_dev_image_name(struct rbd_device *rbd_dev) +{ + size_t image_id_size; + char *image_id; + void *p; + void *end; + size_t size; + void *reply_buf = NULL; + size_t len = 0; + char *image_name = NULL; + int ret; + + rbd_assert(!rbd_dev->spec->image_name); + + len = strlen(rbd_dev->spec->image_id); + image_id_size = sizeof (__le32) + len; + image_id = kmalloc(image_id_size, GFP_KERNEL); + if (!image_id) + return NULL; + + p = image_id; + end = image_id + image_id_size; + ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len); + + size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX; + reply_buf = kmalloc(size, GFP_KERNEL); + if (!reply_buf) + goto out; + + ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY, + "rbd", "dir_get_name", + image_id, image_id_size, + reply_buf, size); + if (ret < 0) + goto out; + p = reply_buf; + end = reply_buf + ret; + + image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL); + if (IS_ERR(image_name)) + image_name = NULL; + else + dout("%s: name is %s len is %zd\n", __func__, image_name, len); +out: + kfree(reply_buf); + kfree(image_id); + + return image_name; +} + +static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) +{ + struct ceph_snap_context *snapc = rbd_dev->header.snapc; + const char *snap_name; + u32 which = 0; + + /* Skip over names until we find the one we are looking for */ + + snap_name = rbd_dev->header.snap_names; + while (which < snapc->num_snaps) { + if (!strcmp(name, snap_name)) + return snapc->snaps[which]; + snap_name += strlen(snap_name) + 1; + which++; + } + return CEPH_NOSNAP; +} + +static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) +{ + struct ceph_snap_context *snapc = rbd_dev->header.snapc; + u32 which; + bool found = false; + u64 snap_id; + + for (which = 0; !found && which < snapc->num_snaps; which++) { + const char *snap_name; + + snap_id = snapc->snaps[which]; + snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id); + if (IS_ERR(snap_name)) { + /* ignore no-longer existing snapshots */ + if (PTR_ERR(snap_name) == -ENOENT) + continue; + else + break; + } + found = !strcmp(name, snap_name); + kfree(snap_name); + } + return found ? snap_id : CEPH_NOSNAP; +} + +/* + * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if + * no snapshot by that name is found, or if an error occurs. + */ +static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name) +{ + if (rbd_dev->image_format == 1) + return rbd_v1_snap_id_by_name(rbd_dev, name); + + return rbd_v2_snap_id_by_name(rbd_dev, name); +} + +/* + * An image being mapped will have everything but the snap id. + */ +static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev) +{ + struct rbd_spec *spec = rbd_dev->spec; + + rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name); + rbd_assert(spec->image_id && spec->image_name); + rbd_assert(spec->snap_name); + + if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) { + u64 snap_id; + + snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name); + if (snap_id == CEPH_NOSNAP) + return -ENOENT; + + spec->snap_id = snap_id; + } else { + spec->snap_id = CEPH_NOSNAP; + } + + return 0; +} + +/* + * A parent image will have all ids but none of the names. + * + * All names in an rbd spec are dynamically allocated. It's OK if we + * can't figure out the name for an image id. + */ +static int rbd_spec_fill_names(struct rbd_device *rbd_dev) +{ + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct rbd_spec *spec = rbd_dev->spec; + const char *pool_name; + const char *image_name; + const char *snap_name; + int ret; + + rbd_assert(spec->pool_id != CEPH_NOPOOL); + rbd_assert(spec->image_id); + rbd_assert(spec->snap_id != CEPH_NOSNAP); + + /* Get the pool name; we have to make our own copy of this */ + + pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id); + if (!pool_name) { + rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id); + return -EIO; + } + pool_name = kstrdup(pool_name, GFP_KERNEL); + if (!pool_name) + return -ENOMEM; + + /* Fetch the image name; tolerate failure here */ + + image_name = rbd_dev_image_name(rbd_dev); + if (!image_name) + rbd_warn(rbd_dev, "unable to get image name"); + + /* Fetch the snapshot name */ + + snap_name = rbd_snap_name(rbd_dev, spec->snap_id); + if (IS_ERR(snap_name)) { + ret = PTR_ERR(snap_name); + goto out_err; + } + + spec->pool_name = pool_name; + spec->image_name = image_name; + spec->snap_name = snap_name; + + return 0; + +out_err: + kfree(image_name); + kfree(pool_name); + return ret; +} + +static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev) +{ + size_t size; + int ret; + void *reply_buf; + void *p; + void *end; + u64 seq; + u32 snap_count; + struct ceph_snap_context *snapc; + u32 i; + + /* + * We'll need room for the seq value (maximum snapshot id), + * snapshot count, and array of that many snapshot ids. + * For now we have a fixed upper limit on the number we're + * prepared to receive. + */ + size = sizeof (__le64) + sizeof (__le32) + + RBD_MAX_SNAP_COUNT * sizeof (__le64); + reply_buf = kzalloc(size, GFP_KERNEL); + if (!reply_buf) + return -ENOMEM; + + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + "rbd", "get_snapcontext", NULL, 0, + reply_buf, size); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); + if (ret < 0) + goto out; + + p = reply_buf; + end = reply_buf + ret; + ret = -ERANGE; + ceph_decode_64_safe(&p, end, seq, out); + ceph_decode_32_safe(&p, end, snap_count, out); + + /* + * Make sure the reported number of snapshot ids wouldn't go + * beyond the end of our buffer. But before checking that, + * make sure the computed size of the snapshot context we + * allocate is representable in a size_t. + */ + if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context)) + / sizeof (u64)) { + ret = -EINVAL; + goto out; + } + if (!ceph_has_room(&p, end, snap_count * sizeof (__le64))) + goto out; + ret = 0; + + snapc = ceph_create_snap_context(snap_count, GFP_KERNEL); + if (!snapc) { + ret = -ENOMEM; + goto out; + } + snapc->seq = seq; + for (i = 0; i < snap_count; i++) + snapc->snaps[i] = ceph_decode_64(&p); + + ceph_put_snap_context(rbd_dev->header.snapc); + rbd_dev->header.snapc = snapc; + + dout(" snap context seq = %llu, snap_count = %u\n", + (unsigned long long)seq, (unsigned int)snap_count); +out: + kfree(reply_buf); + + return ret; +} + +static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, + u64 snap_id) +{ + size_t size; + void *reply_buf; + __le64 snapid; + int ret; + void *p; + void *end; + char *snap_name; + + size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN; + reply_buf = kmalloc(size, GFP_KERNEL); + if (!reply_buf) + return ERR_PTR(-ENOMEM); + + snapid = cpu_to_le64(snap_id); + ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name, + "rbd", "get_snapshot_name", + &snapid, sizeof (snapid), + reply_buf, size); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); + if (ret < 0) { + snap_name = ERR_PTR(ret); + goto out; + } + + p = reply_buf; + end = reply_buf + ret; + snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); + if (IS_ERR(snap_name)) + goto out; + + dout(" snap_id 0x%016llx snap_name = %s\n", + (unsigned long long)snap_id, snap_name); +out: + kfree(reply_buf); + + return snap_name; +} + +static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) +{ + bool first_time = rbd_dev->header.object_prefix == NULL; + int ret; + + ret = rbd_dev_v2_image_size(rbd_dev); + if (ret) + return ret; + + if (first_time) { + ret = rbd_dev_v2_header_onetime(rbd_dev); + if (ret) + return ret; + } + + ret = rbd_dev_v2_snap_context(rbd_dev); + dout("rbd_dev_v2_snap_context returned %d\n", ret); + + return ret; +} + +static int rbd_dev_header_info(struct rbd_device *rbd_dev) +{ + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + + if (rbd_dev->image_format == 1) + return rbd_dev_v1_header_info(rbd_dev); + + return rbd_dev_v2_header_info(rbd_dev); +} + +static int rbd_bus_add_dev(struct rbd_device *rbd_dev) +{ + struct device *dev; + int ret; + + dev = &rbd_dev->dev; + dev->bus = &rbd_bus_type; + dev->type = &rbd_device_type; + dev->parent = &rbd_root_dev; + dev->release = rbd_dev_device_release; + dev_set_name(dev, "%d", rbd_dev->dev_id); + ret = device_register(dev); + + return ret; +} + +static void rbd_bus_del_dev(struct rbd_device *rbd_dev) +{ + device_unregister(&rbd_dev->dev); +} + +/* + * Get a unique rbd identifier for the given new rbd_dev, and add + * the rbd_dev to the global list. + */ +static int rbd_dev_id_get(struct rbd_device *rbd_dev) +{ + int new_dev_id; + + new_dev_id = ida_simple_get(&rbd_dev_id_ida, + 0, minor_to_rbd_dev_id(1 << MINORBITS), + GFP_KERNEL); + if (new_dev_id < 0) + return new_dev_id; + + rbd_dev->dev_id = new_dev_id; + + spin_lock(&rbd_dev_list_lock); + list_add_tail(&rbd_dev->node, &rbd_dev_list); + spin_unlock(&rbd_dev_list_lock); + + dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id); + + return 0; +} + +/* + * Remove an rbd_dev from the global list, and record that its + * identifier is no longer in use. + */ +static void rbd_dev_id_put(struct rbd_device *rbd_dev) +{ + spin_lock(&rbd_dev_list_lock); + list_del_init(&rbd_dev->node); + spin_unlock(&rbd_dev_list_lock); + + ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id); + + dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id); +} + +/* + * Skips over white space at *buf, and updates *buf to point to the + * first found non-space character (if any). Returns the length of + * the token (string of non-white space characters) found. Note + * that *buf must be terminated with '\0'. + */ +static inline size_t next_token(const char **buf) +{ + /* + * These are the characters that produce nonzero for + * isspace() in the "C" and "POSIX" locales. + */ + const char *spaces = " \f\n\r\t\v"; + + *buf += strspn(*buf, spaces); /* Find start of token */ + + return strcspn(*buf, spaces); /* Return token length */ +} + +/* + * Finds the next token in *buf, dynamically allocates a buffer big + * enough to hold a copy of it, and copies the token into the new + * buffer. The copy is guaranteed to be terminated with '\0'. Note + * that a duplicate buffer is created even for a zero-length token. + * + * Returns a pointer to the newly-allocated duplicate, or a null + * pointer if memory for the duplicate was not available. If + * the lenp argument is a non-null pointer, the length of the token + * (not including the '\0') is returned in *lenp. + * + * If successful, the *buf pointer will be updated to point beyond + * the end of the found token. + * + * Note: uses GFP_KERNEL for allocation. + */ +static inline char *dup_token(const char **buf, size_t *lenp) +{ + char *dup; + size_t len; + + len = next_token(buf); + dup = kmemdup(*buf, len + 1, GFP_KERNEL); + if (!dup) + return NULL; + *(dup + len) = '\0'; + *buf += len; + + if (lenp) + *lenp = len; + + return dup; +} + +/* + * Parse the options provided for an "rbd add" (i.e., rbd image + * mapping) request. These arrive via a write to /sys/bus/rbd/add, + * and the data written is passed here via a NUL-terminated buffer. + * Returns 0 if successful or an error code otherwise. + * + * The information extracted from these options is recorded in + * the other parameters which return dynamically-allocated + * structures: + * ceph_opts + * The address of a pointer that will refer to a ceph options + * structure. Caller must release the returned pointer using + * ceph_destroy_options() when it is no longer needed. + * rbd_opts + * Address of an rbd options pointer. Fully initialized by + * this function; caller must release with kfree(). + * spec + * Address of an rbd image specification pointer. Fully + * initialized by this function based on parsed options. + * Caller must release with rbd_spec_put(). + * + * The options passed take this form: + * [] + * where: + * + * A comma-separated list of one or more monitor addresses. + * A monitor address is an ip address, optionally followed + * by a port number (separated by a colon). + * I.e.: ip1[:port1][,ip2[:port2]...] + * + * A comma-separated list of ceph and/or rbd options. + * + * The name of the rados pool containing the rbd image. + * + * The name of the image in that pool to map. + * + * An optional snapshot id. If provided, the mapping will + * present data from the image at the time that snapshot was + * created. The image head is used if no snapshot id is + * provided. Snapshot mappings are always read-only. + */ +static int rbd_add_parse_args(const char *buf, + struct ceph_options **ceph_opts, + struct rbd_options **opts, + struct rbd_spec **rbd_spec) +{ + size_t len; + char *options; + const char *mon_addrs; + char *snap_name; + size_t mon_addrs_size; + struct rbd_spec *spec = NULL; + struct rbd_options *rbd_opts = NULL; + struct ceph_options *copts; + int ret; + + /* The first four tokens are required */ + + len = next_token(&buf); + if (!len) { + rbd_warn(NULL, "no monitor address(es) provided"); + return -EINVAL; + } + mon_addrs = buf; + mon_addrs_size = len + 1; + buf += len; + + ret = -EINVAL; + options = dup_token(&buf, NULL); + if (!options) + return -ENOMEM; + if (!*options) { + rbd_warn(NULL, "no options provided"); + goto out_err; + } + + spec = rbd_spec_alloc(); + if (!spec) + goto out_mem; + + spec->pool_name = dup_token(&buf, NULL); + if (!spec->pool_name) + goto out_mem; + if (!*spec->pool_name) { + rbd_warn(NULL, "no pool name provided"); + goto out_err; + } + + spec->image_name = dup_token(&buf, NULL); + if (!spec->image_name) + goto out_mem; + if (!*spec->image_name) { + rbd_warn(NULL, "no image name provided"); + goto out_err; + } + + /* + * Snapshot name is optional; default is to use "-" + * (indicating the head/no snapshot). + */ + len = next_token(&buf); + if (!len) { + buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */ + len = sizeof (RBD_SNAP_HEAD_NAME) - 1; + } else if (len > RBD_MAX_SNAP_NAME_LEN) { + ret = -ENAMETOOLONG; + goto out_err; + } + snap_name = kmemdup(buf, len + 1, GFP_KERNEL); + if (!snap_name) + goto out_mem; + *(snap_name + len) = '\0'; + spec->snap_name = snap_name; + + /* Initialize all rbd options to the defaults */ + + rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); + if (!rbd_opts) + goto out_mem; + + rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; + + copts = ceph_parse_options(options, mon_addrs, + mon_addrs + mon_addrs_size - 1, + parse_rbd_opts_token, rbd_opts); + if (IS_ERR(copts)) { + ret = PTR_ERR(copts); + goto out_err; + } + kfree(options); + + *ceph_opts = copts; + *opts = rbd_opts; + *rbd_spec = spec; + + return 0; +out_mem: + ret = -ENOMEM; +out_err: + kfree(rbd_opts); + rbd_spec_put(spec); + kfree(options); + + return ret; +} + +/* + * Return pool id (>= 0) or a negative error code. + */ +static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) +{ + u64 newest_epoch; + unsigned long timeout = rbdc->client->options->mount_timeout * HZ; + int tries = 0; + int ret; + +again: + ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); + if (ret == -ENOENT && tries++ < 1) { + ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", + &newest_epoch); + if (ret < 0) + return ret; + + if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { + ceph_monc_request_next_osdmap(&rbdc->client->monc); + (void) ceph_monc_wait_osdmap(&rbdc->client->monc, + newest_epoch, timeout); + goto again; + } else { + /* the osdmap we have is new enough */ + return -ENOENT; + } + } + + return ret; +} + +/* + * An rbd format 2 image has a unique identifier, distinct from the + * name given to it by the user. Internally, that identifier is + * what's used to specify the names of objects related to the image. + * + * A special "rbd id" object is used to map an rbd image name to its + * id. If that object doesn't exist, then there is no v2 rbd image + * with the supplied name. + * + * This function will record the given rbd_dev's image_id field if + * it can be determined, and in that case will return 0. If any + * errors occur a negative errno will be returned and the rbd_dev's + * image_id field will be unchanged (and should be NULL). + */ +static int rbd_dev_image_id(struct rbd_device *rbd_dev) +{ + int ret; + size_t size; + char *object_name; + void *response; + char *image_id; + + /* + * When probing a parent image, the image id is already + * known (and the image name likely is not). There's no + * need to fetch the image id again in this case. We + * do still need to set the image format though. + */ + if (rbd_dev->spec->image_id) { + rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1; + + return 0; + } + + /* + * First, see if the format 2 image id file exists, and if + * so, get the image's persistent id from it. + */ + size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name); + object_name = kmalloc(size, GFP_NOIO); + if (!object_name) + return -ENOMEM; + sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name); + dout("rbd id object name is %s\n", object_name); + + /* Response will be an encoded string, which includes a length */ + + size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; + response = kzalloc(size, GFP_NOIO); + if (!response) { + ret = -ENOMEM; + goto out; + } + + /* If it doesn't exist we'll assume it's a format 1 image */ + + ret = rbd_obj_method_sync(rbd_dev, object_name, + "rbd", "get_id", NULL, 0, + response, RBD_IMAGE_ID_LEN_MAX); + dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); + if (ret == -ENOENT) { + image_id = kstrdup("", GFP_KERNEL); + ret = image_id ? 0 : -ENOMEM; + if (!ret) + rbd_dev->image_format = 1; + } else if (ret >= 0) { + void *p = response; + + image_id = ceph_extract_encoded_string(&p, p + ret, + NULL, GFP_NOIO); + ret = PTR_ERR_OR_ZERO(image_id); + if (!ret) + rbd_dev->image_format = 2; + } + + if (!ret) { + rbd_dev->spec->image_id = image_id; + dout("image_id is %s\n", image_id); + } +out: + kfree(response); + kfree(object_name); + + return ret; +} + +/* + * Undo whatever state changes are made by v1 or v2 header info + * call. + */ +static void rbd_dev_unprobe(struct rbd_device *rbd_dev) +{ + struct rbd_image_header *header; + + rbd_dev_parent_put(rbd_dev); + + /* Free dynamic fields from the header, then zero it out */ + + header = &rbd_dev->header; + ceph_put_snap_context(header->snapc); + kfree(header->snap_sizes); + kfree(header->snap_names); + kfree(header->object_prefix); + memset(header, 0, sizeof (*header)); +} + +static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev) +{ + int ret; + + ret = rbd_dev_v2_object_prefix(rbd_dev); + if (ret) + goto out_err; + + /* + * Get the and check features for the image. Currently the + * features are assumed to never change. + */ + ret = rbd_dev_v2_features(rbd_dev); + if (ret) + goto out_err; + + /* If the image supports fancy striping, get its parameters */ + + if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) { + ret = rbd_dev_v2_striping_info(rbd_dev); + if (ret < 0) + goto out_err; + } + /* No support for crypto and compression type format 2 images */ + + return 0; +out_err: + rbd_dev->header.features = 0; + kfree(rbd_dev->header.object_prefix); + rbd_dev->header.object_prefix = NULL; + + return ret; +} + +static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) +{ + struct rbd_device *parent = NULL; + struct rbd_spec *parent_spec; + struct rbd_client *rbdc; + int ret; + + if (!rbd_dev->parent_spec) + return 0; + /* + * We need to pass a reference to the client and the parent + * spec when creating the parent rbd_dev. Images related by + * parent/child relationships always share both. + */ + parent_spec = rbd_spec_get(rbd_dev->parent_spec); + rbdc = __rbd_get_client(rbd_dev->rbd_client); + + ret = -ENOMEM; + parent = rbd_dev_create(rbdc, parent_spec); + if (!parent) + goto out_err; + + ret = rbd_dev_image_probe(parent, false); + if (ret < 0) + goto out_err; + rbd_dev->parent = parent; + atomic_set(&rbd_dev->parent_ref, 1); + + return 0; +out_err: + if (parent) { + rbd_dev_unparent(rbd_dev); + kfree(rbd_dev->header_name); + rbd_dev_destroy(parent); + } else { + rbd_put_client(rbdc); + rbd_spec_put(parent_spec); + } + + return ret; +} + +static int rbd_dev_device_setup(struct rbd_device *rbd_dev) +{ + int ret; + + /* Get an id and fill in device name. */ + + ret = rbd_dev_id_get(rbd_dev); + if (ret) + return ret; + + BUILD_BUG_ON(DEV_NAME_LEN + < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); + sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); + + /* Record our major and minor device numbers. */ + + if (!single_major) { + ret = register_blkdev(0, rbd_dev->name); + if (ret < 0) + goto err_out_id; + + rbd_dev->major = ret; + rbd_dev->minor = 0; + } else { + rbd_dev->major = rbd_major; + rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id); + } + + /* Set up the blkdev mapping. */ + + ret = rbd_init_disk(rbd_dev); + if (ret) + goto err_out_blkdev; + + ret = rbd_dev_mapping_set(rbd_dev); + if (ret) + goto err_out_disk; + + set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); + set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); + + ret = rbd_bus_add_dev(rbd_dev); + if (ret) + goto err_out_mapping; + + /* Everything's ready. Announce the disk to the world. */ + + set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); + add_disk(rbd_dev->disk); + + pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name, + (unsigned long long) rbd_dev->mapping.size); + + return ret; + +err_out_mapping: + rbd_dev_mapping_clear(rbd_dev); +err_out_disk: + rbd_free_disk(rbd_dev); +err_out_blkdev: + if (!single_major) + unregister_blkdev(rbd_dev->major, rbd_dev->name); +err_out_id: + rbd_dev_id_put(rbd_dev); + rbd_dev_mapping_clear(rbd_dev); + + return ret; +} + +static int rbd_dev_header_name(struct rbd_device *rbd_dev) +{ + struct rbd_spec *spec = rbd_dev->spec; + size_t size; + + /* Record the header object name for this rbd image. */ + + rbd_assert(rbd_image_format_valid(rbd_dev->image_format)); + + if (rbd_dev->image_format == 1) + size = strlen(spec->image_name) + sizeof (RBD_SUFFIX); + else + size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id); + + rbd_dev->header_name = kmalloc(size, GFP_KERNEL); + if (!rbd_dev->header_name) + return -ENOMEM; + + if (rbd_dev->image_format == 1) + sprintf(rbd_dev->header_name, "%s%s", + spec->image_name, RBD_SUFFIX); + else + sprintf(rbd_dev->header_name, "%s%s", + RBD_HEADER_PREFIX, spec->image_id); + return 0; +} + +static void rbd_dev_image_release(struct rbd_device *rbd_dev) +{ + rbd_dev_unprobe(rbd_dev); + kfree(rbd_dev->header_name); + rbd_dev->header_name = NULL; + rbd_dev->image_format = 0; + kfree(rbd_dev->spec->image_id); + rbd_dev->spec->image_id = NULL; + + rbd_dev_destroy(rbd_dev); +} + +/* + * Probe for the existence of the header object for the given rbd + * device. If this image is the one being mapped (i.e., not a + * parent), initiate a watch on its header object before using that + * object to get detailed information about the rbd image. + */ +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) +{ + int ret; + + /* + * Get the id from the image id object. Unless there's an + * error, rbd_dev->spec->image_id will be filled in with + * a dynamically-allocated string, and rbd_dev->image_format + * will be set to either 1 or 2. + */ + ret = rbd_dev_image_id(rbd_dev); + if (ret) + return ret; + + ret = rbd_dev_header_name(rbd_dev); + if (ret) + goto err_out_format; + + if (mapping) { + ret = rbd_dev_header_watch_sync(rbd_dev); + if (ret) { + if (ret == -ENOENT) + pr_info("image %s/%s does not exist\n", + rbd_dev->spec->pool_name, + rbd_dev->spec->image_name); + goto out_header_name; + } + } + + ret = rbd_dev_header_info(rbd_dev); + if (ret) + goto err_out_watch; + + /* + * If this image is the one being mapped, we have pool name and + * id, image name and id, and snap name - need to fill snap id. + * Otherwise this is a parent image, identified by pool, image + * and snap ids - need to fill in names for those ids. + */ + if (mapping) + ret = rbd_spec_fill_snap_id(rbd_dev); + else + ret = rbd_spec_fill_names(rbd_dev); + if (ret) { + if (ret == -ENOENT) + pr_info("snap %s/%s@%s does not exist\n", + rbd_dev->spec->pool_name, + rbd_dev->spec->image_name, + rbd_dev->spec->snap_name); + goto err_out_probe; + } + + if (rbd_dev->header.features & RBD_FEATURE_LAYERING) { + ret = rbd_dev_v2_parent_info(rbd_dev); + if (ret) + goto err_out_probe; + + /* + * Need to warn users if this image is the one being + * mapped and has a parent. + */ + if (mapping && rbd_dev->parent_spec) + rbd_warn(rbd_dev, + "WARNING: kernel layering is EXPERIMENTAL!"); + } + + ret = rbd_dev_probe_parent(rbd_dev); + if (ret) + goto err_out_probe; + + dout("discovered format %u image, header name is %s\n", + rbd_dev->image_format, rbd_dev->header_name); + return 0; + +err_out_probe: + rbd_dev_unprobe(rbd_dev); +err_out_watch: + if (mapping) + rbd_dev_header_unwatch_sync(rbd_dev); +out_header_name: + kfree(rbd_dev->header_name); + rbd_dev->header_name = NULL; +err_out_format: + rbd_dev->image_format = 0; + kfree(rbd_dev->spec->image_id); + rbd_dev->spec->image_id = NULL; + return ret; +} + +static ssize_t do_rbd_add(struct bus_type *bus, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + struct ceph_options *ceph_opts = NULL; + struct rbd_options *rbd_opts = NULL; + struct rbd_spec *spec = NULL; + struct rbd_client *rbdc; + bool read_only; + int rc = -ENOMEM; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + /* parse add command */ + rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); + if (rc < 0) + goto err_out_module; + read_only = rbd_opts->read_only; + kfree(rbd_opts); + rbd_opts = NULL; /* done with this */ + + rbdc = rbd_get_client(ceph_opts); + if (IS_ERR(rbdc)) { + rc = PTR_ERR(rbdc); + goto err_out_args; + } + + /* pick the pool */ + rc = rbd_add_get_pool_id(rbdc, spec->pool_name); + if (rc < 0) { + if (rc == -ENOENT) + pr_info("pool %s does not exist\n", spec->pool_name); + goto err_out_client; + } + spec->pool_id = (u64)rc; + + /* The ceph file layout needs to fit pool id in 32 bits */ + + if (spec->pool_id > (u64)U32_MAX) { + rbd_warn(NULL, "pool id too large (%llu > %u)", + (unsigned long long)spec->pool_id, U32_MAX); + rc = -EIO; + goto err_out_client; + } + + rbd_dev = rbd_dev_create(rbdc, spec); + if (!rbd_dev) + goto err_out_client; + rbdc = NULL; /* rbd_dev now owns this */ + spec = NULL; /* rbd_dev now owns this */ + + rc = rbd_dev_image_probe(rbd_dev, true); + if (rc < 0) + goto err_out_rbd_dev; + + /* If we are mapping a snapshot it must be marked read-only */ + + if (rbd_dev->spec->snap_id != CEPH_NOSNAP) + read_only = true; + rbd_dev->mapping.read_only = read_only; + + rc = rbd_dev_device_setup(rbd_dev); + if (rc) { + /* + * rbd_dev_header_unwatch_sync() can't be moved into + * rbd_dev_image_release() without refactoring, see + * commit 1f3ef78861ac. + */ + rbd_dev_header_unwatch_sync(rbd_dev); + rbd_dev_image_release(rbd_dev); + goto err_out_module; + } + + return count; + +err_out_rbd_dev: + rbd_dev_destroy(rbd_dev); +err_out_client: + rbd_put_client(rbdc); +err_out_args: + rbd_spec_put(spec); +err_out_module: + module_put(THIS_MODULE); + + dout("Error adding device %s\n", buf); + + return (ssize_t)rc; +} + +static ssize_t rbd_add(struct bus_type *bus, + const char *buf, + size_t count) +{ + if (single_major) + return -EINVAL; + + return do_rbd_add(bus, buf, count); +} + +static ssize_t rbd_add_single_major(struct bus_type *bus, + const char *buf, + size_t count) +{ + return do_rbd_add(bus, buf, count); +} + +static void rbd_dev_device_release(struct device *dev) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + rbd_free_disk(rbd_dev); + clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); + rbd_dev_mapping_clear(rbd_dev); + if (!single_major) + unregister_blkdev(rbd_dev->major, rbd_dev->name); + rbd_dev_id_put(rbd_dev); + rbd_dev_mapping_clear(rbd_dev); +} + +static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) +{ + while (rbd_dev->parent) { + struct rbd_device *first = rbd_dev; + struct rbd_device *second = first->parent; + struct rbd_device *third; + + /* + * Follow to the parent with no grandparent and + * remove it. + */ + while (second && (third = second->parent)) { + first = second; + second = third; + } + rbd_assert(second); + rbd_dev_image_release(second); + first->parent = NULL; + first->parent_overlap = 0; + + rbd_assert(first->parent_spec); + rbd_spec_put(first->parent_spec); + first->parent_spec = NULL; + } +} + +static ssize_t do_rbd_remove(struct bus_type *bus, + const char *buf, + size_t count) +{ + struct rbd_device *rbd_dev = NULL; + struct list_head *tmp; + int dev_id; + unsigned long ul; + bool already = false; + int ret; + + ret = kstrtoul(buf, 10, &ul); + if (ret) + return ret; + + /* convert to int; abort if we lost anything in the conversion */ + dev_id = (int)ul; + if (dev_id != ul) + return -EINVAL; + + ret = -ENOENT; + spin_lock(&rbd_dev_list_lock); + list_for_each(tmp, &rbd_dev_list) { + rbd_dev = list_entry(tmp, struct rbd_device, node); + if (rbd_dev->dev_id == dev_id) { + ret = 0; + break; + } + } + if (!ret) { + spin_lock_irq(&rbd_dev->lock); + if (rbd_dev->open_count) + ret = -EBUSY; + else + already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, + &rbd_dev->flags); + spin_unlock_irq(&rbd_dev->lock); + } + spin_unlock(&rbd_dev_list_lock); + if (ret < 0 || already) + return ret; + + rbd_dev_header_unwatch_sync(rbd_dev); + /* + * flush remaining watch callbacks - these must be complete + * before the osd_client is shutdown + */ + dout("%s: flushing notifies", __func__); + ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc); + + /* + * Don't free anything from rbd_dev->disk until after all + * notifies are completely processed. Otherwise + * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting + * in a potential use after free of rbd_dev->disk or rbd_dev. + */ + rbd_bus_del_dev(rbd_dev); + rbd_dev_image_release(rbd_dev); + module_put(THIS_MODULE); + + return count; +} + +static ssize_t rbd_remove(struct bus_type *bus, + const char *buf, + size_t count) +{ + if (single_major) + return -EINVAL; + + return do_rbd_remove(bus, buf, count); +} + +static ssize_t rbd_remove_single_major(struct bus_type *bus, + const char *buf, + size_t count) +{ + return do_rbd_remove(bus, buf, count); +} + +/* + * create control files in sysfs + * /sys/bus/rbd/... + */ +static int rbd_sysfs_init(void) +{ + int ret; + + ret = device_register(&rbd_root_dev); + if (ret < 0) + return ret; + + ret = bus_register(&rbd_bus_type); + if (ret < 0) + device_unregister(&rbd_root_dev); + + return ret; +} + +static void rbd_sysfs_cleanup(void) +{ + bus_unregister(&rbd_bus_type); + device_unregister(&rbd_root_dev); +} + +static int rbd_slab_init(void) +{ + rbd_assert(!rbd_img_request_cache); + rbd_img_request_cache = kmem_cache_create("rbd_img_request", + sizeof (struct rbd_img_request), + __alignof__(struct rbd_img_request), + 0, NULL); + if (!rbd_img_request_cache) + return -ENOMEM; + + rbd_assert(!rbd_obj_request_cache); + rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", + sizeof (struct rbd_obj_request), + __alignof__(struct rbd_obj_request), + 0, NULL); + if (!rbd_obj_request_cache) + goto out_err; + + rbd_assert(!rbd_segment_name_cache); + rbd_segment_name_cache = kmem_cache_create("rbd_segment_name", + CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL); + if (rbd_segment_name_cache) + return 0; +out_err: + if (rbd_obj_request_cache) { + kmem_cache_destroy(rbd_obj_request_cache); + rbd_obj_request_cache = NULL; + } + + kmem_cache_destroy(rbd_img_request_cache); + rbd_img_request_cache = NULL; + + return -ENOMEM; +} + +static void rbd_slab_exit(void) +{ + rbd_assert(rbd_segment_name_cache); + kmem_cache_destroy(rbd_segment_name_cache); + rbd_segment_name_cache = NULL; + + rbd_assert(rbd_obj_request_cache); + kmem_cache_destroy(rbd_obj_request_cache); + rbd_obj_request_cache = NULL; + + rbd_assert(rbd_img_request_cache); + kmem_cache_destroy(rbd_img_request_cache); + rbd_img_request_cache = NULL; +} + +static int __init rbd_init(void) +{ + int rc; + + if (!libceph_compatible(NULL)) { + rbd_warn(NULL, "libceph incompatibility (quitting)"); + return -EINVAL; + } + + rc = rbd_slab_init(); + if (rc) + return rc; + + /* + * The number of active work items is limited by the number of + * rbd devices * queue depth, so leave @max_active at default. + */ + rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0); + if (!rbd_wq) { + rc = -ENOMEM; + goto err_out_slab; + } + + if (single_major) { + rbd_major = register_blkdev(0, RBD_DRV_NAME); + if (rbd_major < 0) { + rc = rbd_major; + goto err_out_wq; + } + } + + rc = rbd_sysfs_init(); + if (rc) + goto err_out_blkdev; + + if (single_major) + pr_info("loaded (major %d)\n", rbd_major); + else + pr_info("loaded\n"); + + return 0; + +err_out_blkdev: + if (single_major) + unregister_blkdev(rbd_major, RBD_DRV_NAME); +err_out_wq: + destroy_workqueue(rbd_wq); +err_out_slab: + rbd_slab_exit(); + return rc; +} + +static void __exit rbd_exit(void) +{ + ida_destroy(&rbd_dev_id_ida); + rbd_sysfs_cleanup(); + if (single_major) + unregister_blkdev(rbd_major, RBD_DRV_NAME); + destroy_workqueue(rbd_wq); + rbd_slab_exit(); +} + +module_init(rbd_init); +module_exit(rbd_exit); + +MODULE_AUTHOR("Alex Elder "); +MODULE_AUTHOR("Sage Weil "); +MODULE_AUTHOR("Yehuda Sadeh "); +/* following authorship retained from original osdblk.c */ +MODULE_AUTHOR("Jeff Garzik "); + +MODULE_DESCRIPTION("RADOS Block Device (RBD) driver"); +MODULE_LICENSE("GPL"); diff --git a/kernel/drivers/block/rbd_types.h b/kernel/drivers/block/rbd_types.h new file mode 100644 index 000000000..49d77cbcf --- /dev/null +++ b/kernel/drivers/block/rbd_types.h @@ -0,0 +1,81 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2010 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_RBD_TYPES_H +#define CEPH_RBD_TYPES_H + +#include + +/* For format version 2, rbd image 'foo' consists of objects + * rbd_id.foo - id of image + * rbd_header. - image metadata + * rbd_data..0000000000000000 + * rbd_data..0000000000000001 + * ... - data + * Clients do not access header data directly in rbd format 2. + */ + +#define RBD_HEADER_PREFIX "rbd_header." +#define RBD_DATA_PREFIX "rbd_data." +#define RBD_ID_PREFIX "rbd_id." + +/* + * For format version 1, rbd image 'foo' consists of objects + * foo.rbd - image metadata + * rb...00000000 + * rb...00000001 + * ... - data + * There is no notion of a persistent image id in rbd format 1. + */ + +#define RBD_SUFFIX ".rbd" + +#define RBD_DIRECTORY "rbd_directory" +#define RBD_INFO "rbd_info" + +#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */ +#define RBD_MIN_OBJ_ORDER 16 +#define RBD_MAX_OBJ_ORDER 30 + +#define RBD_COMP_NONE 0 +#define RBD_CRYPT_NONE 0 + +#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n" +#define RBD_HEADER_SIGNATURE "RBD" +#define RBD_HEADER_VERSION "001.005" + +struct rbd_image_snap_ondisk { + __le64 id; + __le64 image_size; +} __attribute__((packed)); + +struct rbd_image_header_ondisk { + char text[40]; + char object_prefix[24]; + char signature[4]; + char version[8]; + struct { + __u8 order; + __u8 crypt_type; + __u8 comp_type; + __u8 unused; + } __attribute__((packed)) options; + __le64 image_size; + __le64 snap_seq; + __le32 snap_count; + __le32 reserved; + __le64 snap_names_len; + struct rbd_image_snap_ondisk snaps[0]; +} __attribute__((packed)); + + +#endif diff --git a/kernel/drivers/block/rsxx/Makefile b/kernel/drivers/block/rsxx/Makefile new file mode 100644 index 000000000..b1c53c0aa --- /dev/null +++ b/kernel/drivers/block/rsxx/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_BLK_DEV_RSXX) += rsxx.o +rsxx-objs := config.o core.o cregs.o dev.o dma.o diff --git a/kernel/drivers/block/rsxx/config.c b/kernel/drivers/block/rsxx/config.c new file mode 100644 index 000000000..10cd530d3 --- /dev/null +++ b/kernel/drivers/block/rsxx/config.c @@ -0,0 +1,211 @@ +/* +* Filename: config.c +* +* +* Authors: Joshua Morris +* Philip Kelleher +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include + +#include "rsxx_priv.h" +#include "rsxx_cfg.h" + +static void initialize_config(struct rsxx_card_cfg *cfg) +{ + cfg->hdr.version = RSXX_CFG_VERSION; + + cfg->data.block_size = RSXX_HW_BLK_SIZE; + cfg->data.stripe_size = RSXX_HW_BLK_SIZE; + cfg->data.vendor_id = RSXX_VENDOR_ID_IBM; + cfg->data.cache_order = (-1); + cfg->data.intr_coal.mode = RSXX_INTR_COAL_DISABLED; + cfg->data.intr_coal.count = 0; + cfg->data.intr_coal.latency = 0; +} + +static u32 config_data_crc32(struct rsxx_card_cfg *cfg) +{ + /* + * Return the compliment of the CRC to ensure compatibility + * (i.e. this is how early rsxx drivers did it.) + */ + + return ~crc32(~0, &cfg->data, sizeof(cfg->data)); +} + + +/*----------------- Config Byte Swap Functions -------------------*/ +static void config_hdr_be_to_cpu(struct card_cfg_hdr *hdr) +{ + hdr->version = be32_to_cpu((__force __be32) hdr->version); + hdr->crc = be32_to_cpu((__force __be32) hdr->crc); +} + +static void config_hdr_cpu_to_be(struct card_cfg_hdr *hdr) +{ + hdr->version = (__force u32) cpu_to_be32(hdr->version); + hdr->crc = (__force u32) cpu_to_be32(hdr->crc); +} + +static void config_data_swab(struct rsxx_card_cfg *cfg) +{ + u32 *data = (u32 *) &cfg->data; + int i; + + for (i = 0; i < (sizeof(cfg->data) / 4); i++) + data[i] = swab32(data[i]); +} + +static void config_data_le_to_cpu(struct rsxx_card_cfg *cfg) +{ + u32 *data = (u32 *) &cfg->data; + int i; + + for (i = 0; i < (sizeof(cfg->data) / 4); i++) + data[i] = le32_to_cpu((__force __le32) data[i]); +} + +static void config_data_cpu_to_le(struct rsxx_card_cfg *cfg) +{ + u32 *data = (u32 *) &cfg->data; + int i; + + for (i = 0; i < (sizeof(cfg->data) / 4); i++) + data[i] = (__force u32) cpu_to_le32(data[i]); +} + + +/*----------------- Config Operations ------------------*/ +static int rsxx_save_config(struct rsxx_cardinfo *card) +{ + struct rsxx_card_cfg cfg; + int st; + + memcpy(&cfg, &card->config, sizeof(cfg)); + + if (unlikely(cfg.hdr.version != RSXX_CFG_VERSION)) { + dev_err(CARD_TO_DEV(card), + "Cannot save config with invalid version %d\n", + cfg.hdr.version); + return -EINVAL; + } + + /* Convert data to little endian for the CRC calculation. */ + config_data_cpu_to_le(&cfg); + + cfg.hdr.crc = config_data_crc32(&cfg); + + /* + * Swap the data from little endian to big endian so it can be + * stored. + */ + config_data_swab(&cfg); + config_hdr_cpu_to_be(&cfg.hdr); + + st = rsxx_creg_write(card, CREG_ADD_CONFIG, sizeof(cfg), &cfg, 1); + if (st) + return st; + + return 0; +} + +int rsxx_load_config(struct rsxx_cardinfo *card) +{ + int st; + u32 crc; + + st = rsxx_creg_read(card, CREG_ADD_CONFIG, sizeof(card->config), + &card->config, 1); + if (st) { + dev_err(CARD_TO_DEV(card), + "Failed reading card config.\n"); + return st; + } + + config_hdr_be_to_cpu(&card->config.hdr); + + if (card->config.hdr.version == RSXX_CFG_VERSION) { + /* + * We calculate the CRC with the data in little endian, because + * early drivers did not take big endian CPUs into account. + * The data is always stored in big endian, so we need to byte + * swap it before calculating the CRC. + */ + + config_data_swab(&card->config); + + /* Check the CRC */ + crc = config_data_crc32(&card->config); + if (crc != card->config.hdr.crc) { + dev_err(CARD_TO_DEV(card), + "Config corruption detected!\n"); + dev_info(CARD_TO_DEV(card), + "CRC (sb x%08x is x%08x)\n", + card->config.hdr.crc, crc); + return -EIO; + } + + /* Convert the data to CPU byteorder */ + config_data_le_to_cpu(&card->config); + + } else if (card->config.hdr.version != 0) { + dev_err(CARD_TO_DEV(card), + "Invalid config version %d.\n", + card->config.hdr.version); + /* + * Config version changes require special handling from the + * user + */ + return -EINVAL; + } else { + dev_info(CARD_TO_DEV(card), + "Initializing card configuration.\n"); + initialize_config(&card->config); + st = rsxx_save_config(card); + if (st) + return st; + } + + card->config_valid = 1; + + dev_dbg(CARD_TO_DEV(card), "version: x%08x\n", + card->config.hdr.version); + dev_dbg(CARD_TO_DEV(card), "crc: x%08x\n", + card->config.hdr.crc); + dev_dbg(CARD_TO_DEV(card), "block_size: x%08x\n", + card->config.data.block_size); + dev_dbg(CARD_TO_DEV(card), "stripe_size: x%08x\n", + card->config.data.stripe_size); + dev_dbg(CARD_TO_DEV(card), "vendor_id: x%08x\n", + card->config.data.vendor_id); + dev_dbg(CARD_TO_DEV(card), "cache_order: x%08x\n", + card->config.data.cache_order); + dev_dbg(CARD_TO_DEV(card), "mode: x%08x\n", + card->config.data.intr_coal.mode); + dev_dbg(CARD_TO_DEV(card), "count: x%08x\n", + card->config.data.intr_coal.count); + dev_dbg(CARD_TO_DEV(card), "latency: x%08x\n", + card->config.data.intr_coal.latency); + + return 0; +} + diff --git a/kernel/drivers/block/rsxx/core.c b/kernel/drivers/block/rsxx/core.c new file mode 100644 index 000000000..d8b2488aa --- /dev/null +++ b/kernel/drivers/block/rsxx/core.c @@ -0,0 +1,1144 @@ +/* +* Filename: core.c +* +* +* Authors: Joshua Morris +* Philip Kelleher +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "rsxx_priv.h" +#include "rsxx_cfg.h" + +#define NO_LEGACY 0 +#define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */ + +MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver"); +MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRIVER_VERSION); + +static unsigned int force_legacy = NO_LEGACY; +module_param(force_legacy, uint, 0444); +MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts"); + +static unsigned int sync_start = 1; +module_param(sync_start, uint, 0444); +MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete " + "until the card startup has completed."); + +static DEFINE_IDA(rsxx_disk_ida); +static DEFINE_SPINLOCK(rsxx_ida_lock); + +/* --------------------Debugfs Setup ------------------- */ + +static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p) +{ + struct rsxx_cardinfo *card = m->private; + + seq_printf(m, "HWID 0x%08x\n", + ioread32(card->regmap + HWID)); + seq_printf(m, "SCRATCH 0x%08x\n", + ioread32(card->regmap + SCRATCH)); + seq_printf(m, "IER 0x%08x\n", + ioread32(card->regmap + IER)); + seq_printf(m, "IPR 0x%08x\n", + ioread32(card->regmap + IPR)); + seq_printf(m, "CREG_CMD 0x%08x\n", + ioread32(card->regmap + CREG_CMD)); + seq_printf(m, "CREG_ADD 0x%08x\n", + ioread32(card->regmap + CREG_ADD)); + seq_printf(m, "CREG_CNT 0x%08x\n", + ioread32(card->regmap + CREG_CNT)); + seq_printf(m, "CREG_STAT 0x%08x\n", + ioread32(card->regmap + CREG_STAT)); + seq_printf(m, "CREG_DATA0 0x%08x\n", + ioread32(card->regmap + CREG_DATA0)); + seq_printf(m, "CREG_DATA1 0x%08x\n", + ioread32(card->regmap + CREG_DATA1)); + seq_printf(m, "CREG_DATA2 0x%08x\n", + ioread32(card->regmap + CREG_DATA2)); + seq_printf(m, "CREG_DATA3 0x%08x\n", + ioread32(card->regmap + CREG_DATA3)); + seq_printf(m, "CREG_DATA4 0x%08x\n", + ioread32(card->regmap + CREG_DATA4)); + seq_printf(m, "CREG_DATA5 0x%08x\n", + ioread32(card->regmap + CREG_DATA5)); + seq_printf(m, "CREG_DATA6 0x%08x\n", + ioread32(card->regmap + CREG_DATA6)); + seq_printf(m, "CREG_DATA7 0x%08x\n", + ioread32(card->regmap + CREG_DATA7)); + seq_printf(m, "INTR_COAL 0x%08x\n", + ioread32(card->regmap + INTR_COAL)); + seq_printf(m, "HW_ERROR 0x%08x\n", + ioread32(card->regmap + HW_ERROR)); + seq_printf(m, "DEBUG0 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG0)); + seq_printf(m, "DEBUG1 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG1)); + seq_printf(m, "DEBUG2 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG2)); + seq_printf(m, "DEBUG3 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG3)); + seq_printf(m, "DEBUG4 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG4)); + seq_printf(m, "DEBUG5 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG5)); + seq_printf(m, "DEBUG6 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG6)); + seq_printf(m, "DEBUG7 0x%08x\n", + ioread32(card->regmap + PCI_DEBUG7)); + seq_printf(m, "RECONFIG 0x%08x\n", + ioread32(card->regmap + PCI_RECONFIG)); + + return 0; +} + +static int rsxx_attr_stats_show(struct seq_file *m, void *p) +{ + struct rsxx_cardinfo *card = m->private; + int i; + + for (i = 0; i < card->n_targets; i++) { + seq_printf(m, "Ctrl %d CRC Errors = %d\n", + i, card->ctrl[i].stats.crc_errors); + seq_printf(m, "Ctrl %d Hard Errors = %d\n", + i, card->ctrl[i].stats.hard_errors); + seq_printf(m, "Ctrl %d Soft Errors = %d\n", + i, card->ctrl[i].stats.soft_errors); + seq_printf(m, "Ctrl %d Writes Issued = %d\n", + i, card->ctrl[i].stats.writes_issued); + seq_printf(m, "Ctrl %d Writes Failed = %d\n", + i, card->ctrl[i].stats.writes_failed); + seq_printf(m, "Ctrl %d Reads Issued = %d\n", + i, card->ctrl[i].stats.reads_issued); + seq_printf(m, "Ctrl %d Reads Failed = %d\n", + i, card->ctrl[i].stats.reads_failed); + seq_printf(m, "Ctrl %d Reads Retried = %d\n", + i, card->ctrl[i].stats.reads_retried); + seq_printf(m, "Ctrl %d Discards Issued = %d\n", + i, card->ctrl[i].stats.discards_issued); + seq_printf(m, "Ctrl %d Discards Failed = %d\n", + i, card->ctrl[i].stats.discards_failed); + seq_printf(m, "Ctrl %d DMA SW Errors = %d\n", + i, card->ctrl[i].stats.dma_sw_err); + seq_printf(m, "Ctrl %d DMA HW Faults = %d\n", + i, card->ctrl[i].stats.dma_hw_fault); + seq_printf(m, "Ctrl %d DMAs Cancelled = %d\n", + i, card->ctrl[i].stats.dma_cancelled); + seq_printf(m, "Ctrl %d SW Queue Depth = %d\n", + i, card->ctrl[i].stats.sw_q_depth); + seq_printf(m, "Ctrl %d HW Queue Depth = %d\n", + i, atomic_read(&card->ctrl[i].stats.hw_q_depth)); + } + + return 0; +} + +static int rsxx_attr_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, rsxx_attr_stats_show, inode->i_private); +} + +static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file) +{ + return single_open(file, rsxx_attr_pci_regs_show, inode->i_private); +} + +static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct rsxx_cardinfo *card = file_inode(fp)->i_private; + char *buf; + ssize_t st; + + buf = kzalloc(cnt, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + st = rsxx_creg_read(card, CREG_ADD_CRAM + (u32)*ppos, cnt, buf, 1); + if (!st) + st = copy_to_user(ubuf, buf, cnt); + kfree(buf); + if (st) + return st; + *ppos += cnt; + return cnt; +} + +static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct rsxx_cardinfo *card = file_inode(fp)->i_private; + char *buf; + ssize_t st; + + buf = kzalloc(cnt, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + st = copy_from_user(buf, ubuf, cnt); + if (!st) + st = rsxx_creg_write(card, CREG_ADD_CRAM + (u32)*ppos, cnt, + buf, 1); + kfree(buf); + if (st) + return st; + *ppos += cnt; + return cnt; +} + +static const struct file_operations debugfs_cram_fops = { + .owner = THIS_MODULE, + .read = rsxx_cram_read, + .write = rsxx_cram_write, +}; + +static const struct file_operations debugfs_stats_fops = { + .owner = THIS_MODULE, + .open = rsxx_attr_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations debugfs_pci_regs_fops = { + .owner = THIS_MODULE, + .open = rsxx_attr_pci_regs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card) +{ + struct dentry *debugfs_stats; + struct dentry *debugfs_pci_regs; + struct dentry *debugfs_cram; + + card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL); + if (IS_ERR_OR_NULL(card->debugfs_dir)) + goto failed_debugfs_dir; + + debugfs_stats = debugfs_create_file("stats", S_IRUGO, + card->debugfs_dir, card, + &debugfs_stats_fops); + if (IS_ERR_OR_NULL(debugfs_stats)) + goto failed_debugfs_stats; + + debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO, + card->debugfs_dir, card, + &debugfs_pci_regs_fops); + if (IS_ERR_OR_NULL(debugfs_pci_regs)) + goto failed_debugfs_pci_regs; + + debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR, + card->debugfs_dir, card, + &debugfs_cram_fops); + if (IS_ERR_OR_NULL(debugfs_cram)) + goto failed_debugfs_cram; + + return; +failed_debugfs_cram: + debugfs_remove(debugfs_pci_regs); +failed_debugfs_pci_regs: + debugfs_remove(debugfs_stats); +failed_debugfs_stats: + debugfs_remove(card->debugfs_dir); +failed_debugfs_dir: + card->debugfs_dir = NULL; +} + +/*----------------- Interrupt Control & Handling -------------------*/ + +static void rsxx_mask_interrupts(struct rsxx_cardinfo *card) +{ + card->isr_mask = 0; + card->ier_mask = 0; +} + +static void __enable_intr(unsigned int *mask, unsigned int intr) +{ + *mask |= intr; +} + +static void __disable_intr(unsigned int *mask, unsigned int intr) +{ + *mask &= ~intr; +} + +/* + * NOTE: Disabling the IER will disable the hardware interrupt. + * Disabling the ISR will disable the software handling of the ISR bit. + * + * Enable/Disable interrupt functions assume the card->irq_lock + * is held by the caller. + */ +void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr) +{ + if (unlikely(card->halt) || + unlikely(card->eeh_state)) + return; + + __enable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} + +void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr) +{ + if (unlikely(card->eeh_state)) + return; + + __disable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} + +void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr) +{ + if (unlikely(card->halt) || + unlikely(card->eeh_state)) + return; + + __enable_intr(&card->isr_mask, intr); + __enable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} +void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr) +{ + if (unlikely(card->eeh_state)) + return; + + __disable_intr(&card->isr_mask, intr); + __disable_intr(&card->ier_mask, intr); + iowrite32(card->ier_mask, card->regmap + IER); +} + +static irqreturn_t rsxx_isr(int irq, void *pdata) +{ + struct rsxx_cardinfo *card = pdata; + unsigned int isr; + int handled = 0; + int reread_isr; + int i; + + spin_lock(&card->irq_lock); + + do { + reread_isr = 0; + + if (unlikely(card->eeh_state)) + break; + + isr = ioread32(card->regmap + ISR); + if (isr == 0xffffffff) { + /* + * A few systems seem to have an intermittent issue + * where PCI reads return all Fs, but retrying the read + * a little later will return as expected. + */ + dev_info(CARD_TO_DEV(card), + "ISR = 0xFFFFFFFF, retrying later\n"); + break; + } + + isr &= card->isr_mask; + if (!isr) + break; + + for (i = 0; i < card->n_targets; i++) { + if (isr & CR_INTR_DMA(i)) { + if (card->ier_mask & CR_INTR_DMA(i)) { + rsxx_disable_ier(card, CR_INTR_DMA(i)); + reread_isr = 1; + } + queue_work(card->ctrl[i].done_wq, + &card->ctrl[i].dma_done_work); + handled++; + } + } + + if (isr & CR_INTR_CREG) { + queue_work(card->creg_ctrl.creg_wq, + &card->creg_ctrl.done_work); + handled++; + } + + if (isr & CR_INTR_EVENT) { + queue_work(card->event_wq, &card->event_work); + rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); + handled++; + } + } while (reread_isr); + + spin_unlock(&card->irq_lock); + + return handled ? IRQ_HANDLED : IRQ_NONE; +} + +/*----------------- Card Event Handler -------------------*/ +static const char * const rsxx_card_state_to_str(unsigned int state) +{ + static const char * const state_strings[] = { + "Unknown", "Shutdown", "Starting", "Formatting", + "Uninitialized", "Good", "Shutting Down", + "Fault", "Read Only Fault", "dStroying" + }; + + return state_strings[ffs(state)]; +} + +static void card_state_change(struct rsxx_cardinfo *card, + unsigned int new_state) +{ + int st; + + dev_info(CARD_TO_DEV(card), + "card state change detected.(%s -> %s)\n", + rsxx_card_state_to_str(card->state), + rsxx_card_state_to_str(new_state)); + + card->state = new_state; + + /* Don't attach DMA interfaces if the card has an invalid config */ + if (!card->config_valid) + return; + + switch (new_state) { + case CARD_STATE_RD_ONLY_FAULT: + dev_crit(CARD_TO_DEV(card), + "Hardware has entered read-only mode!\n"); + /* + * Fall through so the DMA devices can be attached and + * the user can attempt to pull off their data. + */ + case CARD_STATE_GOOD: + st = rsxx_get_card_size8(card, &card->size8); + if (st) + dev_err(CARD_TO_DEV(card), + "Failed attaching DMA devices\n"); + + if (card->config_valid) + set_capacity(card->gendisk, card->size8 >> 9); + break; + + case CARD_STATE_FAULT: + dev_crit(CARD_TO_DEV(card), + "Hardware Fault reported!\n"); + /* Fall through. */ + + /* Everything else, detach DMA interface if it's attached. */ + case CARD_STATE_SHUTDOWN: + case CARD_STATE_STARTING: + case CARD_STATE_FORMATTING: + case CARD_STATE_UNINITIALIZED: + case CARD_STATE_SHUTTING_DOWN: + /* + * dStroy is a term coined by marketing to represent the low level + * secure erase. + */ + case CARD_STATE_DSTROYING: + set_capacity(card->gendisk, 0); + break; + } +} + +static void card_event_handler(struct work_struct *work) +{ + struct rsxx_cardinfo *card; + unsigned int state; + unsigned long flags; + int st; + + card = container_of(work, struct rsxx_cardinfo, event_work); + + if (unlikely(card->halt)) + return; + + /* + * Enable the interrupt now to avoid any weird race conditions where a + * state change might occur while rsxx_get_card_state() is + * processing a returned creg cmd. + */ + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_enable_ier_and_isr(card, CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + st = rsxx_get_card_state(card, &state); + if (st) { + dev_info(CARD_TO_DEV(card), + "Failed reading state after event.\n"); + return; + } + + if (card->state != state) + card_state_change(card, state); + + if (card->creg_ctrl.creg_stats.stat & CREG_STAT_LOG_PENDING) + rsxx_read_hw_log(card); +} + +/*----------------- Card Operations -------------------*/ +static int card_shutdown(struct rsxx_cardinfo *card) +{ + unsigned int state; + signed long start; + const int timeout = msecs_to_jiffies(120000); + int st; + + /* We can't issue a shutdown if the card is in a transition state */ + start = jiffies; + do { + st = rsxx_get_card_state(card, &state); + if (st) + return st; + } while (state == CARD_STATE_STARTING && + (jiffies - start < timeout)); + + if (state == CARD_STATE_STARTING) + return -ETIMEDOUT; + + /* Only issue a shutdown if we need to */ + if ((state != CARD_STATE_SHUTTING_DOWN) && + (state != CARD_STATE_SHUTDOWN)) { + st = rsxx_issue_card_cmd(card, CARD_CMD_SHUTDOWN); + if (st) + return st; + } + + start = jiffies; + do { + st = rsxx_get_card_state(card, &state); + if (st) + return st; + } while (state != CARD_STATE_SHUTDOWN && + (jiffies - start < timeout)); + + if (state != CARD_STATE_SHUTDOWN) + return -ETIMEDOUT; + + return 0; +} + +static int rsxx_eeh_frozen(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + int i; + int st; + + dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n"); + + card->eeh_state = 1; + rsxx_mask_interrupts(card); + + /* + * We need to guarantee that the write for eeh_state and masking + * interrupts does not become reordered. This will prevent a possible + * race condition with the EEH code. + */ + wmb(); + + pci_disable_device(dev); + + st = rsxx_eeh_save_issued_dmas(card); + if (st) + return st; + + rsxx_eeh_save_issued_creg(card); + + for (i = 0; i < card->n_targets; i++) { + if (card->ctrl[i].status.buf) + pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8, + card->ctrl[i].status.buf, + card->ctrl[i].status.dma_addr); + if (card->ctrl[i].cmd.buf) + pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8, + card->ctrl[i].cmd.buf, + card->ctrl[i].cmd.dma_addr); + } + + return 0; +} + +static void rsxx_eeh_failure(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + int i; + int cnt = 0; + + dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n"); + + card->eeh_state = 1; + card->halt = 1; + + for (i = 0; i < card->n_targets; i++) { + spin_lock_bh(&card->ctrl[i].queue_lock); + cnt = rsxx_cleanup_dma_queue(&card->ctrl[i], + &card->ctrl[i].queue, + COMPLETE_DMA); + spin_unlock_bh(&card->ctrl[i].queue_lock); + + cnt += rsxx_dma_cancel(&card->ctrl[i]); + + if (cnt) + dev_info(CARD_TO_DEV(card), + "Freed %d queued DMAs on channel %d\n", + cnt, card->ctrl[i].id); + } +} + +static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card) +{ + unsigned int status; + int iter = 0; + + /* We need to wait for the hardware to reset */ + while (iter++ < 10) { + status = ioread32(card->regmap + PCI_RECONFIG); + + if (status & RSXX_FLUSH_BUSY) { + ssleep(1); + continue; + } + + if (status & RSXX_FLUSH_TIMEOUT) + dev_warn(CARD_TO_DEV(card), "HW: flash controller timeout\n"); + return 0; + } + + /* Hardware failed resetting itself. */ + return -1; +} + +static pci_ers_result_t rsxx_error_detected(struct pci_dev *dev, + enum pci_channel_state error) +{ + int st; + + if (dev->revision < RSXX_EEH_SUPPORT) + return PCI_ERS_RESULT_NONE; + + if (error == pci_channel_io_perm_failure) { + rsxx_eeh_failure(dev); + return PCI_ERS_RESULT_DISCONNECT; + } + + st = rsxx_eeh_frozen(dev); + if (st) { + dev_err(&dev->dev, "Slot reset setup failed\n"); + rsxx_eeh_failure(dev); + return PCI_ERS_RESULT_DISCONNECT; + } + + return PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + unsigned long flags; + int i; + int st; + + dev_warn(&dev->dev, + "IBM Flash Adapter PCI: recovering from slot reset.\n"); + + st = pci_enable_device(dev); + if (st) + goto failed_hw_setup; + + pci_set_master(dev); + + st = rsxx_eeh_fifo_flush_poll(card); + if (st) + goto failed_hw_setup; + + rsxx_dma_queue_reset(card); + + for (i = 0; i < card->n_targets; i++) { + st = rsxx_hw_buffers_init(dev, &card->ctrl[i]); + if (st) + goto failed_hw_buffers_init; + } + + if (card->config_valid) + rsxx_dma_configure(card); + + /* Clears the ISR register from spurious interrupts */ + st = ioread32(card->regmap + ISR); + + card->eeh_state = 0; + + spin_lock_irqsave(&card->irq_lock, flags); + if (card->n_targets & RSXX_MAX_TARGETS) + rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G); + else + rsxx_enable_ier_and_isr(card, CR_INTR_ALL_C); + spin_unlock_irqrestore(&card->irq_lock, flags); + + rsxx_kick_creg_queue(card); + + for (i = 0; i < card->n_targets; i++) { + spin_lock(&card->ctrl[i].queue_lock); + if (list_empty(&card->ctrl[i].queue)) { + spin_unlock(&card->ctrl[i].queue_lock); + continue; + } + spin_unlock(&card->ctrl[i].queue_lock); + + queue_work(card->ctrl[i].issue_wq, + &card->ctrl[i].issue_dma_work); + } + + dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n"); + + return PCI_ERS_RESULT_RECOVERED; + +failed_hw_buffers_init: + for (i = 0; i < card->n_targets; i++) { + if (card->ctrl[i].status.buf) + pci_free_consistent(card->dev, + STATUS_BUFFER_SIZE8, + card->ctrl[i].status.buf, + card->ctrl[i].status.dma_addr); + if (card->ctrl[i].cmd.buf) + pci_free_consistent(card->dev, + COMMAND_BUFFER_SIZE8, + card->ctrl[i].cmd.buf, + card->ctrl[i].cmd.dma_addr); + } +failed_hw_setup: + rsxx_eeh_failure(dev); + return PCI_ERS_RESULT_DISCONNECT; + +} + +/*----------------- Driver Initialization & Setup -------------------*/ +/* Returns: 0 if the driver is compatible with the device + -1 if the driver is NOT compatible with the device */ +static int rsxx_compatibility_check(struct rsxx_cardinfo *card) +{ + unsigned char pci_rev; + + pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev); + + if (pci_rev > RS70_PCI_REV_SUPPORTED) + return -1; + return 0; +} + +static int rsxx_pci_probe(struct pci_dev *dev, + const struct pci_device_id *id) +{ + struct rsxx_cardinfo *card; + int st; + unsigned int sync_timeout; + + dev_info(&dev->dev, "PCI-Flash SSD discovered\n"); + + card = kzalloc(sizeof(*card), GFP_KERNEL); + if (!card) + return -ENOMEM; + + card->dev = dev; + pci_set_drvdata(dev, card); + + do { + if (!ida_pre_get(&rsxx_disk_ida, GFP_KERNEL)) { + st = -ENOMEM; + goto failed_ida_get; + } + + spin_lock(&rsxx_ida_lock); + st = ida_get_new(&rsxx_disk_ida, &card->disk_id); + spin_unlock(&rsxx_ida_lock); + } while (st == -EAGAIN); + + if (st) + goto failed_ida_get; + + st = pci_enable_device(dev); + if (st) + goto failed_enable; + + pci_set_master(dev); + pci_set_dma_max_seg_size(dev, RSXX_HW_BLK_SIZE); + + st = pci_set_dma_mask(dev, DMA_BIT_MASK(64)); + if (st) { + dev_err(CARD_TO_DEV(card), + "No usable DMA configuration,aborting\n"); + goto failed_dma_mask; + } + + st = pci_request_regions(dev, DRIVER_NAME); + if (st) { + dev_err(CARD_TO_DEV(card), + "Failed to request memory region\n"); + goto failed_request_regions; + } + + if (pci_resource_len(dev, 0) == 0) { + dev_err(CARD_TO_DEV(card), "BAR0 has length 0!\n"); + st = -ENOMEM; + goto failed_iomap; + } + + card->regmap = pci_iomap(dev, 0, 0); + if (!card->regmap) { + dev_err(CARD_TO_DEV(card), "Failed to map BAR0\n"); + st = -ENOMEM; + goto failed_iomap; + } + + spin_lock_init(&card->irq_lock); + card->halt = 0; + card->eeh_state = 0; + + spin_lock_irq(&card->irq_lock); + rsxx_disable_ier_and_isr(card, CR_INTR_ALL); + spin_unlock_irq(&card->irq_lock); + + if (!force_legacy) { + st = pci_enable_msi(dev); + if (st) + dev_warn(CARD_TO_DEV(card), + "Failed to enable MSI\n"); + } + + st = request_irq(dev->irq, rsxx_isr, IRQF_SHARED, + DRIVER_NAME, card); + if (st) { + dev_err(CARD_TO_DEV(card), + "Failed requesting IRQ%d\n", dev->irq); + goto failed_irq; + } + + /************* Setup Processor Command Interface *************/ + st = rsxx_creg_setup(card); + if (st) { + dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n"); + goto failed_creg_setup; + } + + spin_lock_irq(&card->irq_lock); + rsxx_enable_ier_and_isr(card, CR_INTR_CREG); + spin_unlock_irq(&card->irq_lock); + + st = rsxx_compatibility_check(card); + if (st) { + dev_warn(CARD_TO_DEV(card), + "Incompatible driver detected. Please update the driver.\n"); + st = -EINVAL; + goto failed_compatiblity_check; + } + + /************* Load Card Config *************/ + st = rsxx_load_config(card); + if (st) + dev_err(CARD_TO_DEV(card), + "Failed loading card config\n"); + + /************* Setup DMA Engine *************/ + st = rsxx_get_num_targets(card, &card->n_targets); + if (st) + dev_info(CARD_TO_DEV(card), + "Failed reading the number of DMA targets\n"); + + card->ctrl = kzalloc(card->n_targets * sizeof(*card->ctrl), GFP_KERNEL); + if (!card->ctrl) { + st = -ENOMEM; + goto failed_dma_setup; + } + + st = rsxx_dma_setup(card); + if (st) { + dev_info(CARD_TO_DEV(card), + "Failed to setup DMA engine\n"); + goto failed_dma_setup; + } + + /************* Setup Card Event Handler *************/ + card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event"); + if (!card->event_wq) { + dev_err(CARD_TO_DEV(card), "Failed card event setup.\n"); + goto failed_event_handler; + } + + INIT_WORK(&card->event_work, card_event_handler); + + st = rsxx_setup_dev(card); + if (st) + goto failed_create_dev; + + rsxx_get_card_state(card, &card->state); + + dev_info(CARD_TO_DEV(card), + "card state: %s\n", + rsxx_card_state_to_str(card->state)); + + /* + * Now that the DMA Engine and devices have been setup, + * we can enable the event interrupt(it kicks off actions in + * those layers so we couldn't enable it right away.) + */ + spin_lock_irq(&card->irq_lock); + rsxx_enable_ier_and_isr(card, CR_INTR_EVENT); + spin_unlock_irq(&card->irq_lock); + + if (card->state == CARD_STATE_SHUTDOWN) { + st = rsxx_issue_card_cmd(card, CARD_CMD_STARTUP); + if (st) + dev_crit(CARD_TO_DEV(card), + "Failed issuing card startup\n"); + if (sync_start) { + sync_timeout = SYNC_START_TIMEOUT; + + dev_info(CARD_TO_DEV(card), + "Waiting for card to startup\n"); + + do { + ssleep(1); + sync_timeout--; + + rsxx_get_card_state(card, &card->state); + } while (sync_timeout && + (card->state == CARD_STATE_STARTING)); + + if (card->state == CARD_STATE_STARTING) { + dev_warn(CARD_TO_DEV(card), + "Card startup timed out\n"); + card->size8 = 0; + } else { + dev_info(CARD_TO_DEV(card), + "card state: %s\n", + rsxx_card_state_to_str(card->state)); + st = rsxx_get_card_size8(card, &card->size8); + if (st) + card->size8 = 0; + } + } + } else if (card->state == CARD_STATE_GOOD || + card->state == CARD_STATE_RD_ONLY_FAULT) { + st = rsxx_get_card_size8(card, &card->size8); + if (st) + card->size8 = 0; + } + + rsxx_attach_dev(card); + + /************* Setup Debugfs *************/ + rsxx_debugfs_dev_new(card); + + return 0; + +failed_create_dev: + destroy_workqueue(card->event_wq); + card->event_wq = NULL; +failed_event_handler: + rsxx_dma_destroy(card); +failed_dma_setup: +failed_compatiblity_check: + destroy_workqueue(card->creg_ctrl.creg_wq); + card->creg_ctrl.creg_wq = NULL; +failed_creg_setup: + spin_lock_irq(&card->irq_lock); + rsxx_disable_ier_and_isr(card, CR_INTR_ALL); + spin_unlock_irq(&card->irq_lock); + free_irq(dev->irq, card); + if (!force_legacy) + pci_disable_msi(dev); +failed_irq: + pci_iounmap(dev, card->regmap); +failed_iomap: + pci_release_regions(dev); +failed_request_regions: +failed_dma_mask: + pci_disable_device(dev); +failed_enable: + spin_lock(&rsxx_ida_lock); + ida_remove(&rsxx_disk_ida, card->disk_id); + spin_unlock(&rsxx_ida_lock); +failed_ida_get: + kfree(card); + + return st; +} + +static void rsxx_pci_remove(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + unsigned long flags; + int st; + int i; + + if (!card) + return; + + dev_info(CARD_TO_DEV(card), + "Removing PCI-Flash SSD.\n"); + + rsxx_detach_dev(card); + + for (i = 0; i < card->n_targets; i++) { + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i)); + spin_unlock_irqrestore(&card->irq_lock, flags); + } + + st = card_shutdown(card); + if (st) + dev_crit(CARD_TO_DEV(card), "Shutdown failed!\n"); + + /* Sync outstanding event handlers. */ + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + cancel_work_sync(&card->event_work); + + rsxx_destroy_dev(card); + rsxx_dma_destroy(card); + + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_ALL); + spin_unlock_irqrestore(&card->irq_lock, flags); + + /* Prevent work_structs from re-queuing themselves. */ + card->halt = 1; + + debugfs_remove_recursive(card->debugfs_dir); + + free_irq(dev->irq, card); + + if (!force_legacy) + pci_disable_msi(dev); + + rsxx_creg_destroy(card); + + pci_iounmap(dev, card->regmap); + + pci_disable_device(dev); + pci_release_regions(dev); + + kfree(card); +} + +static int rsxx_pci_suspend(struct pci_dev *dev, pm_message_t state) +{ + /* We don't support suspend at this time. */ + return -ENOSYS; +} + +static void rsxx_pci_shutdown(struct pci_dev *dev) +{ + struct rsxx_cardinfo *card = pci_get_drvdata(dev); + unsigned long flags; + int i; + + if (!card) + return; + + dev_info(CARD_TO_DEV(card), "Shutting down PCI-Flash SSD.\n"); + + rsxx_detach_dev(card); + + for (i = 0; i < card->n_targets; i++) { + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i)); + spin_unlock_irqrestore(&card->irq_lock, flags); + } + + card_shutdown(card); +} + +static const struct pci_error_handlers rsxx_err_handler = { + .error_detected = rsxx_error_detected, + .slot_reset = rsxx_slot_reset, +}; + +static const struct pci_device_id rsxx_pci_ids[] = { + {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS70_FLASH)}, + {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS80_FLASH)}, + {0,}, +}; + +MODULE_DEVICE_TABLE(pci, rsxx_pci_ids); + +static struct pci_driver rsxx_pci_driver = { + .name = DRIVER_NAME, + .id_table = rsxx_pci_ids, + .probe = rsxx_pci_probe, + .remove = rsxx_pci_remove, + .suspend = rsxx_pci_suspend, + .shutdown = rsxx_pci_shutdown, + .err_handler = &rsxx_err_handler, +}; + +static int __init rsxx_core_init(void) +{ + int st; + + st = rsxx_dev_init(); + if (st) + return st; + + st = rsxx_dma_init(); + if (st) + goto dma_init_failed; + + st = rsxx_creg_init(); + if (st) + goto creg_init_failed; + + return pci_register_driver(&rsxx_pci_driver); + +creg_init_failed: + rsxx_dma_cleanup(); +dma_init_failed: + rsxx_dev_cleanup(); + + return st; +} + +static void __exit rsxx_core_cleanup(void) +{ + pci_unregister_driver(&rsxx_pci_driver); + rsxx_creg_cleanup(); + rsxx_dma_cleanup(); + rsxx_dev_cleanup(); +} + +module_init(rsxx_core_init); +module_exit(rsxx_core_cleanup); diff --git a/kernel/drivers/block/rsxx/cregs.c b/kernel/drivers/block/rsxx/cregs.c new file mode 100644 index 000000000..926dce9c4 --- /dev/null +++ b/kernel/drivers/block/rsxx/cregs.c @@ -0,0 +1,804 @@ +/* +* Filename: cregs.c +* +* +* Authors: Joshua Morris +* Philip Kelleher +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include + +#include "rsxx_priv.h" + +#define CREG_TIMEOUT_MSEC 10000 + +typedef void (*creg_cmd_cb)(struct rsxx_cardinfo *card, + struct creg_cmd *cmd, + int st); + +struct creg_cmd { + struct list_head list; + creg_cmd_cb cb; + void *cb_private; + unsigned int op; + unsigned int addr; + int cnt8; + void *buf; + unsigned int stream; + unsigned int status; +}; + +static struct kmem_cache *creg_cmd_pool; + + +/*------------ Private Functions --------------*/ + +#if defined(__LITTLE_ENDIAN) +#define LITTLE_ENDIAN 1 +#elif defined(__BIG_ENDIAN) +#define LITTLE_ENDIAN 0 +#else +#error Unknown endianess!!! Aborting... +#endif + +static int copy_to_creg_data(struct rsxx_cardinfo *card, + int cnt8, + void *buf, + unsigned int stream) +{ + int i = 0; + u32 *data = buf; + + if (unlikely(card->eeh_state)) + return -EIO; + + for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { + /* + * Firmware implementation makes it necessary to byte swap on + * little endian processors. + */ + if (LITTLE_ENDIAN && stream) + iowrite32be(data[i], card->regmap + CREG_DATA(i)); + else + iowrite32(data[i], card->regmap + CREG_DATA(i)); + } + + return 0; +} + + +static int copy_from_creg_data(struct rsxx_cardinfo *card, + int cnt8, + void *buf, + unsigned int stream) +{ + int i = 0; + u32 *data = buf; + + if (unlikely(card->eeh_state)) + return -EIO; + + for (i = 0; cnt8 > 0; i++, cnt8 -= 4) { + /* + * Firmware implementation makes it necessary to byte swap on + * little endian processors. + */ + if (LITTLE_ENDIAN && stream) + data[i] = ioread32be(card->regmap + CREG_DATA(i)); + else + data[i] = ioread32(card->regmap + CREG_DATA(i)); + } + + return 0; +} + +static void creg_issue_cmd(struct rsxx_cardinfo *card, struct creg_cmd *cmd) +{ + int st; + + if (unlikely(card->eeh_state)) + return; + + iowrite32(cmd->addr, card->regmap + CREG_ADD); + iowrite32(cmd->cnt8, card->regmap + CREG_CNT); + + if (cmd->op == CREG_OP_WRITE) { + if (cmd->buf) { + st = copy_to_creg_data(card, cmd->cnt8, + cmd->buf, cmd->stream); + if (st) + return; + } + } + + if (unlikely(card->eeh_state)) + return; + + /* Setting the valid bit will kick off the command. */ + iowrite32(cmd->op, card->regmap + CREG_CMD); +} + +static void creg_kick_queue(struct rsxx_cardinfo *card) +{ + if (card->creg_ctrl.active || list_empty(&card->creg_ctrl.queue)) + return; + + card->creg_ctrl.active = 1; + card->creg_ctrl.active_cmd = list_first_entry(&card->creg_ctrl.queue, + struct creg_cmd, list); + list_del(&card->creg_ctrl.active_cmd->list); + card->creg_ctrl.q_depth--; + + /* + * We have to set the timer before we push the new command. Otherwise, + * we could create a race condition that would occur if the timer + * was not canceled, and expired after the new command was pushed, + * but before the command was issued to hardware. + */ + mod_timer(&card->creg_ctrl.cmd_timer, + jiffies + msecs_to_jiffies(CREG_TIMEOUT_MSEC)); + + creg_issue_cmd(card, card->creg_ctrl.active_cmd); +} + +static int creg_queue_cmd(struct rsxx_cardinfo *card, + unsigned int op, + unsigned int addr, + unsigned int cnt8, + void *buf, + int stream, + creg_cmd_cb callback, + void *cb_private) +{ + struct creg_cmd *cmd; + + /* Don't queue stuff up if we're halted. */ + if (unlikely(card->halt)) + return -EINVAL; + + if (card->creg_ctrl.reset) + return -EAGAIN; + + if (cnt8 > MAX_CREG_DATA8) + return -EINVAL; + + cmd = kmem_cache_alloc(creg_cmd_pool, GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + INIT_LIST_HEAD(&cmd->list); + + cmd->op = op; + cmd->addr = addr; + cmd->cnt8 = cnt8; + cmd->buf = buf; + cmd->stream = stream; + cmd->cb = callback; + cmd->cb_private = cb_private; + cmd->status = 0; + + spin_lock_bh(&card->creg_ctrl.lock); + list_add_tail(&cmd->list, &card->creg_ctrl.queue); + card->creg_ctrl.q_depth++; + creg_kick_queue(card); + spin_unlock_bh(&card->creg_ctrl.lock); + + return 0; +} + +static void creg_cmd_timed_out(unsigned long data) +{ + struct rsxx_cardinfo *card = (struct rsxx_cardinfo *) data; + struct creg_cmd *cmd; + + spin_lock(&card->creg_ctrl.lock); + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + spin_unlock(&card->creg_ctrl.lock); + + if (cmd == NULL) { + card->creg_ctrl.creg_stats.creg_timeout++; + dev_warn(CARD_TO_DEV(card), + "No active command associated with timeout!\n"); + return; + } + + if (cmd->cb) + cmd->cb(card, cmd, -ETIMEDOUT); + + kmem_cache_free(creg_cmd_pool, cmd); + + + spin_lock(&card->creg_ctrl.lock); + card->creg_ctrl.active = 0; + creg_kick_queue(card); + spin_unlock(&card->creg_ctrl.lock); +} + + +static void creg_cmd_done(struct work_struct *work) +{ + struct rsxx_cardinfo *card; + struct creg_cmd *cmd; + int st = 0; + + card = container_of(work, struct rsxx_cardinfo, + creg_ctrl.done_work); + + /* + * The timer could not be cancelled for some reason, + * race to pop the active command. + */ + if (del_timer_sync(&card->creg_ctrl.cmd_timer) == 0) + card->creg_ctrl.creg_stats.failed_cancel_timer++; + + spin_lock_bh(&card->creg_ctrl.lock); + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + spin_unlock_bh(&card->creg_ctrl.lock); + + if (cmd == NULL) { + dev_err(CARD_TO_DEV(card), + "Spurious creg interrupt!\n"); + return; + } + + card->creg_ctrl.creg_stats.stat = ioread32(card->regmap + CREG_STAT); + cmd->status = card->creg_ctrl.creg_stats.stat; + if ((cmd->status & CREG_STAT_STATUS_MASK) == 0) { + dev_err(CARD_TO_DEV(card), + "Invalid status on creg command\n"); + /* + * At this point we're probably reading garbage from HW. Don't + * do anything else that could mess up the system and let + * the sync function return an error. + */ + st = -EIO; + goto creg_done; + } else if (cmd->status & CREG_STAT_ERROR) { + st = -EIO; + } + + if ((cmd->op == CREG_OP_READ)) { + unsigned int cnt8 = ioread32(card->regmap + CREG_CNT); + + /* Paranoid Sanity Checks */ + if (!cmd->buf) { + dev_err(CARD_TO_DEV(card), + "Buffer not given for read.\n"); + st = -EIO; + goto creg_done; + } + if (cnt8 != cmd->cnt8) { + dev_err(CARD_TO_DEV(card), + "count mismatch\n"); + st = -EIO; + goto creg_done; + } + + st = copy_from_creg_data(card, cnt8, cmd->buf, cmd->stream); + } + +creg_done: + if (cmd->cb) + cmd->cb(card, cmd, st); + + kmem_cache_free(creg_cmd_pool, cmd); + + spin_lock_bh(&card->creg_ctrl.lock); + card->creg_ctrl.active = 0; + creg_kick_queue(card); + spin_unlock_bh(&card->creg_ctrl.lock); +} + +static void creg_reset(struct rsxx_cardinfo *card) +{ + struct creg_cmd *cmd = NULL; + struct creg_cmd *tmp; + unsigned long flags; + + /* + * mutex_trylock is used here because if reset_lock is taken then a + * reset is already happening. So, we can just go ahead and return. + */ + if (!mutex_trylock(&card->creg_ctrl.reset_lock)) + return; + + card->creg_ctrl.reset = 1; + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_disable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + dev_warn(CARD_TO_DEV(card), + "Resetting creg interface for recovery\n"); + + /* Cancel outstanding commands */ + spin_lock_bh(&card->creg_ctrl.lock); + list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { + list_del(&cmd->list); + card->creg_ctrl.q_depth--; + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + kmem_cache_free(creg_cmd_pool, cmd); + } + + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + if (cmd) { + if (timer_pending(&card->creg_ctrl.cmd_timer)) + del_timer_sync(&card->creg_ctrl.cmd_timer); + + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + kmem_cache_free(creg_cmd_pool, cmd); + + card->creg_ctrl.active = 0; + } + spin_unlock_bh(&card->creg_ctrl.lock); + + card->creg_ctrl.reset = 0; + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_enable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT); + spin_unlock_irqrestore(&card->irq_lock, flags); + + mutex_unlock(&card->creg_ctrl.reset_lock); +} + +/* Used for synchronous accesses */ +struct creg_completion { + struct completion *cmd_done; + int st; + u32 creg_status; +}; + +static void creg_cmd_done_cb(struct rsxx_cardinfo *card, + struct creg_cmd *cmd, + int st) +{ + struct creg_completion *cmd_completion; + + cmd_completion = cmd->cb_private; + BUG_ON(!cmd_completion); + + cmd_completion->st = st; + cmd_completion->creg_status = cmd->status; + complete(cmd_completion->cmd_done); +} + +static int __issue_creg_rw(struct rsxx_cardinfo *card, + unsigned int op, + unsigned int addr, + unsigned int cnt8, + void *buf, + int stream, + unsigned int *hw_stat) +{ + DECLARE_COMPLETION_ONSTACK(cmd_done); + struct creg_completion completion; + unsigned long timeout; + int st; + + completion.cmd_done = &cmd_done; + completion.st = 0; + completion.creg_status = 0; + + st = creg_queue_cmd(card, op, addr, cnt8, buf, stream, creg_cmd_done_cb, + &completion); + if (st) + return st; + + /* + * This timeout is necessary for unresponsive hardware. The additional + * 20 seconds to used to guarantee that each cregs requests has time to + * complete. + */ + timeout = msecs_to_jiffies(CREG_TIMEOUT_MSEC * + card->creg_ctrl.q_depth + 20000); + + /* + * The creg interface is guaranteed to complete. It has a timeout + * mechanism that will kick in if hardware does not respond. + */ + st = wait_for_completion_timeout(completion.cmd_done, timeout); + if (st == 0) { + /* + * This is really bad, because the kernel timer did not + * expire and notify us of a timeout! + */ + dev_crit(CARD_TO_DEV(card), + "cregs timer failed\n"); + creg_reset(card); + return -EIO; + } + + *hw_stat = completion.creg_status; + + if (completion.st) { + /* + * This read is needed to verify that there has not been any + * extreme errors that might have occurred, i.e. EEH. The + * function iowrite32 will not detect EEH errors, so it is + * necessary that we recover if such an error is the reason + * for the timeout. This is a dummy read. + */ + ioread32(card->regmap + SCRATCH); + + dev_warn(CARD_TO_DEV(card), + "creg command failed(%d x%08x)\n", + completion.st, addr); + return completion.st; + } + + return 0; +} + +static int issue_creg_rw(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int stream, + int read) +{ + unsigned int hw_stat; + unsigned int xfer; + unsigned int op; + int st; + + op = read ? CREG_OP_READ : CREG_OP_WRITE; + + do { + xfer = min_t(unsigned int, size8, MAX_CREG_DATA8); + + st = __issue_creg_rw(card, op, addr, xfer, + data, stream, &hw_stat); + if (st) + return st; + + data = (char *)data + xfer; + addr += xfer; + size8 -= xfer; + } while (size8); + + return 0; +} + +/* ---------------------------- Public API ---------------------------------- */ +int rsxx_creg_write(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int byte_stream) +{ + return issue_creg_rw(card, addr, size8, data, byte_stream, 0); +} + +int rsxx_creg_read(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int byte_stream) +{ + return issue_creg_rw(card, addr, size8, data, byte_stream, 1); +} + +int rsxx_get_card_state(struct rsxx_cardinfo *card, unsigned int *state) +{ + return rsxx_creg_read(card, CREG_ADD_CARD_STATE, + sizeof(*state), state, 0); +} + +int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8) +{ + unsigned int size; + int st; + + st = rsxx_creg_read(card, CREG_ADD_CARD_SIZE, + sizeof(size), &size, 0); + if (st) + return st; + + *size8 = (u64)size * RSXX_HW_BLK_SIZE; + return 0; +} + +int rsxx_get_num_targets(struct rsxx_cardinfo *card, + unsigned int *n_targets) +{ + return rsxx_creg_read(card, CREG_ADD_NUM_TARGETS, + sizeof(*n_targets), n_targets, 0); +} + +int rsxx_get_card_capabilities(struct rsxx_cardinfo *card, + u32 *capabilities) +{ + return rsxx_creg_read(card, CREG_ADD_CAPABILITIES, + sizeof(*capabilities), capabilities, 0); +} + +int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd) +{ + return rsxx_creg_write(card, CREG_ADD_CARD_CMD, + sizeof(cmd), &cmd, 0); +} + + +/*----------------- HW Log Functions -------------------*/ +static void hw_log_msg(struct rsxx_cardinfo *card, const char *str, int len) +{ + static char level; + + /* + * New messages start with "<#>", where # is the log level. Messages + * that extend past the log buffer will use the previous level + */ + if ((len > 3) && (str[0] == '<') && (str[2] == '>')) { + level = str[1]; + str += 3; /* Skip past the log level. */ + len -= 3; + } + + switch (level) { + case '0': + dev_emerg(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '1': + dev_alert(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '2': + dev_crit(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '3': + dev_err(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '4': + dev_warn(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '5': + dev_notice(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '6': + dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + case '7': + dev_dbg(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + default: + dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str); + break; + } +} + +/* + * The substrncpy function copies the src string (which includes the + * terminating '\0' character), up to the count into the dest pointer. + * Returns the number of bytes copied to dest. + */ +static int substrncpy(char *dest, const char *src, int count) +{ + int max_cnt = count; + + while (count) { + count--; + *dest = *src; + if (*dest == '\0') + break; + src++; + dest++; + } + return max_cnt - count; +} + + +static void read_hw_log_done(struct rsxx_cardinfo *card, + struct creg_cmd *cmd, + int st) +{ + char *buf; + char *log_str; + int cnt; + int len; + int off; + + buf = cmd->buf; + off = 0; + + /* Failed getting the log message */ + if (st) + return; + + while (off < cmd->cnt8) { + log_str = &card->log.buf[card->log.buf_len]; + cnt = min(cmd->cnt8 - off, LOG_BUF_SIZE8 - card->log.buf_len); + len = substrncpy(log_str, &buf[off], cnt); + + off += len; + card->log.buf_len += len; + + /* + * Flush the log if we've hit the end of a message or if we've + * run out of buffer space. + */ + if ((log_str[len - 1] == '\0') || + (card->log.buf_len == LOG_BUF_SIZE8)) { + if (card->log.buf_len != 1) /* Don't log blank lines. */ + hw_log_msg(card, card->log.buf, + card->log.buf_len); + card->log.buf_len = 0; + } + + } + + if (cmd->status & CREG_STAT_LOG_PENDING) + rsxx_read_hw_log(card); +} + +int rsxx_read_hw_log(struct rsxx_cardinfo *card) +{ + int st; + + st = creg_queue_cmd(card, CREG_OP_READ, CREG_ADD_LOG, + sizeof(card->log.tmp), card->log.tmp, + 1, read_hw_log_done, NULL); + if (st) + dev_err(CARD_TO_DEV(card), + "Failed getting log text\n"); + + return st; +} + +/*-------------- IOCTL REG Access ------------------*/ +static int issue_reg_cmd(struct rsxx_cardinfo *card, + struct rsxx_reg_access *cmd, + int read) +{ + unsigned int op = read ? CREG_OP_READ : CREG_OP_WRITE; + + return __issue_creg_rw(card, op, cmd->addr, cmd->cnt, cmd->data, + cmd->stream, &cmd->stat); +} + +int rsxx_reg_access(struct rsxx_cardinfo *card, + struct rsxx_reg_access __user *ucmd, + int read) +{ + struct rsxx_reg_access cmd; + int st; + + st = copy_from_user(&cmd, ucmd, sizeof(cmd)); + if (st) + return -EFAULT; + + if (cmd.cnt > RSXX_MAX_REG_CNT) + return -EFAULT; + + st = issue_reg_cmd(card, &cmd, read); + if (st) + return st; + + st = put_user(cmd.stat, &ucmd->stat); + if (st) + return -EFAULT; + + if (read) { + st = copy_to_user(ucmd->data, cmd.data, cmd.cnt); + if (st) + return -EFAULT; + } + + return 0; +} + +void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card) +{ + struct creg_cmd *cmd = NULL; + + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + + if (cmd) { + del_timer_sync(&card->creg_ctrl.cmd_timer); + + spin_lock_bh(&card->creg_ctrl.lock); + list_add(&cmd->list, &card->creg_ctrl.queue); + card->creg_ctrl.q_depth++; + card->creg_ctrl.active = 0; + spin_unlock_bh(&card->creg_ctrl.lock); + } +} + +void rsxx_kick_creg_queue(struct rsxx_cardinfo *card) +{ + spin_lock_bh(&card->creg_ctrl.lock); + if (!list_empty(&card->creg_ctrl.queue)) + creg_kick_queue(card); + spin_unlock_bh(&card->creg_ctrl.lock); +} + +/*------------ Initialization & Setup --------------*/ +int rsxx_creg_setup(struct rsxx_cardinfo *card) +{ + card->creg_ctrl.active_cmd = NULL; + + card->creg_ctrl.creg_wq = + create_singlethread_workqueue(DRIVER_NAME"_creg"); + if (!card->creg_ctrl.creg_wq) + return -ENOMEM; + + INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done); + mutex_init(&card->creg_ctrl.reset_lock); + INIT_LIST_HEAD(&card->creg_ctrl.queue); + spin_lock_init(&card->creg_ctrl.lock); + setup_timer(&card->creg_ctrl.cmd_timer, creg_cmd_timed_out, + (unsigned long) card); + + return 0; +} + +void rsxx_creg_destroy(struct rsxx_cardinfo *card) +{ + struct creg_cmd *cmd; + struct creg_cmd *tmp; + int cnt = 0; + + /* Cancel outstanding commands */ + spin_lock_bh(&card->creg_ctrl.lock); + list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) { + list_del(&cmd->list); + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + kmem_cache_free(creg_cmd_pool, cmd); + cnt++; + } + + if (cnt) + dev_info(CARD_TO_DEV(card), + "Canceled %d queue creg commands\n", cnt); + + cmd = card->creg_ctrl.active_cmd; + card->creg_ctrl.active_cmd = NULL; + if (cmd) { + if (timer_pending(&card->creg_ctrl.cmd_timer)) + del_timer_sync(&card->creg_ctrl.cmd_timer); + + if (cmd->cb) + cmd->cb(card, cmd, -ECANCELED); + dev_info(CARD_TO_DEV(card), + "Canceled active creg command\n"); + kmem_cache_free(creg_cmd_pool, cmd); + } + spin_unlock_bh(&card->creg_ctrl.lock); + + cancel_work_sync(&card->creg_ctrl.done_work); +} + + +int rsxx_creg_init(void) +{ + creg_cmd_pool = KMEM_CACHE(creg_cmd, SLAB_HWCACHE_ALIGN); + if (!creg_cmd_pool) + return -ENOMEM; + + return 0; +} + +void rsxx_creg_cleanup(void) +{ + kmem_cache_destroy(creg_cmd_pool); +} diff --git a/kernel/drivers/block/rsxx/dev.c b/kernel/drivers/block/rsxx/dev.c new file mode 100644 index 000000000..ac8c62cb4 --- /dev/null +++ b/kernel/drivers/block/rsxx/dev.c @@ -0,0 +1,340 @@ +/* +* Filename: dev.c +* +* +* Authors: Joshua Morris +* Philip Kelleher +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "rsxx_priv.h" + +static unsigned int blkdev_minors = 64; +module_param(blkdev_minors, uint, 0444); +MODULE_PARM_DESC(blkdev_minors, "Number of minors(partitions)"); + +/* + * For now I'm making this tweakable in case any applications hit this limit. + * If you see a "bio too big" error in the log you will need to raise this + * value. + */ +static unsigned int blkdev_max_hw_sectors = 1024; +module_param(blkdev_max_hw_sectors, uint, 0444); +MODULE_PARM_DESC(blkdev_max_hw_sectors, "Max hw sectors for a single BIO"); + +static unsigned int enable_blkdev = 1; +module_param(enable_blkdev , uint, 0444); +MODULE_PARM_DESC(enable_blkdev, "Enable block device interfaces"); + + +struct rsxx_bio_meta { + struct bio *bio; + atomic_t pending_dmas; + atomic_t error; + unsigned long start_time; +}; + +static struct kmem_cache *bio_meta_pool; + +/*----------------- Block Device Operations -----------------*/ +static int rsxx_blkdev_ioctl(struct block_device *bdev, + fmode_t mode, + unsigned int cmd, + unsigned long arg) +{ + struct rsxx_cardinfo *card = bdev->bd_disk->private_data; + + switch (cmd) { + case RSXX_GETREG: + return rsxx_reg_access(card, (void __user *)arg, 1); + case RSXX_SETREG: + return rsxx_reg_access(card, (void __user *)arg, 0); + } + + return -ENOTTY; +} + +static int rsxx_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct rsxx_cardinfo *card = bdev->bd_disk->private_data; + u64 blocks = card->size8 >> 9; + + /* + * get geometry: Fake it. I haven't found any drivers that set + * geo->start, so we won't either. + */ + if (card->size8) { + geo->heads = 64; + geo->sectors = 16; + do_div(blocks, (geo->heads * geo->sectors)); + geo->cylinders = blocks; + } else { + geo->heads = 0; + geo->sectors = 0; + geo->cylinders = 0; + } + return 0; +} + +static const struct block_device_operations rsxx_fops = { + .owner = THIS_MODULE, + .getgeo = rsxx_getgeo, + .ioctl = rsxx_blkdev_ioctl, +}; + +static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) +{ + generic_start_io_acct(bio_data_dir(bio), bio_sectors(bio), + &card->gendisk->part0); +} + +static void disk_stats_complete(struct rsxx_cardinfo *card, + struct bio *bio, + unsigned long start_time) +{ + generic_end_io_acct(bio_data_dir(bio), &card->gendisk->part0, + start_time); +} + +static void bio_dma_done_cb(struct rsxx_cardinfo *card, + void *cb_data, + unsigned int error) +{ + struct rsxx_bio_meta *meta = cb_data; + + if (error) + atomic_set(&meta->error, 1); + + if (atomic_dec_and_test(&meta->pending_dmas)) { + if (!card->eeh_state && card->gendisk) + disk_stats_complete(card, meta->bio, meta->start_time); + + bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0); + kmem_cache_free(bio_meta_pool, meta); + } +} + +static void rsxx_make_request(struct request_queue *q, struct bio *bio) +{ + struct rsxx_cardinfo *card = q->queuedata; + struct rsxx_bio_meta *bio_meta; + int st = -EINVAL; + + might_sleep(); + + if (!card) + goto req_err; + + if (bio_end_sector(bio) > get_capacity(card->gendisk)) + goto req_err; + + if (unlikely(card->halt)) { + st = -EFAULT; + goto req_err; + } + + if (unlikely(card->dma_fault)) { + st = (-EFAULT); + goto req_err; + } + + if (bio->bi_iter.bi_size == 0) { + dev_err(CARD_TO_DEV(card), "size zero BIO!\n"); + goto req_err; + } + + bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL); + if (!bio_meta) { + st = -ENOMEM; + goto req_err; + } + + bio_meta->bio = bio; + atomic_set(&bio_meta->error, 0); + atomic_set(&bio_meta->pending_dmas, 0); + bio_meta->start_time = jiffies; + + if (!unlikely(card->halt)) + disk_stats_start(card, bio); + + dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n", + bio_data_dir(bio) ? 'W' : 'R', bio_meta, + (u64)bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size); + + st = rsxx_dma_queue_bio(card, bio, &bio_meta->pending_dmas, + bio_dma_done_cb, bio_meta); + if (st) + goto queue_err; + + return; + +queue_err: + kmem_cache_free(bio_meta_pool, bio_meta); +req_err: + bio_endio(bio, st); +} + +/*----------------- Device Setup -------------------*/ +static bool rsxx_discard_supported(struct rsxx_cardinfo *card) +{ + unsigned char pci_rev; + + pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev); + + return (pci_rev >= RSXX_DISCARD_SUPPORT); +} + +int rsxx_attach_dev(struct rsxx_cardinfo *card) +{ + mutex_lock(&card->dev_lock); + + /* The block device requires the stripe size from the config. */ + if (enable_blkdev) { + if (card->config_valid) + set_capacity(card->gendisk, card->size8 >> 9); + else + set_capacity(card->gendisk, 0); + add_disk(card->gendisk); + + card->bdev_attached = 1; + } + + mutex_unlock(&card->dev_lock); + + return 0; +} + +void rsxx_detach_dev(struct rsxx_cardinfo *card) +{ + mutex_lock(&card->dev_lock); + + if (card->bdev_attached) { + del_gendisk(card->gendisk); + card->bdev_attached = 0; + } + + mutex_unlock(&card->dev_lock); +} + +int rsxx_setup_dev(struct rsxx_cardinfo *card) +{ + unsigned short blk_size; + + mutex_init(&card->dev_lock); + + if (!enable_blkdev) + return 0; + + card->major = register_blkdev(0, DRIVER_NAME); + if (card->major < 0) { + dev_err(CARD_TO_DEV(card), "Failed to get major number\n"); + return -ENOMEM; + } + + card->queue = blk_alloc_queue(GFP_KERNEL); + if (!card->queue) { + dev_err(CARD_TO_DEV(card), "Failed queue alloc\n"); + unregister_blkdev(card->major, DRIVER_NAME); + return -ENOMEM; + } + + card->gendisk = alloc_disk(blkdev_minors); + if (!card->gendisk) { + dev_err(CARD_TO_DEV(card), "Failed disk alloc\n"); + blk_cleanup_queue(card->queue); + unregister_blkdev(card->major, DRIVER_NAME); + return -ENOMEM; + } + + if (card->config_valid) { + blk_size = card->config.data.block_size; + blk_queue_dma_alignment(card->queue, blk_size - 1); + blk_queue_logical_block_size(card->queue, blk_size); + } + + blk_queue_make_request(card->queue, rsxx_make_request); + blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY); + blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); + blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); + + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, card->queue); + if (rsxx_discard_supported(card)) { + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, card->queue); + blk_queue_max_discard_sectors(card->queue, + RSXX_HW_BLK_SIZE >> 9); + card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE; + card->queue->limits.discard_alignment = RSXX_HW_BLK_SIZE; + card->queue->limits.discard_zeroes_data = 1; + } + + card->queue->queuedata = card; + + snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name), + "rsxx%d", card->disk_id); + card->gendisk->driverfs_dev = &card->dev->dev; + card->gendisk->major = card->major; + card->gendisk->first_minor = 0; + card->gendisk->fops = &rsxx_fops; + card->gendisk->private_data = card; + card->gendisk->queue = card->queue; + + return 0; +} + +void rsxx_destroy_dev(struct rsxx_cardinfo *card) +{ + if (!enable_blkdev) + return; + + put_disk(card->gendisk); + card->gendisk = NULL; + + blk_cleanup_queue(card->queue); + card->queue->queuedata = NULL; + unregister_blkdev(card->major, DRIVER_NAME); +} + +int rsxx_dev_init(void) +{ + bio_meta_pool = KMEM_CACHE(rsxx_bio_meta, SLAB_HWCACHE_ALIGN); + if (!bio_meta_pool) + return -ENOMEM; + + return 0; +} + +void rsxx_dev_cleanup(void) +{ + kmem_cache_destroy(bio_meta_pool); +} + + diff --git a/kernel/drivers/block/rsxx/dma.c b/kernel/drivers/block/rsxx/dma.c new file mode 100644 index 000000000..cf8cd293a --- /dev/null +++ b/kernel/drivers/block/rsxx/dma.c @@ -0,0 +1,1104 @@ +/* +* Filename: dma.c +* +* +* Authors: Joshua Morris +* Philip Kelleher +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include "rsxx_priv.h" + +struct rsxx_dma { + struct list_head list; + u8 cmd; + unsigned int laddr; /* Logical address */ + struct { + u32 off; + u32 cnt; + } sub_page; + dma_addr_t dma_addr; + struct page *page; + unsigned int pg_off; /* Page Offset */ + rsxx_dma_cb cb; + void *cb_data; +}; + +/* This timeout is used to detect a stalled DMA channel */ +#define DMA_ACTIVITY_TIMEOUT msecs_to_jiffies(10000) + +struct hw_status { + u8 status; + u8 tag; + __le16 count; + __le32 _rsvd2; + __le64 _rsvd3; +} __packed; + +enum rsxx_dma_status { + DMA_SW_ERR = 0x1, + DMA_HW_FAULT = 0x2, + DMA_CANCELLED = 0x4, +}; + +struct hw_cmd { + u8 command; + u8 tag; + u8 _rsvd; + u8 sub_page; /* Bit[0:2]: 512byte offset */ + /* Bit[4:6]: 512byte count */ + __le32 device_addr; + __le64 host_addr; +} __packed; + +enum rsxx_hw_cmd { + HW_CMD_BLK_DISCARD = 0x70, + HW_CMD_BLK_WRITE = 0x80, + HW_CMD_BLK_READ = 0xC0, + HW_CMD_BLK_RECON_READ = 0xE0, +}; + +enum rsxx_hw_status { + HW_STATUS_CRC = 0x01, + HW_STATUS_HARD_ERR = 0x02, + HW_STATUS_SOFT_ERR = 0x04, + HW_STATUS_FAULT = 0x08, +}; + +static struct kmem_cache *rsxx_dma_pool; + +struct dma_tracker { + int next_tag; + struct rsxx_dma *dma; +}; + +#define DMA_TRACKER_LIST_SIZE8 (sizeof(struct dma_tracker_list) + \ + (sizeof(struct dma_tracker) * RSXX_MAX_OUTSTANDING_CMDS)) + +struct dma_tracker_list { + spinlock_t lock; + int head; + struct dma_tracker list[0]; +}; + + +/*----------------- Misc Utility Functions -------------------*/ +static unsigned int rsxx_addr8_to_laddr(u64 addr8, struct rsxx_cardinfo *card) +{ + unsigned long long tgt_addr8; + + tgt_addr8 = ((addr8 >> card->_stripe.upper_shift) & + card->_stripe.upper_mask) | + ((addr8) & card->_stripe.lower_mask); + do_div(tgt_addr8, RSXX_HW_BLK_SIZE); + return tgt_addr8; +} + +static unsigned int rsxx_get_dma_tgt(struct rsxx_cardinfo *card, u64 addr8) +{ + unsigned int tgt; + + tgt = (addr8 >> card->_stripe.target_shift) & card->_stripe.target_mask; + + return tgt; +} + +void rsxx_dma_queue_reset(struct rsxx_cardinfo *card) +{ + /* Reset all DMA Command/Status Queues */ + iowrite32(DMA_QUEUE_RESET, card->regmap + RESET); +} + +static unsigned int get_dma_size(struct rsxx_dma *dma) +{ + if (dma->sub_page.cnt) + return dma->sub_page.cnt << 9; + else + return RSXX_HW_BLK_SIZE; +} + + +/*----------------- DMA Tracker -------------------*/ +static void set_tracker_dma(struct dma_tracker_list *trackers, + int tag, + struct rsxx_dma *dma) +{ + trackers->list[tag].dma = dma; +} + +static struct rsxx_dma *get_tracker_dma(struct dma_tracker_list *trackers, + int tag) +{ + return trackers->list[tag].dma; +} + +static int pop_tracker(struct dma_tracker_list *trackers) +{ + int tag; + + spin_lock(&trackers->lock); + tag = trackers->head; + if (tag != -1) { + trackers->head = trackers->list[tag].next_tag; + trackers->list[tag].next_tag = -1; + } + spin_unlock(&trackers->lock); + + return tag; +} + +static void push_tracker(struct dma_tracker_list *trackers, int tag) +{ + spin_lock(&trackers->lock); + trackers->list[tag].next_tag = trackers->head; + trackers->head = tag; + trackers->list[tag].dma = NULL; + spin_unlock(&trackers->lock); +} + + +/*----------------- Interrupt Coalescing -------------*/ +/* + * Interrupt Coalescing Register Format: + * Interrupt Timer (64ns units) [15:0] + * Interrupt Count [24:16] + * Reserved [31:25] +*/ +#define INTR_COAL_LATENCY_MASK (0x0000ffff) + +#define INTR_COAL_COUNT_SHIFT 16 +#define INTR_COAL_COUNT_BITS 9 +#define INTR_COAL_COUNT_MASK (((1 << INTR_COAL_COUNT_BITS) - 1) << \ + INTR_COAL_COUNT_SHIFT) +#define INTR_COAL_LATENCY_UNITS_NS 64 + + +static u32 dma_intr_coal_val(u32 mode, u32 count, u32 latency) +{ + u32 latency_units = latency / INTR_COAL_LATENCY_UNITS_NS; + + if (mode == RSXX_INTR_COAL_DISABLED) + return 0; + + return ((count << INTR_COAL_COUNT_SHIFT) & INTR_COAL_COUNT_MASK) | + (latency_units & INTR_COAL_LATENCY_MASK); + +} + +static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card) +{ + int i; + u32 q_depth = 0; + u32 intr_coal; + + if (card->config.data.intr_coal.mode != RSXX_INTR_COAL_AUTO_TUNE || + unlikely(card->eeh_state)) + return; + + for (i = 0; i < card->n_targets; i++) + q_depth += atomic_read(&card->ctrl[i].stats.hw_q_depth); + + intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode, + q_depth / 2, + card->config.data.intr_coal.latency); + iowrite32(intr_coal, card->regmap + INTR_COAL); +} + +/*----------------- RSXX DMA Handling -------------------*/ +static void rsxx_free_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma) +{ + if (dma->cmd != HW_CMD_BLK_DISCARD) { + if (!pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) { + pci_unmap_page(ctrl->card->dev, dma->dma_addr, + get_dma_size(dma), + dma->cmd == HW_CMD_BLK_WRITE ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + } + } + + kmem_cache_free(rsxx_dma_pool, dma); +} + +static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl, + struct rsxx_dma *dma, + unsigned int status) +{ + if (status & DMA_SW_ERR) + ctrl->stats.dma_sw_err++; + if (status & DMA_HW_FAULT) + ctrl->stats.dma_hw_fault++; + if (status & DMA_CANCELLED) + ctrl->stats.dma_cancelled++; + + if (dma->cb) + dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0); + + rsxx_free_dma(ctrl, dma); +} + +int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, + struct list_head *q, unsigned int done) +{ + struct rsxx_dma *dma; + struct rsxx_dma *tmp; + int cnt = 0; + + list_for_each_entry_safe(dma, tmp, q, list) { + list_del(&dma->list); + if (done & COMPLETE_DMA) + rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); + else + rsxx_free_dma(ctrl, dma); + cnt++; + } + + return cnt; +} + +static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl, + struct rsxx_dma *dma) +{ + /* + * Requeued DMAs go to the front of the queue so they are issued + * first. + */ + spin_lock_bh(&ctrl->queue_lock); + ctrl->stats.sw_q_depth++; + list_add(&dma->list, &ctrl->queue); + spin_unlock_bh(&ctrl->queue_lock); +} + +static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl, + struct rsxx_dma *dma, + u8 hw_st) +{ + unsigned int status = 0; + int requeue_cmd = 0; + + dev_dbg(CARD_TO_DEV(ctrl->card), + "Handling DMA error(cmd x%02x, laddr x%08x st:x%02x)\n", + dma->cmd, dma->laddr, hw_st); + + if (hw_st & HW_STATUS_CRC) + ctrl->stats.crc_errors++; + if (hw_st & HW_STATUS_HARD_ERR) + ctrl->stats.hard_errors++; + if (hw_st & HW_STATUS_SOFT_ERR) + ctrl->stats.soft_errors++; + + switch (dma->cmd) { + case HW_CMD_BLK_READ: + if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) { + if (ctrl->card->scrub_hard) { + dma->cmd = HW_CMD_BLK_RECON_READ; + requeue_cmd = 1; + ctrl->stats.reads_retried++; + } else { + status |= DMA_HW_FAULT; + ctrl->stats.reads_failed++; + } + } else if (hw_st & HW_STATUS_FAULT) { + status |= DMA_HW_FAULT; + ctrl->stats.reads_failed++; + } + + break; + case HW_CMD_BLK_RECON_READ: + if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) { + /* Data could not be reconstructed. */ + status |= DMA_HW_FAULT; + ctrl->stats.reads_failed++; + } + + break; + case HW_CMD_BLK_WRITE: + status |= DMA_HW_FAULT; + ctrl->stats.writes_failed++; + + break; + case HW_CMD_BLK_DISCARD: + status |= DMA_HW_FAULT; + ctrl->stats.discards_failed++; + + break; + default: + dev_err(CARD_TO_DEV(ctrl->card), + "Unknown command in DMA!(cmd: x%02x " + "laddr x%08x st: x%02x\n", + dma->cmd, dma->laddr, hw_st); + status |= DMA_SW_ERR; + + break; + } + + if (requeue_cmd) + rsxx_requeue_dma(ctrl, dma); + else + rsxx_complete_dma(ctrl, dma, status); +} + +static void dma_engine_stalled(unsigned long data) +{ + struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data; + int cnt; + + if (atomic_read(&ctrl->stats.hw_q_depth) == 0 || + unlikely(ctrl->card->eeh_state)) + return; + + if (ctrl->cmd.idx != ioread32(ctrl->regmap + SW_CMD_IDX)) { + /* + * The dma engine was stalled because the SW_CMD_IDX write + * was lost. Issue it again to recover. + */ + dev_warn(CARD_TO_DEV(ctrl->card), + "SW_CMD_IDX write was lost, re-writing...\n"); + iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); + mod_timer(&ctrl->activity_timer, + jiffies + DMA_ACTIVITY_TIMEOUT); + } else { + dev_warn(CARD_TO_DEV(ctrl->card), + "DMA channel %d has stalled, faulting interface.\n", + ctrl->id); + ctrl->card->dma_fault = 1; + + /* Clean up the DMA queue */ + spin_lock(&ctrl->queue_lock); + cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA); + spin_unlock(&ctrl->queue_lock); + + cnt += rsxx_dma_cancel(ctrl); + + if (cnt) + dev_info(CARD_TO_DEV(ctrl->card), + "Freed %d queued DMAs on channel %d\n", + cnt, ctrl->id); + } +} + +static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl) +{ + struct rsxx_dma *dma; + int tag; + int cmds_pending = 0; + struct hw_cmd *hw_cmd_buf; + int dir; + + hw_cmd_buf = ctrl->cmd.buf; + + if (unlikely(ctrl->card->halt) || + unlikely(ctrl->card->eeh_state)) + return; + + while (1) { + spin_lock_bh(&ctrl->queue_lock); + if (list_empty(&ctrl->queue)) { + spin_unlock_bh(&ctrl->queue_lock); + break; + } + spin_unlock_bh(&ctrl->queue_lock); + + tag = pop_tracker(ctrl->trackers); + if (tag == -1) + break; + + spin_lock_bh(&ctrl->queue_lock); + dma = list_entry(ctrl->queue.next, struct rsxx_dma, list); + list_del(&dma->list); + ctrl->stats.sw_q_depth--; + spin_unlock_bh(&ctrl->queue_lock); + + /* + * This will catch any DMAs that slipped in right before the + * fault, but was queued after all the other DMAs were + * cancelled. + */ + if (unlikely(ctrl->card->dma_fault)) { + push_tracker(ctrl->trackers, tag); + rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); + continue; + } + + if (dma->cmd != HW_CMD_BLK_DISCARD) { + if (dma->cmd == HW_CMD_BLK_WRITE) + dir = PCI_DMA_TODEVICE; + else + dir = PCI_DMA_FROMDEVICE; + + /* + * The function pci_map_page is placed here because we + * can only, by design, issue up to 255 commands to the + * hardware at one time per DMA channel. So the maximum + * amount of mapped memory would be 255 * 4 channels * + * 4096 Bytes which is less than 2GB, the limit of a x8 + * Non-HWWD PCIe slot. This way the pci_map_page + * function should never fail because of a lack of + * mappable memory. + */ + dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page, + dma->pg_off, dma->sub_page.cnt << 9, dir); + if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) { + push_tracker(ctrl->trackers, tag); + rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); + continue; + } + } + + set_tracker_dma(ctrl->trackers, tag, dma); + hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd; + hw_cmd_buf[ctrl->cmd.idx].tag = tag; + hw_cmd_buf[ctrl->cmd.idx]._rsvd = 0; + hw_cmd_buf[ctrl->cmd.idx].sub_page = + ((dma->sub_page.cnt & 0x7) << 4) | + (dma->sub_page.off & 0x7); + + hw_cmd_buf[ctrl->cmd.idx].device_addr = + cpu_to_le32(dma->laddr); + + hw_cmd_buf[ctrl->cmd.idx].host_addr = + cpu_to_le64(dma->dma_addr); + + dev_dbg(CARD_TO_DEV(ctrl->card), + "Issue DMA%d(laddr %d tag %d) to idx %d\n", + ctrl->id, dma->laddr, tag, ctrl->cmd.idx); + + ctrl->cmd.idx = (ctrl->cmd.idx + 1) & RSXX_CS_IDX_MASK; + cmds_pending++; + + if (dma->cmd == HW_CMD_BLK_WRITE) + ctrl->stats.writes_issued++; + else if (dma->cmd == HW_CMD_BLK_DISCARD) + ctrl->stats.discards_issued++; + else + ctrl->stats.reads_issued++; + } + + /* Let HW know we've queued commands. */ + if (cmds_pending) { + atomic_add(cmds_pending, &ctrl->stats.hw_q_depth); + mod_timer(&ctrl->activity_timer, + jiffies + DMA_ACTIVITY_TIMEOUT); + + if (unlikely(ctrl->card->eeh_state)) { + del_timer_sync(&ctrl->activity_timer); + return; + } + + iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); + } +} + +static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl) +{ + struct rsxx_dma *dma; + unsigned long flags; + u16 count; + u8 status; + u8 tag; + struct hw_status *hw_st_buf; + + hw_st_buf = ctrl->status.buf; + + if (unlikely(ctrl->card->halt) || + unlikely(ctrl->card->dma_fault) || + unlikely(ctrl->card->eeh_state)) + return; + + count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count); + + while (count == ctrl->e_cnt) { + /* + * The read memory-barrier is necessary to keep aggressive + * processors/optimizers (such as the PPC Apple G5) from + * reordering the following status-buffer tag & status read + * *before* the count read on subsequent iterations of the + * loop! + */ + rmb(); + + status = hw_st_buf[ctrl->status.idx].status; + tag = hw_st_buf[ctrl->status.idx].tag; + + dma = get_tracker_dma(ctrl->trackers, tag); + if (dma == NULL) { + spin_lock_irqsave(&ctrl->card->irq_lock, flags); + rsxx_disable_ier(ctrl->card, CR_INTR_DMA_ALL); + spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); + + dev_err(CARD_TO_DEV(ctrl->card), + "No tracker for tag %d " + "(idx %d id %d)\n", + tag, ctrl->status.idx, ctrl->id); + return; + } + + dev_dbg(CARD_TO_DEV(ctrl->card), + "Completing DMA%d" + "(laddr x%x tag %d st: x%x cnt: x%04x) from idx %d.\n", + ctrl->id, dma->laddr, tag, status, count, + ctrl->status.idx); + + atomic_dec(&ctrl->stats.hw_q_depth); + + mod_timer(&ctrl->activity_timer, + jiffies + DMA_ACTIVITY_TIMEOUT); + + if (status) + rsxx_handle_dma_error(ctrl, dma, status); + else + rsxx_complete_dma(ctrl, dma, 0); + + push_tracker(ctrl->trackers, tag); + + ctrl->status.idx = (ctrl->status.idx + 1) & + RSXX_CS_IDX_MASK; + ctrl->e_cnt++; + + count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count); + } + + dma_intr_coal_auto_tune(ctrl->card); + + if (atomic_read(&ctrl->stats.hw_q_depth) == 0) + del_timer_sync(&ctrl->activity_timer); + + spin_lock_irqsave(&ctrl->card->irq_lock, flags); + rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id)); + spin_unlock_irqrestore(&ctrl->card->irq_lock, flags); + + spin_lock_bh(&ctrl->queue_lock); + if (ctrl->stats.sw_q_depth) + queue_work(ctrl->issue_wq, &ctrl->issue_dma_work); + spin_unlock_bh(&ctrl->queue_lock); +} + +static void rsxx_schedule_issue(struct work_struct *work) +{ + struct rsxx_dma_ctrl *ctrl; + + ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work); + + mutex_lock(&ctrl->work_lock); + rsxx_issue_dmas(ctrl); + mutex_unlock(&ctrl->work_lock); +} + +static void rsxx_schedule_done(struct work_struct *work) +{ + struct rsxx_dma_ctrl *ctrl; + + ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work); + + mutex_lock(&ctrl->work_lock); + rsxx_dma_done(ctrl); + mutex_unlock(&ctrl->work_lock); +} + +static int rsxx_queue_discard(struct rsxx_cardinfo *card, + struct list_head *q, + unsigned int laddr, + rsxx_dma_cb cb, + void *cb_data) +{ + struct rsxx_dma *dma; + + dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); + if (!dma) + return -ENOMEM; + + dma->cmd = HW_CMD_BLK_DISCARD; + dma->laddr = laddr; + dma->dma_addr = 0; + dma->sub_page.off = 0; + dma->sub_page.cnt = 0; + dma->page = NULL; + dma->pg_off = 0; + dma->cb = cb; + dma->cb_data = cb_data; + + dev_dbg(CARD_TO_DEV(card), "Queuing[D] laddr %x\n", dma->laddr); + + list_add_tail(&dma->list, q); + + return 0; +} + +static int rsxx_queue_dma(struct rsxx_cardinfo *card, + struct list_head *q, + int dir, + unsigned int dma_off, + unsigned int dma_len, + unsigned int laddr, + struct page *page, + unsigned int pg_off, + rsxx_dma_cb cb, + void *cb_data) +{ + struct rsxx_dma *dma; + + dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL); + if (!dma) + return -ENOMEM; + + dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ; + dma->laddr = laddr; + dma->sub_page.off = (dma_off >> 9); + dma->sub_page.cnt = (dma_len >> 9); + dma->page = page; + dma->pg_off = pg_off; + dma->cb = cb; + dma->cb_data = cb_data; + + dev_dbg(CARD_TO_DEV(card), + "Queuing[%c] laddr %x off %d cnt %d page %p pg_off %d\n", + dir ? 'W' : 'R', dma->laddr, dma->sub_page.off, + dma->sub_page.cnt, dma->page, dma->pg_off); + + /* Queue the DMA */ + list_add_tail(&dma->list, q); + + return 0; +} + +int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, + struct bio *bio, + atomic_t *n_dmas, + rsxx_dma_cb cb, + void *cb_data) +{ + struct list_head dma_list[RSXX_MAX_TARGETS]; + struct bio_vec bvec; + struct bvec_iter iter; + unsigned long long addr8; + unsigned int laddr; + unsigned int bv_len; + unsigned int bv_off; + unsigned int dma_off; + unsigned int dma_len; + int dma_cnt[RSXX_MAX_TARGETS]; + int tgt; + int st; + int i; + + addr8 = bio->bi_iter.bi_sector << 9; /* sectors are 512 bytes */ + atomic_set(n_dmas, 0); + + for (i = 0; i < card->n_targets; i++) { + INIT_LIST_HEAD(&dma_list[i]); + dma_cnt[i] = 0; + } + + if (bio->bi_rw & REQ_DISCARD) { + bv_len = bio->bi_iter.bi_size; + + while (bv_len > 0) { + tgt = rsxx_get_dma_tgt(card, addr8); + laddr = rsxx_addr8_to_laddr(addr8, card); + + st = rsxx_queue_discard(card, &dma_list[tgt], laddr, + cb, cb_data); + if (st) + goto bvec_err; + + dma_cnt[tgt]++; + atomic_inc(n_dmas); + addr8 += RSXX_HW_BLK_SIZE; + bv_len -= RSXX_HW_BLK_SIZE; + } + } else { + bio_for_each_segment(bvec, bio, iter) { + bv_len = bvec.bv_len; + bv_off = bvec.bv_offset; + + while (bv_len > 0) { + tgt = rsxx_get_dma_tgt(card, addr8); + laddr = rsxx_addr8_to_laddr(addr8, card); + dma_off = addr8 & RSXX_HW_BLK_MASK; + dma_len = min(bv_len, + RSXX_HW_BLK_SIZE - dma_off); + + st = rsxx_queue_dma(card, &dma_list[tgt], + bio_data_dir(bio), + dma_off, dma_len, + laddr, bvec.bv_page, + bv_off, cb, cb_data); + if (st) + goto bvec_err; + + dma_cnt[tgt]++; + atomic_inc(n_dmas); + addr8 += dma_len; + bv_off += dma_len; + bv_len -= dma_len; + } + } + } + + for (i = 0; i < card->n_targets; i++) { + if (!list_empty(&dma_list[i])) { + spin_lock_bh(&card->ctrl[i].queue_lock); + card->ctrl[i].stats.sw_q_depth += dma_cnt[i]; + list_splice_tail(&dma_list[i], &card->ctrl[i].queue); + spin_unlock_bh(&card->ctrl[i].queue_lock); + + queue_work(card->ctrl[i].issue_wq, + &card->ctrl[i].issue_dma_work); + } + } + + return 0; + +bvec_err: + for (i = 0; i < card->n_targets; i++) + rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i], + FREE_DMA); + + return st; +} + + +/*----------------- DMA Engine Initialization & Setup -------------------*/ +int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl) +{ + ctrl->status.buf = pci_alloc_consistent(dev, STATUS_BUFFER_SIZE8, + &ctrl->status.dma_addr); + ctrl->cmd.buf = pci_alloc_consistent(dev, COMMAND_BUFFER_SIZE8, + &ctrl->cmd.dma_addr); + if (ctrl->status.buf == NULL || ctrl->cmd.buf == NULL) + return -ENOMEM; + + memset(ctrl->status.buf, 0xac, STATUS_BUFFER_SIZE8); + iowrite32(lower_32_bits(ctrl->status.dma_addr), + ctrl->regmap + SB_ADD_LO); + iowrite32(upper_32_bits(ctrl->status.dma_addr), + ctrl->regmap + SB_ADD_HI); + + memset(ctrl->cmd.buf, 0x83, COMMAND_BUFFER_SIZE8); + iowrite32(lower_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_LO); + iowrite32(upper_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_HI); + + ctrl->status.idx = ioread32(ctrl->regmap + HW_STATUS_CNT); + if (ctrl->status.idx > RSXX_MAX_OUTSTANDING_CMDS) { + dev_crit(&dev->dev, "Failed reading status cnt x%x\n", + ctrl->status.idx); + return -EINVAL; + } + iowrite32(ctrl->status.idx, ctrl->regmap + HW_STATUS_CNT); + iowrite32(ctrl->status.idx, ctrl->regmap + SW_STATUS_CNT); + + ctrl->cmd.idx = ioread32(ctrl->regmap + HW_CMD_IDX); + if (ctrl->cmd.idx > RSXX_MAX_OUTSTANDING_CMDS) { + dev_crit(&dev->dev, "Failed reading cmd cnt x%x\n", + ctrl->status.idx); + return -EINVAL; + } + iowrite32(ctrl->cmd.idx, ctrl->regmap + HW_CMD_IDX); + iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX); + + return 0; +} + +static int rsxx_dma_ctrl_init(struct pci_dev *dev, + struct rsxx_dma_ctrl *ctrl) +{ + int i; + int st; + + memset(&ctrl->stats, 0, sizeof(ctrl->stats)); + + ctrl->trackers = vmalloc(DMA_TRACKER_LIST_SIZE8); + if (!ctrl->trackers) + return -ENOMEM; + + ctrl->trackers->head = 0; + for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) { + ctrl->trackers->list[i].next_tag = i + 1; + ctrl->trackers->list[i].dma = NULL; + } + ctrl->trackers->list[RSXX_MAX_OUTSTANDING_CMDS-1].next_tag = -1; + spin_lock_init(&ctrl->trackers->lock); + + spin_lock_init(&ctrl->queue_lock); + mutex_init(&ctrl->work_lock); + INIT_LIST_HEAD(&ctrl->queue); + + setup_timer(&ctrl->activity_timer, dma_engine_stalled, + (unsigned long)ctrl); + + ctrl->issue_wq = alloc_ordered_workqueue(DRIVER_NAME"_issue", 0); + if (!ctrl->issue_wq) + return -ENOMEM; + + ctrl->done_wq = alloc_ordered_workqueue(DRIVER_NAME"_done", 0); + if (!ctrl->done_wq) + return -ENOMEM; + + INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue); + INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done); + + st = rsxx_hw_buffers_init(dev, ctrl); + if (st) + return st; + + return 0; +} + +static int rsxx_dma_stripe_setup(struct rsxx_cardinfo *card, + unsigned int stripe_size8) +{ + if (!is_power_of_2(stripe_size8)) { + dev_err(CARD_TO_DEV(card), + "stripe_size is NOT a power of 2!\n"); + return -EINVAL; + } + + card->_stripe.lower_mask = stripe_size8 - 1; + + card->_stripe.upper_mask = ~(card->_stripe.lower_mask); + card->_stripe.upper_shift = ffs(card->n_targets) - 1; + + card->_stripe.target_mask = card->n_targets - 1; + card->_stripe.target_shift = ffs(stripe_size8) - 1; + + dev_dbg(CARD_TO_DEV(card), "_stripe.lower_mask = x%016llx\n", + card->_stripe.lower_mask); + dev_dbg(CARD_TO_DEV(card), "_stripe.upper_shift = x%016llx\n", + card->_stripe.upper_shift); + dev_dbg(CARD_TO_DEV(card), "_stripe.upper_mask = x%016llx\n", + card->_stripe.upper_mask); + dev_dbg(CARD_TO_DEV(card), "_stripe.target_mask = x%016llx\n", + card->_stripe.target_mask); + dev_dbg(CARD_TO_DEV(card), "_stripe.target_shift = x%016llx\n", + card->_stripe.target_shift); + + return 0; +} + +int rsxx_dma_configure(struct rsxx_cardinfo *card) +{ + u32 intr_coal; + + intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode, + card->config.data.intr_coal.count, + card->config.data.intr_coal.latency); + iowrite32(intr_coal, card->regmap + INTR_COAL); + + return rsxx_dma_stripe_setup(card, card->config.data.stripe_size); +} + +int rsxx_dma_setup(struct rsxx_cardinfo *card) +{ + unsigned long flags; + int st; + int i; + + dev_info(CARD_TO_DEV(card), + "Initializing %d DMA targets\n", + card->n_targets); + + /* Regmap is divided up into 4K chunks. One for each DMA channel */ + for (i = 0; i < card->n_targets; i++) + card->ctrl[i].regmap = card->regmap + (i * 4096); + + card->dma_fault = 0; + + /* Reset the DMA queues */ + rsxx_dma_queue_reset(card); + + /************* Setup DMA Control *************/ + for (i = 0; i < card->n_targets; i++) { + st = rsxx_dma_ctrl_init(card->dev, &card->ctrl[i]); + if (st) + goto failed_dma_setup; + + card->ctrl[i].card = card; + card->ctrl[i].id = i; + } + + card->scrub_hard = 1; + + if (card->config_valid) + rsxx_dma_configure(card); + + /* Enable the interrupts after all setup has completed. */ + for (i = 0; i < card->n_targets; i++) { + spin_lock_irqsave(&card->irq_lock, flags); + rsxx_enable_ier_and_isr(card, CR_INTR_DMA(i)); + spin_unlock_irqrestore(&card->irq_lock, flags); + } + + return 0; + +failed_dma_setup: + for (i = 0; i < card->n_targets; i++) { + struct rsxx_dma_ctrl *ctrl = &card->ctrl[i]; + + if (ctrl->issue_wq) { + destroy_workqueue(ctrl->issue_wq); + ctrl->issue_wq = NULL; + } + + if (ctrl->done_wq) { + destroy_workqueue(ctrl->done_wq); + ctrl->done_wq = NULL; + } + + if (ctrl->trackers) + vfree(ctrl->trackers); + + if (ctrl->status.buf) + pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8, + ctrl->status.buf, + ctrl->status.dma_addr); + if (ctrl->cmd.buf) + pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8, + ctrl->cmd.buf, ctrl->cmd.dma_addr); + } + + return st; +} + +int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl) +{ + struct rsxx_dma *dma; + int i; + int cnt = 0; + + /* Clean up issued DMAs */ + for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) { + dma = get_tracker_dma(ctrl->trackers, i); + if (dma) { + atomic_dec(&ctrl->stats.hw_q_depth); + rsxx_complete_dma(ctrl, dma, DMA_CANCELLED); + push_tracker(ctrl->trackers, i); + cnt++; + } + } + + return cnt; +} + +void rsxx_dma_destroy(struct rsxx_cardinfo *card) +{ + struct rsxx_dma_ctrl *ctrl; + int i; + + for (i = 0; i < card->n_targets; i++) { + ctrl = &card->ctrl[i]; + + if (ctrl->issue_wq) { + destroy_workqueue(ctrl->issue_wq); + ctrl->issue_wq = NULL; + } + + if (ctrl->done_wq) { + destroy_workqueue(ctrl->done_wq); + ctrl->done_wq = NULL; + } + + if (timer_pending(&ctrl->activity_timer)) + del_timer_sync(&ctrl->activity_timer); + + /* Clean up the DMA queue */ + spin_lock_bh(&ctrl->queue_lock); + rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA); + spin_unlock_bh(&ctrl->queue_lock); + + rsxx_dma_cancel(ctrl); + + vfree(ctrl->trackers); + + pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8, + ctrl->status.buf, ctrl->status.dma_addr); + pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8, + ctrl->cmd.buf, ctrl->cmd.dma_addr); + } +} + +int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card) +{ + int i; + int j; + int cnt; + struct rsxx_dma *dma; + struct list_head *issued_dmas; + + issued_dmas = kzalloc(sizeof(*issued_dmas) * card->n_targets, + GFP_KERNEL); + if (!issued_dmas) + return -ENOMEM; + + for (i = 0; i < card->n_targets; i++) { + INIT_LIST_HEAD(&issued_dmas[i]); + cnt = 0; + for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) { + dma = get_tracker_dma(card->ctrl[i].trackers, j); + if (dma == NULL) + continue; + + if (dma->cmd == HW_CMD_BLK_WRITE) + card->ctrl[i].stats.writes_issued--; + else if (dma->cmd == HW_CMD_BLK_DISCARD) + card->ctrl[i].stats.discards_issued--; + else + card->ctrl[i].stats.reads_issued--; + + if (dma->cmd != HW_CMD_BLK_DISCARD) { + pci_unmap_page(card->dev, dma->dma_addr, + get_dma_size(dma), + dma->cmd == HW_CMD_BLK_WRITE ? + PCI_DMA_TODEVICE : + PCI_DMA_FROMDEVICE); + } + + list_add_tail(&dma->list, &issued_dmas[i]); + push_tracker(card->ctrl[i].trackers, j); + cnt++; + } + + spin_lock_bh(&card->ctrl[i].queue_lock); + list_splice(&issued_dmas[i], &card->ctrl[i].queue); + + atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth); + card->ctrl[i].stats.sw_q_depth += cnt; + card->ctrl[i].e_cnt = 0; + spin_unlock_bh(&card->ctrl[i].queue_lock); + } + + kfree(issued_dmas); + + return 0; +} + +int rsxx_dma_init(void) +{ + rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN); + if (!rsxx_dma_pool) + return -ENOMEM; + + return 0; +} + + +void rsxx_dma_cleanup(void) +{ + kmem_cache_destroy(rsxx_dma_pool); +} + diff --git a/kernel/drivers/block/rsxx/rsxx.h b/kernel/drivers/block/rsxx/rsxx.h new file mode 100644 index 000000000..24ba3642b --- /dev/null +++ b/kernel/drivers/block/rsxx/rsxx.h @@ -0,0 +1,47 @@ +/* +* Filename: rsxx.h +* +* +* Authors: Joshua Morris +* Philip Kelleher +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __RSXX_H__ +#define __RSXX_H__ + +/*----------------- IOCTL Definitions -------------------*/ + +#define RSXX_MAX_DATA 8 + +struct rsxx_reg_access { + __u32 addr; + __u32 cnt; + __u32 stat; + __u32 stream; + __u32 data[RSXX_MAX_DATA]; +}; + +#define RSXX_MAX_REG_CNT (RSXX_MAX_DATA * (sizeof(__u32))) + +#define RSXX_IOC_MAGIC 'r' + +#define RSXX_GETREG _IOWR(RSXX_IOC_MAGIC, 0x20, struct rsxx_reg_access) +#define RSXX_SETREG _IOWR(RSXX_IOC_MAGIC, 0x21, struct rsxx_reg_access) + +#endif /* __RSXX_H_ */ diff --git a/kernel/drivers/block/rsxx/rsxx_cfg.h b/kernel/drivers/block/rsxx/rsxx_cfg.h new file mode 100644 index 000000000..f384c9438 --- /dev/null +++ b/kernel/drivers/block/rsxx/rsxx_cfg.h @@ -0,0 +1,72 @@ +/* +* Filename: rsXX_cfg.h +* +* +* Authors: Joshua Morris +* Philip Kelleher +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __RSXX_CFG_H__ +#define __RSXX_CFG_H__ + +/* NOTE: Config values will be saved in network byte order (i.e. Big endian) */ +#include + +/* + * The card config version must match the driver's expected version. If it does + * not, the DMA interfaces will not be attached and the user will need to + * initialize/upgrade the card configuration using the card config utility. + */ +#define RSXX_CFG_VERSION 4 + +struct card_cfg_hdr { + __u32 version; + __u32 crc; +}; + +struct card_cfg_data { + __u32 block_size; + __u32 stripe_size; + __u32 vendor_id; + __u32 cache_order; + struct { + __u32 mode; /* Disabled, manual, auto-tune... */ + __u32 count; /* Number of intr to coalesce */ + __u32 latency;/* Max wait time (in ns) */ + } intr_coal; +}; + +struct rsxx_card_cfg { + struct card_cfg_hdr hdr; + struct card_cfg_data data; +}; + +/* Vendor ID Values */ +#define RSXX_VENDOR_ID_IBM 0 +#define RSXX_VENDOR_ID_DSI 1 +#define RSXX_VENDOR_COUNT 2 + +/* Interrupt Coalescing Values */ +#define RSXX_INTR_COAL_DISABLED 0 +#define RSXX_INTR_COAL_EXPLICIT 1 +#define RSXX_INTR_COAL_AUTO_TUNE 2 + + +#endif /* __RSXX_CFG_H__ */ + diff --git a/kernel/drivers/block/rsxx/rsxx_priv.h b/kernel/drivers/block/rsxx/rsxx_priv.h new file mode 100644 index 000000000..6bbc64d0f --- /dev/null +++ b/kernel/drivers/block/rsxx/rsxx_priv.h @@ -0,0 +1,434 @@ +/* +* Filename: rsxx_priv.h +* +* +* Authors: Joshua Morris +* Philip Kelleher +* +* (C) Copyright 2013 IBM Corporation +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License as +* published by the Free Software Foundation; either version 2 of the +* License, or (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, but +* WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software Foundation, +* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#ifndef __RSXX_PRIV_H__ +#define __RSXX_PRIV_H__ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rsxx.h" +#include "rsxx_cfg.h" + +struct proc_cmd; + +#define PCI_DEVICE_ID_FS70_FLASH 0x04A9 +#define PCI_DEVICE_ID_FS80_FLASH 0x04AA + +#define RS70_PCI_REV_SUPPORTED 4 + +#define DRIVER_NAME "rsxx" +#define DRIVER_VERSION "4.0.3.2516" + +/* Block size is 4096 */ +#define RSXX_HW_BLK_SHIFT 12 +#define RSXX_HW_BLK_SIZE (1 << RSXX_HW_BLK_SHIFT) +#define RSXX_HW_BLK_MASK (RSXX_HW_BLK_SIZE - 1) + +#define MAX_CREG_DATA8 32 +#define LOG_BUF_SIZE8 128 + +#define RSXX_MAX_OUTSTANDING_CMDS 255 +#define RSXX_CS_IDX_MASK 0xff + +#define STATUS_BUFFER_SIZE8 4096 +#define COMMAND_BUFFER_SIZE8 4096 + +#define RSXX_MAX_TARGETS 8 + +struct dma_tracker_list; + +/* DMA Command/Status Buffer structure */ +struct rsxx_cs_buffer { + dma_addr_t dma_addr; + void *buf; + u32 idx; +}; + +struct rsxx_dma_stats { + u32 crc_errors; + u32 hard_errors; + u32 soft_errors; + u32 writes_issued; + u32 writes_failed; + u32 reads_issued; + u32 reads_failed; + u32 reads_retried; + u32 discards_issued; + u32 discards_failed; + u32 done_rescheduled; + u32 issue_rescheduled; + u32 dma_sw_err; + u32 dma_hw_fault; + u32 dma_cancelled; + u32 sw_q_depth; /* Number of DMAs on the SW queue. */ + atomic_t hw_q_depth; /* Number of DMAs queued to HW. */ +}; + +struct rsxx_dma_ctrl { + struct rsxx_cardinfo *card; + int id; + void __iomem *regmap; + struct rsxx_cs_buffer status; + struct rsxx_cs_buffer cmd; + u16 e_cnt; + spinlock_t queue_lock; + struct list_head queue; + struct workqueue_struct *issue_wq; + struct work_struct issue_dma_work; + struct workqueue_struct *done_wq; + struct work_struct dma_done_work; + struct timer_list activity_timer; + struct dma_tracker_list *trackers; + struct rsxx_dma_stats stats; + struct mutex work_lock; +}; + +struct rsxx_cardinfo { + struct pci_dev *dev; + unsigned int halt; + unsigned int eeh_state; + + void __iomem *regmap; + spinlock_t irq_lock; + unsigned int isr_mask; + unsigned int ier_mask; + + struct rsxx_card_cfg config; + int config_valid; + + /* Embedded CPU Communication */ + struct { + spinlock_t lock; + bool active; + struct creg_cmd *active_cmd; + struct workqueue_struct *creg_wq; + struct work_struct done_work; + struct list_head queue; + unsigned int q_depth; + /* Cache the creg status to prevent ioreads */ + struct { + u32 stat; + u32 failed_cancel_timer; + u32 creg_timeout; + } creg_stats; + struct timer_list cmd_timer; + struct mutex reset_lock; + int reset; + } creg_ctrl; + + struct { + char tmp[MAX_CREG_DATA8]; + char buf[LOG_BUF_SIZE8]; /* terminated */ + int buf_len; + } log; + + struct workqueue_struct *event_wq; + struct work_struct event_work; + unsigned int state; + u64 size8; + + /* Lock the device attach/detach function */ + struct mutex dev_lock; + + /* Block Device Variables */ + bool bdev_attached; + int disk_id; + int major; + struct request_queue *queue; + struct gendisk *gendisk; + struct { + /* Used to convert a byte address to a device address. */ + u64 lower_mask; + u64 upper_shift; + u64 upper_mask; + u64 target_mask; + u64 target_shift; + } _stripe; + unsigned int dma_fault; + + int scrub_hard; + + int n_targets; + struct rsxx_dma_ctrl *ctrl; + + struct dentry *debugfs_dir; +}; + +enum rsxx_pci_regmap { + HWID = 0x00, /* Hardware Identification Register */ + SCRATCH = 0x04, /* Scratch/Debug Register */ + RESET = 0x08, /* Reset Register */ + ISR = 0x10, /* Interrupt Status Register */ + IER = 0x14, /* Interrupt Enable Register */ + IPR = 0x18, /* Interrupt Poll Register */ + CB_ADD_LO = 0x20, /* Command Host Buffer Address [31:0] */ + CB_ADD_HI = 0x24, /* Command Host Buffer Address [63:32]*/ + HW_CMD_IDX = 0x28, /* Hardware Processed Command Index */ + SW_CMD_IDX = 0x2C, /* Software Processed Command Index */ + SB_ADD_LO = 0x30, /* Status Host Buffer Address [31:0] */ + SB_ADD_HI = 0x34, /* Status Host Buffer Address [63:32] */ + HW_STATUS_CNT = 0x38, /* Hardware Status Counter */ + SW_STATUS_CNT = 0x3C, /* Deprecated */ + CREG_CMD = 0x40, /* CPU Command Register */ + CREG_ADD = 0x44, /* CPU Address Register */ + CREG_CNT = 0x48, /* CPU Count Register */ + CREG_STAT = 0x4C, /* CPU Status Register */ + CREG_DATA0 = 0x50, /* CPU Data Registers */ + CREG_DATA1 = 0x54, + CREG_DATA2 = 0x58, + CREG_DATA3 = 0x5C, + CREG_DATA4 = 0x60, + CREG_DATA5 = 0x64, + CREG_DATA6 = 0x68, + CREG_DATA7 = 0x6c, + INTR_COAL = 0x70, /* Interrupt Coalescing Register */ + HW_ERROR = 0x74, /* Card Error Register */ + PCI_DEBUG0 = 0x78, /* PCI Debug Registers */ + PCI_DEBUG1 = 0x7C, + PCI_DEBUG2 = 0x80, + PCI_DEBUG3 = 0x84, + PCI_DEBUG4 = 0x88, + PCI_DEBUG5 = 0x8C, + PCI_DEBUG6 = 0x90, + PCI_DEBUG7 = 0x94, + PCI_POWER_THROTTLE = 0x98, + PERF_CTRL = 0x9c, + PERF_TIMER_LO = 0xa0, + PERF_TIMER_HI = 0xa4, + PERF_RD512_LO = 0xa8, + PERF_RD512_HI = 0xac, + PERF_WR512_LO = 0xb0, + PERF_WR512_HI = 0xb4, + PCI_RECONFIG = 0xb8, +}; + +enum rsxx_intr { + CR_INTR_DMA0 = 0x00000001, + CR_INTR_CREG = 0x00000002, + CR_INTR_DMA1 = 0x00000004, + CR_INTR_EVENT = 0x00000008, + CR_INTR_DMA2 = 0x00000010, + CR_INTR_DMA3 = 0x00000020, + CR_INTR_DMA4 = 0x00000040, + CR_INTR_DMA5 = 0x00000080, + CR_INTR_DMA6 = 0x00000100, + CR_INTR_DMA7 = 0x00000200, + CR_INTR_ALL_C = 0x0000003f, + CR_INTR_ALL_G = 0x000003ff, + CR_INTR_DMA_ALL = 0x000003f5, + CR_INTR_ALL = 0xffffffff, +}; + +static inline int CR_INTR_DMA(int N) +{ + static const unsigned int _CR_INTR_DMA[] = { + CR_INTR_DMA0, CR_INTR_DMA1, CR_INTR_DMA2, CR_INTR_DMA3, + CR_INTR_DMA4, CR_INTR_DMA5, CR_INTR_DMA6, CR_INTR_DMA7 + }; + return _CR_INTR_DMA[N]; +} +enum rsxx_pci_reset { + DMA_QUEUE_RESET = 0x00000001, +}; + +enum rsxx_hw_fifo_flush { + RSXX_FLUSH_BUSY = 0x00000002, + RSXX_FLUSH_TIMEOUT = 0x00000004, +}; + +enum rsxx_pci_revision { + RSXX_DISCARD_SUPPORT = 2, + RSXX_EEH_SUPPORT = 3, +}; + +enum rsxx_creg_cmd { + CREG_CMD_TAG_MASK = 0x0000FF00, + CREG_OP_WRITE = 0x000000C0, + CREG_OP_READ = 0x000000E0, +}; + +enum rsxx_creg_addr { + CREG_ADD_CARD_CMD = 0x80001000, + CREG_ADD_CARD_STATE = 0x80001004, + CREG_ADD_CARD_SIZE = 0x8000100c, + CREG_ADD_CAPABILITIES = 0x80001050, + CREG_ADD_LOG = 0x80002000, + CREG_ADD_NUM_TARGETS = 0x80003000, + CREG_ADD_CRAM = 0xA0000000, + CREG_ADD_CONFIG = 0xB0000000, +}; + +enum rsxx_creg_card_cmd { + CARD_CMD_STARTUP = 1, + CARD_CMD_SHUTDOWN = 2, + CARD_CMD_LOW_LEVEL_FORMAT = 3, + CARD_CMD_FPGA_RECONFIG_BR = 4, + CARD_CMD_FPGA_RECONFIG_MAIN = 5, + CARD_CMD_BACKUP = 6, + CARD_CMD_RESET = 7, + CARD_CMD_deprecated = 8, + CARD_CMD_UNINITIALIZE = 9, + CARD_CMD_DSTROY_EMERGENCY = 10, + CARD_CMD_DSTROY_NORMAL = 11, + CARD_CMD_DSTROY_EXTENDED = 12, + CARD_CMD_DSTROY_ABORT = 13, +}; + +enum rsxx_card_state { + CARD_STATE_SHUTDOWN = 0x00000001, + CARD_STATE_STARTING = 0x00000002, + CARD_STATE_FORMATTING = 0x00000004, + CARD_STATE_UNINITIALIZED = 0x00000008, + CARD_STATE_GOOD = 0x00000010, + CARD_STATE_SHUTTING_DOWN = 0x00000020, + CARD_STATE_FAULT = 0x00000040, + CARD_STATE_RD_ONLY_FAULT = 0x00000080, + CARD_STATE_DSTROYING = 0x00000100, +}; + +enum rsxx_led { + LED_DEFAULT = 0x0, + LED_IDENTIFY = 0x1, + LED_SOAK = 0x2, +}; + +enum rsxx_creg_flash_lock { + CREG_FLASH_LOCK = 1, + CREG_FLASH_UNLOCK = 2, +}; + +enum rsxx_card_capabilities { + CARD_CAP_SUBPAGE_WRITES = 0x00000080, +}; + +enum rsxx_creg_stat { + CREG_STAT_STATUS_MASK = 0x00000003, + CREG_STAT_SUCCESS = 0x1, + CREG_STAT_ERROR = 0x2, + CREG_STAT_CHAR_PENDING = 0x00000004, /* Character I/O pending bit */ + CREG_STAT_LOG_PENDING = 0x00000008, /* HW log message pending bit */ + CREG_STAT_TAG_MASK = 0x0000ff00, +}; + +enum rsxx_dma_finish { + FREE_DMA = 0x0, + COMPLETE_DMA = 0x1, +}; + +static inline unsigned int CREG_DATA(int N) +{ + return CREG_DATA0 + (N << 2); +} + +/*----------------- Convenient Log Wrappers -------------------*/ +#define CARD_TO_DEV(__CARD) (&(__CARD)->dev->dev) + +/***** config.c *****/ +int rsxx_load_config(struct rsxx_cardinfo *card); + +/***** core.c *****/ +void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr); +void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr); +void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr); +void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card, + unsigned int intr); + +/***** dev.c *****/ +int rsxx_attach_dev(struct rsxx_cardinfo *card); +void rsxx_detach_dev(struct rsxx_cardinfo *card); +int rsxx_setup_dev(struct rsxx_cardinfo *card); +void rsxx_destroy_dev(struct rsxx_cardinfo *card); +int rsxx_dev_init(void); +void rsxx_dev_cleanup(void); + +/***** dma.c ****/ +typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card, + void *cb_data, + unsigned int status); +int rsxx_dma_setup(struct rsxx_cardinfo *card); +void rsxx_dma_destroy(struct rsxx_cardinfo *card); +int rsxx_dma_init(void); +int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, + struct list_head *q, + unsigned int done); +int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl); +void rsxx_dma_cleanup(void); +void rsxx_dma_queue_reset(struct rsxx_cardinfo *card); +int rsxx_dma_configure(struct rsxx_cardinfo *card); +int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, + struct bio *bio, + atomic_t *n_dmas, + rsxx_dma_cb cb, + void *cb_data); +int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl); +int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card); +int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card); + +/***** cregs.c *****/ +int rsxx_creg_write(struct rsxx_cardinfo *card, u32 addr, + unsigned int size8, + void *data, + int byte_stream); +int rsxx_creg_read(struct rsxx_cardinfo *card, + u32 addr, + unsigned int size8, + void *data, + int byte_stream); +int rsxx_read_hw_log(struct rsxx_cardinfo *card); +int rsxx_get_card_state(struct rsxx_cardinfo *card, + unsigned int *state); +int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8); +int rsxx_get_num_targets(struct rsxx_cardinfo *card, + unsigned int *n_targets); +int rsxx_get_card_capabilities(struct rsxx_cardinfo *card, + u32 *capabilities); +int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd); +int rsxx_creg_setup(struct rsxx_cardinfo *card); +void rsxx_creg_destroy(struct rsxx_cardinfo *card); +int rsxx_creg_init(void); +void rsxx_creg_cleanup(void); +int rsxx_reg_access(struct rsxx_cardinfo *card, + struct rsxx_reg_access __user *ucmd, + int read); +void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card); +void rsxx_kick_creg_queue(struct rsxx_cardinfo *card); + + + +#endif /* __DRIVERS_BLOCK_RSXX_H__ */ diff --git a/kernel/drivers/block/skd_main.c b/kernel/drivers/block/skd_main.c new file mode 100644 index 000000000..1e46eb230 --- /dev/null +++ b/kernel/drivers/block/skd_main.c @@ -0,0 +1,5411 @@ +/* Copyright 2012 STEC, Inc. + * + * This file is licensed under the terms of the 3-clause + * BSD License (http://opensource.org/licenses/BSD-3-Clause) + * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html), + * at your option. Both licenses are also available in the LICENSE file + * distributed with this project. This file may not be copied, modified, + * or distributed except in accordance with those terms. + * Gordoni Waidhofer + * Initial Driver Design! + * Thomas Swann + * Interrupt handling. + * Ramprasad Chinthekindi + * biomode implementation. + * Akhil Bhansali + * Added support for DISCARD / FLUSH and FUA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "skd_s1120.h" + +static int skd_dbg_level; +static int skd_isr_comp_limit = 4; + +enum { + STEC_LINK_2_5GTS = 0, + STEC_LINK_5GTS = 1, + STEC_LINK_8GTS = 2, + STEC_LINK_UNKNOWN = 0xFF +}; + +enum { + SKD_FLUSH_INITIALIZER, + SKD_FLUSH_ZERO_SIZE_FIRST, + SKD_FLUSH_DATA_SECOND, +}; + +#define SKD_ASSERT(expr) \ + do { \ + if (unlikely(!(expr))) { \ + pr_err("Assertion failed! %s,%s,%s,line=%d\n", \ + # expr, __FILE__, __func__, __LINE__); \ + } \ + } while (0) + +#define DRV_NAME "skd" +#define DRV_VERSION "2.2.1" +#define DRV_BUILD_ID "0260" +#define PFX DRV_NAME ": " +#define DRV_BIN_VERSION 0x100 +#define DRV_VER_COMPL "2.2.1." DRV_BUILD_ID + +MODULE_AUTHOR("bug-reports: support@stec-inc.com"); +MODULE_LICENSE("Dual BSD/GPL"); + +MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver (b" DRV_BUILD_ID ")"); +MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID); + +#define PCI_VENDOR_ID_STEC 0x1B39 +#define PCI_DEVICE_ID_S1120 0x0001 + +#define SKD_FUA_NV (1 << 1) +#define SKD_MINORS_PER_DEVICE 16 + +#define SKD_MAX_QUEUE_DEPTH 200u + +#define SKD_PAUSE_TIMEOUT (5 * 1000) + +#define SKD_N_FITMSG_BYTES (512u) + +#define SKD_N_SPECIAL_CONTEXT 32u +#define SKD_N_SPECIAL_FITMSG_BYTES (128u) + +/* SG elements are 32 bytes, so we can make this 4096 and still be under the + * 128KB limit. That allows 4096*4K = 16M xfer size + */ +#define SKD_N_SG_PER_REQ_DEFAULT 256u +#define SKD_N_SG_PER_SPECIAL 256u + +#define SKD_N_COMPLETION_ENTRY 256u +#define SKD_N_READ_CAP_BYTES (8u) + +#define SKD_N_INTERNAL_BYTES (512u) + +/* 5 bits of uniqifier, 0xF800 */ +#define SKD_ID_INCR (0x400) +#define SKD_ID_TABLE_MASK (3u << 8u) +#define SKD_ID_RW_REQUEST (0u << 8u) +#define SKD_ID_INTERNAL (1u << 8u) +#define SKD_ID_SPECIAL_REQUEST (2u << 8u) +#define SKD_ID_FIT_MSG (3u << 8u) +#define SKD_ID_SLOT_MASK 0x00FFu +#define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu + +#define SKD_N_TIMEOUT_SLOT 4u +#define SKD_TIMEOUT_SLOT_MASK 3u + +#define SKD_N_MAX_SECTORS 2048u + +#define SKD_MAX_RETRIES 2u + +#define SKD_TIMER_SECONDS(seconds) (seconds) +#define SKD_TIMER_MINUTES(minutes) ((minutes) * (60)) + +#define INQ_STD_NBYTES 36 +#define SKD_DISCARD_CDB_LENGTH 24 + +enum skd_drvr_state { + SKD_DRVR_STATE_LOAD, + SKD_DRVR_STATE_IDLE, + SKD_DRVR_STATE_BUSY, + SKD_DRVR_STATE_STARTING, + SKD_DRVR_STATE_ONLINE, + SKD_DRVR_STATE_PAUSING, + SKD_DRVR_STATE_PAUSED, + SKD_DRVR_STATE_DRAINING_TIMEOUT, + SKD_DRVR_STATE_RESTARTING, + SKD_DRVR_STATE_RESUMING, + SKD_DRVR_STATE_STOPPING, + SKD_DRVR_STATE_FAULT, + SKD_DRVR_STATE_DISAPPEARED, + SKD_DRVR_STATE_PROTOCOL_MISMATCH, + SKD_DRVR_STATE_BUSY_ERASE, + SKD_DRVR_STATE_BUSY_SANITIZE, + SKD_DRVR_STATE_BUSY_IMMINENT, + SKD_DRVR_STATE_WAIT_BOOT, + SKD_DRVR_STATE_SYNCING, +}; + +#define SKD_WAIT_BOOT_TIMO SKD_TIMER_SECONDS(90u) +#define SKD_STARTING_TIMO SKD_TIMER_SECONDS(8u) +#define SKD_RESTARTING_TIMO SKD_TIMER_MINUTES(4u) +#define SKD_DRAINING_TIMO SKD_TIMER_SECONDS(6u) +#define SKD_BUSY_TIMO SKD_TIMER_MINUTES(20u) +#define SKD_STARTED_BUSY_TIMO SKD_TIMER_SECONDS(60u) +#define SKD_START_WAIT_SECONDS 90u + +enum skd_req_state { + SKD_REQ_STATE_IDLE, + SKD_REQ_STATE_SETUP, + SKD_REQ_STATE_BUSY, + SKD_REQ_STATE_COMPLETED, + SKD_REQ_STATE_TIMEOUT, + SKD_REQ_STATE_ABORTED, +}; + +enum skd_fit_msg_state { + SKD_MSG_STATE_IDLE, + SKD_MSG_STATE_BUSY, +}; + +enum skd_check_status_action { + SKD_CHECK_STATUS_REPORT_GOOD, + SKD_CHECK_STATUS_REPORT_SMART_ALERT, + SKD_CHECK_STATUS_REQUEUE_REQUEST, + SKD_CHECK_STATUS_REPORT_ERROR, + SKD_CHECK_STATUS_BUSY_IMMINENT, +}; + +struct skd_fitmsg_context { + enum skd_fit_msg_state state; + + struct skd_fitmsg_context *next; + + u32 id; + u16 outstanding; + + u32 length; + u32 offset; + + u8 *msg_buf; + dma_addr_t mb_dma_address; +}; + +struct skd_request_context { + enum skd_req_state state; + + struct skd_request_context *next; + + u16 id; + u32 fitmsg_id; + + struct request *req; + u8 flush_cmd; + u8 discard_page; + + u32 timeout_stamp; + u8 sg_data_dir; + struct scatterlist *sg; + u32 n_sg; + u32 sg_byte_count; + + struct fit_sg_descriptor *sksg_list; + dma_addr_t sksg_dma_address; + + struct fit_completion_entry_v1 completion; + + struct fit_comp_error_info err_info; + +}; +#define SKD_DATA_DIR_HOST_TO_CARD 1 +#define SKD_DATA_DIR_CARD_TO_HOST 2 +#define SKD_DATA_DIR_NONE 3 /* especially for DISCARD requests. */ + +struct skd_special_context { + struct skd_request_context req; + + u8 orphaned; + + void *data_buf; + dma_addr_t db_dma_address; + + u8 *msg_buf; + dma_addr_t mb_dma_address; +}; + +struct skd_sg_io { + fmode_t mode; + void __user *argp; + + struct sg_io_hdr sg; + + u8 cdb[16]; + + u32 dxfer_len; + u32 iovcnt; + struct sg_iovec *iov; + struct sg_iovec no_iov_iov; + + struct skd_special_context *skspcl; +}; + +typedef enum skd_irq_type { + SKD_IRQ_LEGACY, + SKD_IRQ_MSI, + SKD_IRQ_MSIX +} skd_irq_type_t; + +#define SKD_MAX_BARS 2 + +struct skd_device { + volatile void __iomem *mem_map[SKD_MAX_BARS]; + resource_size_t mem_phys[SKD_MAX_BARS]; + u32 mem_size[SKD_MAX_BARS]; + + skd_irq_type_t irq_type; + u32 msix_count; + struct skd_msix_entry *msix_entries; + + struct pci_dev *pdev; + int pcie_error_reporting_is_enabled; + + spinlock_t lock; + struct gendisk *disk; + struct request_queue *queue; + struct device *class_dev; + int gendisk_on; + int sync_done; + + atomic_t device_count; + u32 devno; + u32 major; + char name[32]; + char isr_name[30]; + + enum skd_drvr_state state; + u32 drive_state; + + u32 in_flight; + u32 cur_max_queue_depth; + u32 queue_low_water_mark; + u32 dev_max_queue_depth; + + u32 num_fitmsg_context; + u32 num_req_context; + + u32 timeout_slot[SKD_N_TIMEOUT_SLOT]; + u32 timeout_stamp; + struct skd_fitmsg_context *skmsg_free_list; + struct skd_fitmsg_context *skmsg_table; + + struct skd_request_context *skreq_free_list; + struct skd_request_context *skreq_table; + + struct skd_special_context *skspcl_free_list; + struct skd_special_context *skspcl_table; + + struct skd_special_context internal_skspcl; + u32 read_cap_blocksize; + u32 read_cap_last_lba; + int read_cap_is_valid; + int inquiry_is_valid; + u8 inq_serial_num[13]; /*12 chars plus null term */ + u8 id_str[80]; /* holds a composite name (pci + sernum) */ + + u8 skcomp_cycle; + u32 skcomp_ix; + struct fit_completion_entry_v1 *skcomp_table; + struct fit_comp_error_info *skerr_table; + dma_addr_t cq_dma_address; + + wait_queue_head_t waitq; + + struct timer_list timer; + u32 timer_countdown; + u32 timer_substate; + + int n_special; + int sgs_per_request; + u32 last_mtd; + + u32 proto_ver; + + int dbg_level; + u32 connect_time_stamp; + int connect_retries; +#define SKD_MAX_CONNECT_RETRIES 16 + u32 drive_jiffies; + + u32 timo_slot; + + + struct work_struct completion_worker; +}; + +#define SKD_WRITEL(DEV, VAL, OFF) skd_reg_write32(DEV, VAL, OFF) +#define SKD_READL(DEV, OFF) skd_reg_read32(DEV, OFF) +#define SKD_WRITEQ(DEV, VAL, OFF) skd_reg_write64(DEV, VAL, OFF) + +static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset) +{ + u32 val; + + if (likely(skdev->dbg_level < 2)) + return readl(skdev->mem_map[1] + offset); + else { + barrier(); + val = readl(skdev->mem_map[1] + offset); + barrier(); + pr_debug("%s:%s:%d offset %x = %x\n", + skdev->name, __func__, __LINE__, offset, val); + return val; + } + +} + +static inline void skd_reg_write32(struct skd_device *skdev, u32 val, + u32 offset) +{ + if (likely(skdev->dbg_level < 2)) { + writel(val, skdev->mem_map[1] + offset); + barrier(); + } else { + barrier(); + writel(val, skdev->mem_map[1] + offset); + barrier(); + pr_debug("%s:%s:%d offset %x = %x\n", + skdev->name, __func__, __LINE__, offset, val); + } +} + +static inline void skd_reg_write64(struct skd_device *skdev, u64 val, + u32 offset) +{ + if (likely(skdev->dbg_level < 2)) { + writeq(val, skdev->mem_map[1] + offset); + barrier(); + } else { + barrier(); + writeq(val, skdev->mem_map[1] + offset); + barrier(); + pr_debug("%s:%s:%d offset %x = %016llx\n", + skdev->name, __func__, __LINE__, offset, val); + } +} + + +#define SKD_IRQ_DEFAULT SKD_IRQ_MSI +static int skd_isr_type = SKD_IRQ_DEFAULT; + +module_param(skd_isr_type, int, 0444); +MODULE_PARM_DESC(skd_isr_type, "Interrupt type capability." + " (0==legacy, 1==MSI, 2==MSI-X, default==1)"); + +#define SKD_MAX_REQ_PER_MSG_DEFAULT 1 +static int skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT; + +module_param(skd_max_req_per_msg, int, 0444); +MODULE_PARM_DESC(skd_max_req_per_msg, + "Maximum SCSI requests packed in a single message." + " (1-14, default==1)"); + +#define SKD_MAX_QUEUE_DEPTH_DEFAULT 64 +#define SKD_MAX_QUEUE_DEPTH_DEFAULT_STR "64" +static int skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT; + +module_param(skd_max_queue_depth, int, 0444); +MODULE_PARM_DESC(skd_max_queue_depth, + "Maximum SCSI requests issued to s1120." + " (1-200, default==" SKD_MAX_QUEUE_DEPTH_DEFAULT_STR ")"); + +static int skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT; +module_param(skd_sgs_per_request, int, 0444); +MODULE_PARM_DESC(skd_sgs_per_request, + "Maximum SG elements per block request." + " (1-4096, default==256)"); + +static int skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT; +module_param(skd_max_pass_thru, int, 0444); +MODULE_PARM_DESC(skd_max_pass_thru, + "Maximum SCSI pass-thru at a time." " (1-50, default==32)"); + +module_param(skd_dbg_level, int, 0444); +MODULE_PARM_DESC(skd_dbg_level, "s1120 debug level (0,1,2)"); + +module_param(skd_isr_comp_limit, int, 0444); +MODULE_PARM_DESC(skd_isr_comp_limit, "s1120 isr comp limit (0=none) default=4"); + +/* Major device number dynamically assigned. */ +static u32 skd_major; + +static void skd_destruct(struct skd_device *skdev); +static const struct block_device_operations skd_blockdev_ops; +static void skd_send_fitmsg(struct skd_device *skdev, + struct skd_fitmsg_context *skmsg); +static void skd_send_special_fitmsg(struct skd_device *skdev, + struct skd_special_context *skspcl); +static void skd_request_fn(struct request_queue *rq); +static void skd_end_request(struct skd_device *skdev, + struct skd_request_context *skreq, int error); +static int skd_preop_sg_list(struct skd_device *skdev, + struct skd_request_context *skreq); +static void skd_postop_sg_list(struct skd_device *skdev, + struct skd_request_context *skreq); + +static void skd_restart_device(struct skd_device *skdev); +static int skd_quiesce_dev(struct skd_device *skdev); +static int skd_unquiesce_dev(struct skd_device *skdev); +static void skd_release_special(struct skd_device *skdev, + struct skd_special_context *skspcl); +static void skd_disable_interrupts(struct skd_device *skdev); +static void skd_isr_fwstate(struct skd_device *skdev); +static void skd_recover_requests(struct skd_device *skdev, int requeue); +static void skd_soft_reset(struct skd_device *skdev); + +static const char *skd_name(struct skd_device *skdev); +const char *skd_drive_state_to_str(int state); +const char *skd_skdev_state_to_str(enum skd_drvr_state state); +static void skd_log_skdev(struct skd_device *skdev, const char *event); +static void skd_log_skmsg(struct skd_device *skdev, + struct skd_fitmsg_context *skmsg, const char *event); +static void skd_log_skreq(struct skd_device *skdev, + struct skd_request_context *skreq, const char *event); + +/* + ***************************************************************************** + * READ/WRITE REQUESTS + ***************************************************************************** + */ +static void skd_fail_all_pending(struct skd_device *skdev) +{ + struct request_queue *q = skdev->queue; + struct request *req; + + for (;; ) { + req = blk_peek_request(q); + if (req == NULL) + break; + blk_start_request(req); + __blk_end_request_all(req, -EIO); + } +} + +static void +skd_prep_rw_cdb(struct skd_scsi_request *scsi_req, + int data_dir, unsigned lba, + unsigned count) +{ + if (data_dir == READ) + scsi_req->cdb[0] = 0x28; + else + scsi_req->cdb[0] = 0x2a; + + scsi_req->cdb[1] = 0; + scsi_req->cdb[2] = (lba & 0xff000000) >> 24; + scsi_req->cdb[3] = (lba & 0xff0000) >> 16; + scsi_req->cdb[4] = (lba & 0xff00) >> 8; + scsi_req->cdb[5] = (lba & 0xff); + scsi_req->cdb[6] = 0; + scsi_req->cdb[7] = (count & 0xff00) >> 8; + scsi_req->cdb[8] = count & 0xff; + scsi_req->cdb[9] = 0; +} + +static void +skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req, + struct skd_request_context *skreq) +{ + skreq->flush_cmd = 1; + + scsi_req->cdb[0] = 0x35; + scsi_req->cdb[1] = 0; + scsi_req->cdb[2] = 0; + scsi_req->cdb[3] = 0; + scsi_req->cdb[4] = 0; + scsi_req->cdb[5] = 0; + scsi_req->cdb[6] = 0; + scsi_req->cdb[7] = 0; + scsi_req->cdb[8] = 0; + scsi_req->cdb[9] = 0; +} + +static void +skd_prep_discard_cdb(struct skd_scsi_request *scsi_req, + struct skd_request_context *skreq, + struct page *page, + u32 lba, u32 count) +{ + char *buf; + unsigned long len; + struct request *req; + + buf = page_address(page); + len = SKD_DISCARD_CDB_LENGTH; + + scsi_req->cdb[0] = UNMAP; + scsi_req->cdb[8] = len; + + put_unaligned_be16(6 + 16, &buf[0]); + put_unaligned_be16(16, &buf[2]); + put_unaligned_be64(lba, &buf[8]); + put_unaligned_be32(count, &buf[16]); + + req = skreq->req; + blk_add_request_payload(req, page, len); +} + +static void skd_request_fn_not_online(struct request_queue *q); + +static void skd_request_fn(struct request_queue *q) +{ + struct skd_device *skdev = q->queuedata; + struct skd_fitmsg_context *skmsg = NULL; + struct fit_msg_hdr *fmh = NULL; + struct skd_request_context *skreq; + struct request *req = NULL; + struct skd_scsi_request *scsi_req; + struct page *page; + unsigned long io_flags; + int error; + u32 lba; + u32 count; + int data_dir; + u32 be_lba; + u32 be_count; + u64 be_dmaa; + u64 cmdctxt; + u32 timo_slot; + void *cmd_ptr; + int flush, fua; + + if (skdev->state != SKD_DRVR_STATE_ONLINE) { + skd_request_fn_not_online(q); + return; + } + + if (blk_queue_stopped(skdev->queue)) { + if (skdev->skmsg_free_list == NULL || + skdev->skreq_free_list == NULL || + skdev->in_flight >= skdev->queue_low_water_mark) + /* There is still some kind of shortage */ + return; + + queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue); + } + + /* + * Stop conditions: + * - There are no more native requests + * - There are already the maximum number of requests in progress + * - There are no more skd_request_context entries + * - There are no more FIT msg buffers + */ + for (;; ) { + + flush = fua = 0; + + req = blk_peek_request(q); + + /* Are there any native requests to start? */ + if (req == NULL) + break; + + lba = (u32)blk_rq_pos(req); + count = blk_rq_sectors(req); + data_dir = rq_data_dir(req); + io_flags = req->cmd_flags; + + if (io_flags & REQ_FLUSH) + flush++; + + if (io_flags & REQ_FUA) + fua++; + + pr_debug("%s:%s:%d new req=%p lba=%u(0x%x) " + "count=%u(0x%x) dir=%d\n", + skdev->name, __func__, __LINE__, + req, lba, lba, count, count, data_dir); + + /* At this point we know there is a request */ + + /* Are too many requets already in progress? */ + if (skdev->in_flight >= skdev->cur_max_queue_depth) { + pr_debug("%s:%s:%d qdepth %d, limit %d\n", + skdev->name, __func__, __LINE__, + skdev->in_flight, skdev->cur_max_queue_depth); + break; + } + + /* Is a skd_request_context available? */ + skreq = skdev->skreq_free_list; + if (skreq == NULL) { + pr_debug("%s:%s:%d Out of req=%p\n", + skdev->name, __func__, __LINE__, q); + break; + } + SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE); + SKD_ASSERT((skreq->id & SKD_ID_INCR) == 0); + + /* Now we check to see if we can get a fit msg */ + if (skmsg == NULL) { + if (skdev->skmsg_free_list == NULL) { + pr_debug("%s:%s:%d Out of msg\n", + skdev->name, __func__, __LINE__); + break; + } + } + + skreq->flush_cmd = 0; + skreq->n_sg = 0; + skreq->sg_byte_count = 0; + skreq->discard_page = 0; + + /* + * OK to now dequeue request from q. + * + * At this point we are comitted to either start or reject + * the native request. Note that skd_request_context is + * available but is still at the head of the free list. + */ + blk_start_request(req); + skreq->req = req; + skreq->fitmsg_id = 0; + + /* Either a FIT msg is in progress or we have to start one. */ + if (skmsg == NULL) { + /* Are there any FIT msg buffers available? */ + skmsg = skdev->skmsg_free_list; + if (skmsg == NULL) { + pr_debug("%s:%s:%d Out of msg skdev=%p\n", + skdev->name, __func__, __LINE__, + skdev); + break; + } + SKD_ASSERT(skmsg->state == SKD_MSG_STATE_IDLE); + SKD_ASSERT((skmsg->id & SKD_ID_INCR) == 0); + + skdev->skmsg_free_list = skmsg->next; + + skmsg->state = SKD_MSG_STATE_BUSY; + skmsg->id += SKD_ID_INCR; + + /* Initialize the FIT msg header */ + fmh = (struct fit_msg_hdr *)skmsg->msg_buf; + memset(fmh, 0, sizeof(*fmh)); + fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; + skmsg->length = sizeof(*fmh); + } + + skreq->fitmsg_id = skmsg->id; + + /* + * Note that a FIT msg may have just been started + * but contains no SoFIT requests yet. + */ + + /* + * Transcode the request, checking as we go. The outcome of + * the transcoding is represented by the error variable. + */ + cmd_ptr = &skmsg->msg_buf[skmsg->length]; + memset(cmd_ptr, 0, 32); + + be_lba = cpu_to_be32(lba); + be_count = cpu_to_be32(count); + be_dmaa = cpu_to_be64((u64)skreq->sksg_dma_address); + cmdctxt = skreq->id + SKD_ID_INCR; + + scsi_req = cmd_ptr; + scsi_req->hdr.tag = cmdctxt; + scsi_req->hdr.sg_list_dma_address = be_dmaa; + + if (data_dir == READ) + skreq->sg_data_dir = SKD_DATA_DIR_CARD_TO_HOST; + else + skreq->sg_data_dir = SKD_DATA_DIR_HOST_TO_CARD; + + if (io_flags & REQ_DISCARD) { + page = alloc_page(GFP_ATOMIC | __GFP_ZERO); + if (!page) { + pr_err("request_fn:Page allocation failed.\n"); + skd_end_request(skdev, skreq, -ENOMEM); + break; + } + skreq->discard_page = 1; + req->completion_data = page; + skd_prep_discard_cdb(scsi_req, skreq, page, lba, count); + + } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) { + skd_prep_zerosize_flush_cdb(scsi_req, skreq); + SKD_ASSERT(skreq->flush_cmd == 1); + + } else { + skd_prep_rw_cdb(scsi_req, data_dir, lba, count); + } + + if (fua) + scsi_req->cdb[1] |= SKD_FUA_NV; + + if (!req->bio) + goto skip_sg; + + error = skd_preop_sg_list(skdev, skreq); + + if (error != 0) { + /* + * Complete the native request with error. + * Note that the request context is still at the + * head of the free list, and that the SoFIT request + * was encoded into the FIT msg buffer but the FIT + * msg length has not been updated. In short, the + * only resource that has been allocated but might + * not be used is that the FIT msg could be empty. + */ + pr_debug("%s:%s:%d error Out\n", + skdev->name, __func__, __LINE__); + skd_end_request(skdev, skreq, error); + continue; + } + +skip_sg: + scsi_req->hdr.sg_list_len_bytes = + cpu_to_be32(skreq->sg_byte_count); + + /* Complete resource allocations. */ + skdev->skreq_free_list = skreq->next; + skreq->state = SKD_REQ_STATE_BUSY; + skreq->id += SKD_ID_INCR; + + skmsg->length += sizeof(struct skd_scsi_request); + fmh->num_protocol_cmds_coalesced++; + + /* + * Update the active request counts. + * Capture the timeout timestamp. + */ + skreq->timeout_stamp = skdev->timeout_stamp; + timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; + skdev->timeout_slot[timo_slot]++; + skdev->in_flight++; + pr_debug("%s:%s:%d req=0x%x busy=%d\n", + skdev->name, __func__, __LINE__, + skreq->id, skdev->in_flight); + + /* + * If the FIT msg buffer is full send it. + */ + if (skmsg->length >= SKD_N_FITMSG_BYTES || + fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) { + skd_send_fitmsg(skdev, skmsg); + skmsg = NULL; + fmh = NULL; + } + } + + /* + * Is a FIT msg in progress? If it is empty put the buffer back + * on the free list. If it is non-empty send what we got. + * This minimizes latency when there are fewer requests than + * what fits in a FIT msg. + */ + if (skmsg != NULL) { + /* Bigger than just a FIT msg header? */ + if (skmsg->length > sizeof(struct fit_msg_hdr)) { + pr_debug("%s:%s:%d sending msg=%p, len %d\n", + skdev->name, __func__, __LINE__, + skmsg, skmsg->length); + skd_send_fitmsg(skdev, skmsg); + } else { + /* + * The FIT msg is empty. It means we got started + * on the msg, but the requests were rejected. + */ + skmsg->state = SKD_MSG_STATE_IDLE; + skmsg->id += SKD_ID_INCR; + skmsg->next = skdev->skmsg_free_list; + skdev->skmsg_free_list = skmsg; + } + skmsg = NULL; + fmh = NULL; + } + + /* + * If req is non-NULL it means there is something to do but + * we are out of a resource. + */ + if (req) + blk_stop_queue(skdev->queue); +} + +static void skd_end_request(struct skd_device *skdev, + struct skd_request_context *skreq, int error) +{ + struct request *req = skreq->req; + unsigned int io_flags = req->cmd_flags; + + if ((io_flags & REQ_DISCARD) && + (skreq->discard_page == 1)) { + pr_debug("%s:%s:%d, free the page!", + skdev->name, __func__, __LINE__); + __free_page(req->completion_data); + } + + if (unlikely(error)) { + struct request *req = skreq->req; + char *cmd = (rq_data_dir(req) == READ) ? "read" : "write"; + u32 lba = (u32)blk_rq_pos(req); + u32 count = blk_rq_sectors(req); + + pr_err("(%s): Error cmd=%s sect=%u count=%u id=0x%x\n", + skd_name(skdev), cmd, lba, count, skreq->id); + } else + pr_debug("%s:%s:%d id=0x%x error=%d\n", + skdev->name, __func__, __LINE__, skreq->id, error); + + __blk_end_request_all(skreq->req, error); +} + +static int skd_preop_sg_list(struct skd_device *skdev, + struct skd_request_context *skreq) +{ + struct request *req = skreq->req; + int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD; + int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE; + struct scatterlist *sg = &skreq->sg[0]; + int n_sg; + int i; + + skreq->sg_byte_count = 0; + + /* SKD_ASSERT(skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD || + skreq->sg_data_dir == SKD_DATA_DIR_CARD_TO_HOST); */ + + n_sg = blk_rq_map_sg(skdev->queue, req, sg); + if (n_sg <= 0) + return -EINVAL; + + /* + * Map scatterlist to PCI bus addresses. + * Note PCI might change the number of entries. + */ + n_sg = pci_map_sg(skdev->pdev, sg, n_sg, pci_dir); + if (n_sg <= 0) + return -EINVAL; + + SKD_ASSERT(n_sg <= skdev->sgs_per_request); + + skreq->n_sg = n_sg; + + for (i = 0; i < n_sg; i++) { + struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; + u32 cnt = sg_dma_len(&sg[i]); + uint64_t dma_addr = sg_dma_address(&sg[i]); + + sgd->control = FIT_SGD_CONTROL_NOT_LAST; + sgd->byte_count = cnt; + skreq->sg_byte_count += cnt; + sgd->host_side_addr = dma_addr; + sgd->dev_side_addr = 0; + } + + skreq->sksg_list[n_sg - 1].next_desc_ptr = 0LL; + skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST; + + if (unlikely(skdev->dbg_level > 1)) { + pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n", + skdev->name, __func__, __LINE__, + skreq->id, skreq->sksg_list, skreq->sksg_dma_address); + for (i = 0; i < n_sg; i++) { + struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; + pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " + "addr=0x%llx next=0x%llx\n", + skdev->name, __func__, __LINE__, + i, sgd->byte_count, sgd->control, + sgd->host_side_addr, sgd->next_desc_ptr); + } + } + + return 0; +} + +static void skd_postop_sg_list(struct skd_device *skdev, + struct skd_request_context *skreq) +{ + int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD; + int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE; + + /* + * restore the next ptr for next IO request so we + * don't have to set it every time. + */ + skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr = + skreq->sksg_dma_address + + ((skreq->n_sg) * sizeof(struct fit_sg_descriptor)); + pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, pci_dir); +} + +static void skd_request_fn_not_online(struct request_queue *q) +{ + struct skd_device *skdev = q->queuedata; + int error; + + SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE); + + skd_log_skdev(skdev, "req_not_online"); + switch (skdev->state) { + case SKD_DRVR_STATE_PAUSING: + case SKD_DRVR_STATE_PAUSED: + case SKD_DRVR_STATE_STARTING: + case SKD_DRVR_STATE_RESTARTING: + case SKD_DRVR_STATE_WAIT_BOOT: + /* In case of starting, we haven't started the queue, + * so we can't get here... but requests are + * possibly hanging out waiting for us because we + * reported the dev/skd0 already. They'll wait + * forever if connect doesn't complete. + * What to do??? delay dev/skd0 ?? + */ + case SKD_DRVR_STATE_BUSY: + case SKD_DRVR_STATE_BUSY_IMMINENT: + case SKD_DRVR_STATE_BUSY_ERASE: + case SKD_DRVR_STATE_DRAINING_TIMEOUT: + return; + + case SKD_DRVR_STATE_BUSY_SANITIZE: + case SKD_DRVR_STATE_STOPPING: + case SKD_DRVR_STATE_SYNCING: + case SKD_DRVR_STATE_FAULT: + case SKD_DRVR_STATE_DISAPPEARED: + default: + error = -EIO; + break; + } + + /* If we get here, terminate all pending block requeusts + * with EIO and any scsi pass thru with appropriate sense + */ + + skd_fail_all_pending(skdev); +} + +/* + ***************************************************************************** + * TIMER + ***************************************************************************** + */ + +static void skd_timer_tick_not_online(struct skd_device *skdev); + +static void skd_timer_tick(ulong arg) +{ + struct skd_device *skdev = (struct skd_device *)arg; + + u32 timo_slot; + u32 overdue_timestamp; + unsigned long reqflags; + u32 state; + + if (skdev->state == SKD_DRVR_STATE_FAULT) + /* The driver has declared fault, and we want it to + * stay that way until driver is reloaded. + */ + return; + + spin_lock_irqsave(&skdev->lock, reqflags); + + state = SKD_READL(skdev, FIT_STATUS); + state &= FIT_SR_DRIVE_STATE_MASK; + if (state != skdev->drive_state) + skd_isr_fwstate(skdev); + + if (skdev->state != SKD_DRVR_STATE_ONLINE) { + skd_timer_tick_not_online(skdev); + goto timer_func_out; + } + skdev->timeout_stamp++; + timo_slot = skdev->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; + + /* + * All requests that happened during the previous use of + * this slot should be done by now. The previous use was + * over 7 seconds ago. + */ + if (skdev->timeout_slot[timo_slot] == 0) + goto timer_func_out; + + /* Something is overdue */ + overdue_timestamp = skdev->timeout_stamp - SKD_N_TIMEOUT_SLOT; + + pr_debug("%s:%s:%d found %d timeouts, draining busy=%d\n", + skdev->name, __func__, __LINE__, + skdev->timeout_slot[timo_slot], skdev->in_flight); + pr_err("(%s): Overdue IOs (%d), busy %d\n", + skd_name(skdev), skdev->timeout_slot[timo_slot], + skdev->in_flight); + + skdev->timer_countdown = SKD_DRAINING_TIMO; + skdev->state = SKD_DRVR_STATE_DRAINING_TIMEOUT; + skdev->timo_slot = timo_slot; + blk_stop_queue(skdev->queue); + +timer_func_out: + mod_timer(&skdev->timer, (jiffies + HZ)); + + spin_unlock_irqrestore(&skdev->lock, reqflags); +} + +static void skd_timer_tick_not_online(struct skd_device *skdev) +{ + switch (skdev->state) { + case SKD_DRVR_STATE_IDLE: + case SKD_DRVR_STATE_LOAD: + break; + case SKD_DRVR_STATE_BUSY_SANITIZE: + pr_debug("%s:%s:%d drive busy sanitize[%x], driver[%x]\n", + skdev->name, __func__, __LINE__, + skdev->drive_state, skdev->state); + /* If we've been in sanitize for 3 seconds, we figure we're not + * going to get anymore completions, so recover requests now + */ + if (skdev->timer_countdown > 0) { + skdev->timer_countdown--; + return; + } + skd_recover_requests(skdev, 0); + break; + + case SKD_DRVR_STATE_BUSY: + case SKD_DRVR_STATE_BUSY_IMMINENT: + case SKD_DRVR_STATE_BUSY_ERASE: + pr_debug("%s:%s:%d busy[%x], countdown=%d\n", + skdev->name, __func__, __LINE__, + skdev->state, skdev->timer_countdown); + if (skdev->timer_countdown > 0) { + skdev->timer_countdown--; + return; + } + pr_debug("%s:%s:%d busy[%x], timedout=%d, restarting device.", + skdev->name, __func__, __LINE__, + skdev->state, skdev->timer_countdown); + skd_restart_device(skdev); + break; + + case SKD_DRVR_STATE_WAIT_BOOT: + case SKD_DRVR_STATE_STARTING: + if (skdev->timer_countdown > 0) { + skdev->timer_countdown--; + return; + } + /* For now, we fault the drive. Could attempt resets to + * revcover at some point. */ + skdev->state = SKD_DRVR_STATE_FAULT; + + pr_err("(%s): DriveFault Connect Timeout (%x)\n", + skd_name(skdev), skdev->drive_state); + + /*start the queue so we can respond with error to requests */ + /* wakeup anyone waiting for startup complete */ + blk_start_queue(skdev->queue); + skdev->gendisk_on = -1; + wake_up_interruptible(&skdev->waitq); + break; + + case SKD_DRVR_STATE_ONLINE: + /* shouldn't get here. */ + break; + + case SKD_DRVR_STATE_PAUSING: + case SKD_DRVR_STATE_PAUSED: + break; + + case SKD_DRVR_STATE_DRAINING_TIMEOUT: + pr_debug("%s:%s:%d " + "draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n", + skdev->name, __func__, __LINE__, + skdev->timo_slot, + skdev->timer_countdown, + skdev->in_flight, + skdev->timeout_slot[skdev->timo_slot]); + /* if the slot has cleared we can let the I/O continue */ + if (skdev->timeout_slot[skdev->timo_slot] == 0) { + pr_debug("%s:%s:%d Slot drained, starting queue.\n", + skdev->name, __func__, __LINE__); + skdev->state = SKD_DRVR_STATE_ONLINE; + blk_start_queue(skdev->queue); + return; + } + if (skdev->timer_countdown > 0) { + skdev->timer_countdown--; + return; + } + skd_restart_device(skdev); + break; + + case SKD_DRVR_STATE_RESTARTING: + if (skdev->timer_countdown > 0) { + skdev->timer_countdown--; + return; + } + /* For now, we fault the drive. Could attempt resets to + * revcover at some point. */ + skdev->state = SKD_DRVR_STATE_FAULT; + pr_err("(%s): DriveFault Reconnect Timeout (%x)\n", + skd_name(skdev), skdev->drive_state); + + /* + * Recovering does two things: + * 1. completes IO with error + * 2. reclaims dma resources + * When is it safe to recover requests? + * - if the drive state is faulted + * - if the state is still soft reset after out timeout + * - if the drive registers are dead (state = FF) + * If it is "unsafe", we still need to recover, so we will + * disable pci bus mastering and disable our interrupts. + */ + + if ((skdev->drive_state == FIT_SR_DRIVE_SOFT_RESET) || + (skdev->drive_state == FIT_SR_DRIVE_FAULT) || + (skdev->drive_state == FIT_SR_DRIVE_STATE_MASK)) + /* It never came out of soft reset. Try to + * recover the requests and then let them + * fail. This is to mitigate hung processes. */ + skd_recover_requests(skdev, 0); + else { + pr_err("(%s): Disable BusMaster (%x)\n", + skd_name(skdev), skdev->drive_state); + pci_disable_device(skdev->pdev); + skd_disable_interrupts(skdev); + skd_recover_requests(skdev, 0); + } + + /*start the queue so we can respond with error to requests */ + /* wakeup anyone waiting for startup complete */ + blk_start_queue(skdev->queue); + skdev->gendisk_on = -1; + wake_up_interruptible(&skdev->waitq); + break; + + case SKD_DRVR_STATE_RESUMING: + case SKD_DRVR_STATE_STOPPING: + case SKD_DRVR_STATE_SYNCING: + case SKD_DRVR_STATE_FAULT: + case SKD_DRVR_STATE_DISAPPEARED: + default: + break; + } +} + +static int skd_start_timer(struct skd_device *skdev) +{ + int rc; + + init_timer(&skdev->timer); + setup_timer(&skdev->timer, skd_timer_tick, (ulong)skdev); + + rc = mod_timer(&skdev->timer, (jiffies + HZ)); + if (rc) + pr_err("%s: failed to start timer %d\n", + __func__, rc); + return rc; +} + +static void skd_kill_timer(struct skd_device *skdev) +{ + del_timer_sync(&skdev->timer); +} + +/* + ***************************************************************************** + * IOCTL + ***************************************************************************** + */ +static int skd_ioctl_sg_io(struct skd_device *skdev, + fmode_t mode, void __user *argp); +static int skd_sg_io_get_and_check_args(struct skd_device *skdev, + struct skd_sg_io *sksgio); +static int skd_sg_io_obtain_skspcl(struct skd_device *skdev, + struct skd_sg_io *sksgio); +static int skd_sg_io_prep_buffering(struct skd_device *skdev, + struct skd_sg_io *sksgio); +static int skd_sg_io_copy_buffer(struct skd_device *skdev, + struct skd_sg_io *sksgio, int dxfer_dir); +static int skd_sg_io_send_fitmsg(struct skd_device *skdev, + struct skd_sg_io *sksgio); +static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio); +static int skd_sg_io_release_skspcl(struct skd_device *skdev, + struct skd_sg_io *sksgio); +static int skd_sg_io_put_status(struct skd_device *skdev, + struct skd_sg_io *sksgio); + +static void skd_complete_special(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 + *skcomp, + volatile struct fit_comp_error_info *skerr, + struct skd_special_context *skspcl); + +static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode, + uint cmd_in, ulong arg) +{ + int rc = 0; + struct gendisk *disk = bdev->bd_disk; + struct skd_device *skdev = disk->private_data; + void __user *p = (void *)arg; + + pr_debug("%s:%s:%d %s: CMD[%s] ioctl mode 0x%x, cmd 0x%x arg %0lx\n", + skdev->name, __func__, __LINE__, + disk->disk_name, current->comm, mode, cmd_in, arg); + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd_in) { + case SG_SET_TIMEOUT: + case SG_GET_TIMEOUT: + case SG_GET_VERSION_NUM: + rc = scsi_cmd_ioctl(disk->queue, disk, mode, cmd_in, p); + break; + case SG_IO: + rc = skd_ioctl_sg_io(skdev, mode, p); + break; + + default: + rc = -ENOTTY; + break; + } + + pr_debug("%s:%s:%d %s: completion rc %d\n", + skdev->name, __func__, __LINE__, disk->disk_name, rc); + return rc; +} + +static int skd_ioctl_sg_io(struct skd_device *skdev, fmode_t mode, + void __user *argp) +{ + int rc; + struct skd_sg_io sksgio; + + memset(&sksgio, 0, sizeof(sksgio)); + sksgio.mode = mode; + sksgio.argp = argp; + sksgio.iov = &sksgio.no_iov_iov; + + switch (skdev->state) { + case SKD_DRVR_STATE_ONLINE: + case SKD_DRVR_STATE_BUSY_IMMINENT: + break; + + default: + pr_debug("%s:%s:%d drive not online\n", + skdev->name, __func__, __LINE__); + rc = -ENXIO; + goto out; + } + + rc = skd_sg_io_get_and_check_args(skdev, &sksgio); + if (rc) + goto out; + + rc = skd_sg_io_obtain_skspcl(skdev, &sksgio); + if (rc) + goto out; + + rc = skd_sg_io_prep_buffering(skdev, &sksgio); + if (rc) + goto out; + + rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_TO_DEV); + if (rc) + goto out; + + rc = skd_sg_io_send_fitmsg(skdev, &sksgio); + if (rc) + goto out; + + rc = skd_sg_io_await(skdev, &sksgio); + if (rc) + goto out; + + rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_FROM_DEV); + if (rc) + goto out; + + rc = skd_sg_io_put_status(skdev, &sksgio); + if (rc) + goto out; + + rc = 0; + +out: + skd_sg_io_release_skspcl(skdev, &sksgio); + + if (sksgio.iov != NULL && sksgio.iov != &sksgio.no_iov_iov) + kfree(sksgio.iov); + return rc; +} + +static int skd_sg_io_get_and_check_args(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct sg_io_hdr *sgp = &sksgio->sg; + int i, acc; + + if (!access_ok(VERIFY_WRITE, sksgio->argp, sizeof(sg_io_hdr_t))) { + pr_debug("%s:%s:%d access sg failed %p\n", + skdev->name, __func__, __LINE__, sksgio->argp); + return -EFAULT; + } + + if (__copy_from_user(sgp, sksgio->argp, sizeof(sg_io_hdr_t))) { + pr_debug("%s:%s:%d copy_from_user sg failed %p\n", + skdev->name, __func__, __LINE__, sksgio->argp); + return -EFAULT; + } + + if (sgp->interface_id != SG_INTERFACE_ID_ORIG) { + pr_debug("%s:%s:%d interface_id invalid 0x%x\n", + skdev->name, __func__, __LINE__, sgp->interface_id); + return -EINVAL; + } + + if (sgp->cmd_len > sizeof(sksgio->cdb)) { + pr_debug("%s:%s:%d cmd_len invalid %d\n", + skdev->name, __func__, __LINE__, sgp->cmd_len); + return -EINVAL; + } + + if (sgp->iovec_count > 256) { + pr_debug("%s:%s:%d iovec_count invalid %d\n", + skdev->name, __func__, __LINE__, sgp->iovec_count); + return -EINVAL; + } + + if (sgp->dxfer_len > (PAGE_SIZE * SKD_N_SG_PER_SPECIAL)) { + pr_debug("%s:%s:%d dxfer_len invalid %d\n", + skdev->name, __func__, __LINE__, sgp->dxfer_len); + return -EINVAL; + } + + switch (sgp->dxfer_direction) { + case SG_DXFER_NONE: + acc = -1; + break; + + case SG_DXFER_TO_DEV: + acc = VERIFY_READ; + break; + + case SG_DXFER_FROM_DEV: + case SG_DXFER_TO_FROM_DEV: + acc = VERIFY_WRITE; + break; + + default: + pr_debug("%s:%s:%d dxfer_dir invalid %d\n", + skdev->name, __func__, __LINE__, sgp->dxfer_direction); + return -EINVAL; + } + + if (copy_from_user(sksgio->cdb, sgp->cmdp, sgp->cmd_len)) { + pr_debug("%s:%s:%d copy_from_user cmdp failed %p\n", + skdev->name, __func__, __LINE__, sgp->cmdp); + return -EFAULT; + } + + if (sgp->mx_sb_len != 0) { + if (!access_ok(VERIFY_WRITE, sgp->sbp, sgp->mx_sb_len)) { + pr_debug("%s:%s:%d access sbp failed %p\n", + skdev->name, __func__, __LINE__, sgp->sbp); + return -EFAULT; + } + } + + if (sgp->iovec_count == 0) { + sksgio->iov[0].iov_base = sgp->dxferp; + sksgio->iov[0].iov_len = sgp->dxfer_len; + sksgio->iovcnt = 1; + sksgio->dxfer_len = sgp->dxfer_len; + } else { + struct sg_iovec *iov; + uint nbytes = sizeof(*iov) * sgp->iovec_count; + size_t iov_data_len; + + iov = kmalloc(nbytes, GFP_KERNEL); + if (iov == NULL) { + pr_debug("%s:%s:%d alloc iovec failed %d\n", + skdev->name, __func__, __LINE__, + sgp->iovec_count); + return -ENOMEM; + } + sksgio->iov = iov; + sksgio->iovcnt = sgp->iovec_count; + + if (copy_from_user(iov, sgp->dxferp, nbytes)) { + pr_debug("%s:%s:%d copy_from_user iovec failed %p\n", + skdev->name, __func__, __LINE__, sgp->dxferp); + return -EFAULT; + } + + /* + * Sum up the vecs, making sure they don't overflow + */ + iov_data_len = 0; + for (i = 0; i < sgp->iovec_count; i++) { + if (iov_data_len + iov[i].iov_len < iov_data_len) + return -EINVAL; + iov_data_len += iov[i].iov_len; + } + + /* SG_IO howto says that the shorter of the two wins */ + if (sgp->dxfer_len < iov_data_len) { + sksgio->iovcnt = iov_shorten((struct iovec *)iov, + sgp->iovec_count, + sgp->dxfer_len); + sksgio->dxfer_len = sgp->dxfer_len; + } else + sksgio->dxfer_len = iov_data_len; + } + + if (sgp->dxfer_direction != SG_DXFER_NONE) { + struct sg_iovec *iov = sksgio->iov; + for (i = 0; i < sksgio->iovcnt; i++, iov++) { + if (!access_ok(acc, iov->iov_base, iov->iov_len)) { + pr_debug("%s:%s:%d access data failed %p/%d\n", + skdev->name, __func__, __LINE__, + iov->iov_base, (int)iov->iov_len); + return -EFAULT; + } + } + } + + return 0; +} + +static int skd_sg_io_obtain_skspcl(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct skd_special_context *skspcl = NULL; + int rc; + + for (;;) { + ulong flags; + + spin_lock_irqsave(&skdev->lock, flags); + skspcl = skdev->skspcl_free_list; + if (skspcl != NULL) { + skdev->skspcl_free_list = + (struct skd_special_context *)skspcl->req.next; + skspcl->req.id += SKD_ID_INCR; + skspcl->req.state = SKD_REQ_STATE_SETUP; + skspcl->orphaned = 0; + skspcl->req.n_sg = 0; + } + spin_unlock_irqrestore(&skdev->lock, flags); + + if (skspcl != NULL) { + rc = 0; + break; + } + + pr_debug("%s:%s:%d blocking\n", + skdev->name, __func__, __LINE__); + + rc = wait_event_interruptible_timeout( + skdev->waitq, + (skdev->skspcl_free_list != NULL), + msecs_to_jiffies(sksgio->sg.timeout)); + + pr_debug("%s:%s:%d unblocking, rc=%d\n", + skdev->name, __func__, __LINE__, rc); + + if (rc <= 0) { + if (rc == 0) + rc = -ETIMEDOUT; + else + rc = -EINTR; + break; + } + /* + * If we get here rc > 0 meaning the timeout to + * wait_event_interruptible_timeout() had time left, hence the + * sought event -- non-empty free list -- happened. + * Retry the allocation. + */ + } + sksgio->skspcl = skspcl; + + return rc; +} + +static int skd_skreq_prep_buffering(struct skd_device *skdev, + struct skd_request_context *skreq, + u32 dxfer_len) +{ + u32 resid = dxfer_len; + + /* + * The DMA engine must have aligned addresses and byte counts. + */ + resid += (-resid) & 3; + skreq->sg_byte_count = resid; + + skreq->n_sg = 0; + + while (resid > 0) { + u32 nbytes = PAGE_SIZE; + u32 ix = skreq->n_sg; + struct scatterlist *sg = &skreq->sg[ix]; + struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix]; + struct page *page; + + if (nbytes > resid) + nbytes = resid; + + page = alloc_page(GFP_KERNEL); + if (page == NULL) + return -ENOMEM; + + sg_set_page(sg, page, nbytes, 0); + + /* TODO: This should be going through a pci_???() + * routine to do proper mapping. */ + sksg->control = FIT_SGD_CONTROL_NOT_LAST; + sksg->byte_count = nbytes; + + sksg->host_side_addr = sg_phys(sg); + + sksg->dev_side_addr = 0; + sksg->next_desc_ptr = skreq->sksg_dma_address + + (ix + 1) * sizeof(*sksg); + + skreq->n_sg++; + resid -= nbytes; + } + + if (skreq->n_sg > 0) { + u32 ix = skreq->n_sg - 1; + struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix]; + + sksg->control = FIT_SGD_CONTROL_LAST; + sksg->next_desc_ptr = 0; + } + + if (unlikely(skdev->dbg_level > 1)) { + u32 i; + + pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n", + skdev->name, __func__, __LINE__, + skreq->id, skreq->sksg_list, skreq->sksg_dma_address); + for (i = 0; i < skreq->n_sg; i++) { + struct fit_sg_descriptor *sgd = &skreq->sksg_list[i]; + + pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " + "addr=0x%llx next=0x%llx\n", + skdev->name, __func__, __LINE__, + i, sgd->byte_count, sgd->control, + sgd->host_side_addr, sgd->next_desc_ptr); + } + } + + return 0; +} + +static int skd_sg_io_prep_buffering(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct skd_special_context *skspcl = sksgio->skspcl; + struct skd_request_context *skreq = &skspcl->req; + u32 dxfer_len = sksgio->dxfer_len; + int rc; + + rc = skd_skreq_prep_buffering(skdev, skreq, dxfer_len); + /* + * Eventually, errors or not, skd_release_special() is called + * to recover allocations including partial allocations. + */ + return rc; +} + +static int skd_sg_io_copy_buffer(struct skd_device *skdev, + struct skd_sg_io *sksgio, int dxfer_dir) +{ + struct skd_special_context *skspcl = sksgio->skspcl; + u32 iov_ix = 0; + struct sg_iovec curiov; + u32 sksg_ix = 0; + u8 *bufp = NULL; + u32 buf_len = 0; + u32 resid = sksgio->dxfer_len; + int rc; + + curiov.iov_len = 0; + curiov.iov_base = NULL; + + if (dxfer_dir != sksgio->sg.dxfer_direction) { + if (dxfer_dir != SG_DXFER_TO_DEV || + sksgio->sg.dxfer_direction != SG_DXFER_TO_FROM_DEV) + return 0; + } + + while (resid > 0) { + u32 nbytes = PAGE_SIZE; + + if (curiov.iov_len == 0) { + curiov = sksgio->iov[iov_ix++]; + continue; + } + + if (buf_len == 0) { + struct page *page; + page = sg_page(&skspcl->req.sg[sksg_ix++]); + bufp = page_address(page); + buf_len = PAGE_SIZE; + } + + nbytes = min_t(u32, nbytes, resid); + nbytes = min_t(u32, nbytes, curiov.iov_len); + nbytes = min_t(u32, nbytes, buf_len); + + if (dxfer_dir == SG_DXFER_TO_DEV) + rc = __copy_from_user(bufp, curiov.iov_base, nbytes); + else + rc = __copy_to_user(curiov.iov_base, bufp, nbytes); + + if (rc) + return -EFAULT; + + resid -= nbytes; + curiov.iov_len -= nbytes; + curiov.iov_base += nbytes; + buf_len -= nbytes; + } + + return 0; +} + +static int skd_sg_io_send_fitmsg(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct skd_special_context *skspcl = sksgio->skspcl; + struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf; + struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1]; + + memset(skspcl->msg_buf, 0, SKD_N_SPECIAL_FITMSG_BYTES); + + /* Initialize the FIT msg header */ + fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; + fmh->num_protocol_cmds_coalesced = 1; + + /* Initialize the SCSI request */ + if (sksgio->sg.dxfer_direction != SG_DXFER_NONE) + scsi_req->hdr.sg_list_dma_address = + cpu_to_be64(skspcl->req.sksg_dma_address); + scsi_req->hdr.tag = skspcl->req.id; + scsi_req->hdr.sg_list_len_bytes = + cpu_to_be32(skspcl->req.sg_byte_count); + memcpy(scsi_req->cdb, sksgio->cdb, sizeof(scsi_req->cdb)); + + skspcl->req.state = SKD_REQ_STATE_BUSY; + skd_send_special_fitmsg(skdev, skspcl); + + return 0; +} + +static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio) +{ + unsigned long flags; + int rc; + + rc = wait_event_interruptible_timeout(skdev->waitq, + (sksgio->skspcl->req.state != + SKD_REQ_STATE_BUSY), + msecs_to_jiffies(sksgio->sg. + timeout)); + + spin_lock_irqsave(&skdev->lock, flags); + + if (sksgio->skspcl->req.state == SKD_REQ_STATE_ABORTED) { + pr_debug("%s:%s:%d skspcl %p aborted\n", + skdev->name, __func__, __LINE__, sksgio->skspcl); + + /* Build check cond, sense and let command finish. */ + /* For a timeout, we must fabricate completion and sense + * data to complete the command */ + sksgio->skspcl->req.completion.status = + SAM_STAT_CHECK_CONDITION; + + memset(&sksgio->skspcl->req.err_info, 0, + sizeof(sksgio->skspcl->req.err_info)); + sksgio->skspcl->req.err_info.type = 0x70; + sksgio->skspcl->req.err_info.key = ABORTED_COMMAND; + sksgio->skspcl->req.err_info.code = 0x44; + sksgio->skspcl->req.err_info.qual = 0; + rc = 0; + } else if (sksgio->skspcl->req.state != SKD_REQ_STATE_BUSY) + /* No longer on the adapter. We finish. */ + rc = 0; + else { + /* Something's gone wrong. Still busy. Timeout or + * user interrupted (control-C). Mark as an orphan + * so it will be disposed when completed. */ + sksgio->skspcl->orphaned = 1; + sksgio->skspcl = NULL; + if (rc == 0) { + pr_debug("%s:%s:%d timed out %p (%u ms)\n", + skdev->name, __func__, __LINE__, + sksgio, sksgio->sg.timeout); + rc = -ETIMEDOUT; + } else { + pr_debug("%s:%s:%d cntlc %p\n", + skdev->name, __func__, __LINE__, sksgio); + rc = -EINTR; + } + } + + spin_unlock_irqrestore(&skdev->lock, flags); + + return rc; +} + +static int skd_sg_io_put_status(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct sg_io_hdr *sgp = &sksgio->sg; + struct skd_special_context *skspcl = sksgio->skspcl; + int resid = 0; + + u32 nb = be32_to_cpu(skspcl->req.completion.num_returned_bytes); + + sgp->status = skspcl->req.completion.status; + resid = sksgio->dxfer_len - nb; + + sgp->masked_status = sgp->status & STATUS_MASK; + sgp->msg_status = 0; + sgp->host_status = 0; + sgp->driver_status = 0; + sgp->resid = resid; + if (sgp->masked_status || sgp->host_status || sgp->driver_status) + sgp->info |= SG_INFO_CHECK; + + pr_debug("%s:%s:%d status %x masked %x resid 0x%x\n", + skdev->name, __func__, __LINE__, + sgp->status, sgp->masked_status, sgp->resid); + + if (sgp->masked_status == SAM_STAT_CHECK_CONDITION) { + if (sgp->mx_sb_len > 0) { + struct fit_comp_error_info *ei = &skspcl->req.err_info; + u32 nbytes = sizeof(*ei); + + nbytes = min_t(u32, nbytes, sgp->mx_sb_len); + + sgp->sb_len_wr = nbytes; + + if (__copy_to_user(sgp->sbp, ei, nbytes)) { + pr_debug("%s:%s:%d copy_to_user sense failed %p\n", + skdev->name, __func__, __LINE__, + sgp->sbp); + return -EFAULT; + } + } + } + + if (__copy_to_user(sksgio->argp, sgp, sizeof(sg_io_hdr_t))) { + pr_debug("%s:%s:%d copy_to_user sg failed %p\n", + skdev->name, __func__, __LINE__, sksgio->argp); + return -EFAULT; + } + + return 0; +} + +static int skd_sg_io_release_skspcl(struct skd_device *skdev, + struct skd_sg_io *sksgio) +{ + struct skd_special_context *skspcl = sksgio->skspcl; + + if (skspcl != NULL) { + ulong flags; + + sksgio->skspcl = NULL; + + spin_lock_irqsave(&skdev->lock, flags); + skd_release_special(skdev, skspcl); + spin_unlock_irqrestore(&skdev->lock, flags); + } + + return 0; +} + +/* + ***************************************************************************** + * INTERNAL REQUESTS -- generated by driver itself + ***************************************************************************** + */ + +static int skd_format_internal_skspcl(struct skd_device *skdev) +{ + struct skd_special_context *skspcl = &skdev->internal_skspcl; + struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0]; + struct fit_msg_hdr *fmh; + uint64_t dma_address; + struct skd_scsi_request *scsi; + + fmh = (struct fit_msg_hdr *)&skspcl->msg_buf[0]; + fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT; + fmh->num_protocol_cmds_coalesced = 1; + + scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64]; + memset(scsi, 0, sizeof(*scsi)); + dma_address = skspcl->req.sksg_dma_address; + scsi->hdr.sg_list_dma_address = cpu_to_be64(dma_address); + sgd->control = FIT_SGD_CONTROL_LAST; + sgd->byte_count = 0; + sgd->host_side_addr = skspcl->db_dma_address; + sgd->dev_side_addr = 0; + sgd->next_desc_ptr = 0LL; + + return 1; +} + +#define WR_BUF_SIZE SKD_N_INTERNAL_BYTES + +static void skd_send_internal_skspcl(struct skd_device *skdev, + struct skd_special_context *skspcl, + u8 opcode) +{ + struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0]; + struct skd_scsi_request *scsi; + unsigned char *buf = skspcl->data_buf; + int i; + + if (skspcl->req.state != SKD_REQ_STATE_IDLE) + /* + * A refresh is already in progress. + * Just wait for it to finish. + */ + return; + + SKD_ASSERT((skspcl->req.id & SKD_ID_INCR) == 0); + skspcl->req.state = SKD_REQ_STATE_BUSY; + skspcl->req.id += SKD_ID_INCR; + + scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64]; + scsi->hdr.tag = skspcl->req.id; + + memset(scsi->cdb, 0, sizeof(scsi->cdb)); + + switch (opcode) { + case TEST_UNIT_READY: + scsi->cdb[0] = TEST_UNIT_READY; + sgd->byte_count = 0; + scsi->hdr.sg_list_len_bytes = 0; + break; + + case READ_CAPACITY: + scsi->cdb[0] = READ_CAPACITY; + sgd->byte_count = SKD_N_READ_CAP_BYTES; + scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); + break; + + case INQUIRY: + scsi->cdb[0] = INQUIRY; + scsi->cdb[1] = 0x01; /* evpd */ + scsi->cdb[2] = 0x80; /* serial number page */ + scsi->cdb[4] = 0x10; + sgd->byte_count = 16; + scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); + break; + + case SYNCHRONIZE_CACHE: + scsi->cdb[0] = SYNCHRONIZE_CACHE; + sgd->byte_count = 0; + scsi->hdr.sg_list_len_bytes = 0; + break; + + case WRITE_BUFFER: + scsi->cdb[0] = WRITE_BUFFER; + scsi->cdb[1] = 0x02; + scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8; + scsi->cdb[8] = WR_BUF_SIZE & 0xFF; + sgd->byte_count = WR_BUF_SIZE; + scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); + /* fill incrementing byte pattern */ + for (i = 0; i < sgd->byte_count; i++) + buf[i] = i & 0xFF; + break; + + case READ_BUFFER: + scsi->cdb[0] = READ_BUFFER; + scsi->cdb[1] = 0x02; + scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8; + scsi->cdb[8] = WR_BUF_SIZE & 0xFF; + sgd->byte_count = WR_BUF_SIZE; + scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count); + memset(skspcl->data_buf, 0, sgd->byte_count); + break; + + default: + SKD_ASSERT("Don't know what to send"); + return; + + } + skd_send_special_fitmsg(skdev, skspcl); +} + +static void skd_refresh_device_data(struct skd_device *skdev) +{ + struct skd_special_context *skspcl = &skdev->internal_skspcl; + + skd_send_internal_skspcl(skdev, skspcl, TEST_UNIT_READY); +} + +static int skd_chk_read_buf(struct skd_device *skdev, + struct skd_special_context *skspcl) +{ + unsigned char *buf = skspcl->data_buf; + int i; + + /* check for incrementing byte pattern */ + for (i = 0; i < WR_BUF_SIZE; i++) + if (buf[i] != (i & 0xFF)) + return 1; + + return 0; +} + +static void skd_log_check_status(struct skd_device *skdev, u8 status, u8 key, + u8 code, u8 qual, u8 fruc) +{ + /* If the check condition is of special interest, log a message */ + if ((status == SAM_STAT_CHECK_CONDITION) && (key == 0x02) + && (code == 0x04) && (qual == 0x06)) { + pr_err("(%s): *** LOST_WRITE_DATA ERROR *** key/asc/" + "ascq/fruc %02x/%02x/%02x/%02x\n", + skd_name(skdev), key, code, qual, fruc); + } +} + +static void skd_complete_internal(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 + *skcomp, + volatile struct fit_comp_error_info *skerr, + struct skd_special_context *skspcl) +{ + u8 *buf = skspcl->data_buf; + u8 status; + int i; + struct skd_scsi_request *scsi = + (struct skd_scsi_request *)&skspcl->msg_buf[64]; + + SKD_ASSERT(skspcl == &skdev->internal_skspcl); + + pr_debug("%s:%s:%d complete internal %x\n", + skdev->name, __func__, __LINE__, scsi->cdb[0]); + + skspcl->req.completion = *skcomp; + skspcl->req.state = SKD_REQ_STATE_IDLE; + skspcl->req.id += SKD_ID_INCR; + + status = skspcl->req.completion.status; + + skd_log_check_status(skdev, status, skerr->key, skerr->code, + skerr->qual, skerr->fruc); + + switch (scsi->cdb[0]) { + case TEST_UNIT_READY: + if (status == SAM_STAT_GOOD) + skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER); + else if ((status == SAM_STAT_CHECK_CONDITION) && + (skerr->key == MEDIUM_ERROR)) + skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER); + else { + if (skdev->state == SKD_DRVR_STATE_STOPPING) { + pr_debug("%s:%s:%d TUR failed, don't send anymore state 0x%x\n", + skdev->name, __func__, __LINE__, + skdev->state); + return; + } + pr_debug("%s:%s:%d **** TUR failed, retry skerr\n", + skdev->name, __func__, __LINE__); + skd_send_internal_skspcl(skdev, skspcl, 0x00); + } + break; + + case WRITE_BUFFER: + if (status == SAM_STAT_GOOD) + skd_send_internal_skspcl(skdev, skspcl, READ_BUFFER); + else { + if (skdev->state == SKD_DRVR_STATE_STOPPING) { + pr_debug("%s:%s:%d write buffer failed, don't send anymore state 0x%x\n", + skdev->name, __func__, __LINE__, + skdev->state); + return; + } + pr_debug("%s:%s:%d **** write buffer failed, retry skerr\n", + skdev->name, __func__, __LINE__); + skd_send_internal_skspcl(skdev, skspcl, 0x00); + } + break; + + case READ_BUFFER: + if (status == SAM_STAT_GOOD) { + if (skd_chk_read_buf(skdev, skspcl) == 0) + skd_send_internal_skspcl(skdev, skspcl, + READ_CAPACITY); + else { + pr_err( + "(%s):*** W/R Buffer mismatch %d ***\n", + skd_name(skdev), skdev->connect_retries); + if (skdev->connect_retries < + SKD_MAX_CONNECT_RETRIES) { + skdev->connect_retries++; + skd_soft_reset(skdev); + } else { + pr_err( + "(%s): W/R Buffer Connect Error\n", + skd_name(skdev)); + return; + } + } + + } else { + if (skdev->state == SKD_DRVR_STATE_STOPPING) { + pr_debug("%s:%s:%d " + "read buffer failed, don't send anymore state 0x%x\n", + skdev->name, __func__, __LINE__, + skdev->state); + return; + } + pr_debug("%s:%s:%d " + "**** read buffer failed, retry skerr\n", + skdev->name, __func__, __LINE__); + skd_send_internal_skspcl(skdev, skspcl, 0x00); + } + break; + + case READ_CAPACITY: + skdev->read_cap_is_valid = 0; + if (status == SAM_STAT_GOOD) { + skdev->read_cap_last_lba = + (buf[0] << 24) | (buf[1] << 16) | + (buf[2] << 8) | buf[3]; + skdev->read_cap_blocksize = + (buf[4] << 24) | (buf[5] << 16) | + (buf[6] << 8) | buf[7]; + + pr_debug("%s:%s:%d last lba %d, bs %d\n", + skdev->name, __func__, __LINE__, + skdev->read_cap_last_lba, + skdev->read_cap_blocksize); + + set_capacity(skdev->disk, skdev->read_cap_last_lba + 1); + + skdev->read_cap_is_valid = 1; + + skd_send_internal_skspcl(skdev, skspcl, INQUIRY); + } else if ((status == SAM_STAT_CHECK_CONDITION) && + (skerr->key == MEDIUM_ERROR)) { + skdev->read_cap_last_lba = ~0; + set_capacity(skdev->disk, skdev->read_cap_last_lba + 1); + pr_debug("%s:%s:%d " + "**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n", + skdev->name, __func__, __LINE__); + skd_send_internal_skspcl(skdev, skspcl, INQUIRY); + } else { + pr_debug("%s:%s:%d **** READCAP failed, retry TUR\n", + skdev->name, __func__, __LINE__); + skd_send_internal_skspcl(skdev, skspcl, + TEST_UNIT_READY); + } + break; + + case INQUIRY: + skdev->inquiry_is_valid = 0; + if (status == SAM_STAT_GOOD) { + skdev->inquiry_is_valid = 1; + + for (i = 0; i < 12; i++) + skdev->inq_serial_num[i] = buf[i + 4]; + skdev->inq_serial_num[12] = 0; + } + + if (skd_unquiesce_dev(skdev) < 0) + pr_debug("%s:%s:%d **** failed, to ONLINE device\n", + skdev->name, __func__, __LINE__); + /* connection is complete */ + skdev->connect_retries = 0; + break; + + case SYNCHRONIZE_CACHE: + if (status == SAM_STAT_GOOD) + skdev->sync_done = 1; + else + skdev->sync_done = -1; + wake_up_interruptible(&skdev->waitq); + break; + + default: + SKD_ASSERT("we didn't send this"); + } +} + +/* + ***************************************************************************** + * FIT MESSAGES + ***************************************************************************** + */ + +static void skd_send_fitmsg(struct skd_device *skdev, + struct skd_fitmsg_context *skmsg) +{ + u64 qcmd; + struct fit_msg_hdr *fmh; + + pr_debug("%s:%s:%d dma address 0x%llx, busy=%d\n", + skdev->name, __func__, __LINE__, + skmsg->mb_dma_address, skdev->in_flight); + pr_debug("%s:%s:%d msg_buf 0x%p, offset %x\n", + skdev->name, __func__, __LINE__, + skmsg->msg_buf, skmsg->offset); + + qcmd = skmsg->mb_dma_address; + qcmd |= FIT_QCMD_QID_NORMAL; + + fmh = (struct fit_msg_hdr *)skmsg->msg_buf; + skmsg->outstanding = fmh->num_protocol_cmds_coalesced; + + if (unlikely(skdev->dbg_level > 1)) { + u8 *bp = (u8 *)skmsg->msg_buf; + int i; + for (i = 0; i < skmsg->length; i += 8) { + pr_debug("%s:%s:%d msg[%2d] %02x %02x %02x %02x " + "%02x %02x %02x %02x\n", + skdev->name, __func__, __LINE__, + i, bp[i + 0], bp[i + 1], bp[i + 2], + bp[i + 3], bp[i + 4], bp[i + 5], + bp[i + 6], bp[i + 7]); + if (i == 0) + i = 64 - 8; + } + } + + if (skmsg->length > 256) + qcmd |= FIT_QCMD_MSGSIZE_512; + else if (skmsg->length > 128) + qcmd |= FIT_QCMD_MSGSIZE_256; + else if (skmsg->length > 64) + qcmd |= FIT_QCMD_MSGSIZE_128; + else + /* + * This makes no sense because the FIT msg header is + * 64 bytes. If the msg is only 64 bytes long it has + * no payload. + */ + qcmd |= FIT_QCMD_MSGSIZE_64; + + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); + +} + +static void skd_send_special_fitmsg(struct skd_device *skdev, + struct skd_special_context *skspcl) +{ + u64 qcmd; + + if (unlikely(skdev->dbg_level > 1)) { + u8 *bp = (u8 *)skspcl->msg_buf; + int i; + + for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) { + pr_debug("%s:%s:%d spcl[%2d] %02x %02x %02x %02x " + "%02x %02x %02x %02x\n", + skdev->name, __func__, __LINE__, i, + bp[i + 0], bp[i + 1], bp[i + 2], bp[i + 3], + bp[i + 4], bp[i + 5], bp[i + 6], bp[i + 7]); + if (i == 0) + i = 64 - 8; + } + + pr_debug("%s:%s:%d skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n", + skdev->name, __func__, __LINE__, + skspcl, skspcl->req.id, skspcl->req.sksg_list, + skspcl->req.sksg_dma_address); + for (i = 0; i < skspcl->req.n_sg; i++) { + struct fit_sg_descriptor *sgd = + &skspcl->req.sksg_list[i]; + + pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x " + "addr=0x%llx next=0x%llx\n", + skdev->name, __func__, __LINE__, + i, sgd->byte_count, sgd->control, + sgd->host_side_addr, sgd->next_desc_ptr); + } + } + + /* + * Special FIT msgs are always 128 bytes: a 64-byte FIT hdr + * and one 64-byte SSDI command. + */ + qcmd = skspcl->mb_dma_address; + qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128; + + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); +} + +/* + ***************************************************************************** + * COMPLETION QUEUE + ***************************************************************************** + */ + +static void skd_complete_other(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 *skcomp, + volatile struct fit_comp_error_info *skerr); + +struct sns_info { + u8 type; + u8 stat; + u8 key; + u8 asc; + u8 ascq; + u8 mask; + enum skd_check_status_action action; +}; + +static struct sns_info skd_chkstat_table[] = { + /* Good */ + { 0x70, 0x02, RECOVERED_ERROR, 0, 0, 0x1c, + SKD_CHECK_STATUS_REPORT_GOOD }, + + /* Smart alerts */ + { 0x70, 0x02, NO_SENSE, 0x0B, 0x00, 0x1E, /* warnings */ + SKD_CHECK_STATUS_REPORT_SMART_ALERT }, + { 0x70, 0x02, NO_SENSE, 0x5D, 0x00, 0x1E, /* thresholds */ + SKD_CHECK_STATUS_REPORT_SMART_ALERT }, + { 0x70, 0x02, RECOVERED_ERROR, 0x0B, 0x01, 0x1F, /* temperature over trigger */ + SKD_CHECK_STATUS_REPORT_SMART_ALERT }, + + /* Retry (with limits) */ + { 0x70, 0x02, 0x0B, 0, 0, 0x1C, /* This one is for DMA ERROR */ + SKD_CHECK_STATUS_REQUEUE_REQUEST }, + { 0x70, 0x02, 0x06, 0x0B, 0x00, 0x1E, /* warnings */ + SKD_CHECK_STATUS_REQUEUE_REQUEST }, + { 0x70, 0x02, 0x06, 0x5D, 0x00, 0x1E, /* thresholds */ + SKD_CHECK_STATUS_REQUEUE_REQUEST }, + { 0x70, 0x02, 0x06, 0x80, 0x30, 0x1F, /* backup power */ + SKD_CHECK_STATUS_REQUEUE_REQUEST }, + + /* Busy (or about to be) */ + { 0x70, 0x02, 0x06, 0x3f, 0x01, 0x1F, /* fw changed */ + SKD_CHECK_STATUS_BUSY_IMMINENT }, +}; + +/* + * Look up status and sense data to decide how to handle the error + * from the device. + * mask says which fields must match e.g., mask=0x18 means check + * type and stat, ignore key, asc, ascq. + */ + +static enum skd_check_status_action +skd_check_status(struct skd_device *skdev, + u8 cmp_status, volatile struct fit_comp_error_info *skerr) +{ + int i, n; + + pr_err("(%s): key/asc/ascq/fruc %02x/%02x/%02x/%02x\n", + skd_name(skdev), skerr->key, skerr->code, skerr->qual, + skerr->fruc); + + pr_debug("%s:%s:%d stat: t=%02x stat=%02x k=%02x c=%02x q=%02x fruc=%02x\n", + skdev->name, __func__, __LINE__, skerr->type, cmp_status, + skerr->key, skerr->code, skerr->qual, skerr->fruc); + + /* Does the info match an entry in the good category? */ + n = sizeof(skd_chkstat_table) / sizeof(skd_chkstat_table[0]); + for (i = 0; i < n; i++) { + struct sns_info *sns = &skd_chkstat_table[i]; + + if (sns->mask & 0x10) + if (skerr->type != sns->type) + continue; + + if (sns->mask & 0x08) + if (cmp_status != sns->stat) + continue; + + if (sns->mask & 0x04) + if (skerr->key != sns->key) + continue; + + if (sns->mask & 0x02) + if (skerr->code != sns->asc) + continue; + + if (sns->mask & 0x01) + if (skerr->qual != sns->ascq) + continue; + + if (sns->action == SKD_CHECK_STATUS_REPORT_SMART_ALERT) { + pr_err("(%s): SMART Alert: sense key/asc/ascq " + "%02x/%02x/%02x\n", + skd_name(skdev), skerr->key, + skerr->code, skerr->qual); + } + return sns->action; + } + + /* No other match, so nonzero status means error, + * zero status means good + */ + if (cmp_status) { + pr_debug("%s:%s:%d status check: error\n", + skdev->name, __func__, __LINE__); + return SKD_CHECK_STATUS_REPORT_ERROR; + } + + pr_debug("%s:%s:%d status check good default\n", + skdev->name, __func__, __LINE__); + return SKD_CHECK_STATUS_REPORT_GOOD; +} + +static void skd_resolve_req_exception(struct skd_device *skdev, + struct skd_request_context *skreq) +{ + u8 cmp_status = skreq->completion.status; + + switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) { + case SKD_CHECK_STATUS_REPORT_GOOD: + case SKD_CHECK_STATUS_REPORT_SMART_ALERT: + skd_end_request(skdev, skreq, 0); + break; + + case SKD_CHECK_STATUS_BUSY_IMMINENT: + skd_log_skreq(skdev, skreq, "retry(busy)"); + blk_requeue_request(skdev->queue, skreq->req); + pr_info("(%s) drive BUSY imminent\n", skd_name(skdev)); + skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT; + skdev->timer_countdown = SKD_TIMER_MINUTES(20); + skd_quiesce_dev(skdev); + break; + + case SKD_CHECK_STATUS_REQUEUE_REQUEST: + if ((unsigned long) ++skreq->req->special < SKD_MAX_RETRIES) { + skd_log_skreq(skdev, skreq, "retry"); + blk_requeue_request(skdev->queue, skreq->req); + break; + } + /* fall through to report error */ + + case SKD_CHECK_STATUS_REPORT_ERROR: + default: + skd_end_request(skdev, skreq, -EIO); + break; + } +} + +/* assume spinlock is already held */ +static void skd_release_skreq(struct skd_device *skdev, + struct skd_request_context *skreq) +{ + u32 msg_slot; + struct skd_fitmsg_context *skmsg; + + u32 timo_slot; + + /* + * Reclaim the FIT msg buffer if this is + * the first of the requests it carried to + * be completed. The FIT msg buffer used to + * send this request cannot be reused until + * we are sure the s1120 card has copied + * it to its memory. The FIT msg might have + * contained several requests. As soon as + * any of them are completed we know that + * the entire FIT msg was transferred. + * Only the first completed request will + * match the FIT msg buffer id. The FIT + * msg buffer id is immediately updated. + * When subsequent requests complete the FIT + * msg buffer id won't match, so we know + * quite cheaply that it is already done. + */ + msg_slot = skreq->fitmsg_id & SKD_ID_SLOT_MASK; + SKD_ASSERT(msg_slot < skdev->num_fitmsg_context); + + skmsg = &skdev->skmsg_table[msg_slot]; + if (skmsg->id == skreq->fitmsg_id) { + SKD_ASSERT(skmsg->state == SKD_MSG_STATE_BUSY); + SKD_ASSERT(skmsg->outstanding > 0); + skmsg->outstanding--; + if (skmsg->outstanding == 0) { + skmsg->state = SKD_MSG_STATE_IDLE; + skmsg->id += SKD_ID_INCR; + skmsg->next = skdev->skmsg_free_list; + skdev->skmsg_free_list = skmsg; + } + } + + /* + * Decrease the number of active requests. + * Also decrements the count in the timeout slot. + */ + SKD_ASSERT(skdev->in_flight > 0); + skdev->in_flight -= 1; + + timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK; + SKD_ASSERT(skdev->timeout_slot[timo_slot] > 0); + skdev->timeout_slot[timo_slot] -= 1; + + /* + * Reset backpointer + */ + skreq->req = NULL; + + /* + * Reclaim the skd_request_context + */ + skreq->state = SKD_REQ_STATE_IDLE; + skreq->id += SKD_ID_INCR; + skreq->next = skdev->skreq_free_list; + skdev->skreq_free_list = skreq; +} + +#define DRIVER_INQ_EVPD_PAGE_CODE 0xDA + +static void skd_do_inq_page_00(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 *skcomp, + volatile struct fit_comp_error_info *skerr, + uint8_t *cdb, uint8_t *buf) +{ + uint16_t insert_pt, max_bytes, drive_pages, drive_bytes, new_size; + + /* Caller requested "supported pages". The driver needs to insert + * its page. + */ + pr_debug("%s:%s:%d skd_do_driver_inquiry: modify supported pages.\n", + skdev->name, __func__, __LINE__); + + /* If the device rejected the request because the CDB was + * improperly formed, then just leave. + */ + if (skcomp->status == SAM_STAT_CHECK_CONDITION && + skerr->key == ILLEGAL_REQUEST && skerr->code == 0x24) + return; + + /* Get the amount of space the caller allocated */ + max_bytes = (cdb[3] << 8) | cdb[4]; + + /* Get the number of pages actually returned by the device */ + drive_pages = (buf[2] << 8) | buf[3]; + drive_bytes = drive_pages + 4; + new_size = drive_pages + 1; + + /* Supported pages must be in numerical order, so find where + * the driver page needs to be inserted into the list of + * pages returned by the device. + */ + for (insert_pt = 4; insert_pt < drive_bytes; insert_pt++) { + if (buf[insert_pt] == DRIVER_INQ_EVPD_PAGE_CODE) + return; /* Device using this page code. abort */ + else if (buf[insert_pt] > DRIVER_INQ_EVPD_PAGE_CODE) + break; + } + + if (insert_pt < max_bytes) { + uint16_t u; + + /* Shift everything up one byte to make room. */ + for (u = new_size + 3; u > insert_pt; u--) + buf[u] = buf[u - 1]; + buf[insert_pt] = DRIVER_INQ_EVPD_PAGE_CODE; + + /* SCSI byte order increment of num_returned_bytes by 1 */ + skcomp->num_returned_bytes = + be32_to_cpu(skcomp->num_returned_bytes) + 1; + skcomp->num_returned_bytes = + be32_to_cpu(skcomp->num_returned_bytes); + } + + /* update page length field to reflect the driver's page too */ + buf[2] = (uint8_t)((new_size >> 8) & 0xFF); + buf[3] = (uint8_t)((new_size >> 0) & 0xFF); +} + +static void skd_get_link_info(struct pci_dev *pdev, u8 *speed, u8 *width) +{ + int pcie_reg; + u16 pci_bus_speed; + u8 pci_lanes; + + pcie_reg = pci_find_capability(pdev, PCI_CAP_ID_EXP); + if (pcie_reg) { + u16 linksta; + pci_read_config_word(pdev, pcie_reg + PCI_EXP_LNKSTA, &linksta); + + pci_bus_speed = linksta & 0xF; + pci_lanes = (linksta & 0x3F0) >> 4; + } else { + *speed = STEC_LINK_UNKNOWN; + *width = 0xFF; + return; + } + + switch (pci_bus_speed) { + case 1: + *speed = STEC_LINK_2_5GTS; + break; + case 2: + *speed = STEC_LINK_5GTS; + break; + case 3: + *speed = STEC_LINK_8GTS; + break; + default: + *speed = STEC_LINK_UNKNOWN; + break; + } + + if (pci_lanes <= 0x20) + *width = pci_lanes; + else + *width = 0xFF; +} + +static void skd_do_inq_page_da(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 *skcomp, + volatile struct fit_comp_error_info *skerr, + uint8_t *cdb, uint8_t *buf) +{ + struct pci_dev *pdev = skdev->pdev; + unsigned max_bytes; + struct driver_inquiry_data inq; + u16 val; + + pr_debug("%s:%s:%d skd_do_driver_inquiry: return driver page\n", + skdev->name, __func__, __LINE__); + + memset(&inq, 0, sizeof(inq)); + + inq.page_code = DRIVER_INQ_EVPD_PAGE_CODE; + + skd_get_link_info(pdev, &inq.pcie_link_speed, &inq.pcie_link_lanes); + inq.pcie_bus_number = cpu_to_be16(pdev->bus->number); + inq.pcie_device_number = PCI_SLOT(pdev->devfn); + inq.pcie_function_number = PCI_FUNC(pdev->devfn); + + pci_read_config_word(pdev, PCI_VENDOR_ID, &val); + inq.pcie_vendor_id = cpu_to_be16(val); + + pci_read_config_word(pdev, PCI_DEVICE_ID, &val); + inq.pcie_device_id = cpu_to_be16(val); + + pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &val); + inq.pcie_subsystem_vendor_id = cpu_to_be16(val); + + pci_read_config_word(pdev, PCI_SUBSYSTEM_ID, &val); + inq.pcie_subsystem_device_id = cpu_to_be16(val); + + /* Driver version, fixed lenth, padded with spaces on the right */ + inq.driver_version_length = sizeof(inq.driver_version); + memset(&inq.driver_version, ' ', sizeof(inq.driver_version)); + memcpy(inq.driver_version, DRV_VER_COMPL, + min(sizeof(inq.driver_version), strlen(DRV_VER_COMPL))); + + inq.page_length = cpu_to_be16((sizeof(inq) - 4)); + + /* Clear the error set by the device */ + skcomp->status = SAM_STAT_GOOD; + memset((void *)skerr, 0, sizeof(*skerr)); + + /* copy response into output buffer */ + max_bytes = (cdb[3] << 8) | cdb[4]; + memcpy(buf, &inq, min_t(unsigned, max_bytes, sizeof(inq))); + + skcomp->num_returned_bytes = + be32_to_cpu(min_t(uint16_t, max_bytes, sizeof(inq))); +} + +static void skd_do_driver_inq(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 *skcomp, + volatile struct fit_comp_error_info *skerr, + uint8_t *cdb, uint8_t *buf) +{ + if (!buf) + return; + else if (cdb[0] != INQUIRY) + return; /* Not an INQUIRY */ + else if ((cdb[1] & 1) == 0) + return; /* EVPD not set */ + else if (cdb[2] == 0) + /* Need to add driver's page to supported pages list */ + skd_do_inq_page_00(skdev, skcomp, skerr, cdb, buf); + else if (cdb[2] == DRIVER_INQ_EVPD_PAGE_CODE) + /* Caller requested driver's page */ + skd_do_inq_page_da(skdev, skcomp, skerr, cdb, buf); +} + +static unsigned char *skd_sg_1st_page_ptr(struct scatterlist *sg) +{ + if (!sg) + return NULL; + if (!sg_page(sg)) + return NULL; + return sg_virt(sg); +} + +static void skd_process_scsi_inq(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 + *skcomp, + volatile struct fit_comp_error_info *skerr, + struct skd_special_context *skspcl) +{ + uint8_t *buf; + struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf; + struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1]; + + dma_sync_sg_for_cpu(skdev->class_dev, skspcl->req.sg, skspcl->req.n_sg, + skspcl->req.sg_data_dir); + buf = skd_sg_1st_page_ptr(skspcl->req.sg); + + if (buf) + skd_do_driver_inq(skdev, skcomp, skerr, scsi_req->cdb, buf); +} + + +static int skd_isr_completion_posted(struct skd_device *skdev, + int limit, int *enqueued) +{ + volatile struct fit_completion_entry_v1 *skcmp = NULL; + volatile struct fit_comp_error_info *skerr; + u16 req_id; + u32 req_slot; + struct skd_request_context *skreq; + u16 cmp_cntxt = 0; + u8 cmp_status = 0; + u8 cmp_cycle = 0; + u32 cmp_bytes = 0; + int rc = 0; + int processed = 0; + + for (;; ) { + SKD_ASSERT(skdev->skcomp_ix < SKD_N_COMPLETION_ENTRY); + + skcmp = &skdev->skcomp_table[skdev->skcomp_ix]; + cmp_cycle = skcmp->cycle; + cmp_cntxt = skcmp->tag; + cmp_status = skcmp->status; + cmp_bytes = be32_to_cpu(skcmp->num_returned_bytes); + + skerr = &skdev->skerr_table[skdev->skcomp_ix]; + + pr_debug("%s:%s:%d " + "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d " + "busy=%d rbytes=0x%x proto=%d\n", + skdev->name, __func__, __LINE__, skdev->skcomp_cycle, + skdev->skcomp_ix, cmp_cycle, cmp_cntxt, cmp_status, + skdev->in_flight, cmp_bytes, skdev->proto_ver); + + if (cmp_cycle != skdev->skcomp_cycle) { + pr_debug("%s:%s:%d end of completions\n", + skdev->name, __func__, __LINE__); + break; + } + /* + * Update the completion queue head index and possibly + * the completion cycle count. 8-bit wrap-around. + */ + skdev->skcomp_ix++; + if (skdev->skcomp_ix >= SKD_N_COMPLETION_ENTRY) { + skdev->skcomp_ix = 0; + skdev->skcomp_cycle++; + } + + /* + * The command context is a unique 32-bit ID. The low order + * bits help locate the request. The request is usually a + * r/w request (see skd_start() above) or a special request. + */ + req_id = cmp_cntxt; + req_slot = req_id & SKD_ID_SLOT_AND_TABLE_MASK; + + /* Is this other than a r/w request? */ + if (req_slot >= skdev->num_req_context) { + /* + * This is not a completion for a r/w request. + */ + skd_complete_other(skdev, skcmp, skerr); + continue; + } + + skreq = &skdev->skreq_table[req_slot]; + + /* + * Make sure the request ID for the slot matches. + */ + if (skreq->id != req_id) { + pr_debug("%s:%s:%d mismatch comp_id=0x%x req_id=0x%x\n", + skdev->name, __func__, __LINE__, + req_id, skreq->id); + { + u16 new_id = cmp_cntxt; + pr_err("(%s): Completion mismatch " + "comp_id=0x%04x skreq=0x%04x new=0x%04x\n", + skd_name(skdev), req_id, + skreq->id, new_id); + + continue; + } + } + + SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY); + + if (skreq->state == SKD_REQ_STATE_ABORTED) { + pr_debug("%s:%s:%d reclaim req %p id=%04x\n", + skdev->name, __func__, __LINE__, + skreq, skreq->id); + /* a previously timed out command can + * now be cleaned up */ + skd_release_skreq(skdev, skreq); + continue; + } + + skreq->completion = *skcmp; + if (unlikely(cmp_status == SAM_STAT_CHECK_CONDITION)) { + skreq->err_info = *skerr; + skd_log_check_status(skdev, cmp_status, skerr->key, + skerr->code, skerr->qual, + skerr->fruc); + } + /* Release DMA resources for the request. */ + if (skreq->n_sg > 0) + skd_postop_sg_list(skdev, skreq); + + if (!skreq->req) { + pr_debug("%s:%s:%d NULL backptr skdreq %p, " + "req=0x%x req_id=0x%x\n", + skdev->name, __func__, __LINE__, + skreq, skreq->id, req_id); + } else { + /* + * Capture the outcome and post it back to the + * native request. + */ + if (likely(cmp_status == SAM_STAT_GOOD)) + skd_end_request(skdev, skreq, 0); + else + skd_resolve_req_exception(skdev, skreq); + } + + /* + * Release the skreq, its FIT msg (if one), timeout slot, + * and queue depth. + */ + skd_release_skreq(skdev, skreq); + + /* skd_isr_comp_limit equal zero means no limit */ + if (limit) { + if (++processed >= limit) { + rc = 1; + break; + } + } + } + + if ((skdev->state == SKD_DRVR_STATE_PAUSING) + && (skdev->in_flight) == 0) { + skdev->state = SKD_DRVR_STATE_PAUSED; + wake_up_interruptible(&skdev->waitq); + } + + return rc; +} + +static void skd_complete_other(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 *skcomp, + volatile struct fit_comp_error_info *skerr) +{ + u32 req_id = 0; + u32 req_table; + u32 req_slot; + struct skd_special_context *skspcl; + + req_id = skcomp->tag; + req_table = req_id & SKD_ID_TABLE_MASK; + req_slot = req_id & SKD_ID_SLOT_MASK; + + pr_debug("%s:%s:%d table=0x%x id=0x%x slot=%d\n", + skdev->name, __func__, __LINE__, + req_table, req_id, req_slot); + + /* + * Based on the request id, determine how to dispatch this completion. + * This swich/case is finding the good cases and forwarding the + * completion entry. Errors are reported below the switch. + */ + switch (req_table) { + case SKD_ID_RW_REQUEST: + /* + * The caller, skd_completion_posted_isr() above, + * handles r/w requests. The only way we get here + * is if the req_slot is out of bounds. + */ + break; + + case SKD_ID_SPECIAL_REQUEST: + /* + * Make sure the req_slot is in bounds and that the id + * matches. + */ + if (req_slot < skdev->n_special) { + skspcl = &skdev->skspcl_table[req_slot]; + if (skspcl->req.id == req_id && + skspcl->req.state == SKD_REQ_STATE_BUSY) { + skd_complete_special(skdev, + skcomp, skerr, skspcl); + return; + } + } + break; + + case SKD_ID_INTERNAL: + if (req_slot == 0) { + skspcl = &skdev->internal_skspcl; + if (skspcl->req.id == req_id && + skspcl->req.state == SKD_REQ_STATE_BUSY) { + skd_complete_internal(skdev, + skcomp, skerr, skspcl); + return; + } + } + break; + + case SKD_ID_FIT_MSG: + /* + * These id's should never appear in a completion record. + */ + break; + + default: + /* + * These id's should never appear anywhere; + */ + break; + } + + /* + * If we get here it is a bad or stale id. + */ +} + +static void skd_complete_special(struct skd_device *skdev, + volatile struct fit_completion_entry_v1 + *skcomp, + volatile struct fit_comp_error_info *skerr, + struct skd_special_context *skspcl) +{ + pr_debug("%s:%s:%d completing special request %p\n", + skdev->name, __func__, __LINE__, skspcl); + if (skspcl->orphaned) { + /* Discard orphaned request */ + /* ?: Can this release directly or does it need + * to use a worker? */ + pr_debug("%s:%s:%d release orphaned %p\n", + skdev->name, __func__, __LINE__, skspcl); + skd_release_special(skdev, skspcl); + return; + } + + skd_process_scsi_inq(skdev, skcomp, skerr, skspcl); + + skspcl->req.state = SKD_REQ_STATE_COMPLETED; + skspcl->req.completion = *skcomp; + skspcl->req.err_info = *skerr; + + skd_log_check_status(skdev, skspcl->req.completion.status, skerr->key, + skerr->code, skerr->qual, skerr->fruc); + + wake_up_interruptible(&skdev->waitq); +} + +/* assume spinlock is already held */ +static void skd_release_special(struct skd_device *skdev, + struct skd_special_context *skspcl) +{ + int i, was_depleted; + + for (i = 0; i < skspcl->req.n_sg; i++) { + struct page *page = sg_page(&skspcl->req.sg[i]); + __free_page(page); + } + + was_depleted = (skdev->skspcl_free_list == NULL); + + skspcl->req.state = SKD_REQ_STATE_IDLE; + skspcl->req.id += SKD_ID_INCR; + skspcl->req.next = + (struct skd_request_context *)skdev->skspcl_free_list; + skdev->skspcl_free_list = (struct skd_special_context *)skspcl; + + if (was_depleted) { + pr_debug("%s:%s:%d skspcl was depleted\n", + skdev->name, __func__, __LINE__); + /* Free list was depleted. Their might be waiters. */ + wake_up_interruptible(&skdev->waitq); + } +} + +static void skd_reset_skcomp(struct skd_device *skdev) +{ + u32 nbytes; + struct fit_completion_entry_v1 *skcomp; + + nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY; + nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY; + + memset(skdev->skcomp_table, 0, nbytes); + + skdev->skcomp_ix = 0; + skdev->skcomp_cycle = 1; +} + +/* + ***************************************************************************** + * INTERRUPTS + ***************************************************************************** + */ +static void skd_completion_worker(struct work_struct *work) +{ + struct skd_device *skdev = + container_of(work, struct skd_device, completion_worker); + unsigned long flags; + int flush_enqueued = 0; + + spin_lock_irqsave(&skdev->lock, flags); + + /* + * pass in limit=0, which means no limit.. + * process everything in compq + */ + skd_isr_completion_posted(skdev, 0, &flush_enqueued); + skd_request_fn(skdev->queue); + + spin_unlock_irqrestore(&skdev->lock, flags); +} + +static void skd_isr_msg_from_dev(struct skd_device *skdev); + +irqreturn_t +static skd_isr(int irq, void *ptr) +{ + struct skd_device *skdev; + u32 intstat; + u32 ack; + int rc = 0; + int deferred = 0; + int flush_enqueued = 0; + + skdev = (struct skd_device *)ptr; + spin_lock(&skdev->lock); + + for (;; ) { + intstat = SKD_READL(skdev, FIT_INT_STATUS_HOST); + + ack = FIT_INT_DEF_MASK; + ack &= intstat; + + pr_debug("%s:%s:%d intstat=0x%x ack=0x%x\n", + skdev->name, __func__, __LINE__, intstat, ack); + + /* As long as there is an int pending on device, keep + * running loop. When none, get out, but if we've never + * done any processing, call completion handler? + */ + if (ack == 0) { + /* No interrupts on device, but run the completion + * processor anyway? + */ + if (rc == 0) + if (likely (skdev->state + == SKD_DRVR_STATE_ONLINE)) + deferred = 1; + break; + } + + rc = IRQ_HANDLED; + + SKD_WRITEL(skdev, ack, FIT_INT_STATUS_HOST); + + if (likely((skdev->state != SKD_DRVR_STATE_LOAD) && + (skdev->state != SKD_DRVR_STATE_STOPPING))) { + if (intstat & FIT_ISH_COMPLETION_POSTED) { + /* + * If we have already deferred completion + * processing, don't bother running it again + */ + if (deferred == 0) + deferred = + skd_isr_completion_posted(skdev, + skd_isr_comp_limit, &flush_enqueued); + } + + if (intstat & FIT_ISH_FW_STATE_CHANGE) { + skd_isr_fwstate(skdev); + if (skdev->state == SKD_DRVR_STATE_FAULT || + skdev->state == + SKD_DRVR_STATE_DISAPPEARED) { + spin_unlock(&skdev->lock); + return rc; + } + } + + if (intstat & FIT_ISH_MSG_FROM_DEV) + skd_isr_msg_from_dev(skdev); + } + } + + if (unlikely(flush_enqueued)) + skd_request_fn(skdev->queue); + + if (deferred) + schedule_work(&skdev->completion_worker); + else if (!flush_enqueued) + skd_request_fn(skdev->queue); + + spin_unlock(&skdev->lock); + + return rc; +} + +static void skd_drive_fault(struct skd_device *skdev) +{ + skdev->state = SKD_DRVR_STATE_FAULT; + pr_err("(%s): Drive FAULT\n", skd_name(skdev)); +} + +static void skd_drive_disappeared(struct skd_device *skdev) +{ + skdev->state = SKD_DRVR_STATE_DISAPPEARED; + pr_err("(%s): Drive DISAPPEARED\n", skd_name(skdev)); +} + +static void skd_isr_fwstate(struct skd_device *skdev) +{ + u32 sense; + u32 state; + u32 mtd; + int prev_driver_state = skdev->state; + + sense = SKD_READL(skdev, FIT_STATUS); + state = sense & FIT_SR_DRIVE_STATE_MASK; + + pr_err("(%s): s1120 state %s(%d)=>%s(%d)\n", + skd_name(skdev), + skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, + skd_drive_state_to_str(state), state); + + skdev->drive_state = state; + + switch (skdev->drive_state) { + case FIT_SR_DRIVE_INIT: + if (skdev->state == SKD_DRVR_STATE_PROTOCOL_MISMATCH) { + skd_disable_interrupts(skdev); + break; + } + if (skdev->state == SKD_DRVR_STATE_RESTARTING) + skd_recover_requests(skdev, 0); + if (skdev->state == SKD_DRVR_STATE_WAIT_BOOT) { + skdev->timer_countdown = SKD_STARTING_TIMO; + skdev->state = SKD_DRVR_STATE_STARTING; + skd_soft_reset(skdev); + break; + } + mtd = FIT_MXD_CONS(FIT_MTD_FITFW_INIT, 0, 0); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_SR_DRIVE_ONLINE: + skdev->cur_max_queue_depth = skd_max_queue_depth; + if (skdev->cur_max_queue_depth > skdev->dev_max_queue_depth) + skdev->cur_max_queue_depth = skdev->dev_max_queue_depth; + + skdev->queue_low_water_mark = + skdev->cur_max_queue_depth * 2 / 3 + 1; + if (skdev->queue_low_water_mark < 1) + skdev->queue_low_water_mark = 1; + pr_info( + "(%s): Queue depth limit=%d dev=%d lowat=%d\n", + skd_name(skdev), + skdev->cur_max_queue_depth, + skdev->dev_max_queue_depth, skdev->queue_low_water_mark); + + skd_refresh_device_data(skdev); + break; + + case FIT_SR_DRIVE_BUSY: + skdev->state = SKD_DRVR_STATE_BUSY; + skdev->timer_countdown = SKD_BUSY_TIMO; + skd_quiesce_dev(skdev); + break; + case FIT_SR_DRIVE_BUSY_SANITIZE: + /* set timer for 3 seconds, we'll abort any unfinished + * commands after that expires + */ + skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE; + skdev->timer_countdown = SKD_TIMER_SECONDS(3); + blk_start_queue(skdev->queue); + break; + case FIT_SR_DRIVE_BUSY_ERASE: + skdev->state = SKD_DRVR_STATE_BUSY_ERASE; + skdev->timer_countdown = SKD_BUSY_TIMO; + break; + case FIT_SR_DRIVE_OFFLINE: + skdev->state = SKD_DRVR_STATE_IDLE; + break; + case FIT_SR_DRIVE_SOFT_RESET: + switch (skdev->state) { + case SKD_DRVR_STATE_STARTING: + case SKD_DRVR_STATE_RESTARTING: + /* Expected by a caller of skd_soft_reset() */ + break; + default: + skdev->state = SKD_DRVR_STATE_RESTARTING; + break; + } + break; + case FIT_SR_DRIVE_FW_BOOTING: + pr_debug("%s:%s:%d ISR FIT_SR_DRIVE_FW_BOOTING %s\n", + skdev->name, __func__, __LINE__, skdev->name); + skdev->state = SKD_DRVR_STATE_WAIT_BOOT; + skdev->timer_countdown = SKD_WAIT_BOOT_TIMO; + break; + + case FIT_SR_DRIVE_DEGRADED: + case FIT_SR_PCIE_LINK_DOWN: + case FIT_SR_DRIVE_NEED_FW_DOWNLOAD: + break; + + case FIT_SR_DRIVE_FAULT: + skd_drive_fault(skdev); + skd_recover_requests(skdev, 0); + blk_start_queue(skdev->queue); + break; + + /* PCIe bus returned all Fs? */ + case 0xFF: + pr_info("(%s): state=0x%x sense=0x%x\n", + skd_name(skdev), state, sense); + skd_drive_disappeared(skdev); + skd_recover_requests(skdev, 0); + blk_start_queue(skdev->queue); + break; + default: + /* + * Uknown FW State. Wait for a state we recognize. + */ + break; + } + pr_err("(%s): Driver state %s(%d)=>%s(%d)\n", + skd_name(skdev), + skd_skdev_state_to_str(prev_driver_state), prev_driver_state, + skd_skdev_state_to_str(skdev->state), skdev->state); +} + +static void skd_recover_requests(struct skd_device *skdev, int requeue) +{ + int i; + + for (i = 0; i < skdev->num_req_context; i++) { + struct skd_request_context *skreq = &skdev->skreq_table[i]; + + if (skreq->state == SKD_REQ_STATE_BUSY) { + skd_log_skreq(skdev, skreq, "recover"); + + SKD_ASSERT((skreq->id & SKD_ID_INCR) != 0); + SKD_ASSERT(skreq->req != NULL); + + /* Release DMA resources for the request. */ + if (skreq->n_sg > 0) + skd_postop_sg_list(skdev, skreq); + + if (requeue && + (unsigned long) ++skreq->req->special < + SKD_MAX_RETRIES) + blk_requeue_request(skdev->queue, skreq->req); + else + skd_end_request(skdev, skreq, -EIO); + + skreq->req = NULL; + + skreq->state = SKD_REQ_STATE_IDLE; + skreq->id += SKD_ID_INCR; + } + if (i > 0) + skreq[-1].next = skreq; + skreq->next = NULL; + } + skdev->skreq_free_list = skdev->skreq_table; + + for (i = 0; i < skdev->num_fitmsg_context; i++) { + struct skd_fitmsg_context *skmsg = &skdev->skmsg_table[i]; + + if (skmsg->state == SKD_MSG_STATE_BUSY) { + skd_log_skmsg(skdev, skmsg, "salvaged"); + SKD_ASSERT((skmsg->id & SKD_ID_INCR) != 0); + skmsg->state = SKD_MSG_STATE_IDLE; + skmsg->id += SKD_ID_INCR; + } + if (i > 0) + skmsg[-1].next = skmsg; + skmsg->next = NULL; + } + skdev->skmsg_free_list = skdev->skmsg_table; + + for (i = 0; i < skdev->n_special; i++) { + struct skd_special_context *skspcl = &skdev->skspcl_table[i]; + + /* If orphaned, reclaim it because it has already been reported + * to the process as an error (it was just waiting for + * a completion that didn't come, and now it will never come) + * If busy, change to a state that will cause it to error + * out in the wait routine and let it do the normal + * reporting and reclaiming + */ + if (skspcl->req.state == SKD_REQ_STATE_BUSY) { + if (skspcl->orphaned) { + pr_debug("%s:%s:%d orphaned %p\n", + skdev->name, __func__, __LINE__, + skspcl); + skd_release_special(skdev, skspcl); + } else { + pr_debug("%s:%s:%d not orphaned %p\n", + skdev->name, __func__, __LINE__, + skspcl); + skspcl->req.state = SKD_REQ_STATE_ABORTED; + } + } + } + skdev->skspcl_free_list = skdev->skspcl_table; + + for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++) + skdev->timeout_slot[i] = 0; + + skdev->in_flight = 0; +} + +static void skd_isr_msg_from_dev(struct skd_device *skdev) +{ + u32 mfd; + u32 mtd; + u32 data; + + mfd = SKD_READL(skdev, FIT_MSG_FROM_DEVICE); + + pr_debug("%s:%s:%d mfd=0x%x last_mtd=0x%x\n", + skdev->name, __func__, __LINE__, mfd, skdev->last_mtd); + + /* ignore any mtd that is an ack for something we didn't send */ + if (FIT_MXD_TYPE(mfd) != FIT_MXD_TYPE(skdev->last_mtd)) + return; + + switch (FIT_MXD_TYPE(mfd)) { + case FIT_MTD_FITFW_INIT: + skdev->proto_ver = FIT_PROTOCOL_MAJOR_VER(mfd); + + if (skdev->proto_ver != FIT_PROTOCOL_VERSION_1) { + pr_err("(%s): protocol mismatch\n", + skdev->name); + pr_err("(%s): got=%d support=%d\n", + skdev->name, skdev->proto_ver, + FIT_PROTOCOL_VERSION_1); + pr_err("(%s): please upgrade driver\n", + skdev->name); + skdev->state = SKD_DRVR_STATE_PROTOCOL_MISMATCH; + skd_soft_reset(skdev); + break; + } + mtd = FIT_MXD_CONS(FIT_MTD_GET_CMDQ_DEPTH, 0, 0); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_GET_CMDQ_DEPTH: + skdev->dev_max_queue_depth = FIT_MXD_DATA(mfd); + mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_DEPTH, 0, + SKD_N_COMPLETION_ENTRY); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_SET_COMPQ_DEPTH: + SKD_WRITEQ(skdev, skdev->cq_dma_address, FIT_MSG_TO_DEVICE_ARG); + mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_ADDR, 0, 0); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_SET_COMPQ_ADDR: + skd_reset_skcomp(skdev); + mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_HOST_ID, 0, skdev->devno); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_CMD_LOG_HOST_ID: + skdev->connect_time_stamp = get_seconds(); + data = skdev->connect_time_stamp & 0xFFFF; + mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_CMD_LOG_TIME_STAMP_LO: + skdev->drive_jiffies = FIT_MXD_DATA(mfd); + data = (skdev->connect_time_stamp >> 16) & 0xFFFF; + mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_HI, 0, data); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + break; + + case FIT_MTD_CMD_LOG_TIME_STAMP_HI: + skdev->drive_jiffies |= (FIT_MXD_DATA(mfd) << 16); + mtd = FIT_MXD_CONS(FIT_MTD_ARM_QUEUE, 0, 0); + SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); + skdev->last_mtd = mtd; + + pr_err("(%s): Time sync driver=0x%x device=0x%x\n", + skd_name(skdev), + skdev->connect_time_stamp, skdev->drive_jiffies); + break; + + case FIT_MTD_ARM_QUEUE: + skdev->last_mtd = 0; + /* + * State should be, or soon will be, FIT_SR_DRIVE_ONLINE. + */ + break; + + default: + break; + } +} + +static void skd_disable_interrupts(struct skd_device *skdev) +{ + u32 sense; + + sense = SKD_READL(skdev, FIT_CONTROL); + sense &= ~FIT_CR_ENABLE_INTERRUPTS; + SKD_WRITEL(skdev, sense, FIT_CONTROL); + pr_debug("%s:%s:%d sense 0x%x\n", + skdev->name, __func__, __LINE__, sense); + + /* Note that the 1s is written. A 1-bit means + * disable, a 0 means enable. + */ + SKD_WRITEL(skdev, ~0, FIT_INT_MASK_HOST); +} + +static void skd_enable_interrupts(struct skd_device *skdev) +{ + u32 val; + + /* unmask interrupts first */ + val = FIT_ISH_FW_STATE_CHANGE + + FIT_ISH_COMPLETION_POSTED + FIT_ISH_MSG_FROM_DEV; + + /* Note that the compliment of mask is written. A 1-bit means + * disable, a 0 means enable. */ + SKD_WRITEL(skdev, ~val, FIT_INT_MASK_HOST); + pr_debug("%s:%s:%d interrupt mask=0x%x\n", + skdev->name, __func__, __LINE__, ~val); + + val = SKD_READL(skdev, FIT_CONTROL); + val |= FIT_CR_ENABLE_INTERRUPTS; + pr_debug("%s:%s:%d control=0x%x\n", + skdev->name, __func__, __LINE__, val); + SKD_WRITEL(skdev, val, FIT_CONTROL); +} + +/* + ***************************************************************************** + * START, STOP, RESTART, QUIESCE, UNQUIESCE + ***************************************************************************** + */ + +static void skd_soft_reset(struct skd_device *skdev) +{ + u32 val; + + val = SKD_READL(skdev, FIT_CONTROL); + val |= (FIT_CR_SOFT_RESET); + pr_debug("%s:%s:%d control=0x%x\n", + skdev->name, __func__, __LINE__, val); + SKD_WRITEL(skdev, val, FIT_CONTROL); +} + +static void skd_start_device(struct skd_device *skdev) +{ + unsigned long flags; + u32 sense; + u32 state; + + spin_lock_irqsave(&skdev->lock, flags); + + /* ack all ghost interrupts */ + SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); + + sense = SKD_READL(skdev, FIT_STATUS); + + pr_debug("%s:%s:%d initial status=0x%x\n", + skdev->name, __func__, __LINE__, sense); + + state = sense & FIT_SR_DRIVE_STATE_MASK; + skdev->drive_state = state; + skdev->last_mtd = 0; + + skdev->state = SKD_DRVR_STATE_STARTING; + skdev->timer_countdown = SKD_STARTING_TIMO; + + skd_enable_interrupts(skdev); + + switch (skdev->drive_state) { + case FIT_SR_DRIVE_OFFLINE: + pr_err("(%s): Drive offline...\n", skd_name(skdev)); + break; + + case FIT_SR_DRIVE_FW_BOOTING: + pr_debug("%s:%s:%d FIT_SR_DRIVE_FW_BOOTING %s\n", + skdev->name, __func__, __LINE__, skdev->name); + skdev->state = SKD_DRVR_STATE_WAIT_BOOT; + skdev->timer_countdown = SKD_WAIT_BOOT_TIMO; + break; + + case FIT_SR_DRIVE_BUSY_SANITIZE: + pr_info("(%s): Start: BUSY_SANITIZE\n", + skd_name(skdev)); + skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE; + skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; + break; + + case FIT_SR_DRIVE_BUSY_ERASE: + pr_info("(%s): Start: BUSY_ERASE\n", skd_name(skdev)); + skdev->state = SKD_DRVR_STATE_BUSY_ERASE; + skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; + break; + + case FIT_SR_DRIVE_INIT: + case FIT_SR_DRIVE_ONLINE: + skd_soft_reset(skdev); + break; + + case FIT_SR_DRIVE_BUSY: + pr_err("(%s): Drive Busy...\n", skd_name(skdev)); + skdev->state = SKD_DRVR_STATE_BUSY; + skdev->timer_countdown = SKD_STARTED_BUSY_TIMO; + break; + + case FIT_SR_DRIVE_SOFT_RESET: + pr_err("(%s) drive soft reset in prog\n", + skd_name(skdev)); + break; + + case FIT_SR_DRIVE_FAULT: + /* Fault state is bad...soft reset won't do it... + * Hard reset, maybe, but does it work on device? + * For now, just fault so the system doesn't hang. + */ + skd_drive_fault(skdev); + /*start the queue so we can respond with error to requests */ + pr_debug("%s:%s:%d starting %s queue\n", + skdev->name, __func__, __LINE__, skdev->name); + blk_start_queue(skdev->queue); + skdev->gendisk_on = -1; + wake_up_interruptible(&skdev->waitq); + break; + + case 0xFF: + /* Most likely the device isn't there or isn't responding + * to the BAR1 addresses. */ + skd_drive_disappeared(skdev); + /*start the queue so we can respond with error to requests */ + pr_debug("%s:%s:%d starting %s queue to error-out reqs\n", + skdev->name, __func__, __LINE__, skdev->name); + blk_start_queue(skdev->queue); + skdev->gendisk_on = -1; + wake_up_interruptible(&skdev->waitq); + break; + + default: + pr_err("(%s) Start: unknown state %x\n", + skd_name(skdev), skdev->drive_state); + break; + } + + state = SKD_READL(skdev, FIT_CONTROL); + pr_debug("%s:%s:%d FIT Control Status=0x%x\n", + skdev->name, __func__, __LINE__, state); + + state = SKD_READL(skdev, FIT_INT_STATUS_HOST); + pr_debug("%s:%s:%d Intr Status=0x%x\n", + skdev->name, __func__, __LINE__, state); + + state = SKD_READL(skdev, FIT_INT_MASK_HOST); + pr_debug("%s:%s:%d Intr Mask=0x%x\n", + skdev->name, __func__, __LINE__, state); + + state = SKD_READL(skdev, FIT_MSG_FROM_DEVICE); + pr_debug("%s:%s:%d Msg from Dev=0x%x\n", + skdev->name, __func__, __LINE__, state); + + state = SKD_READL(skdev, FIT_HW_VERSION); + pr_debug("%s:%s:%d HW version=0x%x\n", + skdev->name, __func__, __LINE__, state); + + spin_unlock_irqrestore(&skdev->lock, flags); +} + +static void skd_stop_device(struct skd_device *skdev) +{ + unsigned long flags; + struct skd_special_context *skspcl = &skdev->internal_skspcl; + u32 dev_state; + int i; + + spin_lock_irqsave(&skdev->lock, flags); + + if (skdev->state != SKD_DRVR_STATE_ONLINE) { + pr_err("(%s): skd_stop_device not online no sync\n", + skd_name(skdev)); + goto stop_out; + } + + if (skspcl->req.state != SKD_REQ_STATE_IDLE) { + pr_err("(%s): skd_stop_device no special\n", + skd_name(skdev)); + goto stop_out; + } + + skdev->state = SKD_DRVR_STATE_SYNCING; + skdev->sync_done = 0; + + skd_send_internal_skspcl(skdev, skspcl, SYNCHRONIZE_CACHE); + + spin_unlock_irqrestore(&skdev->lock, flags); + + wait_event_interruptible_timeout(skdev->waitq, + (skdev->sync_done), (10 * HZ)); + + spin_lock_irqsave(&skdev->lock, flags); + + switch (skdev->sync_done) { + case 0: + pr_err("(%s): skd_stop_device no sync\n", + skd_name(skdev)); + break; + case 1: + pr_err("(%s): skd_stop_device sync done\n", + skd_name(skdev)); + break; + default: + pr_err("(%s): skd_stop_device sync error\n", + skd_name(skdev)); + } + +stop_out: + skdev->state = SKD_DRVR_STATE_STOPPING; + spin_unlock_irqrestore(&skdev->lock, flags); + + skd_kill_timer(skdev); + + spin_lock_irqsave(&skdev->lock, flags); + skd_disable_interrupts(skdev); + + /* ensure all ints on device are cleared */ + /* soft reset the device to unload with a clean slate */ + SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); + SKD_WRITEL(skdev, FIT_CR_SOFT_RESET, FIT_CONTROL); + + spin_unlock_irqrestore(&skdev->lock, flags); + + /* poll every 100ms, 1 second timeout */ + for (i = 0; i < 10; i++) { + dev_state = + SKD_READL(skdev, FIT_STATUS) & FIT_SR_DRIVE_STATE_MASK; + if (dev_state == FIT_SR_DRIVE_INIT) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(100)); + } + + if (dev_state != FIT_SR_DRIVE_INIT) + pr_err("(%s): skd_stop_device state error 0x%02x\n", + skd_name(skdev), dev_state); +} + +/* assume spinlock is held */ +static void skd_restart_device(struct skd_device *skdev) +{ + u32 state; + + /* ack all ghost interrupts */ + SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST); + + state = SKD_READL(skdev, FIT_STATUS); + + pr_debug("%s:%s:%d drive status=0x%x\n", + skdev->name, __func__, __LINE__, state); + + state &= FIT_SR_DRIVE_STATE_MASK; + skdev->drive_state = state; + skdev->last_mtd = 0; + + skdev->state = SKD_DRVR_STATE_RESTARTING; + skdev->timer_countdown = SKD_RESTARTING_TIMO; + + skd_soft_reset(skdev); +} + +/* assume spinlock is held */ +static int skd_quiesce_dev(struct skd_device *skdev) +{ + int rc = 0; + + switch (skdev->state) { + case SKD_DRVR_STATE_BUSY: + case SKD_DRVR_STATE_BUSY_IMMINENT: + pr_debug("%s:%s:%d stopping %s queue\n", + skdev->name, __func__, __LINE__, skdev->name); + blk_stop_queue(skdev->queue); + break; + case SKD_DRVR_STATE_ONLINE: + case SKD_DRVR_STATE_STOPPING: + case SKD_DRVR_STATE_SYNCING: + case SKD_DRVR_STATE_PAUSING: + case SKD_DRVR_STATE_PAUSED: + case SKD_DRVR_STATE_STARTING: + case SKD_DRVR_STATE_RESTARTING: + case SKD_DRVR_STATE_RESUMING: + default: + rc = -EINVAL; + pr_debug("%s:%s:%d state [%d] not implemented\n", + skdev->name, __func__, __LINE__, skdev->state); + } + return rc; +} + +/* assume spinlock is held */ +static int skd_unquiesce_dev(struct skd_device *skdev) +{ + int prev_driver_state = skdev->state; + + skd_log_skdev(skdev, "unquiesce"); + if (skdev->state == SKD_DRVR_STATE_ONLINE) { + pr_debug("%s:%s:%d **** device already ONLINE\n", + skdev->name, __func__, __LINE__); + return 0; + } + if (skdev->drive_state != FIT_SR_DRIVE_ONLINE) { + /* + * If there has been an state change to other than + * ONLINE, we will rely on controller state change + * to come back online and restart the queue. + * The BUSY state means that driver is ready to + * continue normal processing but waiting for controller + * to become available. + */ + skdev->state = SKD_DRVR_STATE_BUSY; + pr_debug("%s:%s:%d drive BUSY state\n", + skdev->name, __func__, __LINE__); + return 0; + } + + /* + * Drive has just come online, driver is either in startup, + * paused performing a task, or bust waiting for hardware. + */ + switch (skdev->state) { + case SKD_DRVR_STATE_PAUSED: + case SKD_DRVR_STATE_BUSY: + case SKD_DRVR_STATE_BUSY_IMMINENT: + case SKD_DRVR_STATE_BUSY_ERASE: + case SKD_DRVR_STATE_STARTING: + case SKD_DRVR_STATE_RESTARTING: + case SKD_DRVR_STATE_FAULT: + case SKD_DRVR_STATE_IDLE: + case SKD_DRVR_STATE_LOAD: + skdev->state = SKD_DRVR_STATE_ONLINE; + pr_err("(%s): Driver state %s(%d)=>%s(%d)\n", + skd_name(skdev), + skd_skdev_state_to_str(prev_driver_state), + prev_driver_state, skd_skdev_state_to_str(skdev->state), + skdev->state); + pr_debug("%s:%s:%d **** device ONLINE...starting block queue\n", + skdev->name, __func__, __LINE__); + pr_debug("%s:%s:%d starting %s queue\n", + skdev->name, __func__, __LINE__, skdev->name); + pr_info("(%s): STEC s1120 ONLINE\n", skd_name(skdev)); + blk_start_queue(skdev->queue); + skdev->gendisk_on = 1; + wake_up_interruptible(&skdev->waitq); + break; + + case SKD_DRVR_STATE_DISAPPEARED: + default: + pr_debug("%s:%s:%d **** driver state %d, not implemented \n", + skdev->name, __func__, __LINE__, + skdev->state); + return -EBUSY; + } + return 0; +} + +/* + ***************************************************************************** + * PCIe MSI/MSI-X INTERRUPT HANDLERS + ***************************************************************************** + */ + +static irqreturn_t skd_reserved_isr(int irq, void *skd_host_data) +{ + struct skd_device *skdev = skd_host_data; + unsigned long flags; + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d MSIX = 0x%x\n", + skdev->name, __func__, __LINE__, + SKD_READL(skdev, FIT_INT_STATUS_HOST)); + pr_err("(%s): MSIX reserved irq %d = 0x%x\n", skd_name(skdev), + irq, SKD_READL(skdev, FIT_INT_STATUS_HOST)); + SKD_WRITEL(skdev, FIT_INT_RESERVED_MASK, FIT_INT_STATUS_HOST); + spin_unlock_irqrestore(&skdev->lock, flags); + return IRQ_HANDLED; +} + +static irqreturn_t skd_statec_isr(int irq, void *skd_host_data) +{ + struct skd_device *skdev = skd_host_data; + unsigned long flags; + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d MSIX = 0x%x\n", + skdev->name, __func__, __LINE__, + SKD_READL(skdev, FIT_INT_STATUS_HOST)); + SKD_WRITEL(skdev, FIT_ISH_FW_STATE_CHANGE, FIT_INT_STATUS_HOST); + skd_isr_fwstate(skdev); + spin_unlock_irqrestore(&skdev->lock, flags); + return IRQ_HANDLED; +} + +static irqreturn_t skd_comp_q(int irq, void *skd_host_data) +{ + struct skd_device *skdev = skd_host_data; + unsigned long flags; + int flush_enqueued = 0; + int deferred; + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d MSIX = 0x%x\n", + skdev->name, __func__, __LINE__, + SKD_READL(skdev, FIT_INT_STATUS_HOST)); + SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST); + deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit, + &flush_enqueued); + if (flush_enqueued) + skd_request_fn(skdev->queue); + + if (deferred) + schedule_work(&skdev->completion_worker); + else if (!flush_enqueued) + skd_request_fn(skdev->queue); + + spin_unlock_irqrestore(&skdev->lock, flags); + + return IRQ_HANDLED; +} + +static irqreturn_t skd_msg_isr(int irq, void *skd_host_data) +{ + struct skd_device *skdev = skd_host_data; + unsigned long flags; + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d MSIX = 0x%x\n", + skdev->name, __func__, __LINE__, + SKD_READL(skdev, FIT_INT_STATUS_HOST)); + SKD_WRITEL(skdev, FIT_ISH_MSG_FROM_DEV, FIT_INT_STATUS_HOST); + skd_isr_msg_from_dev(skdev); + spin_unlock_irqrestore(&skdev->lock, flags); + return IRQ_HANDLED; +} + +static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data) +{ + struct skd_device *skdev = skd_host_data; + unsigned long flags; + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d MSIX = 0x%x\n", + skdev->name, __func__, __LINE__, + SKD_READL(skdev, FIT_INT_STATUS_HOST)); + SKD_WRITEL(skdev, FIT_INT_QUEUE_FULL, FIT_INT_STATUS_HOST); + spin_unlock_irqrestore(&skdev->lock, flags); + return IRQ_HANDLED; +} + +/* + ***************************************************************************** + * PCIe MSI/MSI-X SETUP + ***************************************************************************** + */ + +struct skd_msix_entry { + int have_irq; + u32 vector; + u32 entry; + struct skd_device *rsp; + char isr_name[30]; +}; + +struct skd_init_msix_entry { + const char *name; + irq_handler_t handler; +}; + +#define SKD_MAX_MSIX_COUNT 13 +#define SKD_MIN_MSIX_COUNT 7 +#define SKD_BASE_MSIX_IRQ 4 + +static struct skd_init_msix_entry msix_entries[SKD_MAX_MSIX_COUNT] = { + { "(DMA 0)", skd_reserved_isr }, + { "(DMA 1)", skd_reserved_isr }, + { "(DMA 2)", skd_reserved_isr }, + { "(DMA 3)", skd_reserved_isr }, + { "(State Change)", skd_statec_isr }, + { "(COMPL_Q)", skd_comp_q }, + { "(MSG)", skd_msg_isr }, + { "(Reserved)", skd_reserved_isr }, + { "(Reserved)", skd_reserved_isr }, + { "(Queue Full 0)", skd_qfull_isr }, + { "(Queue Full 1)", skd_qfull_isr }, + { "(Queue Full 2)", skd_qfull_isr }, + { "(Queue Full 3)", skd_qfull_isr }, +}; + +static void skd_release_msix(struct skd_device *skdev) +{ + struct skd_msix_entry *qentry; + int i; + + if (skdev->msix_entries) { + for (i = 0; i < skdev->msix_count; i++) { + qentry = &skdev->msix_entries[i]; + skdev = qentry->rsp; + + if (qentry->have_irq) + devm_free_irq(&skdev->pdev->dev, + qentry->vector, qentry->rsp); + } + + kfree(skdev->msix_entries); + } + + if (skdev->msix_count) + pci_disable_msix(skdev->pdev); + + skdev->msix_count = 0; + skdev->msix_entries = NULL; +} + +static int skd_acquire_msix(struct skd_device *skdev) +{ + int i, rc; + struct pci_dev *pdev = skdev->pdev; + struct msix_entry *entries; + struct skd_msix_entry *qentry; + + entries = kzalloc(sizeof(struct msix_entry) * SKD_MAX_MSIX_COUNT, + GFP_KERNEL); + if (!entries) + return -ENOMEM; + + for (i = 0; i < SKD_MAX_MSIX_COUNT; i++) + entries[i].entry = i; + + rc = pci_enable_msix_exact(pdev, entries, SKD_MAX_MSIX_COUNT); + if (rc) { + pr_err("(%s): failed to enable MSI-X %d\n", + skd_name(skdev), rc); + goto msix_out; + } + + skdev->msix_count = SKD_MAX_MSIX_COUNT; + skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) * + skdev->msix_count, GFP_KERNEL); + if (!skdev->msix_entries) { + rc = -ENOMEM; + pr_err("(%s): msix table allocation error\n", + skd_name(skdev)); + goto msix_out; + } + + for (i = 0; i < skdev->msix_count; i++) { + qentry = &skdev->msix_entries[i]; + qentry->vector = entries[i].vector; + qentry->entry = entries[i].entry; + qentry->rsp = NULL; + qentry->have_irq = 0; + pr_debug("%s:%s:%d %s: <%s> msix (%d) vec %d, entry %x\n", + skdev->name, __func__, __LINE__, + pci_name(pdev), skdev->name, + i, qentry->vector, qentry->entry); + } + + /* Enable MSI-X vectors for the base queue */ + for (i = 0; i < skdev->msix_count; i++) { + qentry = &skdev->msix_entries[i]; + snprintf(qentry->isr_name, sizeof(qentry->isr_name), + "%s%d-msix %s", DRV_NAME, skdev->devno, + msix_entries[i].name); + rc = devm_request_irq(&skdev->pdev->dev, qentry->vector, + msix_entries[i].handler, 0, + qentry->isr_name, skdev); + if (rc) { + pr_err("(%s): Unable to register(%d) MSI-X " + "handler %d: %s\n", + skd_name(skdev), rc, i, qentry->isr_name); + goto msix_out; + } else { + qentry->have_irq = 1; + qentry->rsp = skdev; + } + } + pr_debug("%s:%s:%d %s: <%s> msix %d irq(s) enabled\n", + skdev->name, __func__, __LINE__, + pci_name(pdev), skdev->name, skdev->msix_count); + return 0; + +msix_out: + if (entries) + kfree(entries); + skd_release_msix(skdev); + return rc; +} + +static int skd_acquire_irq(struct skd_device *skdev) +{ + int rc; + struct pci_dev *pdev; + + pdev = skdev->pdev; + skdev->msix_count = 0; + +RETRY_IRQ_TYPE: + switch (skdev->irq_type) { + case SKD_IRQ_MSIX: + rc = skd_acquire_msix(skdev); + if (!rc) + pr_info("(%s): MSI-X %d irqs enabled\n", + skd_name(skdev), skdev->msix_count); + else { + pr_err( + "(%s): failed to enable MSI-X, re-trying with MSI %d\n", + skd_name(skdev), rc); + skdev->irq_type = SKD_IRQ_MSI; + goto RETRY_IRQ_TYPE; + } + break; + case SKD_IRQ_MSI: + snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d-msi", + DRV_NAME, skdev->devno); + rc = pci_enable_msi_range(pdev, 1, 1); + if (rc > 0) { + rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, 0, + skdev->isr_name, skdev); + if (rc) { + pci_disable_msi(pdev); + pr_err( + "(%s): failed to allocate the MSI interrupt %d\n", + skd_name(skdev), rc); + goto RETRY_IRQ_LEGACY; + } + pr_info("(%s): MSI irq %d enabled\n", + skd_name(skdev), pdev->irq); + } else { +RETRY_IRQ_LEGACY: + pr_err( + "(%s): failed to enable MSI, re-trying with LEGACY %d\n", + skd_name(skdev), rc); + skdev->irq_type = SKD_IRQ_LEGACY; + goto RETRY_IRQ_TYPE; + } + break; + case SKD_IRQ_LEGACY: + snprintf(skdev->isr_name, sizeof(skdev->isr_name), + "%s%d-legacy", DRV_NAME, skdev->devno); + rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, + IRQF_SHARED, skdev->isr_name, skdev); + if (!rc) + pr_info("(%s): LEGACY irq %d enabled\n", + skd_name(skdev), pdev->irq); + else + pr_err("(%s): request LEGACY irq error %d\n", + skd_name(skdev), rc); + break; + default: + pr_info("(%s): irq_type %d invalid, re-set to %d\n", + skd_name(skdev), skdev->irq_type, SKD_IRQ_DEFAULT); + skdev->irq_type = SKD_IRQ_LEGACY; + goto RETRY_IRQ_TYPE; + } + return rc; +} + +static void skd_release_irq(struct skd_device *skdev) +{ + switch (skdev->irq_type) { + case SKD_IRQ_MSIX: + skd_release_msix(skdev); + break; + case SKD_IRQ_MSI: + devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev); + pci_disable_msi(skdev->pdev); + break; + case SKD_IRQ_LEGACY: + devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev); + break; + default: + pr_err("(%s): wrong irq type %d!", + skd_name(skdev), skdev->irq_type); + break; + } +} + +/* + ***************************************************************************** + * CONSTRUCT + ***************************************************************************** + */ + +static int skd_cons_skcomp(struct skd_device *skdev) +{ + int rc = 0; + struct fit_completion_entry_v1 *skcomp; + u32 nbytes; + + nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY; + nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY; + + pr_debug("%s:%s:%d comp pci_alloc, total bytes %d entries %d\n", + skdev->name, __func__, __LINE__, + nbytes, SKD_N_COMPLETION_ENTRY); + + skcomp = pci_zalloc_consistent(skdev->pdev, nbytes, + &skdev->cq_dma_address); + + if (skcomp == NULL) { + rc = -ENOMEM; + goto err_out; + } + + skdev->skcomp_table = skcomp; + skdev->skerr_table = (struct fit_comp_error_info *)((char *)skcomp + + sizeof(*skcomp) * + SKD_N_COMPLETION_ENTRY); + +err_out: + return rc; +} + +static int skd_cons_skmsg(struct skd_device *skdev) +{ + int rc = 0; + u32 i; + + pr_debug("%s:%s:%d skmsg_table kzalloc, struct %lu, count %u total %lu\n", + skdev->name, __func__, __LINE__, + sizeof(struct skd_fitmsg_context), + skdev->num_fitmsg_context, + sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context); + + skdev->skmsg_table = kzalloc(sizeof(struct skd_fitmsg_context) + *skdev->num_fitmsg_context, GFP_KERNEL); + if (skdev->skmsg_table == NULL) { + rc = -ENOMEM; + goto err_out; + } + + for (i = 0; i < skdev->num_fitmsg_context; i++) { + struct skd_fitmsg_context *skmsg; + + skmsg = &skdev->skmsg_table[i]; + + skmsg->id = i + SKD_ID_FIT_MSG; + + skmsg->state = SKD_MSG_STATE_IDLE; + skmsg->msg_buf = pci_alloc_consistent(skdev->pdev, + SKD_N_FITMSG_BYTES + 64, + &skmsg->mb_dma_address); + + if (skmsg->msg_buf == NULL) { + rc = -ENOMEM; + goto err_out; + } + + skmsg->offset = (u32)((u64)skmsg->msg_buf & + (~FIT_QCMD_BASE_ADDRESS_MASK)); + skmsg->msg_buf += ~FIT_QCMD_BASE_ADDRESS_MASK; + skmsg->msg_buf = (u8 *)((u64)skmsg->msg_buf & + FIT_QCMD_BASE_ADDRESS_MASK); + skmsg->mb_dma_address += ~FIT_QCMD_BASE_ADDRESS_MASK; + skmsg->mb_dma_address &= FIT_QCMD_BASE_ADDRESS_MASK; + memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES); + + skmsg->next = &skmsg[1]; + } + + /* Free list is in order starting with the 0th entry. */ + skdev->skmsg_table[i - 1].next = NULL; + skdev->skmsg_free_list = skdev->skmsg_table; + +err_out: + return rc; +} + +static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev, + u32 n_sg, + dma_addr_t *ret_dma_addr) +{ + struct fit_sg_descriptor *sg_list; + u32 nbytes; + + nbytes = sizeof(*sg_list) * n_sg; + + sg_list = pci_alloc_consistent(skdev->pdev, nbytes, ret_dma_addr); + + if (sg_list != NULL) { + uint64_t dma_address = *ret_dma_addr; + u32 i; + + memset(sg_list, 0, nbytes); + + for (i = 0; i < n_sg - 1; i++) { + uint64_t ndp_off; + ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor); + + sg_list[i].next_desc_ptr = dma_address + ndp_off; + } + sg_list[i].next_desc_ptr = 0LL; + } + + return sg_list; +} + +static int skd_cons_skreq(struct skd_device *skdev) +{ + int rc = 0; + u32 i; + + pr_debug("%s:%s:%d skreq_table kzalloc, struct %lu, count %u total %lu\n", + skdev->name, __func__, __LINE__, + sizeof(struct skd_request_context), + skdev->num_req_context, + sizeof(struct skd_request_context) * skdev->num_req_context); + + skdev->skreq_table = kzalloc(sizeof(struct skd_request_context) + * skdev->num_req_context, GFP_KERNEL); + if (skdev->skreq_table == NULL) { + rc = -ENOMEM; + goto err_out; + } + + pr_debug("%s:%s:%d alloc sg_table sg_per_req %u scatlist %lu total %lu\n", + skdev->name, __func__, __LINE__, + skdev->sgs_per_request, sizeof(struct scatterlist), + skdev->sgs_per_request * sizeof(struct scatterlist)); + + for (i = 0; i < skdev->num_req_context; i++) { + struct skd_request_context *skreq; + + skreq = &skdev->skreq_table[i]; + + skreq->id = i + SKD_ID_RW_REQUEST; + skreq->state = SKD_REQ_STATE_IDLE; + + skreq->sg = kzalloc(sizeof(struct scatterlist) * + skdev->sgs_per_request, GFP_KERNEL); + if (skreq->sg == NULL) { + rc = -ENOMEM; + goto err_out; + } + sg_init_table(skreq->sg, skdev->sgs_per_request); + + skreq->sksg_list = skd_cons_sg_list(skdev, + skdev->sgs_per_request, + &skreq->sksg_dma_address); + + if (skreq->sksg_list == NULL) { + rc = -ENOMEM; + goto err_out; + } + + skreq->next = &skreq[1]; + } + + /* Free list is in order starting with the 0th entry. */ + skdev->skreq_table[i - 1].next = NULL; + skdev->skreq_free_list = skdev->skreq_table; + +err_out: + return rc; +} + +static int skd_cons_skspcl(struct skd_device *skdev) +{ + int rc = 0; + u32 i, nbytes; + + pr_debug("%s:%s:%d skspcl_table kzalloc, struct %lu, count %u total %lu\n", + skdev->name, __func__, __LINE__, + sizeof(struct skd_special_context), + skdev->n_special, + sizeof(struct skd_special_context) * skdev->n_special); + + skdev->skspcl_table = kzalloc(sizeof(struct skd_special_context) + * skdev->n_special, GFP_KERNEL); + if (skdev->skspcl_table == NULL) { + rc = -ENOMEM; + goto err_out; + } + + for (i = 0; i < skdev->n_special; i++) { + struct skd_special_context *skspcl; + + skspcl = &skdev->skspcl_table[i]; + + skspcl->req.id = i + SKD_ID_SPECIAL_REQUEST; + skspcl->req.state = SKD_REQ_STATE_IDLE; + + skspcl->req.next = &skspcl[1].req; + + nbytes = SKD_N_SPECIAL_FITMSG_BYTES; + + skspcl->msg_buf = + pci_zalloc_consistent(skdev->pdev, nbytes, + &skspcl->mb_dma_address); + if (skspcl->msg_buf == NULL) { + rc = -ENOMEM; + goto err_out; + } + + skspcl->req.sg = kzalloc(sizeof(struct scatterlist) * + SKD_N_SG_PER_SPECIAL, GFP_KERNEL); + if (skspcl->req.sg == NULL) { + rc = -ENOMEM; + goto err_out; + } + + skspcl->req.sksg_list = skd_cons_sg_list(skdev, + SKD_N_SG_PER_SPECIAL, + &skspcl->req. + sksg_dma_address); + if (skspcl->req.sksg_list == NULL) { + rc = -ENOMEM; + goto err_out; + } + } + + /* Free list is in order starting with the 0th entry. */ + skdev->skspcl_table[i - 1].req.next = NULL; + skdev->skspcl_free_list = skdev->skspcl_table; + + return rc; + +err_out: + return rc; +} + +static int skd_cons_sksb(struct skd_device *skdev) +{ + int rc = 0; + struct skd_special_context *skspcl; + u32 nbytes; + + skspcl = &skdev->internal_skspcl; + + skspcl->req.id = 0 + SKD_ID_INTERNAL; + skspcl->req.state = SKD_REQ_STATE_IDLE; + + nbytes = SKD_N_INTERNAL_BYTES; + + skspcl->data_buf = pci_zalloc_consistent(skdev->pdev, nbytes, + &skspcl->db_dma_address); + if (skspcl->data_buf == NULL) { + rc = -ENOMEM; + goto err_out; + } + + nbytes = SKD_N_SPECIAL_FITMSG_BYTES; + skspcl->msg_buf = pci_zalloc_consistent(skdev->pdev, nbytes, + &skspcl->mb_dma_address); + if (skspcl->msg_buf == NULL) { + rc = -ENOMEM; + goto err_out; + } + + skspcl->req.sksg_list = skd_cons_sg_list(skdev, 1, + &skspcl->req.sksg_dma_address); + if (skspcl->req.sksg_list == NULL) { + rc = -ENOMEM; + goto err_out; + } + + if (!skd_format_internal_skspcl(skdev)) { + rc = -EINVAL; + goto err_out; + } + +err_out: + return rc; +} + +static int skd_cons_disk(struct skd_device *skdev) +{ + int rc = 0; + struct gendisk *disk; + struct request_queue *q; + unsigned long flags; + + disk = alloc_disk(SKD_MINORS_PER_DEVICE); + if (!disk) { + rc = -ENOMEM; + goto err_out; + } + + skdev->disk = disk; + sprintf(disk->disk_name, DRV_NAME "%u", skdev->devno); + + disk->major = skdev->major; + disk->first_minor = skdev->devno * SKD_MINORS_PER_DEVICE; + disk->fops = &skd_blockdev_ops; + disk->private_data = skdev; + + q = blk_init_queue(skd_request_fn, &skdev->lock); + if (!q) { + rc = -ENOMEM; + goto err_out; + } + + skdev->queue = q; + disk->queue = q; + q->queuedata = skdev; + + blk_queue_flush(q, REQ_FLUSH | REQ_FUA); + blk_queue_max_segments(q, skdev->sgs_per_request); + blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS); + + /* set sysfs ptimal_io_size to 8K */ + blk_queue_io_opt(q, 8192); + + /* DISCARD Flag initialization. */ + q->limits.discard_granularity = 8192; + q->limits.discard_alignment = 0; + q->limits.max_discard_sectors = UINT_MAX >> 9; + q->limits.discard_zeroes_data = 1; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); + + spin_lock_irqsave(&skdev->lock, flags); + pr_debug("%s:%s:%d stopping %s queue\n", + skdev->name, __func__, __LINE__, skdev->name); + blk_stop_queue(skdev->queue); + spin_unlock_irqrestore(&skdev->lock, flags); + +err_out: + return rc; +} + +#define SKD_N_DEV_TABLE 16u +static u32 skd_next_devno; + +static struct skd_device *skd_construct(struct pci_dev *pdev) +{ + struct skd_device *skdev; + int blk_major = skd_major; + int rc; + + skdev = kzalloc(sizeof(*skdev), GFP_KERNEL); + + if (!skdev) { + pr_err(PFX "(%s): memory alloc failure\n", + pci_name(pdev)); + return NULL; + } + + skdev->state = SKD_DRVR_STATE_LOAD; + skdev->pdev = pdev; + skdev->devno = skd_next_devno++; + skdev->major = blk_major; + skdev->irq_type = skd_isr_type; + sprintf(skdev->name, DRV_NAME "%d", skdev->devno); + skdev->dev_max_queue_depth = 0; + + skdev->num_req_context = skd_max_queue_depth; + skdev->num_fitmsg_context = skd_max_queue_depth; + skdev->n_special = skd_max_pass_thru; + skdev->cur_max_queue_depth = 1; + skdev->queue_low_water_mark = 1; + skdev->proto_ver = 99; + skdev->sgs_per_request = skd_sgs_per_request; + skdev->dbg_level = skd_dbg_level; + + atomic_set(&skdev->device_count, 0); + + spin_lock_init(&skdev->lock); + + INIT_WORK(&skdev->completion_worker, skd_completion_worker); + + pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__); + rc = skd_cons_skcomp(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__); + rc = skd_cons_skmsg(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__); + rc = skd_cons_skreq(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__); + rc = skd_cons_skspcl(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__); + rc = skd_cons_sksb(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__); + rc = skd_cons_disk(skdev); + if (rc < 0) + goto err_out; + + pr_debug("%s:%s:%d VICTORY\n", skdev->name, __func__, __LINE__); + return skdev; + +err_out: + pr_debug("%s:%s:%d construct failed\n", + skdev->name, __func__, __LINE__); + skd_destruct(skdev); + return NULL; +} + +/* + ***************************************************************************** + * DESTRUCT (FREE) + ***************************************************************************** + */ + +static void skd_free_skcomp(struct skd_device *skdev) +{ + if (skdev->skcomp_table != NULL) { + u32 nbytes; + + nbytes = sizeof(skdev->skcomp_table[0]) * + SKD_N_COMPLETION_ENTRY; + pci_free_consistent(skdev->pdev, nbytes, + skdev->skcomp_table, skdev->cq_dma_address); + } + + skdev->skcomp_table = NULL; + skdev->cq_dma_address = 0; +} + +static void skd_free_skmsg(struct skd_device *skdev) +{ + u32 i; + + if (skdev->skmsg_table == NULL) + return; + + for (i = 0; i < skdev->num_fitmsg_context; i++) { + struct skd_fitmsg_context *skmsg; + + skmsg = &skdev->skmsg_table[i]; + + if (skmsg->msg_buf != NULL) { + skmsg->msg_buf += skmsg->offset; + skmsg->mb_dma_address += skmsg->offset; + pci_free_consistent(skdev->pdev, SKD_N_FITMSG_BYTES, + skmsg->msg_buf, + skmsg->mb_dma_address); + } + skmsg->msg_buf = NULL; + skmsg->mb_dma_address = 0; + } + + kfree(skdev->skmsg_table); + skdev->skmsg_table = NULL; +} + +static void skd_free_sg_list(struct skd_device *skdev, + struct fit_sg_descriptor *sg_list, + u32 n_sg, dma_addr_t dma_addr) +{ + if (sg_list != NULL) { + u32 nbytes; + + nbytes = sizeof(*sg_list) * n_sg; + + pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr); + } +} + +static void skd_free_skreq(struct skd_device *skdev) +{ + u32 i; + + if (skdev->skreq_table == NULL) + return; + + for (i = 0; i < skdev->num_req_context; i++) { + struct skd_request_context *skreq; + + skreq = &skdev->skreq_table[i]; + + skd_free_sg_list(skdev, skreq->sksg_list, + skdev->sgs_per_request, + skreq->sksg_dma_address); + + skreq->sksg_list = NULL; + skreq->sksg_dma_address = 0; + + kfree(skreq->sg); + } + + kfree(skdev->skreq_table); + skdev->skreq_table = NULL; +} + +static void skd_free_skspcl(struct skd_device *skdev) +{ + u32 i; + u32 nbytes; + + if (skdev->skspcl_table == NULL) + return; + + for (i = 0; i < skdev->n_special; i++) { + struct skd_special_context *skspcl; + + skspcl = &skdev->skspcl_table[i]; + + if (skspcl->msg_buf != NULL) { + nbytes = SKD_N_SPECIAL_FITMSG_BYTES; + pci_free_consistent(skdev->pdev, nbytes, + skspcl->msg_buf, + skspcl->mb_dma_address); + } + + skspcl->msg_buf = NULL; + skspcl->mb_dma_address = 0; + + skd_free_sg_list(skdev, skspcl->req.sksg_list, + SKD_N_SG_PER_SPECIAL, + skspcl->req.sksg_dma_address); + + skspcl->req.sksg_list = NULL; + skspcl->req.sksg_dma_address = 0; + + kfree(skspcl->req.sg); + } + + kfree(skdev->skspcl_table); + skdev->skspcl_table = NULL; +} + +static void skd_free_sksb(struct skd_device *skdev) +{ + struct skd_special_context *skspcl; + u32 nbytes; + + skspcl = &skdev->internal_skspcl; + + if (skspcl->data_buf != NULL) { + nbytes = SKD_N_INTERNAL_BYTES; + + pci_free_consistent(skdev->pdev, nbytes, + skspcl->data_buf, skspcl->db_dma_address); + } + + skspcl->data_buf = NULL; + skspcl->db_dma_address = 0; + + if (skspcl->msg_buf != NULL) { + nbytes = SKD_N_SPECIAL_FITMSG_BYTES; + pci_free_consistent(skdev->pdev, nbytes, + skspcl->msg_buf, skspcl->mb_dma_address); + } + + skspcl->msg_buf = NULL; + skspcl->mb_dma_address = 0; + + skd_free_sg_list(skdev, skspcl->req.sksg_list, 1, + skspcl->req.sksg_dma_address); + + skspcl->req.sksg_list = NULL; + skspcl->req.sksg_dma_address = 0; +} + +static void skd_free_disk(struct skd_device *skdev) +{ + struct gendisk *disk = skdev->disk; + + if (disk != NULL) { + struct request_queue *q = disk->queue; + + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + if (q) + blk_cleanup_queue(q); + put_disk(disk); + } + skdev->disk = NULL; +} + +static void skd_destruct(struct skd_device *skdev) +{ + if (skdev == NULL) + return; + + + pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__); + skd_free_disk(skdev); + + pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__); + skd_free_sksb(skdev); + + pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__); + skd_free_skspcl(skdev); + + pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__); + skd_free_skreq(skdev); + + pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__); + skd_free_skmsg(skdev); + + pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__); + skd_free_skcomp(skdev); + + pr_debug("%s:%s:%d skdev\n", skdev->name, __func__, __LINE__); + kfree(skdev); +} + +/* + ***************************************************************************** + * BLOCK DEVICE (BDEV) GLUE + ***************************************************************************** + */ + +static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct skd_device *skdev; + u64 capacity; + + skdev = bdev->bd_disk->private_data; + + pr_debug("%s:%s:%d %s: CMD[%s] getgeo device\n", + skdev->name, __func__, __LINE__, + bdev->bd_disk->disk_name, current->comm); + + if (skdev->read_cap_is_valid) { + capacity = get_capacity(skdev->disk); + geo->heads = 64; + geo->sectors = 255; + geo->cylinders = (capacity) / (255 * 64); + + return 0; + } + return -EIO; +} + +static int skd_bdev_attach(struct skd_device *skdev) +{ + pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__); + add_disk(skdev->disk); + return 0; +} + +static const struct block_device_operations skd_blockdev_ops = { + .owner = THIS_MODULE, + .ioctl = skd_bdev_ioctl, + .getgeo = skd_bdev_getgeo, +}; + + +/* + ***************************************************************************** + * PCIe DRIVER GLUE + ***************************************************************************** + */ + +static const struct pci_device_id skd_pci_tbl[] = { + { PCI_VENDOR_ID_STEC, PCI_DEVICE_ID_S1120, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, + { 0 } /* terminate list */ +}; + +MODULE_DEVICE_TABLE(pci, skd_pci_tbl); + +static char *skd_pci_info(struct skd_device *skdev, char *str) +{ + int pcie_reg; + + strcpy(str, "PCIe ("); + pcie_reg = pci_find_capability(skdev->pdev, PCI_CAP_ID_EXP); + + if (pcie_reg) { + + char lwstr[6]; + uint16_t pcie_lstat, lspeed, lwidth; + + pcie_reg += 0x12; + pci_read_config_word(skdev->pdev, pcie_reg, &pcie_lstat); + lspeed = pcie_lstat & (0xF); + lwidth = (pcie_lstat & 0x3F0) >> 4; + + if (lspeed == 1) + strcat(str, "2.5GT/s "); + else if (lspeed == 2) + strcat(str, "5.0GT/s "); + else + strcat(str, " "); + snprintf(lwstr, sizeof(lwstr), "%dX)", lwidth); + strcat(str, lwstr); + } + return str; +} + +static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int i; + int rc = 0; + char pci_str[32]; + struct skd_device *skdev; + + pr_info("STEC s1120 Driver(%s) version %s-b%s\n", + DRV_NAME, DRV_VERSION, DRV_BUILD_ID); + pr_info("(skd?:??:[%s]): vendor=%04X device=%04x\n", + pci_name(pdev), pdev->vendor, pdev->device); + + rc = pci_enable_device(pdev); + if (rc) + return rc; + rc = pci_request_regions(pdev, DRV_NAME); + if (rc) + goto err_out; + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (!rc) { + if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) { + + pr_err("(%s): consistent DMA mask error %d\n", + pci_name(pdev), rc); + } + } else { + (rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32))); + if (rc) { + + pr_err("(%s): DMA mask error %d\n", + pci_name(pdev), rc); + goto err_out_regions; + } + } + + if (!skd_major) { + rc = register_blkdev(0, DRV_NAME); + if (rc < 0) + goto err_out_regions; + BUG_ON(!rc); + skd_major = rc; + } + + skdev = skd_construct(pdev); + if (skdev == NULL) { + rc = -ENOMEM; + goto err_out_regions; + } + + skd_pci_info(skdev, pci_str); + pr_info("(%s): %s 64bit\n", skd_name(skdev), pci_str); + + pci_set_master(pdev); + rc = pci_enable_pcie_error_reporting(pdev); + if (rc) { + pr_err( + "(%s): bad enable of PCIe error reporting rc=%d\n", + skd_name(skdev), rc); + skdev->pcie_error_reporting_is_enabled = 0; + } else + skdev->pcie_error_reporting_is_enabled = 1; + + + pci_set_drvdata(pdev, skdev); + + skdev->disk->driverfs_dev = &pdev->dev; + + for (i = 0; i < SKD_MAX_BARS; i++) { + skdev->mem_phys[i] = pci_resource_start(pdev, i); + skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); + skdev->mem_map[i] = ioremap(skdev->mem_phys[i], + skdev->mem_size[i]); + if (!skdev->mem_map[i]) { + pr_err("(%s): Unable to map adapter memory!\n", + skd_name(skdev)); + rc = -ENODEV; + goto err_out_iounmap; + } + pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n", + skdev->name, __func__, __LINE__, + skdev->mem_map[i], + (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]); + } + + rc = skd_acquire_irq(skdev); + if (rc) { + pr_err("(%s): interrupt resource error %d\n", + skd_name(skdev), rc); + goto err_out_iounmap; + } + + rc = skd_start_timer(skdev); + if (rc) + goto err_out_timer; + + init_waitqueue_head(&skdev->waitq); + + skd_start_device(skdev); + + rc = wait_event_interruptible_timeout(skdev->waitq, + (skdev->gendisk_on), + (SKD_START_WAIT_SECONDS * HZ)); + if (skdev->gendisk_on > 0) { + /* device came on-line after reset */ + skd_bdev_attach(skdev); + rc = 0; + } else { + /* we timed out, something is wrong with the device, + don't add the disk structure */ + pr_err( + "(%s): error: waiting for s1120 timed out %d!\n", + skd_name(skdev), rc); + /* in case of no error; we timeout with ENXIO */ + if (!rc) + rc = -ENXIO; + goto err_out_timer; + } + + +#ifdef SKD_VMK_POLL_HANDLER + if (skdev->irq_type == SKD_IRQ_MSIX) { + /* MSIX completion handler is being used for coredump */ + vmklnx_scsi_register_poll_handler(skdev->scsi_host, + skdev->msix_entries[5].vector, + skd_comp_q, skdev); + } else { + vmklnx_scsi_register_poll_handler(skdev->scsi_host, + skdev->pdev->irq, skd_isr, + skdev); + } +#endif /* SKD_VMK_POLL_HANDLER */ + + return rc; + +err_out_timer: + skd_stop_device(skdev); + skd_release_irq(skdev); + +err_out_iounmap: + for (i = 0; i < SKD_MAX_BARS; i++) + if (skdev->mem_map[i]) + iounmap(skdev->mem_map[i]); + + if (skdev->pcie_error_reporting_is_enabled) + pci_disable_pcie_error_reporting(pdev); + + skd_destruct(skdev); + +err_out_regions: + pci_release_regions(pdev); + +err_out: + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + return rc; +} + +static void skd_pci_remove(struct pci_dev *pdev) +{ + int i; + struct skd_device *skdev; + + skdev = pci_get_drvdata(pdev); + if (!skdev) { + pr_err("%s: no device data for PCI\n", pci_name(pdev)); + return; + } + skd_stop_device(skdev); + skd_release_irq(skdev); + + for (i = 0; i < SKD_MAX_BARS; i++) + if (skdev->mem_map[i]) + iounmap((u32 *)skdev->mem_map[i]); + + if (skdev->pcie_error_reporting_is_enabled) + pci_disable_pcie_error_reporting(pdev); + + skd_destruct(skdev); + + pci_release_regions(pdev); + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); + + return; +} + +static int skd_pci_suspend(struct pci_dev *pdev, pm_message_t state) +{ + int i; + struct skd_device *skdev; + + skdev = pci_get_drvdata(pdev); + if (!skdev) { + pr_err("%s: no device data for PCI\n", pci_name(pdev)); + return -EIO; + } + + skd_stop_device(skdev); + + skd_release_irq(skdev); + + for (i = 0; i < SKD_MAX_BARS; i++) + if (skdev->mem_map[i]) + iounmap((u32 *)skdev->mem_map[i]); + + if (skdev->pcie_error_reporting_is_enabled) + pci_disable_pcie_error_reporting(pdev); + + pci_release_regions(pdev); + pci_save_state(pdev); + pci_disable_device(pdev); + pci_set_power_state(pdev, pci_choose_state(pdev, state)); + return 0; +} + +static int skd_pci_resume(struct pci_dev *pdev) +{ + int i; + int rc = 0; + struct skd_device *skdev; + + skdev = pci_get_drvdata(pdev); + if (!skdev) { + pr_err("%s: no device data for PCI\n", pci_name(pdev)); + return -1; + } + + pci_set_power_state(pdev, PCI_D0); + pci_enable_wake(pdev, PCI_D0, 0); + pci_restore_state(pdev); + + rc = pci_enable_device(pdev); + if (rc) + return rc; + rc = pci_request_regions(pdev, DRV_NAME); + if (rc) + goto err_out; + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (!rc) { + if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) { + + pr_err("(%s): consistent DMA mask error %d\n", + pci_name(pdev), rc); + } + } else { + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (rc) { + + pr_err("(%s): DMA mask error %d\n", + pci_name(pdev), rc); + goto err_out_regions; + } + } + + pci_set_master(pdev); + rc = pci_enable_pcie_error_reporting(pdev); + if (rc) { + pr_err("(%s): bad enable of PCIe error reporting rc=%d\n", + skdev->name, rc); + skdev->pcie_error_reporting_is_enabled = 0; + } else + skdev->pcie_error_reporting_is_enabled = 1; + + for (i = 0; i < SKD_MAX_BARS; i++) { + + skdev->mem_phys[i] = pci_resource_start(pdev, i); + skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); + skdev->mem_map[i] = ioremap(skdev->mem_phys[i], + skdev->mem_size[i]); + if (!skdev->mem_map[i]) { + pr_err("(%s): Unable to map adapter memory!\n", + skd_name(skdev)); + rc = -ENODEV; + goto err_out_iounmap; + } + pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n", + skdev->name, __func__, __LINE__, + skdev->mem_map[i], + (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]); + } + rc = skd_acquire_irq(skdev); + if (rc) { + + pr_err("(%s): interrupt resource error %d\n", + pci_name(pdev), rc); + goto err_out_iounmap; + } + + rc = skd_start_timer(skdev); + if (rc) + goto err_out_timer; + + init_waitqueue_head(&skdev->waitq); + + skd_start_device(skdev); + + return rc; + +err_out_timer: + skd_stop_device(skdev); + skd_release_irq(skdev); + +err_out_iounmap: + for (i = 0; i < SKD_MAX_BARS; i++) + if (skdev->mem_map[i]) + iounmap(skdev->mem_map[i]); + + if (skdev->pcie_error_reporting_is_enabled) + pci_disable_pcie_error_reporting(pdev); + +err_out_regions: + pci_release_regions(pdev); + +err_out: + pci_disable_device(pdev); + return rc; +} + +static void skd_pci_shutdown(struct pci_dev *pdev) +{ + struct skd_device *skdev; + + pr_err("skd_pci_shutdown called\n"); + + skdev = pci_get_drvdata(pdev); + if (!skdev) { + pr_err("%s: no device data for PCI\n", pci_name(pdev)); + return; + } + + pr_err("%s: calling stop\n", skd_name(skdev)); + skd_stop_device(skdev); +} + +static struct pci_driver skd_driver = { + .name = DRV_NAME, + .id_table = skd_pci_tbl, + .probe = skd_pci_probe, + .remove = skd_pci_remove, + .suspend = skd_pci_suspend, + .resume = skd_pci_resume, + .shutdown = skd_pci_shutdown, +}; + +/* + ***************************************************************************** + * LOGGING SUPPORT + ***************************************************************************** + */ + +static const char *skd_name(struct skd_device *skdev) +{ + memset(skdev->id_str, 0, sizeof(skdev->id_str)); + + if (skdev->inquiry_is_valid) + snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:%s:[%s]", + skdev->name, skdev->inq_serial_num, + pci_name(skdev->pdev)); + else + snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:??:[%s]", + skdev->name, pci_name(skdev->pdev)); + + return skdev->id_str; +} + +const char *skd_drive_state_to_str(int state) +{ + switch (state) { + case FIT_SR_DRIVE_OFFLINE: + return "OFFLINE"; + case FIT_SR_DRIVE_INIT: + return "INIT"; + case FIT_SR_DRIVE_ONLINE: + return "ONLINE"; + case FIT_SR_DRIVE_BUSY: + return "BUSY"; + case FIT_SR_DRIVE_FAULT: + return "FAULT"; + case FIT_SR_DRIVE_DEGRADED: + return "DEGRADED"; + case FIT_SR_PCIE_LINK_DOWN: + return "INK_DOWN"; + case FIT_SR_DRIVE_SOFT_RESET: + return "SOFT_RESET"; + case FIT_SR_DRIVE_NEED_FW_DOWNLOAD: + return "NEED_FW"; + case FIT_SR_DRIVE_INIT_FAULT: + return "INIT_FAULT"; + case FIT_SR_DRIVE_BUSY_SANITIZE: + return "BUSY_SANITIZE"; + case FIT_SR_DRIVE_BUSY_ERASE: + return "BUSY_ERASE"; + case FIT_SR_DRIVE_FW_BOOTING: + return "FW_BOOTING"; + default: + return "???"; + } +} + +const char *skd_skdev_state_to_str(enum skd_drvr_state state) +{ + switch (state) { + case SKD_DRVR_STATE_LOAD: + return "LOAD"; + case SKD_DRVR_STATE_IDLE: + return "IDLE"; + case SKD_DRVR_STATE_BUSY: + return "BUSY"; + case SKD_DRVR_STATE_STARTING: + return "STARTING"; + case SKD_DRVR_STATE_ONLINE: + return "ONLINE"; + case SKD_DRVR_STATE_PAUSING: + return "PAUSING"; + case SKD_DRVR_STATE_PAUSED: + return "PAUSED"; + case SKD_DRVR_STATE_DRAINING_TIMEOUT: + return "DRAINING_TIMEOUT"; + case SKD_DRVR_STATE_RESTARTING: + return "RESTARTING"; + case SKD_DRVR_STATE_RESUMING: + return "RESUMING"; + case SKD_DRVR_STATE_STOPPING: + return "STOPPING"; + case SKD_DRVR_STATE_SYNCING: + return "SYNCING"; + case SKD_DRVR_STATE_FAULT: + return "FAULT"; + case SKD_DRVR_STATE_DISAPPEARED: + return "DISAPPEARED"; + case SKD_DRVR_STATE_BUSY_ERASE: + return "BUSY_ERASE"; + case SKD_DRVR_STATE_BUSY_SANITIZE: + return "BUSY_SANITIZE"; + case SKD_DRVR_STATE_BUSY_IMMINENT: + return "BUSY_IMMINENT"; + case SKD_DRVR_STATE_WAIT_BOOT: + return "WAIT_BOOT"; + + default: + return "???"; + } +} + +static const char *skd_skmsg_state_to_str(enum skd_fit_msg_state state) +{ + switch (state) { + case SKD_MSG_STATE_IDLE: + return "IDLE"; + case SKD_MSG_STATE_BUSY: + return "BUSY"; + default: + return "???"; + } +} + +static const char *skd_skreq_state_to_str(enum skd_req_state state) +{ + switch (state) { + case SKD_REQ_STATE_IDLE: + return "IDLE"; + case SKD_REQ_STATE_SETUP: + return "SETUP"; + case SKD_REQ_STATE_BUSY: + return "BUSY"; + case SKD_REQ_STATE_COMPLETED: + return "COMPLETED"; + case SKD_REQ_STATE_TIMEOUT: + return "TIMEOUT"; + case SKD_REQ_STATE_ABORTED: + return "ABORTED"; + default: + return "???"; + } +} + +static void skd_log_skdev(struct skd_device *skdev, const char *event) +{ + pr_debug("%s:%s:%d (%s) skdev=%p event='%s'\n", + skdev->name, __func__, __LINE__, skdev->name, skdev, event); + pr_debug("%s:%s:%d drive_state=%s(%d) driver_state=%s(%d)\n", + skdev->name, __func__, __LINE__, + skd_drive_state_to_str(skdev->drive_state), skdev->drive_state, + skd_skdev_state_to_str(skdev->state), skdev->state); + pr_debug("%s:%s:%d busy=%d limit=%d dev=%d lowat=%d\n", + skdev->name, __func__, __LINE__, + skdev->in_flight, skdev->cur_max_queue_depth, + skdev->dev_max_queue_depth, skdev->queue_low_water_mark); + pr_debug("%s:%s:%d timestamp=0x%x cycle=%d cycle_ix=%d\n", + skdev->name, __func__, __LINE__, + skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix); +} + +static void skd_log_skmsg(struct skd_device *skdev, + struct skd_fitmsg_context *skmsg, const char *event) +{ + pr_debug("%s:%s:%d (%s) skmsg=%p event='%s'\n", + skdev->name, __func__, __LINE__, skdev->name, skmsg, event); + pr_debug("%s:%s:%d state=%s(%d) id=0x%04x length=%d\n", + skdev->name, __func__, __LINE__, + skd_skmsg_state_to_str(skmsg->state), skmsg->state, + skmsg->id, skmsg->length); +} + +static void skd_log_skreq(struct skd_device *skdev, + struct skd_request_context *skreq, const char *event) +{ + pr_debug("%s:%s:%d (%s) skreq=%p event='%s'\n", + skdev->name, __func__, __LINE__, skdev->name, skreq, event); + pr_debug("%s:%s:%d state=%s(%d) id=0x%04x fitmsg=0x%04x\n", + skdev->name, __func__, __LINE__, + skd_skreq_state_to_str(skreq->state), skreq->state, + skreq->id, skreq->fitmsg_id); + pr_debug("%s:%s:%d timo=0x%x sg_dir=%d n_sg=%d\n", + skdev->name, __func__, __LINE__, + skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg); + + if (skreq->req != NULL) { + struct request *req = skreq->req; + u32 lba = (u32)blk_rq_pos(req); + u32 count = blk_rq_sectors(req); + + pr_debug("%s:%s:%d " + "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n", + skdev->name, __func__, __LINE__, + req, lba, lba, count, count, + (int)rq_data_dir(req)); + } else + pr_debug("%s:%s:%d req=NULL\n", + skdev->name, __func__, __LINE__); +} + +/* + ***************************************************************************** + * MODULE GLUE + ***************************************************************************** + */ + +static int __init skd_init(void) +{ + pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID); + + switch (skd_isr_type) { + case SKD_IRQ_LEGACY: + case SKD_IRQ_MSI: + case SKD_IRQ_MSIX: + break; + default: + pr_err(PFX "skd_isr_type %d invalid, re-set to %d\n", + skd_isr_type, SKD_IRQ_DEFAULT); + skd_isr_type = SKD_IRQ_DEFAULT; + } + + if (skd_max_queue_depth < 1 || + skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) { + pr_err(PFX "skd_max_queue_depth %d invalid, re-set to %d\n", + skd_max_queue_depth, SKD_MAX_QUEUE_DEPTH_DEFAULT); + skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT; + } + + if (skd_max_req_per_msg < 1 || skd_max_req_per_msg > 14) { + pr_err(PFX "skd_max_req_per_msg %d invalid, re-set to %d\n", + skd_max_req_per_msg, SKD_MAX_REQ_PER_MSG_DEFAULT); + skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT; + } + + if (skd_sgs_per_request < 1 || skd_sgs_per_request > 4096) { + pr_err(PFX "skd_sg_per_request %d invalid, re-set to %d\n", + skd_sgs_per_request, SKD_N_SG_PER_REQ_DEFAULT); + skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT; + } + + if (skd_dbg_level < 0 || skd_dbg_level > 2) { + pr_err(PFX "skd_dbg_level %d invalid, re-set to %d\n", + skd_dbg_level, 0); + skd_dbg_level = 0; + } + + if (skd_isr_comp_limit < 0) { + pr_err(PFX "skd_isr_comp_limit %d invalid, set to %d\n", + skd_isr_comp_limit, 0); + skd_isr_comp_limit = 0; + } + + if (skd_max_pass_thru < 1 || skd_max_pass_thru > 50) { + pr_err(PFX "skd_max_pass_thru %d invalid, re-set to %d\n", + skd_max_pass_thru, SKD_N_SPECIAL_CONTEXT); + skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT; + } + + return pci_register_driver(&skd_driver); +} + +static void __exit skd_exit(void) +{ + pr_info(PFX " v%s-b%s unloading\n", DRV_VERSION, DRV_BUILD_ID); + + pci_unregister_driver(&skd_driver); + + if (skd_major) + unregister_blkdev(skd_major, DRV_NAME); +} + +module_init(skd_init); +module_exit(skd_exit); diff --git a/kernel/drivers/block/skd_s1120.h b/kernel/drivers/block/skd_s1120.h new file mode 100644 index 000000000..61c757ff0 --- /dev/null +++ b/kernel/drivers/block/skd_s1120.h @@ -0,0 +1,330 @@ +/* Copyright 2012 STEC, Inc. + * + * This file is licensed under the terms of the 3-clause + * BSD License (http://opensource.org/licenses/BSD-3-Clause) + * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html), + * at your option. Both licenses are also available in the LICENSE file + * distributed with this project. This file may not be copied, modified, + * or distributed except in accordance with those terms. + */ + + +#ifndef SKD_S1120_H +#define SKD_S1120_H + +#pragma pack(push, s1120_h, 1) + +/* + * Q-channel, 64-bit r/w + */ +#define FIT_Q_COMMAND 0x400u +#define FIT_QCMD_QID_MASK (0x3 << 1) +#define FIT_QCMD_QID0 (0x0 << 1) +#define FIT_QCMD_QID_NORMAL FIT_QCMD_QID0 +#define FIT_QCMD_QID1 (0x1 << 1) +#define FIT_QCMD_QID2 (0x2 << 1) +#define FIT_QCMD_QID3 (0x3 << 1) +#define FIT_QCMD_FLUSH_QUEUE (0ull) /* add QID */ +#define FIT_QCMD_MSGSIZE_MASK (0x3 << 4) +#define FIT_QCMD_MSGSIZE_64 (0x0 << 4) +#define FIT_QCMD_MSGSIZE_128 (0x1 << 4) +#define FIT_QCMD_MSGSIZE_256 (0x2 << 4) +#define FIT_QCMD_MSGSIZE_512 (0x3 << 4) +#define FIT_QCMD_BASE_ADDRESS_MASK (0xFFFFFFFFFFFFFFC0ull) + +/* + * Control, 32-bit r/w + */ +#define FIT_CONTROL 0x500u +#define FIT_CR_HARD_RESET (1u << 0u) +#define FIT_CR_SOFT_RESET (1u << 1u) +#define FIT_CR_DIS_TIMESTAMPS (1u << 6u) +#define FIT_CR_ENABLE_INTERRUPTS (1u << 7u) + +/* + * Status, 32-bit, r/o + */ +#define FIT_STATUS 0x510u +#define FIT_SR_DRIVE_STATE_MASK 0x000000FFu +#define FIT_SR_SIGNATURE (0xFF << 8) +#define FIT_SR_PIO_DMA (1 << 16) +#define FIT_SR_DRIVE_OFFLINE 0x00 +#define FIT_SR_DRIVE_INIT 0x01 +/* #define FIT_SR_DRIVE_READY 0x02 */ +#define FIT_SR_DRIVE_ONLINE 0x03 +#define FIT_SR_DRIVE_BUSY 0x04 +#define FIT_SR_DRIVE_FAULT 0x05 +#define FIT_SR_DRIVE_DEGRADED 0x06 +#define FIT_SR_PCIE_LINK_DOWN 0x07 +#define FIT_SR_DRIVE_SOFT_RESET 0x08 +#define FIT_SR_DRIVE_INIT_FAULT 0x09 +#define FIT_SR_DRIVE_BUSY_SANITIZE 0x0A +#define FIT_SR_DRIVE_BUSY_ERASE 0x0B +#define FIT_SR_DRIVE_FW_BOOTING 0x0C +#define FIT_SR_DRIVE_NEED_FW_DOWNLOAD 0xFE +#define FIT_SR_DEVICE_MISSING 0xFF +#define FIT_SR__RESERVED 0xFFFFFF00u + +/* + * FIT_STATUS - Status register data definition + */ +#define FIT_SR_STATE_MASK (0xFF << 0) +#define FIT_SR_SIGNATURE (0xFF << 8) +#define FIT_SR_PIO_DMA (1 << 16) + +/* + * Interrupt status, 32-bit r/w1c (w1c ==> write 1 to clear) + */ +#define FIT_INT_STATUS_HOST 0x520u +#define FIT_ISH_FW_STATE_CHANGE (1u << 0u) +#define FIT_ISH_COMPLETION_POSTED (1u << 1u) +#define FIT_ISH_MSG_FROM_DEV (1u << 2u) +#define FIT_ISH_UNDEFINED_3 (1u << 3u) +#define FIT_ISH_UNDEFINED_4 (1u << 4u) +#define FIT_ISH_Q0_FULL (1u << 5u) +#define FIT_ISH_Q1_FULL (1u << 6u) +#define FIT_ISH_Q2_FULL (1u << 7u) +#define FIT_ISH_Q3_FULL (1u << 8u) +#define FIT_ISH_QCMD_FIFO_OVERRUN (1u << 9u) +#define FIT_ISH_BAD_EXP_ROM_READ (1u << 10u) + +#define FIT_INT_DEF_MASK \ + (FIT_ISH_FW_STATE_CHANGE | \ + FIT_ISH_COMPLETION_POSTED | \ + FIT_ISH_MSG_FROM_DEV | \ + FIT_ISH_Q0_FULL | \ + FIT_ISH_Q1_FULL | \ + FIT_ISH_Q2_FULL | \ + FIT_ISH_Q3_FULL | \ + FIT_ISH_QCMD_FIFO_OVERRUN | \ + FIT_ISH_BAD_EXP_ROM_READ) + +#define FIT_INT_QUEUE_FULL \ + (FIT_ISH_Q0_FULL | \ + FIT_ISH_Q1_FULL | \ + FIT_ISH_Q2_FULL | \ + FIT_ISH_Q3_FULL) + +#define MSI_MSG_NWL_ERROR_0 0x00000000 +#define MSI_MSG_NWL_ERROR_1 0x00000001 +#define MSI_MSG_NWL_ERROR_2 0x00000002 +#define MSI_MSG_NWL_ERROR_3 0x00000003 +#define MSI_MSG_STATE_CHANGE 0x00000004 +#define MSI_MSG_COMPLETION_POSTED 0x00000005 +#define MSI_MSG_MSG_FROM_DEV 0x00000006 +#define MSI_MSG_RESERVED_0 0x00000007 +#define MSI_MSG_RESERVED_1 0x00000008 +#define MSI_MSG_QUEUE_0_FULL 0x00000009 +#define MSI_MSG_QUEUE_1_FULL 0x0000000A +#define MSI_MSG_QUEUE_2_FULL 0x0000000B +#define MSI_MSG_QUEUE_3_FULL 0x0000000C + +#define FIT_INT_RESERVED_MASK \ + (FIT_ISH_UNDEFINED_3 | \ + FIT_ISH_UNDEFINED_4) + +/* + * Interrupt mask, 32-bit r/w + * Bit definitions are the same as FIT_INT_STATUS_HOST + */ +#define FIT_INT_MASK_HOST 0x528u + +/* + * Message to device, 32-bit r/w + */ +#define FIT_MSG_TO_DEVICE 0x540u + +/* + * Message from device, 32-bit, r/o + */ +#define FIT_MSG_FROM_DEVICE 0x548u + +/* + * 32-bit messages to/from device, composition/extraction macros + */ +#define FIT_MXD_CONS(TYPE, PARAM, DATA) \ + ((((TYPE) & 0xFFu) << 24u) | \ + (((PARAM) & 0xFFu) << 16u) | \ + (((DATA) & 0xFFFFu) << 0u)) +#define FIT_MXD_TYPE(MXD) (((MXD) >> 24u) & 0xFFu) +#define FIT_MXD_PARAM(MXD) (((MXD) >> 16u) & 0xFFu) +#define FIT_MXD_DATA(MXD) (((MXD) >> 0u) & 0xFFFFu) + +/* + * Types of messages to/from device + */ +#define FIT_MTD_FITFW_INIT 0x01u +#define FIT_MTD_GET_CMDQ_DEPTH 0x02u +#define FIT_MTD_SET_COMPQ_DEPTH 0x03u +#define FIT_MTD_SET_COMPQ_ADDR 0x04u +#define FIT_MTD_ARM_QUEUE 0x05u +#define FIT_MTD_CMD_LOG_HOST_ID 0x07u +#define FIT_MTD_CMD_LOG_TIME_STAMP_LO 0x08u +#define FIT_MTD_CMD_LOG_TIME_STAMP_HI 0x09u +#define FIT_MFD_SMART_EXCEEDED 0x10u +#define FIT_MFD_POWER_DOWN 0x11u +#define FIT_MFD_OFFLINE 0x12u +#define FIT_MFD_ONLINE 0x13u +#define FIT_MFD_FW_RESTARTING 0x14u +#define FIT_MFD_PM_ACTIVE 0x15u +#define FIT_MFD_PM_STANDBY 0x16u +#define FIT_MFD_PM_SLEEP 0x17u +#define FIT_MFD_CMD_PROGRESS 0x18u + +#define FIT_MTD_DEBUG 0xFEu +#define FIT_MFD_DEBUG 0xFFu + +#define FIT_MFD_MASK (0xFFu) +#define FIT_MFD_DATA_MASK (0xFFu) +#define FIT_MFD_MSG(x) (((x) >> 24) & FIT_MFD_MASK) +#define FIT_MFD_DATA(x) ((x) & FIT_MFD_MASK) + +/* + * Extra arg to FIT_MSG_TO_DEVICE, 64-bit r/w + * Used to set completion queue address (FIT_MTD_SET_COMPQ_ADDR) + * (was Response buffer in docs) + */ +#define FIT_MSG_TO_DEVICE_ARG 0x580u + +/* + * Hardware (ASIC) version, 32-bit r/o + */ +#define FIT_HW_VERSION 0x588u + +/* + * Scatter/gather list descriptor. + * 32-bytes and must be aligned on a 32-byte boundary. + * All fields are in little endian order. + */ +struct fit_sg_descriptor { + uint32_t control; + uint32_t byte_count; + uint64_t host_side_addr; + uint64_t dev_side_addr; + uint64_t next_desc_ptr; +}; + +#define FIT_SGD_CONTROL_NOT_LAST 0x000u +#define FIT_SGD_CONTROL_LAST 0x40Eu + +/* + * Header at the beginning of a FIT message. The header + * is followed by SSDI requests each 64 bytes. + * A FIT message can be up to 512 bytes long and must start + * on a 64-byte boundary. + */ +struct fit_msg_hdr { + uint8_t protocol_id; + uint8_t num_protocol_cmds_coalesced; + uint8_t _reserved[62]; +}; + +#define FIT_PROTOCOL_ID_FIT 1 +#define FIT_PROTOCOL_ID_SSDI 2 +#define FIT_PROTOCOL_ID_SOFIT 3 + + +#define FIT_PROTOCOL_MINOR_VER(mtd_val) ((mtd_val >> 16) & 0xF) +#define FIT_PROTOCOL_MAJOR_VER(mtd_val) ((mtd_val >> 20) & 0xF) + +/* + * Format of a completion entry. The completion queue is circular + * and must have at least as many entries as the maximum number + * of commands that may be issued to the device. + * + * There are no head/tail pointers. The cycle value is used to + * infer the presence of new completion records. + * Initially the cycle in all entries is 0, the index is 0, and + * the cycle value to expect is 1. When completions are added + * their cycle values are set to 1. When the index wraps the + * cycle value to expect is incremented. + * + * Command_context is opaque and taken verbatim from the SSDI command. + * All other fields are big endian. + */ +#define FIT_PROTOCOL_VERSION_0 0 + +/* + * Protocol major version 1 completion entry. + * The major protocol version is found in bits + * 20-23 of the FIT_MTD_FITFW_INIT response. + */ +struct fit_completion_entry_v1 { + uint32_t num_returned_bytes; + uint16_t tag; + uint8_t status; /* SCSI status */ + uint8_t cycle; +}; +#define FIT_PROTOCOL_VERSION_1 1 +#define FIT_PROTOCOL_VERSION_CURRENT FIT_PROTOCOL_VERSION_1 + +struct fit_comp_error_info { + uint8_t type:7; /* 00: Bits0-6 indicates the type of sense data. */ + uint8_t valid:1; /* 00: Bit 7 := 1 ==> info field is valid. */ + uint8_t reserved0; /* 01: Obsolete field */ + uint8_t key:4; /* 02: Bits0-3 indicate the sense key. */ + uint8_t reserved2:1; /* 02: Reserved bit. */ + uint8_t bad_length:1; /* 02: Incorrect Length Indicator */ + uint8_t end_medium:1; /* 02: End of Medium */ + uint8_t file_mark:1; /* 02: Filemark */ + uint8_t info[4]; /* 03: */ + uint8_t reserved1; /* 07: Additional Sense Length */ + uint8_t cmd_spec[4]; /* 08: Command Specific Information */ + uint8_t code; /* 0C: Additional Sense Code */ + uint8_t qual; /* 0D: Additional Sense Code Qualifier */ + uint8_t fruc; /* 0E: Field Replaceable Unit Code */ + uint8_t sks_high:7; /* 0F: Sense Key Specific (MSB) */ + uint8_t sks_valid:1; /* 0F: Sense Key Specific Valid */ + uint16_t sks_low; /* 10: Sense Key Specific (LSW) */ + uint16_t reserved3; /* 12: Part of additional sense bytes (unused) */ + uint16_t uec; /* 14: Additional Sense Bytes */ + uint64_t per; /* 16: Additional Sense Bytes */ + uint8_t reserved4[2]; /* 1E: Additional Sense Bytes (unused) */ +}; + + +/* Task management constants */ +#define SOFT_TASK_SIMPLE 0x00 +#define SOFT_TASK_HEAD_OF_QUEUE 0x01 +#define SOFT_TASK_ORDERED 0x02 + +/* Version zero has the last 32 bits reserved, + * Version one has the last 32 bits sg_list_len_bytes; + */ +struct skd_command_header { + uint64_t sg_list_dma_address; + uint16_t tag; + uint8_t attribute; + uint8_t add_cdb_len; /* In 32 bit words */ + uint32_t sg_list_len_bytes; +}; + +struct skd_scsi_request { + struct skd_command_header hdr; + unsigned char cdb[16]; +/* unsigned char _reserved[16]; */ +}; + +struct driver_inquiry_data { + uint8_t peripheral_device_type:5; + uint8_t qualifier:3; + uint8_t page_code; + uint16_t page_length; + uint16_t pcie_bus_number; + uint8_t pcie_device_number; + uint8_t pcie_function_number; + uint8_t pcie_link_speed; + uint8_t pcie_link_lanes; + uint16_t pcie_vendor_id; + uint16_t pcie_device_id; + uint16_t pcie_subsystem_vendor_id; + uint16_t pcie_subsystem_device_id; + uint8_t reserved1[2]; + uint8_t reserved2[3]; + uint8_t driver_version_length; + uint8_t driver_version[0x14]; +}; + +#pragma pack(pop, s1120_h) + +#endif /* SKD_S1120_H */ diff --git a/kernel/drivers/block/smart1,2.h b/kernel/drivers/block/smart1,2.h new file mode 100644 index 000000000..e5565fbae --- /dev/null +++ b/kernel/drivers/block/smart1,2.h @@ -0,0 +1,278 @@ +/* + * Disk Array driver for Compaq SMART2 Controllers + * Copyright 1998 Compaq Computer Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Questions/Comments/Bugfixes to iss_storagedev@hp.com + * + * If you want to make changes, improve or add functionality to this + * driver, you'll probably need the Compaq Array Controller Interface + * Specificiation (Document number ECG086/1198) + */ + +/* + * This file contains the controller communication implementation for + * Compaq SMART-1 and SMART-2 controllers. To the best of my knowledge, + * this should support: + * + * PCI: + * SMART-2/P, SMART-2DH, SMART-2SL, SMART-221, SMART-3100ES, SMART-3200 + * Integerated SMART Array Controller, SMART-4200, SMART-4250ES + * + * EISA: + * SMART-2/E, SMART, IAES, IDA-2, IDA + */ + +/* + * Memory mapped FIFO interface (SMART 42xx cards) + */ +static void smart4_submit_command(ctlr_info_t *h, cmdlist_t *c) +{ + writel(c->busaddr, h->vaddr + S42XX_REQUEST_PORT_OFFSET); +} + +/* + * This card is the opposite of the other cards. + * 0 turns interrupts on... + * 0x08 turns them off... + */ +static void smart4_intr_mask(ctlr_info_t *h, unsigned long val) +{ + if (val) + { /* Turn interrupts on */ + writel(0, h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET); + } else /* Turn them off */ + { + writel( S42XX_INTR_OFF, + h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET); + } +} + +/* + * For older cards FIFO Full = 0. + * On this card 0 means there is room, anything else FIFO Full. + * + */ +static unsigned long smart4_fifo_full(ctlr_info_t *h) +{ + + return (!readl(h->vaddr + S42XX_REQUEST_PORT_OFFSET)); +} + +/* This type of controller returns -1 if the fifo is empty, + * Not 0 like the others. + * And we need to let it know we read a value out + */ +static unsigned long smart4_completed(ctlr_info_t *h) +{ + long register_value + = readl(h->vaddr + S42XX_REPLY_PORT_OFFSET); + + /* Fifo is empty */ + if( register_value == 0xffffffff) + return 0; + + /* Need to let it know we got the reply */ + /* We do this by writing a 0 to the port we just read from */ + writel(0, h->vaddr + S42XX_REPLY_PORT_OFFSET); + + return ((unsigned long) register_value); +} + + /* + * This hardware returns interrupt pending at a different place and + * it does not tell us if the fifo is empty, we will have check + * that by getting a 0 back from the command_completed call. + */ +static unsigned long smart4_intr_pending(ctlr_info_t *h) +{ + unsigned long register_value = + readl(h->vaddr + S42XX_INTR_STATUS); + + if( register_value & S42XX_INTR_PENDING) + return FIFO_NOT_EMPTY; + return 0 ; +} + +static struct access_method smart4_access = { + smart4_submit_command, + smart4_intr_mask, + smart4_fifo_full, + smart4_intr_pending, + smart4_completed, +}; + +/* + * Memory mapped FIFO interface (PCI SMART2 and SMART 3xxx cards) + */ +static void smart2_submit_command(ctlr_info_t *h, cmdlist_t *c) +{ + writel(c->busaddr, h->vaddr + COMMAND_FIFO); +} + +static void smart2_intr_mask(ctlr_info_t *h, unsigned long val) +{ + writel(val, h->vaddr + INTR_MASK); +} + +static unsigned long smart2_fifo_full(ctlr_info_t *h) +{ + return readl(h->vaddr + COMMAND_FIFO); +} + +static unsigned long smart2_completed(ctlr_info_t *h) +{ + return readl(h->vaddr + COMMAND_COMPLETE_FIFO); +} + +static unsigned long smart2_intr_pending(ctlr_info_t *h) +{ + return readl(h->vaddr + INTR_PENDING); +} + +static struct access_method smart2_access = { + smart2_submit_command, + smart2_intr_mask, + smart2_fifo_full, + smart2_intr_pending, + smart2_completed, +}; + +/* + * IO access for SMART-2/E cards + */ +static void smart2e_submit_command(ctlr_info_t *h, cmdlist_t *c) +{ + outl(c->busaddr, h->io_mem_addr + COMMAND_FIFO); +} + +static void smart2e_intr_mask(ctlr_info_t *h, unsigned long val) +{ + outl(val, h->io_mem_addr + INTR_MASK); +} + +static unsigned long smart2e_fifo_full(ctlr_info_t *h) +{ + return inl(h->io_mem_addr + COMMAND_FIFO); +} + +static unsigned long smart2e_completed(ctlr_info_t *h) +{ + return inl(h->io_mem_addr + COMMAND_COMPLETE_FIFO); +} + +static unsigned long smart2e_intr_pending(ctlr_info_t *h) +{ + return inl(h->io_mem_addr + INTR_PENDING); +} + +static struct access_method smart2e_access = { + smart2e_submit_command, + smart2e_intr_mask, + smart2e_fifo_full, + smart2e_intr_pending, + smart2e_completed, +}; + +/* + * IO access for older SMART-1 type cards + */ +#define SMART1_SYSTEM_MASK 0xC8E +#define SMART1_SYSTEM_DOORBELL 0xC8F +#define SMART1_LOCAL_MASK 0xC8C +#define SMART1_LOCAL_DOORBELL 0xC8D +#define SMART1_INTR_MASK 0xC89 +#define SMART1_LISTADDR 0xC90 +#define SMART1_LISTLEN 0xC94 +#define SMART1_TAG 0xC97 +#define SMART1_COMPLETE_ADDR 0xC98 +#define SMART1_LISTSTATUS 0xC9E + +#define CHANNEL_BUSY 0x01 +#define CHANNEL_CLEAR 0x02 + +static void smart1_submit_command(ctlr_info_t *h, cmdlist_t *c) +{ + /* + * This __u16 is actually a bunch of control flags on SMART + * and below. We want them all to be zero. + */ + c->hdr.size = 0; + + outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_SYSTEM_DOORBELL); + + outl(c->busaddr, h->io_mem_addr + SMART1_LISTADDR); + outw(c->size, h->io_mem_addr + SMART1_LISTLEN); + + outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL); +} + +static void smart1_intr_mask(ctlr_info_t *h, unsigned long val) +{ + if (val == 1) { + outb(0xFD, h->io_mem_addr + SMART1_SYSTEM_DOORBELL); + outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL); + outb(0x01, h->io_mem_addr + SMART1_INTR_MASK); + outb(0x01, h->io_mem_addr + SMART1_SYSTEM_MASK); + } else { + outb(0, h->io_mem_addr + 0xC8E); + } +} + +static unsigned long smart1_fifo_full(ctlr_info_t *h) +{ + unsigned char chan; + chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_CLEAR; + return chan; +} + +static unsigned long smart1_completed(ctlr_info_t *h) +{ + unsigned char status; + unsigned long cmd; + + if (inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY) { + outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_SYSTEM_DOORBELL); + + cmd = inl(h->io_mem_addr + SMART1_COMPLETE_ADDR); + status = inb(h->io_mem_addr + SMART1_LISTSTATUS); + + outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_LOCAL_DOORBELL); + + /* + * this is x86 (actually compaq x86) only, so it's ok + */ + if (cmd) ((cmdlist_t*)bus_to_virt(cmd))->req.hdr.rcode = status; + } else { + cmd = 0; + } + return cmd; +} + +static unsigned long smart1_intr_pending(ctlr_info_t *h) +{ + unsigned char chan; + chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY; + return chan; +} + +static struct access_method smart1_access = { + smart1_submit_command, + smart1_intr_mask, + smart1_fifo_full, + smart1_intr_pending, + smart1_completed, +}; diff --git a/kernel/drivers/block/sunvdc.c b/kernel/drivers/block/sunvdc.c new file mode 100644 index 000000000..4b911ed96 --- /dev/null +++ b/kernel/drivers/block/sunvdc.c @@ -0,0 +1,1129 @@ +/* sunvdc.c: Sun LDOM Virtual Disk Client. + * + * Copyright (C) 2007, 2008 David S. Miller + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define DRV_MODULE_NAME "sunvdc" +#define PFX DRV_MODULE_NAME ": " +#define DRV_MODULE_VERSION "1.2" +#define DRV_MODULE_RELDATE "November 24, 2014" + +static char version[] = + DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; +MODULE_AUTHOR("David S. Miller (davem@davemloft.net)"); +MODULE_DESCRIPTION("Sun LDOM virtual disk client driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_MODULE_VERSION); + +#define VDC_TX_RING_SIZE 512 + +#define WAITING_FOR_LINK_UP 0x01 +#define WAITING_FOR_TX_SPACE 0x02 +#define WAITING_FOR_GEN_CMD 0x04 +#define WAITING_FOR_ANY -1 + +static struct workqueue_struct *sunvdc_wq; + +struct vdc_req_entry { + struct request *req; +}; + +struct vdc_port { + struct vio_driver_state vio; + + struct gendisk *disk; + + struct vdc_completion *cmp; + + u64 req_id; + u64 seq; + struct vdc_req_entry rq_arr[VDC_TX_RING_SIZE]; + + unsigned long ring_cookies; + + u64 max_xfer_size; + u32 vdisk_block_size; + + u64 ldc_timeout; + struct timer_list ldc_reset_timer; + struct work_struct ldc_reset_work; + + /* The server fills these in for us in the disk attribute + * ACK packet. + */ + u64 operations; + u32 vdisk_size; + u8 vdisk_type; + u8 vdisk_mtype; + + char disk_name[32]; +}; + +static void vdc_ldc_reset(struct vdc_port *port); +static void vdc_ldc_reset_work(struct work_struct *work); +static void vdc_ldc_reset_timer(unsigned long _arg); + +static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio) +{ + return container_of(vio, struct vdc_port, vio); +} + +/* Ordered from largest major to lowest */ +static struct vio_version vdc_versions[] = { + { .major = 1, .minor = 1 }, + { .major = 1, .minor = 0 }, +}; + +static inline int vdc_version_supported(struct vdc_port *port, + u16 major, u16 minor) +{ + return port->vio.ver.major == major && port->vio.ver.minor >= minor; +} + +#define VDCBLK_NAME "vdisk" +static int vdc_major; +#define PARTITION_SHIFT 3 + +static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr) +{ + return vio_dring_avail(dr, VDC_TX_RING_SIZE); +} + +static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct gendisk *disk = bdev->bd_disk; + sector_t nsect = get_capacity(disk); + sector_t cylinders = nsect; + + geo->heads = 0xff; + geo->sectors = 0x3f; + sector_div(cylinders, geo->heads * geo->sectors); + geo->cylinders = cylinders; + if ((sector_t)(geo->cylinders + 1) * geo->heads * geo->sectors < nsect) + geo->cylinders = 0xffff; + + return 0; +} + +/* Add ioctl/CDROM_GET_CAPABILITY to support cdrom_id in udev + * when vdisk_mtype is VD_MEDIA_TYPE_CD or VD_MEDIA_TYPE_DVD. + * Needed to be able to install inside an ldom from an iso image. + */ +static int vdc_ioctl(struct block_device *bdev, fmode_t mode, + unsigned command, unsigned long argument) +{ + int i; + struct gendisk *disk; + + switch (command) { + case CDROMMULTISESSION: + pr_debug(PFX "Multisession CDs not supported\n"); + for (i = 0; i < sizeof(struct cdrom_multisession); i++) + if (put_user(0, (char __user *)(argument + i))) + return -EFAULT; + return 0; + + case CDROM_GET_CAPABILITY: + disk = bdev->bd_disk; + + if (bdev->bd_disk && (disk->flags & GENHD_FL_CD)) + return 0; + return -EINVAL; + + default: + pr_debug(PFX "ioctl %08x not supported\n", command); + return -EINVAL; + } +} + +static const struct block_device_operations vdc_fops = { + .owner = THIS_MODULE, + .getgeo = vdc_getgeo, + .ioctl = vdc_ioctl, +}; + +static void vdc_blk_queue_start(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + + /* restart blk queue when ring is half emptied. also called after + * handshake completes, so check for initial handshake before we've + * allocated a disk. + */ + if (port->disk && blk_queue_stopped(port->disk->queue) && + vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) { + blk_start_queue(port->disk->queue); + } + +} + +static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for) +{ + if (vio->cmp && + (waiting_for == -1 || + vio->cmp->waiting_for == waiting_for)) { + vio->cmp->err = err; + complete(&vio->cmp->com); + vio->cmp = NULL; + } +} + +static void vdc_handshake_complete(struct vio_driver_state *vio) +{ + struct vdc_port *port = to_vdc_port(vio); + + del_timer(&port->ldc_reset_timer); + vdc_finish(vio, 0, WAITING_FOR_LINK_UP); + vdc_blk_queue_start(port); +} + +static int vdc_handle_unknown(struct vdc_port *port, void *arg) +{ + struct vio_msg_tag *pkt = arg; + + printk(KERN_ERR PFX "Received unknown msg [%02x:%02x:%04x:%08x]\n", + pkt->type, pkt->stype, pkt->stype_env, pkt->sid); + printk(KERN_ERR PFX "Resetting connection.\n"); + + ldc_disconnect(port->vio.lp); + + return -ECONNRESET; +} + +static int vdc_send_attr(struct vio_driver_state *vio) +{ + struct vdc_port *port = to_vdc_port(vio); + struct vio_disk_attr_info pkt; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.tag.type = VIO_TYPE_CTRL; + pkt.tag.stype = VIO_SUBTYPE_INFO; + pkt.tag.stype_env = VIO_ATTR_INFO; + pkt.tag.sid = vio_send_sid(vio); + + pkt.xfer_mode = VIO_DRING_MODE; + pkt.vdisk_block_size = port->vdisk_block_size; + pkt.max_xfer_size = port->max_xfer_size; + + viodbg(HS, "SEND ATTR xfer_mode[0x%x] blksz[%u] max_xfer[%llu]\n", + pkt.xfer_mode, pkt.vdisk_block_size, pkt.max_xfer_size); + + return vio_ldc_send(&port->vio, &pkt, sizeof(pkt)); +} + +static int vdc_handle_attr(struct vio_driver_state *vio, void *arg) +{ + struct vdc_port *port = to_vdc_port(vio); + struct vio_disk_attr_info *pkt = arg; + + viodbg(HS, "GOT ATTR stype[0x%x] ops[%llx] disk_size[%llu] disk_type[%x] " + "mtype[0x%x] xfer_mode[0x%x] blksz[%u] max_xfer[%llu]\n", + pkt->tag.stype, pkt->operations, + pkt->vdisk_size, pkt->vdisk_type, pkt->vdisk_mtype, + pkt->xfer_mode, pkt->vdisk_block_size, + pkt->max_xfer_size); + + if (pkt->tag.stype == VIO_SUBTYPE_ACK) { + switch (pkt->vdisk_type) { + case VD_DISK_TYPE_DISK: + case VD_DISK_TYPE_SLICE: + break; + + default: + printk(KERN_ERR PFX "%s: Bogus vdisk_type 0x%x\n", + vio->name, pkt->vdisk_type); + return -ECONNRESET; + } + + if (pkt->vdisk_block_size > port->vdisk_block_size) { + printk(KERN_ERR PFX "%s: BLOCK size increased " + "%u --> %u\n", + vio->name, + port->vdisk_block_size, pkt->vdisk_block_size); + return -ECONNRESET; + } + + port->operations = pkt->operations; + port->vdisk_type = pkt->vdisk_type; + if (vdc_version_supported(port, 1, 1)) { + port->vdisk_size = pkt->vdisk_size; + port->vdisk_mtype = pkt->vdisk_mtype; + } + if (pkt->max_xfer_size < port->max_xfer_size) + port->max_xfer_size = pkt->max_xfer_size; + port->vdisk_block_size = pkt->vdisk_block_size; + return 0; + } else { + printk(KERN_ERR PFX "%s: Attribute NACK\n", vio->name); + + return -ECONNRESET; + } +} + +static void vdc_end_special(struct vdc_port *port, struct vio_disk_desc *desc) +{ + int err = desc->status; + + vdc_finish(&port->vio, -err, WAITING_FOR_GEN_CMD); +} + +static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr, + unsigned int index) +{ + struct vio_disk_desc *desc = vio_dring_entry(dr, index); + struct vdc_req_entry *rqe = &port->rq_arr[index]; + struct request *req; + + if (unlikely(desc->hdr.state != VIO_DESC_DONE)) + return; + + ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies); + desc->hdr.state = VIO_DESC_FREE; + dr->cons = vio_dring_next(dr, index); + + req = rqe->req; + if (req == NULL) { + vdc_end_special(port, desc); + return; + } + + rqe->req = NULL; + + __blk_end_request(req, (desc->status ? -EIO : 0), desc->size); + + vdc_blk_queue_start(port); +} + +static int vdc_ack(struct vdc_port *port, void *msgbuf) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + struct vio_dring_data *pkt = msgbuf; + + if (unlikely(pkt->dring_ident != dr->ident || + pkt->start_idx != pkt->end_idx || + pkt->start_idx >= VDC_TX_RING_SIZE)) + return 0; + + vdc_end_one(port, dr, pkt->start_idx); + + return 0; +} + +static int vdc_nack(struct vdc_port *port, void *msgbuf) +{ + /* XXX Implement me XXX */ + return 0; +} + +static void vdc_event(void *arg, int event) +{ + struct vdc_port *port = arg; + struct vio_driver_state *vio = &port->vio; + unsigned long flags; + int err; + + spin_lock_irqsave(&vio->lock, flags); + + if (unlikely(event == LDC_EVENT_RESET)) { + vio_link_state_change(vio, event); + queue_work(sunvdc_wq, &port->ldc_reset_work); + goto out; + } + + if (unlikely(event == LDC_EVENT_UP)) { + vio_link_state_change(vio, event); + goto out; + } + + if (unlikely(event != LDC_EVENT_DATA_READY)) { + pr_warn(PFX "Unexpected LDC event %d\n", event); + goto out; + } + + err = 0; + while (1) { + union { + struct vio_msg_tag tag; + u64 raw[8]; + } msgbuf; + + err = ldc_read(vio->lp, &msgbuf, sizeof(msgbuf)); + if (unlikely(err < 0)) { + if (err == -ECONNRESET) + vio_conn_reset(vio); + break; + } + if (err == 0) + break; + viodbg(DATA, "TAG [%02x:%02x:%04x:%08x]\n", + msgbuf.tag.type, + msgbuf.tag.stype, + msgbuf.tag.stype_env, + msgbuf.tag.sid); + err = vio_validate_sid(vio, &msgbuf.tag); + if (err < 0) + break; + + if (likely(msgbuf.tag.type == VIO_TYPE_DATA)) { + if (msgbuf.tag.stype == VIO_SUBTYPE_ACK) + err = vdc_ack(port, &msgbuf); + else if (msgbuf.tag.stype == VIO_SUBTYPE_NACK) + err = vdc_nack(port, &msgbuf); + else + err = vdc_handle_unknown(port, &msgbuf); + } else if (msgbuf.tag.type == VIO_TYPE_CTRL) { + err = vio_control_pkt_engine(vio, &msgbuf); + } else { + err = vdc_handle_unknown(port, &msgbuf); + } + if (err < 0) + break; + } + if (err < 0) + vdc_finish(&port->vio, err, WAITING_FOR_ANY); +out: + spin_unlock_irqrestore(&vio->lock, flags); +} + +static int __vdc_tx_trigger(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + struct vio_dring_data hdr = { + .tag = { + .type = VIO_TYPE_DATA, + .stype = VIO_SUBTYPE_INFO, + .stype_env = VIO_DRING_DATA, + .sid = vio_send_sid(&port->vio), + }, + .dring_ident = dr->ident, + .start_idx = dr->prod, + .end_idx = dr->prod, + }; + int err, delay; + + hdr.seq = dr->snd_nxt; + delay = 1; + do { + err = vio_ldc_send(&port->vio, &hdr, sizeof(hdr)); + if (err > 0) { + dr->snd_nxt++; + break; + } + udelay(delay); + if ((delay <<= 1) > 128) + delay = 128; + } while (err == -EAGAIN); + + if (err == -ENOTCONN) + vdc_ldc_reset(port); + return err; +} + +static int __send_request(struct request *req) +{ + struct vdc_port *port = req->rq_disk->private_data; + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + struct scatterlist sg[port->ring_cookies]; + struct vdc_req_entry *rqe; + struct vio_disk_desc *desc; + unsigned int map_perm; + int nsg, err, i; + u64 len; + u8 op; + + map_perm = LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_IO; + + if (rq_data_dir(req) == READ) { + map_perm |= LDC_MAP_W; + op = VD_OP_BREAD; + } else { + map_perm |= LDC_MAP_R; + op = VD_OP_BWRITE; + } + + sg_init_table(sg, port->ring_cookies); + nsg = blk_rq_map_sg(req->q, req, sg); + + len = 0; + for (i = 0; i < nsg; i++) + len += sg[i].length; + + desc = vio_dring_cur(dr); + + err = ldc_map_sg(port->vio.lp, sg, nsg, + desc->cookies, port->ring_cookies, + map_perm); + if (err < 0) { + printk(KERN_ERR PFX "ldc_map_sg() failure, err=%d.\n", err); + return err; + } + + rqe = &port->rq_arr[dr->prod]; + rqe->req = req; + + desc->hdr.ack = VIO_ACK_ENABLE; + desc->req_id = port->req_id; + desc->operation = op; + if (port->vdisk_type == VD_DISK_TYPE_DISK) { + desc->slice = 0xff; + } else { + desc->slice = 0; + } + desc->status = ~0; + desc->offset = (blk_rq_pos(req) << 9) / port->vdisk_block_size; + desc->size = len; + desc->ncookies = err; + + /* This has to be a non-SMP write barrier because we are writing + * to memory which is shared with the peer LDOM. + */ + wmb(); + desc->hdr.state = VIO_DESC_READY; + + err = __vdc_tx_trigger(port); + if (err < 0) { + printk(KERN_ERR PFX "vdc_tx_trigger() failure, err=%d\n", err); + } else { + port->req_id++; + dr->prod = vio_dring_next(dr, dr->prod); + } + + return err; +} + +static void do_vdc_request(struct request_queue *rq) +{ + struct request *req; + + while ((req = blk_peek_request(rq)) != NULL) { + struct vdc_port *port; + struct vio_dring_state *dr; + + port = req->rq_disk->private_data; + dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + if (unlikely(vdc_tx_dring_avail(dr) < 1)) + goto wait; + + blk_start_request(req); + + if (__send_request(req) < 0) { + blk_requeue_request(rq, req); +wait: + /* Avoid pointless unplugs. */ + blk_stop_queue(rq); + break; + } + } +} + +static int generic_request(struct vdc_port *port, u8 op, void *buf, int len) +{ + struct vio_dring_state *dr; + struct vio_completion comp; + struct vio_disk_desc *desc; + unsigned int map_perm; + unsigned long flags; + int op_len, err; + void *req_buf; + + if (!(((u64)1 << (u64)op) & port->operations)) + return -EOPNOTSUPP; + + switch (op) { + case VD_OP_BREAD: + case VD_OP_BWRITE: + default: + return -EINVAL; + + case VD_OP_FLUSH: + op_len = 0; + map_perm = 0; + break; + + case VD_OP_GET_WCE: + op_len = sizeof(u32); + map_perm = LDC_MAP_W; + break; + + case VD_OP_SET_WCE: + op_len = sizeof(u32); + map_perm = LDC_MAP_R; + break; + + case VD_OP_GET_VTOC: + op_len = sizeof(struct vio_disk_vtoc); + map_perm = LDC_MAP_W; + break; + + case VD_OP_SET_VTOC: + op_len = sizeof(struct vio_disk_vtoc); + map_perm = LDC_MAP_R; + break; + + case VD_OP_GET_DISKGEOM: + op_len = sizeof(struct vio_disk_geom); + map_perm = LDC_MAP_W; + break; + + case VD_OP_SET_DISKGEOM: + op_len = sizeof(struct vio_disk_geom); + map_perm = LDC_MAP_R; + break; + + case VD_OP_SCSICMD: + op_len = 16; + map_perm = LDC_MAP_RW; + break; + + case VD_OP_GET_DEVID: + op_len = sizeof(struct vio_disk_devid); + map_perm = LDC_MAP_W; + break; + + case VD_OP_GET_EFI: + case VD_OP_SET_EFI: + return -EOPNOTSUPP; + break; + }; + + map_perm |= LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_IO; + + op_len = (op_len + 7) & ~7; + req_buf = kzalloc(op_len, GFP_KERNEL); + if (!req_buf) + return -ENOMEM; + + if (len > op_len) + len = op_len; + + if (map_perm & LDC_MAP_R) + memcpy(req_buf, buf, len); + + spin_lock_irqsave(&port->vio.lock, flags); + + dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + + /* XXX If we want to use this code generically we have to + * XXX handle TX ring exhaustion etc. + */ + desc = vio_dring_cur(dr); + + err = ldc_map_single(port->vio.lp, req_buf, op_len, + desc->cookies, port->ring_cookies, + map_perm); + if (err < 0) { + spin_unlock_irqrestore(&port->vio.lock, flags); + kfree(req_buf); + return err; + } + + init_completion(&comp.com); + comp.waiting_for = WAITING_FOR_GEN_CMD; + port->vio.cmp = ∁ + + desc->hdr.ack = VIO_ACK_ENABLE; + desc->req_id = port->req_id; + desc->operation = op; + desc->slice = 0; + desc->status = ~0; + desc->offset = 0; + desc->size = op_len; + desc->ncookies = err; + + /* This has to be a non-SMP write barrier because we are writing + * to memory which is shared with the peer LDOM. + */ + wmb(); + desc->hdr.state = VIO_DESC_READY; + + err = __vdc_tx_trigger(port); + if (err >= 0) { + port->req_id++; + dr->prod = vio_dring_next(dr, dr->prod); + spin_unlock_irqrestore(&port->vio.lock, flags); + + wait_for_completion(&comp.com); + err = comp.err; + } else { + port->vio.cmp = NULL; + spin_unlock_irqrestore(&port->vio.lock, flags); + } + + if (map_perm & LDC_MAP_W) + memcpy(buf, req_buf, len); + + kfree(req_buf); + + return err; +} + +static int vdc_alloc_tx_ring(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + unsigned long len, entry_size; + int ncookies; + void *dring; + + entry_size = sizeof(struct vio_disk_desc) + + (sizeof(struct ldc_trans_cookie) * port->ring_cookies); + len = (VDC_TX_RING_SIZE * entry_size); + + ncookies = VIO_MAX_RING_COOKIES; + dring = ldc_alloc_exp_dring(port->vio.lp, len, + dr->cookies, &ncookies, + (LDC_MAP_SHADOW | + LDC_MAP_DIRECT | + LDC_MAP_RW)); + if (IS_ERR(dring)) + return PTR_ERR(dring); + + dr->base = dring; + dr->entry_size = entry_size; + dr->num_entries = VDC_TX_RING_SIZE; + dr->prod = dr->cons = 0; + dr->pending = VDC_TX_RING_SIZE; + dr->ncookies = ncookies; + + return 0; +} + +static void vdc_free_tx_ring(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + + if (dr->base) { + ldc_free_exp_dring(port->vio.lp, dr->base, + (dr->entry_size * dr->num_entries), + dr->cookies, dr->ncookies); + dr->base = NULL; + dr->entry_size = 0; + dr->num_entries = 0; + dr->pending = 0; + dr->ncookies = 0; + } +} + +static int vdc_port_up(struct vdc_port *port) +{ + struct vio_completion comp; + + init_completion(&comp.com); + comp.err = 0; + comp.waiting_for = WAITING_FOR_LINK_UP; + port->vio.cmp = ∁ + + vio_port_up(&port->vio); + wait_for_completion(&comp.com); + return comp.err; +} + +static void vdc_port_down(struct vdc_port *port) +{ + ldc_disconnect(port->vio.lp); + ldc_unbind(port->vio.lp); + vdc_free_tx_ring(port); + vio_ldc_free(&port->vio); +} + +static int probe_disk(struct vdc_port *port) +{ + struct request_queue *q; + struct gendisk *g; + int err; + + err = vdc_port_up(port); + if (err) + return err; + + if (vdc_version_supported(port, 1, 1)) { + /* vdisk_size should be set during the handshake, if it wasn't + * then the underlying disk is reserved by another system + */ + if (port->vdisk_size == -1) + return -ENODEV; + } else { + struct vio_disk_geom geom; + + err = generic_request(port, VD_OP_GET_DISKGEOM, + &geom, sizeof(geom)); + if (err < 0) { + printk(KERN_ERR PFX "VD_OP_GET_DISKGEOM returns " + "error %d\n", err); + return err; + } + port->vdisk_size = ((u64)geom.num_cyl * + (u64)geom.num_hd * + (u64)geom.num_sec); + } + + q = blk_init_queue(do_vdc_request, &port->vio.lock); + if (!q) { + printk(KERN_ERR PFX "%s: Could not allocate queue.\n", + port->vio.name); + return -ENOMEM; + } + g = alloc_disk(1 << PARTITION_SHIFT); + if (!g) { + printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n", + port->vio.name); + blk_cleanup_queue(q); + return -ENOMEM; + } + + port->disk = g; + + /* Each segment in a request is up to an aligned page in size. */ + blk_queue_segment_boundary(q, PAGE_SIZE - 1); + blk_queue_max_segment_size(q, PAGE_SIZE); + + blk_queue_max_segments(q, port->ring_cookies); + blk_queue_max_hw_sectors(q, port->max_xfer_size); + g->major = vdc_major; + g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT; + strcpy(g->disk_name, port->disk_name); + + g->fops = &vdc_fops; + g->queue = q; + g->private_data = port; + g->driverfs_dev = &port->vio.vdev->dev; + + set_capacity(g, port->vdisk_size); + + if (vdc_version_supported(port, 1, 1)) { + switch (port->vdisk_mtype) { + case VD_MEDIA_TYPE_CD: + pr_info(PFX "Virtual CDROM %s\n", port->disk_name); + g->flags |= GENHD_FL_CD; + g->flags |= GENHD_FL_REMOVABLE; + set_disk_ro(g, 1); + break; + + case VD_MEDIA_TYPE_DVD: + pr_info(PFX "Virtual DVD %s\n", port->disk_name); + g->flags |= GENHD_FL_CD; + g->flags |= GENHD_FL_REMOVABLE; + set_disk_ro(g, 1); + break; + + case VD_MEDIA_TYPE_FIXED: + pr_info(PFX "Virtual Hard disk %s\n", port->disk_name); + break; + } + } + + pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n", + g->disk_name, + port->vdisk_size, (port->vdisk_size >> (20 - 9)), + port->vio.ver.major, port->vio.ver.minor); + + add_disk(g); + + return 0; +} + +static struct ldc_channel_config vdc_ldc_cfg = { + .event = vdc_event, + .mtu = 64, + .mode = LDC_MODE_UNRELIABLE, +}; + +static struct vio_driver_ops vdc_vio_ops = { + .send_attr = vdc_send_attr, + .handle_attr = vdc_handle_attr, + .handshake_complete = vdc_handshake_complete, +}; + +static void print_version(void) +{ + static int version_printed; + + if (version_printed++ == 0) + printk(KERN_INFO "%s", version); +} + +static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) +{ + struct mdesc_handle *hp; + struct vdc_port *port; + int err; + const u64 *ldc_timeout; + + print_version(); + + hp = mdesc_grab(); + + err = -ENODEV; + if ((vdev->dev_no << PARTITION_SHIFT) & ~(u64)MINORMASK) { + printk(KERN_ERR PFX "Port id [%llu] too large.\n", + vdev->dev_no); + goto err_out_release_mdesc; + } + + port = kzalloc(sizeof(*port), GFP_KERNEL); + err = -ENOMEM; + if (!port) { + printk(KERN_ERR PFX "Cannot allocate vdc_port.\n"); + goto err_out_release_mdesc; + } + + if (vdev->dev_no >= 26) + snprintf(port->disk_name, sizeof(port->disk_name), + VDCBLK_NAME "%c%c", + 'a' + ((int)vdev->dev_no / 26) - 1, + 'a' + ((int)vdev->dev_no % 26)); + else + snprintf(port->disk_name, sizeof(port->disk_name), + VDCBLK_NAME "%c", 'a' + ((int)vdev->dev_no % 26)); + port->vdisk_size = -1; + + /* Actual wall time may be double due to do_generic_file_read() doing + * a readahead I/O first, and once that fails it will try to read a + * single page. + */ + ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL); + port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0; + setup_timer(&port->ldc_reset_timer, vdc_ldc_reset_timer, + (unsigned long)port); + INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work); + + err = vio_driver_init(&port->vio, vdev, VDEV_DISK, + vdc_versions, ARRAY_SIZE(vdc_versions), + &vdc_vio_ops, port->disk_name); + if (err) + goto err_out_free_port; + + port->vdisk_block_size = 512; + port->max_xfer_size = ((128 * 1024) / port->vdisk_block_size); + port->ring_cookies = ((port->max_xfer_size * + port->vdisk_block_size) / PAGE_SIZE) + 2; + + err = vio_ldc_alloc(&port->vio, &vdc_ldc_cfg, port); + if (err) + goto err_out_free_port; + + err = vdc_alloc_tx_ring(port); + if (err) + goto err_out_free_ldc; + + err = probe_disk(port); + if (err) + goto err_out_free_tx_ring; + + dev_set_drvdata(&vdev->dev, port); + + mdesc_release(hp); + + return 0; + +err_out_free_tx_ring: + vdc_free_tx_ring(port); + +err_out_free_ldc: + vio_ldc_free(&port->vio); + +err_out_free_port: + kfree(port); + +err_out_release_mdesc: + mdesc_release(hp); + return err; +} + +static int vdc_port_remove(struct vio_dev *vdev) +{ + struct vdc_port *port = dev_get_drvdata(&vdev->dev); + + if (port) { + unsigned long flags; + + spin_lock_irqsave(&port->vio.lock, flags); + blk_stop_queue(port->disk->queue); + spin_unlock_irqrestore(&port->vio.lock, flags); + + flush_work(&port->ldc_reset_work); + del_timer_sync(&port->ldc_reset_timer); + del_timer_sync(&port->vio.timer); + + del_gendisk(port->disk); + blk_cleanup_queue(port->disk->queue); + put_disk(port->disk); + port->disk = NULL; + + vdc_free_tx_ring(port); + vio_ldc_free(&port->vio); + + dev_set_drvdata(&vdev->dev, NULL); + + kfree(port); + } + return 0; +} + +static void vdc_requeue_inflight(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + u32 idx; + + for (idx = dr->cons; idx != dr->prod; idx = vio_dring_next(dr, idx)) { + struct vio_disk_desc *desc = vio_dring_entry(dr, idx); + struct vdc_req_entry *rqe = &port->rq_arr[idx]; + struct request *req; + + ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies); + desc->hdr.state = VIO_DESC_FREE; + dr->cons = vio_dring_next(dr, idx); + + req = rqe->req; + if (req == NULL) { + vdc_end_special(port, desc); + continue; + } + + rqe->req = NULL; + blk_requeue_request(port->disk->queue, req); + } +} + +static void vdc_queue_drain(struct vdc_port *port) +{ + struct request *req; + + while ((req = blk_fetch_request(port->disk->queue)) != NULL) + __blk_end_request_all(req, -EIO); +} + +static void vdc_ldc_reset_timer(unsigned long _arg) +{ + struct vdc_port *port = (struct vdc_port *) _arg; + struct vio_driver_state *vio = &port->vio; + unsigned long flags; + + spin_lock_irqsave(&vio->lock, flags); + if (!(port->vio.hs_state & VIO_HS_COMPLETE)) { + pr_warn(PFX "%s ldc down %llu seconds, draining queue\n", + port->disk_name, port->ldc_timeout); + vdc_queue_drain(port); + vdc_blk_queue_start(port); + } + spin_unlock_irqrestore(&vio->lock, flags); +} + +static void vdc_ldc_reset_work(struct work_struct *work) +{ + struct vdc_port *port; + struct vio_driver_state *vio; + unsigned long flags; + + port = container_of(work, struct vdc_port, ldc_reset_work); + vio = &port->vio; + + spin_lock_irqsave(&vio->lock, flags); + vdc_ldc_reset(port); + spin_unlock_irqrestore(&vio->lock, flags); +} + +static void vdc_ldc_reset(struct vdc_port *port) +{ + int err; + + assert_spin_locked(&port->vio.lock); + + pr_warn(PFX "%s ldc link reset\n", port->disk_name); + blk_stop_queue(port->disk->queue); + vdc_requeue_inflight(port); + vdc_port_down(port); + + err = vio_ldc_alloc(&port->vio, &vdc_ldc_cfg, port); + if (err) { + pr_err(PFX "%s vio_ldc_alloc:%d\n", port->disk_name, err); + return; + } + + err = vdc_alloc_tx_ring(port); + if (err) { + pr_err(PFX "%s vio_alloc_tx_ring:%d\n", port->disk_name, err); + goto err_free_ldc; + } + + if (port->ldc_timeout) + mod_timer(&port->ldc_reset_timer, + round_jiffies(jiffies + HZ * port->ldc_timeout)); + mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ)); + return; + +err_free_ldc: + vio_ldc_free(&port->vio); +} + +static const struct vio_device_id vdc_port_match[] = { + { + .type = "vdc-port", + }, + {}, +}; +MODULE_DEVICE_TABLE(vio, vdc_port_match); + +static struct vio_driver vdc_port_driver = { + .id_table = vdc_port_match, + .probe = vdc_port_probe, + .remove = vdc_port_remove, + .name = "vdc_port", +}; + +static int __init vdc_init(void) +{ + int err; + + sunvdc_wq = alloc_workqueue("sunvdc", 0, 0); + if (!sunvdc_wq) + return -ENOMEM; + + err = register_blkdev(0, VDCBLK_NAME); + if (err < 0) + goto out_free_wq; + + vdc_major = err; + + err = vio_register_driver(&vdc_port_driver); + if (err) + goto out_unregister_blkdev; + + return 0; + +out_unregister_blkdev: + unregister_blkdev(vdc_major, VDCBLK_NAME); + vdc_major = 0; + +out_free_wq: + destroy_workqueue(sunvdc_wq); + return err; +} + +static void __exit vdc_exit(void) +{ + vio_unregister_driver(&vdc_port_driver); + unregister_blkdev(vdc_major, VDCBLK_NAME); + destroy_workqueue(sunvdc_wq); +} + +module_init(vdc_init); +module_exit(vdc_exit); diff --git a/kernel/drivers/block/swim.c b/kernel/drivers/block/swim.c new file mode 100644 index 000000000..b5afd495d --- /dev/null +++ b/kernel/drivers/block/swim.c @@ -0,0 +1,994 @@ +/* + * Driver for SWIM (Sander Woz Integrated Machine) floppy controller + * + * Copyright (C) 2004,2008 Laurent Vivier + * + * based on Alastair Bridgewater SWIM analysis, 2001 + * based on SWIM3 driver (c) Paul Mackerras, 1996 + * based on netBSD IWM driver (c) 1997, 1998 Hauke Fath. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 2004-08-21 (lv) - Initial implementation + * 2008-10-30 (lv) - Port to 2.6 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define CARDNAME "swim" + +struct sector_header { + unsigned char side; + unsigned char track; + unsigned char sector; + unsigned char size; + unsigned char crc0; + unsigned char crc1; +} __attribute__((packed)); + +#define DRIVER_VERSION "Version 0.2 (2008-10-30)" + +#define REG(x) unsigned char x, x ## _pad[0x200 - 1]; + +struct swim { + REG(write_data) + REG(write_mark) + REG(write_CRC) + REG(write_parameter) + REG(write_phase) + REG(write_setup) + REG(write_mode0) + REG(write_mode1) + + REG(read_data) + REG(read_mark) + REG(read_error) + REG(read_parameter) + REG(read_phase) + REG(read_setup) + REG(read_status) + REG(read_handshake) +} __attribute__((packed)); + +#define swim_write(base, reg, v) out_8(&(base)->write_##reg, (v)) +#define swim_read(base, reg) in_8(&(base)->read_##reg) + +/* IWM registers */ + +struct iwm { + REG(ph0L) + REG(ph0H) + REG(ph1L) + REG(ph1H) + REG(ph2L) + REG(ph2H) + REG(ph3L) + REG(ph3H) + REG(mtrOff) + REG(mtrOn) + REG(intDrive) + REG(extDrive) + REG(q6L) + REG(q6H) + REG(q7L) + REG(q7H) +} __attribute__((packed)); + +#define iwm_write(base, reg, v) out_8(&(base)->reg, (v)) +#define iwm_read(base, reg) in_8(&(base)->reg) + +/* bits in phase register */ + +#define SEEK_POSITIVE 0x070 +#define SEEK_NEGATIVE 0x074 +#define STEP 0x071 +#define MOTOR_ON 0x072 +#define MOTOR_OFF 0x076 +#define INDEX 0x073 +#define EJECT 0x077 +#define SETMFM 0x171 +#define SETGCR 0x175 + +#define RELAX 0x033 +#define LSTRB 0x008 + +#define CA_MASK 0x077 + +/* Select values for swim_select and swim_readbit */ + +#define READ_DATA_0 0x074 +#define TWOMEG_DRIVE 0x075 +#define SINGLE_SIDED 0x076 +#define DRIVE_PRESENT 0x077 +#define DISK_IN 0x170 +#define WRITE_PROT 0x171 +#define TRACK_ZERO 0x172 +#define TACHO 0x173 +#define READ_DATA_1 0x174 +#define MFM_MODE 0x175 +#define SEEK_COMPLETE 0x176 +#define ONEMEG_MEDIA 0x177 + +/* Bits in handshake register */ + +#define MARK_BYTE 0x01 +#define CRC_ZERO 0x02 +#define RDDATA 0x04 +#define SENSE 0x08 +#define MOTEN 0x10 +#define ERROR 0x20 +#define DAT2BYTE 0x40 +#define DAT1BYTE 0x80 + +/* bits in setup register */ + +#define S_INV_WDATA 0x01 +#define S_3_5_SELECT 0x02 +#define S_GCR 0x04 +#define S_FCLK_DIV2 0x08 +#define S_ERROR_CORR 0x10 +#define S_IBM_DRIVE 0x20 +#define S_GCR_WRITE 0x40 +#define S_TIMEOUT 0x80 + +/* bits in mode register */ + +#define CLFIFO 0x01 +#define ENBL1 0x02 +#define ENBL2 0x04 +#define ACTION 0x08 +#define WRITE_MODE 0x10 +#define HEDSEL 0x20 +#define MOTON 0x80 + +/*----------------------------------------------------------------------------*/ + +enum drive_location { + INTERNAL_DRIVE = 0x02, + EXTERNAL_DRIVE = 0x04, +}; + +enum media_type { + DD_MEDIA, + HD_MEDIA, +}; + +struct floppy_state { + + /* physical properties */ + + enum drive_location location; /* internal or external drive */ + int head_number; /* single- or double-sided drive */ + + /* media */ + + int disk_in; + int ejected; + enum media_type type; + int write_protected; + + int total_secs; + int secpercyl; + int secpertrack; + + /* in-use information */ + + int track; + int ref_count; + + struct gendisk *disk; + + /* parent controller */ + + struct swim_priv *swd; +}; + +enum motor_action { + OFF, + ON, +}; + +enum head { + LOWER_HEAD = 0, + UPPER_HEAD = 1, +}; + +#define FD_MAX_UNIT 2 + +struct swim_priv { + struct swim __iomem *base; + spinlock_t lock; + struct request_queue *queue; + int floppy_count; + struct floppy_state unit[FD_MAX_UNIT]; +}; + +extern int swim_read_sector_header(struct swim __iomem *base, + struct sector_header *header); +extern int swim_read_sector_data(struct swim __iomem *base, + unsigned char *data); + +static DEFINE_MUTEX(swim_mutex); +static inline void set_swim_mode(struct swim __iomem *base, int enable) +{ + struct iwm __iomem *iwm_base; + unsigned long flags; + + if (!enable) { + swim_write(base, mode0, 0xf8); + return; + } + + iwm_base = (struct iwm __iomem *)base; + local_irq_save(flags); + + iwm_read(iwm_base, q7L); + iwm_read(iwm_base, mtrOff); + iwm_read(iwm_base, q6H); + + iwm_write(iwm_base, q7H, 0x57); + iwm_write(iwm_base, q7H, 0x17); + iwm_write(iwm_base, q7H, 0x57); + iwm_write(iwm_base, q7H, 0x57); + + local_irq_restore(flags); +} + +static inline int get_swim_mode(struct swim __iomem *base) +{ + unsigned long flags; + + local_irq_save(flags); + + swim_write(base, phase, 0xf5); + if (swim_read(base, phase) != 0xf5) + goto is_iwm; + swim_write(base, phase, 0xf6); + if (swim_read(base, phase) != 0xf6) + goto is_iwm; + swim_write(base, phase, 0xf7); + if (swim_read(base, phase) != 0xf7) + goto is_iwm; + local_irq_restore(flags); + return 1; +is_iwm: + local_irq_restore(flags); + return 0; +} + +static inline void swim_select(struct swim __iomem *base, int sel) +{ + swim_write(base, phase, RELAX); + + via1_set_head(sel & 0x100); + + swim_write(base, phase, sel & CA_MASK); +} + +static inline void swim_action(struct swim __iomem *base, int action) +{ + unsigned long flags; + + local_irq_save(flags); + + swim_select(base, action); + udelay(1); + swim_write(base, phase, (LSTRB<<4) | LSTRB); + udelay(1); + swim_write(base, phase, (LSTRB<<4) | ((~LSTRB) & 0x0F)); + udelay(1); + + local_irq_restore(flags); +} + +static inline int swim_readbit(struct swim __iomem *base, int bit) +{ + int stat; + + swim_select(base, bit); + + udelay(10); + + stat = swim_read(base, handshake); + + return (stat & SENSE) == 0; +} + +static inline void swim_drive(struct swim __iomem *base, + enum drive_location location) +{ + if (location == INTERNAL_DRIVE) { + swim_write(base, mode0, EXTERNAL_DRIVE); /* clear drive 1 bit */ + swim_write(base, mode1, INTERNAL_DRIVE); /* set drive 0 bit */ + } else if (location == EXTERNAL_DRIVE) { + swim_write(base, mode0, INTERNAL_DRIVE); /* clear drive 0 bit */ + swim_write(base, mode1, EXTERNAL_DRIVE); /* set drive 1 bit */ + } +} + +static inline void swim_motor(struct swim __iomem *base, + enum motor_action action) +{ + if (action == ON) { + int i; + + swim_action(base, MOTOR_ON); + + for (i = 0; i < 2*HZ; i++) { + swim_select(base, RELAX); + if (swim_readbit(base, MOTOR_ON)) + break; + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } + } else if (action == OFF) { + swim_action(base, MOTOR_OFF); + swim_select(base, RELAX); + } +} + +static inline void swim_eject(struct swim __iomem *base) +{ + int i; + + swim_action(base, EJECT); + + for (i = 0; i < 2*HZ; i++) { + swim_select(base, RELAX); + if (!swim_readbit(base, DISK_IN)) + break; + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } + swim_select(base, RELAX); +} + +static inline void swim_head(struct swim __iomem *base, enum head head) +{ + /* wait drive is ready */ + + if (head == UPPER_HEAD) + swim_select(base, READ_DATA_1); + else if (head == LOWER_HEAD) + swim_select(base, READ_DATA_0); +} + +static inline int swim_step(struct swim __iomem *base) +{ + int wait; + + swim_action(base, STEP); + + for (wait = 0; wait < HZ; wait++) { + + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + + swim_select(base, RELAX); + if (!swim_readbit(base, STEP)) + return 0; + } + return -1; +} + +static inline int swim_track00(struct swim __iomem *base) +{ + int try; + + swim_action(base, SEEK_NEGATIVE); + + for (try = 0; try < 100; try++) { + + swim_select(base, RELAX); + if (swim_readbit(base, TRACK_ZERO)) + break; + + if (swim_step(base)) + return -1; + } + + if (swim_readbit(base, TRACK_ZERO)) + return 0; + + return -1; +} + +static inline int swim_seek(struct swim __iomem *base, int step) +{ + if (step == 0) + return 0; + + if (step < 0) { + swim_action(base, SEEK_NEGATIVE); + step = -step; + } else + swim_action(base, SEEK_POSITIVE); + + for ( ; step > 0; step--) { + if (swim_step(base)) + return -1; + } + + return 0; +} + +static inline int swim_track(struct floppy_state *fs, int track) +{ + struct swim __iomem *base = fs->swd->base; + int ret; + + ret = swim_seek(base, track - fs->track); + + if (ret == 0) + fs->track = track; + else { + swim_track00(base); + fs->track = 0; + } + + return ret; +} + +static int floppy_eject(struct floppy_state *fs) +{ + struct swim __iomem *base = fs->swd->base; + + swim_drive(base, fs->location); + swim_motor(base, OFF); + swim_eject(base); + + fs->disk_in = 0; + fs->ejected = 1; + + return 0; +} + +static inline int swim_read_sector(struct floppy_state *fs, + int side, int track, + int sector, unsigned char *buffer) +{ + struct swim __iomem *base = fs->swd->base; + unsigned long flags; + struct sector_header header; + int ret = -1; + short i; + + swim_track(fs, track); + + swim_write(base, mode1, MOTON); + swim_head(base, side); + swim_write(base, mode0, side); + + local_irq_save(flags); + for (i = 0; i < 36; i++) { + ret = swim_read_sector_header(base, &header); + if (!ret && (header.sector == sector)) { + /* found */ + + ret = swim_read_sector_data(base, buffer); + break; + } + } + local_irq_restore(flags); + + swim_write(base, mode0, MOTON); + + if ((header.side != side) || (header.track != track) || + (header.sector != sector)) + return 0; + + return ret; +} + +static int floppy_read_sectors(struct floppy_state *fs, + int req_sector, int sectors_nb, + unsigned char *buffer) +{ + struct swim __iomem *base = fs->swd->base; + int ret; + int side, track, sector; + int i, try; + + + swim_drive(base, fs->location); + for (i = req_sector; i < req_sector + sectors_nb; i++) { + int x; + track = i / fs->secpercyl; + x = i % fs->secpercyl; + side = x / fs->secpertrack; + sector = x % fs->secpertrack + 1; + + try = 5; + do { + ret = swim_read_sector(fs, side, track, sector, + buffer); + if (try-- == 0) + return -EIO; + } while (ret != 512); + + buffer += ret; + } + + return 0; +} + +static void redo_fd_request(struct request_queue *q) +{ + struct request *req; + struct floppy_state *fs; + + req = blk_fetch_request(q); + while (req) { + int err = -EIO; + + fs = req->rq_disk->private_data; + if (blk_rq_pos(req) >= fs->total_secs) + goto done; + if (!fs->disk_in) + goto done; + if (rq_data_dir(req) == WRITE && fs->write_protected) + goto done; + + switch (rq_data_dir(req)) { + case WRITE: + /* NOT IMPLEMENTED */ + break; + case READ: + err = floppy_read_sectors(fs, blk_rq_pos(req), + blk_rq_cur_sectors(req), + bio_data(req->bio)); + break; + } + done: + if (!__blk_end_request_cur(req, err)) + req = blk_fetch_request(q); + } +} + +static void do_fd_request(struct request_queue *q) +{ + redo_fd_request(q); +} + +static struct floppy_struct floppy_type[4] = { + { 0, 0, 0, 0, 0, 0x00, 0x00, 0x00, 0x00, NULL }, /* no testing */ + { 720, 9, 1, 80, 0, 0x2A, 0x02, 0xDF, 0x50, NULL }, /* 360KB SS 3.5"*/ + { 1440, 9, 2, 80, 0, 0x2A, 0x02, 0xDF, 0x50, NULL }, /* 720KB 3.5" */ + { 2880, 18, 2, 80, 0, 0x1B, 0x00, 0xCF, 0x6C, NULL }, /* 1.44MB 3.5" */ +}; + +static int get_floppy_geometry(struct floppy_state *fs, int type, + struct floppy_struct **g) +{ + if (type >= ARRAY_SIZE(floppy_type)) + return -EINVAL; + + if (type) + *g = &floppy_type[type]; + else if (fs->type == HD_MEDIA) /* High-Density media */ + *g = &floppy_type[3]; + else if (fs->head_number == 2) /* double-sided */ + *g = &floppy_type[2]; + else + *g = &floppy_type[1]; + + return 0; +} + +static void setup_medium(struct floppy_state *fs) +{ + struct swim __iomem *base = fs->swd->base; + + if (swim_readbit(base, DISK_IN)) { + struct floppy_struct *g; + fs->disk_in = 1; + fs->write_protected = swim_readbit(base, WRITE_PROT); + fs->type = swim_readbit(base, ONEMEG_MEDIA); + + if (swim_track00(base)) + printk(KERN_ERR + "SWIM: cannot move floppy head to track 0\n"); + + swim_track00(base); + + get_floppy_geometry(fs, 0, &g); + fs->total_secs = g->size; + fs->secpercyl = g->head * g->sect; + fs->secpertrack = g->sect; + fs->track = 0; + } else { + fs->disk_in = 0; + } +} + +static int floppy_open(struct block_device *bdev, fmode_t mode) +{ + struct floppy_state *fs = bdev->bd_disk->private_data; + struct swim __iomem *base = fs->swd->base; + int err; + + if (fs->ref_count == -1 || (fs->ref_count && mode & FMODE_EXCL)) + return -EBUSY; + + if (mode & FMODE_EXCL) + fs->ref_count = -1; + else + fs->ref_count++; + + swim_write(base, setup, S_IBM_DRIVE | S_FCLK_DIV2); + udelay(10); + swim_drive(base, INTERNAL_DRIVE); + swim_motor(base, ON); + swim_action(base, SETMFM); + if (fs->ejected) + setup_medium(fs); + if (!fs->disk_in) { + err = -ENXIO; + goto out; + } + + if (mode & FMODE_NDELAY) + return 0; + + if (mode & (FMODE_READ|FMODE_WRITE)) { + check_disk_change(bdev); + if ((mode & FMODE_WRITE) && fs->write_protected) { + err = -EROFS; + goto out; + } + } + return 0; +out: + if (fs->ref_count < 0) + fs->ref_count = 0; + else if (fs->ref_count > 0) + --fs->ref_count; + + if (fs->ref_count == 0) + swim_motor(base, OFF); + return err; +} + +static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + mutex_lock(&swim_mutex); + ret = floppy_open(bdev, mode); + mutex_unlock(&swim_mutex); + + return ret; +} + +static void floppy_release(struct gendisk *disk, fmode_t mode) +{ + struct floppy_state *fs = disk->private_data; + struct swim __iomem *base = fs->swd->base; + + mutex_lock(&swim_mutex); + if (fs->ref_count < 0) + fs->ref_count = 0; + else if (fs->ref_count > 0) + --fs->ref_count; + + if (fs->ref_count == 0) + swim_motor(base, OFF); + mutex_unlock(&swim_mutex); +} + +static int floppy_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long param) +{ + struct floppy_state *fs = bdev->bd_disk->private_data; + int err; + + if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case FDEJECT: + if (fs->ref_count != 1) + return -EBUSY; + mutex_lock(&swim_mutex); + err = floppy_eject(fs); + mutex_unlock(&swim_mutex); + return err; + + case FDGETPRM: + if (copy_to_user((void __user *) param, (void *) &floppy_type, + sizeof(struct floppy_struct))) + return -EFAULT; + break; + + default: + printk(KERN_DEBUG "SWIM floppy_ioctl: unknown cmd %d\n", + cmd); + return -ENOSYS; + } + return 0; +} + +static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct floppy_state *fs = bdev->bd_disk->private_data; + struct floppy_struct *g; + int ret; + + ret = get_floppy_geometry(fs, 0, &g); + if (ret) + return ret; + + geo->heads = g->head; + geo->sectors = g->sect; + geo->cylinders = g->track; + + return 0; +} + +static unsigned int floppy_check_events(struct gendisk *disk, + unsigned int clearing) +{ + struct floppy_state *fs = disk->private_data; + + return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0; +} + +static int floppy_revalidate(struct gendisk *disk) +{ + struct floppy_state *fs = disk->private_data; + struct swim __iomem *base = fs->swd->base; + + swim_drive(base, fs->location); + + if (fs->ejected) + setup_medium(fs); + + if (!fs->disk_in) + swim_motor(base, OFF); + else + fs->ejected = 0; + + return !fs->disk_in; +} + +static const struct block_device_operations floppy_fops = { + .owner = THIS_MODULE, + .open = floppy_unlocked_open, + .release = floppy_release, + .ioctl = floppy_ioctl, + .getgeo = floppy_getgeo, + .check_events = floppy_check_events, + .revalidate_disk = floppy_revalidate, +}; + +static struct kobject *floppy_find(dev_t dev, int *part, void *data) +{ + struct swim_priv *swd = data; + int drive = (*part & 3); + + if (drive > swd->floppy_count) + return NULL; + + *part = 0; + return get_disk(swd->unit[drive].disk); +} + +static int swim_add_floppy(struct swim_priv *swd, enum drive_location location) +{ + struct floppy_state *fs = &swd->unit[swd->floppy_count]; + struct swim __iomem *base = swd->base; + + fs->location = location; + + swim_drive(base, location); + + swim_motor(base, OFF); + + if (swim_readbit(base, SINGLE_SIDED)) + fs->head_number = 1; + else + fs->head_number = 2; + fs->ref_count = 0; + fs->ejected = 1; + + swd->floppy_count++; + + return 0; +} + +static int swim_floppy_init(struct swim_priv *swd) +{ + int err; + int drive; + struct swim __iomem *base = swd->base; + + /* scan floppy drives */ + + swim_drive(base, INTERNAL_DRIVE); + if (swim_readbit(base, DRIVE_PRESENT)) + swim_add_floppy(swd, INTERNAL_DRIVE); + swim_drive(base, EXTERNAL_DRIVE); + if (swim_readbit(base, DRIVE_PRESENT)) + swim_add_floppy(swd, EXTERNAL_DRIVE); + + /* register floppy drives */ + + err = register_blkdev(FLOPPY_MAJOR, "fd"); + if (err) { + printk(KERN_ERR "Unable to get major %d for SWIM floppy\n", + FLOPPY_MAJOR); + return -EBUSY; + } + + for (drive = 0; drive < swd->floppy_count; drive++) { + swd->unit[drive].disk = alloc_disk(1); + if (swd->unit[drive].disk == NULL) { + err = -ENOMEM; + goto exit_put_disks; + } + swd->unit[drive].swd = swd; + } + + spin_lock_init(&swd->lock); + swd->queue = blk_init_queue(do_fd_request, &swd->lock); + if (!swd->queue) { + err = -ENOMEM; + goto exit_put_disks; + } + + for (drive = 0; drive < swd->floppy_count; drive++) { + swd->unit[drive].disk->flags = GENHD_FL_REMOVABLE; + swd->unit[drive].disk->major = FLOPPY_MAJOR; + swd->unit[drive].disk->first_minor = drive; + sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive); + swd->unit[drive].disk->fops = &floppy_fops; + swd->unit[drive].disk->private_data = &swd->unit[drive]; + swd->unit[drive].disk->queue = swd->queue; + set_capacity(swd->unit[drive].disk, 2880); + add_disk(swd->unit[drive].disk); + } + + blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE, + floppy_find, NULL, swd); + + return 0; + +exit_put_disks: + unregister_blkdev(FLOPPY_MAJOR, "fd"); + while (drive--) + put_disk(swd->unit[drive].disk); + return err; +} + +static int swim_probe(struct platform_device *dev) +{ + struct resource *res; + struct swim __iomem *swim_base; + struct swim_priv *swd; + int ret; + + res = platform_get_resource(dev, IORESOURCE_MEM, 0); + if (!res) { + ret = -ENODEV; + goto out; + } + + if (!request_mem_region(res->start, resource_size(res), CARDNAME)) { + ret = -EBUSY; + goto out; + } + + swim_base = ioremap(res->start, resource_size(res)); + if (!swim_base) { + ret = -ENOMEM; + goto out_release_io; + } + + /* probe device */ + + set_swim_mode(swim_base, 1); + if (!get_swim_mode(swim_base)) { + printk(KERN_INFO "SWIM device not found !\n"); + ret = -ENODEV; + goto out_iounmap; + } + + /* set platform driver data */ + + swd = kzalloc(sizeof(struct swim_priv), GFP_KERNEL); + if (!swd) { + ret = -ENOMEM; + goto out_iounmap; + } + platform_set_drvdata(dev, swd); + + swd->base = swim_base; + + ret = swim_floppy_init(swd); + if (ret) + goto out_kfree; + + return 0; + +out_kfree: + kfree(swd); +out_iounmap: + iounmap(swim_base); +out_release_io: + release_mem_region(res->start, resource_size(res)); +out: + return ret; +} + +static int swim_remove(struct platform_device *dev) +{ + struct swim_priv *swd = platform_get_drvdata(dev); + int drive; + struct resource *res; + + blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); + + for (drive = 0; drive < swd->floppy_count; drive++) { + del_gendisk(swd->unit[drive].disk); + put_disk(swd->unit[drive].disk); + } + + unregister_blkdev(FLOPPY_MAJOR, "fd"); + + blk_cleanup_queue(swd->queue); + + /* eject floppies */ + + for (drive = 0; drive < swd->floppy_count; drive++) + floppy_eject(&swd->unit[drive]); + + iounmap(swd->base); + + res = platform_get_resource(dev, IORESOURCE_MEM, 0); + if (res) + release_mem_region(res->start, resource_size(res)); + + kfree(swd); + + return 0; +} + +static struct platform_driver swim_driver = { + .probe = swim_probe, + .remove = swim_remove, + .driver = { + .name = CARDNAME, + }, +}; + +static int __init swim_init(void) +{ + printk(KERN_INFO "SWIM floppy driver %s\n", DRIVER_VERSION); + + return platform_driver_register(&swim_driver); +} +module_init(swim_init); + +static void __exit swim_exit(void) +{ + platform_driver_unregister(&swim_driver); +} +module_exit(swim_exit); + +MODULE_DESCRIPTION("Driver for SWIM floppy controller"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Laurent Vivier "); +MODULE_ALIAS_BLOCKDEV_MAJOR(FLOPPY_MAJOR); diff --git a/kernel/drivers/block/swim3.c b/kernel/drivers/block/swim3.c new file mode 100644 index 000000000..c264f2d28 --- /dev/null +++ b/kernel/drivers/block/swim3.c @@ -0,0 +1,1289 @@ +/* + * Driver for the SWIM3 (Super Woz Integrated Machine 3) + * floppy controller found on Power Macintoshes. + * + * Copyright (C) 1996 Paul Mackerras. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * TODO: + * handle 2 drives + * handle GCR disks + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_FLOPPIES 2 + +static DEFINE_MUTEX(swim3_mutex); +static struct gendisk *disks[MAX_FLOPPIES]; + +enum swim_state { + idle, + locating, + seeking, + settling, + do_transfer, + jogging, + available, + revalidating, + ejecting +}; + +#define REG(x) unsigned char x; char x ## _pad[15]; + +/* + * The names for these registers mostly represent speculation on my part. + * It will be interesting to see how close they are to the names Apple uses. + */ +struct swim3 { + REG(data); + REG(timer); /* counts down at 1MHz */ + REG(error); + REG(mode); + REG(select); /* controls CA0, CA1, CA2 and LSTRB signals */ + REG(setup); + REG(control); /* writing bits clears them */ + REG(status); /* writing bits sets them in control */ + REG(intr); + REG(nseek); /* # tracks to seek */ + REG(ctrack); /* current track number */ + REG(csect); /* current sector number */ + REG(gap3); /* size of gap 3 in track format */ + REG(sector); /* sector # to read or write */ + REG(nsect); /* # sectors to read or write */ + REG(intr_enable); +}; + +#define control_bic control +#define control_bis status + +/* Bits in select register */ +#define CA_MASK 7 +#define LSTRB 8 + +/* Bits in control register */ +#define DO_SEEK 0x80 +#define FORMAT 0x40 +#define SELECT 0x20 +#define WRITE_SECTORS 0x10 +#define DO_ACTION 0x08 +#define DRIVE2_ENABLE 0x04 +#define DRIVE_ENABLE 0x02 +#define INTR_ENABLE 0x01 + +/* Bits in status register */ +#define FIFO_1BYTE 0x80 +#define FIFO_2BYTE 0x40 +#define ERROR 0x20 +#define DATA 0x08 +#define RDDATA 0x04 +#define INTR_PENDING 0x02 +#define MARK_BYTE 0x01 + +/* Bits in intr and intr_enable registers */ +#define ERROR_INTR 0x20 +#define DATA_CHANGED 0x10 +#define TRANSFER_DONE 0x08 +#define SEEN_SECTOR 0x04 +#define SEEK_DONE 0x02 +#define TIMER_DONE 0x01 + +/* Bits in error register */ +#define ERR_DATA_CRC 0x80 +#define ERR_ADDR_CRC 0x40 +#define ERR_OVERRUN 0x04 +#define ERR_UNDERRUN 0x01 + +/* Bits in setup register */ +#define S_SW_RESET 0x80 +#define S_GCR_WRITE 0x40 +#define S_IBM_DRIVE 0x20 +#define S_TEST_MODE 0x10 +#define S_FCLK_DIV2 0x08 +#define S_GCR 0x04 +#define S_COPY_PROT 0x02 +#define S_INV_WDATA 0x01 + +/* Select values for swim3_action */ +#define SEEK_POSITIVE 0 +#define SEEK_NEGATIVE 4 +#define STEP 1 +#define MOTOR_ON 2 +#define MOTOR_OFF 6 +#define INDEX 3 +#define EJECT 7 +#define SETMFM 9 +#define SETGCR 13 + +/* Select values for swim3_select and swim3_readbit */ +#define STEP_DIR 0 +#define STEPPING 1 +#define MOTOR_ON 2 +#define RELAX 3 /* also eject in progress */ +#define READ_DATA_0 4 +#define TWOMEG_DRIVE 5 +#define SINGLE_SIDED 6 /* drive or diskette is 4MB type? */ +#define DRIVE_PRESENT 7 +#define DISK_IN 8 +#define WRITE_PROT 9 +#define TRACK_ZERO 10 +#define TACHO 11 +#define READ_DATA_1 12 +#define MFM_MODE 13 +#define SEEK_COMPLETE 14 +#define ONEMEG_MEDIA 15 + +/* Definitions of values used in writing and formatting */ +#define DATA_ESCAPE 0x99 +#define GCR_SYNC_EXC 0x3f +#define GCR_SYNC_CONV 0x80 +#define GCR_FIRST_MARK 0xd5 +#define GCR_SECOND_MARK 0xaa +#define GCR_ADDR_MARK "\xd5\xaa\x00" +#define GCR_DATA_MARK "\xd5\xaa\x0b" +#define GCR_SLIP_BYTE "\x27\xaa" +#define GCR_SELF_SYNC "\x3f\xbf\x1e\x34\x3c\x3f" + +#define DATA_99 "\x99\x99" +#define MFM_ADDR_MARK "\x99\xa1\x99\xa1\x99\xa1\x99\xfe" +#define MFM_INDEX_MARK "\x99\xc2\x99\xc2\x99\xc2\x99\xfc" +#define MFM_GAP_LEN 12 + +struct floppy_state { + enum swim_state state; + struct swim3 __iomem *swim3; /* hardware registers */ + struct dbdma_regs __iomem *dma; /* DMA controller registers */ + int swim3_intr; /* interrupt number for SWIM3 */ + int dma_intr; /* interrupt number for DMA channel */ + int cur_cyl; /* cylinder head is on, or -1 */ + int cur_sector; /* last sector we saw go past */ + int req_cyl; /* the cylinder for the current r/w request */ + int head; /* head number ditto */ + int req_sector; /* sector number ditto */ + int scount; /* # sectors we're transferring at present */ + int retries; + int settle_time; + int secpercyl; /* disk geometry information */ + int secpertrack; + int total_secs; + int write_prot; /* 1 if write-protected, 0 if not, -1 dunno */ + struct dbdma_cmd *dma_cmd; + int ref_count; + int expect_cyl; + struct timer_list timeout; + int timeout_pending; + int ejected; + wait_queue_head_t wait; + int wanted; + struct macio_dev *mdev; + char dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)]; + int index; + struct request *cur_req; +}; + +#define swim3_err(fmt, arg...) dev_err(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg) +#define swim3_warn(fmt, arg...) dev_warn(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg) +#define swim3_info(fmt, arg...) dev_info(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg) + +#ifdef DEBUG +#define swim3_dbg(fmt, arg...) dev_dbg(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg) +#else +#define swim3_dbg(fmt, arg...) do { } while(0) +#endif + +static struct floppy_state floppy_states[MAX_FLOPPIES]; +static int floppy_count = 0; +static DEFINE_SPINLOCK(swim3_lock); + +static unsigned short write_preamble[] = { + 0x4e4e, 0x4e4e, 0x4e4e, 0x4e4e, 0x4e4e, /* gap field */ + 0, 0, 0, 0, 0, 0, /* sync field */ + 0x99a1, 0x99a1, 0x99a1, 0x99fb, /* data address mark */ + 0x990f /* no escape for 512 bytes */ +}; + +static unsigned short write_postamble[] = { + 0x9904, /* insert CRC */ + 0x4e4e, 0x4e4e, + 0x9908, /* stop writing */ + 0, 0, 0, 0, 0, 0 +}; + +static void seek_track(struct floppy_state *fs, int n); +static void init_dma(struct dbdma_cmd *cp, int cmd, void *buf, int count); +static void act(struct floppy_state *fs); +static void scan_timeout(unsigned long data); +static void seek_timeout(unsigned long data); +static void settle_timeout(unsigned long data); +static void xfer_timeout(unsigned long data); +static irqreturn_t swim3_interrupt(int irq, void *dev_id); +/*static void fd_dma_interrupt(int irq, void *dev_id);*/ +static int grab_drive(struct floppy_state *fs, enum swim_state state, + int interruptible); +static void release_drive(struct floppy_state *fs); +static int fd_eject(struct floppy_state *fs); +static int floppy_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long param); +static int floppy_open(struct block_device *bdev, fmode_t mode); +static void floppy_release(struct gendisk *disk, fmode_t mode); +static unsigned int floppy_check_events(struct gendisk *disk, + unsigned int clearing); +static int floppy_revalidate(struct gendisk *disk); + +static bool swim3_end_request(struct floppy_state *fs, int err, unsigned int nr_bytes) +{ + struct request *req = fs->cur_req; + int rc; + + swim3_dbg(" end request, err=%d nr_bytes=%d, cur_req=%p\n", + err, nr_bytes, req); + + if (err) + nr_bytes = blk_rq_cur_bytes(req); + rc = __blk_end_request(req, err, nr_bytes); + if (rc) + return true; + fs->cur_req = NULL; + return false; +} + +static void swim3_select(struct floppy_state *fs, int sel) +{ + struct swim3 __iomem *sw = fs->swim3; + + out_8(&sw->select, RELAX); + if (sel & 8) + out_8(&sw->control_bis, SELECT); + else + out_8(&sw->control_bic, SELECT); + out_8(&sw->select, sel & CA_MASK); +} + +static void swim3_action(struct floppy_state *fs, int action) +{ + struct swim3 __iomem *sw = fs->swim3; + + swim3_select(fs, action); + udelay(1); + out_8(&sw->select, sw->select | LSTRB); + udelay(2); + out_8(&sw->select, sw->select & ~LSTRB); + udelay(1); +} + +static int swim3_readbit(struct floppy_state *fs, int bit) +{ + struct swim3 __iomem *sw = fs->swim3; + int stat; + + swim3_select(fs, bit); + udelay(1); + stat = in_8(&sw->status); + return (stat & DATA) == 0; +} + +static void start_request(struct floppy_state *fs) +{ + struct request *req; + unsigned long x; + + swim3_dbg("start request, initial state=%d\n", fs->state); + + if (fs->state == idle && fs->wanted) { + fs->state = available; + wake_up(&fs->wait); + return; + } + while (fs->state == idle) { + swim3_dbg("start request, idle loop, cur_req=%p\n", fs->cur_req); + if (!fs->cur_req) { + fs->cur_req = blk_fetch_request(disks[fs->index]->queue); + swim3_dbg(" fetched request %p\n", fs->cur_req); + if (!fs->cur_req) + break; + } + req = fs->cur_req; + + if (fs->mdev->media_bay && + check_media_bay(fs->mdev->media_bay) != MB_FD) { + swim3_dbg("%s", " media bay absent, dropping req\n"); + swim3_end_request(fs, -ENODEV, 0); + continue; + } + +#if 0 /* This is really too verbose */ + swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n", + req->rq_disk->disk_name, req->cmd, + (long)blk_rq_pos(req), blk_rq_sectors(req), + bio_data(req->bio)); + swim3_dbg(" errors=%d current_nr_sectors=%u\n", + req->errors, blk_rq_cur_sectors(req)); +#endif + + if (blk_rq_pos(req) >= fs->total_secs) { + swim3_dbg(" pos out of bounds (%ld, max is %ld)\n", + (long)blk_rq_pos(req), (long)fs->total_secs); + swim3_end_request(fs, -EIO, 0); + continue; + } + if (fs->ejected) { + swim3_dbg("%s", " disk ejected\n"); + swim3_end_request(fs, -EIO, 0); + continue; + } + + if (rq_data_dir(req) == WRITE) { + if (fs->write_prot < 0) + fs->write_prot = swim3_readbit(fs, WRITE_PROT); + if (fs->write_prot) { + swim3_dbg("%s", " try to write, disk write protected\n"); + swim3_end_request(fs, -EIO, 0); + continue; + } + } + + /* Do not remove the cast. blk_rq_pos(req) is now a + * sector_t and can be 64 bits, but it will never go + * past 32 bits for this driver anyway, so we can + * safely cast it down and not have to do a 64/32 + * division + */ + fs->req_cyl = ((long)blk_rq_pos(req)) / fs->secpercyl; + x = ((long)blk_rq_pos(req)) % fs->secpercyl; + fs->head = x / fs->secpertrack; + fs->req_sector = x % fs->secpertrack + 1; + fs->state = do_transfer; + fs->retries = 0; + + act(fs); + } +} + +static void do_fd_request(struct request_queue * q) +{ + start_request(q->queuedata); +} + +static void set_timeout(struct floppy_state *fs, int nticks, + void (*proc)(unsigned long)) +{ + if (fs->timeout_pending) + del_timer(&fs->timeout); + fs->timeout.expires = jiffies + nticks; + fs->timeout.function = proc; + fs->timeout.data = (unsigned long) fs; + add_timer(&fs->timeout); + fs->timeout_pending = 1; +} + +static inline void scan_track(struct floppy_state *fs) +{ + struct swim3 __iomem *sw = fs->swim3; + + swim3_select(fs, READ_DATA_0); + in_8(&sw->intr); /* clear SEEN_SECTOR bit */ + in_8(&sw->error); + out_8(&sw->intr_enable, SEEN_SECTOR); + out_8(&sw->control_bis, DO_ACTION); + /* enable intr when track found */ + set_timeout(fs, HZ, scan_timeout); /* enable timeout */ +} + +static inline void seek_track(struct floppy_state *fs, int n) +{ + struct swim3 __iomem *sw = fs->swim3; + + if (n >= 0) { + swim3_action(fs, SEEK_POSITIVE); + sw->nseek = n; + } else { + swim3_action(fs, SEEK_NEGATIVE); + sw->nseek = -n; + } + fs->expect_cyl = (fs->cur_cyl >= 0)? fs->cur_cyl + n: -1; + swim3_select(fs, STEP); + in_8(&sw->error); + /* enable intr when seek finished */ + out_8(&sw->intr_enable, SEEK_DONE); + out_8(&sw->control_bis, DO_SEEK); + set_timeout(fs, 3*HZ, seek_timeout); /* enable timeout */ + fs->settle_time = 0; +} + +static inline void init_dma(struct dbdma_cmd *cp, int cmd, + void *buf, int count) +{ + cp->req_count = cpu_to_le16(count); + cp->command = cpu_to_le16(cmd); + cp->phy_addr = cpu_to_le32(virt_to_bus(buf)); + cp->xfer_status = 0; +} + +static inline void setup_transfer(struct floppy_state *fs) +{ + int n; + struct swim3 __iomem *sw = fs->swim3; + struct dbdma_cmd *cp = fs->dma_cmd; + struct dbdma_regs __iomem *dr = fs->dma; + struct request *req = fs->cur_req; + + if (blk_rq_cur_sectors(req) <= 0) { + swim3_warn("%s", "Transfer 0 sectors ?\n"); + return; + } + if (rq_data_dir(req) == WRITE) + n = 1; + else { + n = fs->secpertrack - fs->req_sector + 1; + if (n > blk_rq_cur_sectors(req)) + n = blk_rq_cur_sectors(req); + } + + swim3_dbg(" setup xfer at sect %d (of %d) head %d for %d\n", + fs->req_sector, fs->secpertrack, fs->head, n); + + fs->scount = n; + swim3_select(fs, fs->head? READ_DATA_1: READ_DATA_0); + out_8(&sw->sector, fs->req_sector); + out_8(&sw->nsect, n); + out_8(&sw->gap3, 0); + out_le32(&dr->cmdptr, virt_to_bus(cp)); + if (rq_data_dir(req) == WRITE) { + /* Set up 3 dma commands: write preamble, data, postamble */ + init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble)); + ++cp; + init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512); + ++cp; + init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble)); + } else { + init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512); + } + ++cp; + out_le16(&cp->command, DBDMA_STOP); + out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS); + in_8(&sw->error); + out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS); + if (rq_data_dir(req) == WRITE) + out_8(&sw->control_bis, WRITE_SECTORS); + in_8(&sw->intr); + out_le32(&dr->control, (RUN << 16) | RUN); + /* enable intr when transfer complete */ + out_8(&sw->intr_enable, TRANSFER_DONE); + out_8(&sw->control_bis, DO_ACTION); + set_timeout(fs, 2*HZ, xfer_timeout); /* enable timeout */ +} + +static void act(struct floppy_state *fs) +{ + for (;;) { + swim3_dbg(" act loop, state=%d, req_cyl=%d, cur_cyl=%d\n", + fs->state, fs->req_cyl, fs->cur_cyl); + + switch (fs->state) { + case idle: + return; /* XXX shouldn't get here */ + + case locating: + if (swim3_readbit(fs, TRACK_ZERO)) { + swim3_dbg("%s", " locate track 0\n"); + fs->cur_cyl = 0; + if (fs->req_cyl == 0) + fs->state = do_transfer; + else + fs->state = seeking; + break; + } + scan_track(fs); + return; + + case seeking: + if (fs->cur_cyl < 0) { + fs->expect_cyl = -1; + fs->state = locating; + break; + } + if (fs->req_cyl == fs->cur_cyl) { + swim3_warn("%s", "Whoops, seeking 0\n"); + fs->state = do_transfer; + break; + } + seek_track(fs, fs->req_cyl - fs->cur_cyl); + return; + + case settling: + /* check for SEEK_COMPLETE after 30ms */ + fs->settle_time = (HZ + 32) / 33; + set_timeout(fs, fs->settle_time, settle_timeout); + return; + + case do_transfer: + if (fs->cur_cyl != fs->req_cyl) { + if (fs->retries > 5) { + swim3_err("Wrong cylinder in transfer, want: %d got %d\n", + fs->req_cyl, fs->cur_cyl); + swim3_end_request(fs, -EIO, 0); + fs->state = idle; + return; + } + fs->state = seeking; + break; + } + setup_transfer(fs); + return; + + case jogging: + seek_track(fs, -5); + return; + + default: + swim3_err("Unknown state %d\n", fs->state); + return; + } + } +} + +static void scan_timeout(unsigned long data) +{ + struct floppy_state *fs = (struct floppy_state *) data; + struct swim3 __iomem *sw = fs->swim3; + unsigned long flags; + + swim3_dbg("* scan timeout, state=%d\n", fs->state); + + spin_lock_irqsave(&swim3_lock, flags); + fs->timeout_pending = 0; + out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS); + out_8(&sw->select, RELAX); + out_8(&sw->intr_enable, 0); + fs->cur_cyl = -1; + if (fs->retries > 5) { + swim3_end_request(fs, -EIO, 0); + fs->state = idle; + start_request(fs); + } else { + fs->state = jogging; + act(fs); + } + spin_unlock_irqrestore(&swim3_lock, flags); +} + +static void seek_timeout(unsigned long data) +{ + struct floppy_state *fs = (struct floppy_state *) data; + struct swim3 __iomem *sw = fs->swim3; + unsigned long flags; + + swim3_dbg("* seek timeout, state=%d\n", fs->state); + + spin_lock_irqsave(&swim3_lock, flags); + fs->timeout_pending = 0; + out_8(&sw->control_bic, DO_SEEK); + out_8(&sw->select, RELAX); + out_8(&sw->intr_enable, 0); + swim3_err("%s", "Seek timeout\n"); + swim3_end_request(fs, -EIO, 0); + fs->state = idle; + start_request(fs); + spin_unlock_irqrestore(&swim3_lock, flags); +} + +static void settle_timeout(unsigned long data) +{ + struct floppy_state *fs = (struct floppy_state *) data; + struct swim3 __iomem *sw = fs->swim3; + unsigned long flags; + + swim3_dbg("* settle timeout, state=%d\n", fs->state); + + spin_lock_irqsave(&swim3_lock, flags); + fs->timeout_pending = 0; + if (swim3_readbit(fs, SEEK_COMPLETE)) { + out_8(&sw->select, RELAX); + fs->state = locating; + act(fs); + goto unlock; + } + out_8(&sw->select, RELAX); + if (fs->settle_time < 2*HZ) { + ++fs->settle_time; + set_timeout(fs, 1, settle_timeout); + goto unlock; + } + swim3_err("%s", "Seek settle timeout\n"); + swim3_end_request(fs, -EIO, 0); + fs->state = idle; + start_request(fs); + unlock: + spin_unlock_irqrestore(&swim3_lock, flags); +} + +static void xfer_timeout(unsigned long data) +{ + struct floppy_state *fs = (struct floppy_state *) data; + struct swim3 __iomem *sw = fs->swim3; + struct dbdma_regs __iomem *dr = fs->dma; + unsigned long flags; + int n; + + swim3_dbg("* xfer timeout, state=%d\n", fs->state); + + spin_lock_irqsave(&swim3_lock, flags); + fs->timeout_pending = 0; + out_le32(&dr->control, RUN << 16); + /* We must wait a bit for dbdma to stop */ + for (n = 0; (in_le32(&dr->status) & ACTIVE) && n < 1000; n++) + udelay(1); + out_8(&sw->intr_enable, 0); + out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION); + out_8(&sw->select, RELAX); + swim3_err("Timeout %sing sector %ld\n", + (rq_data_dir(fs->cur_req)==WRITE? "writ": "read"), + (long)blk_rq_pos(fs->cur_req)); + swim3_end_request(fs, -EIO, 0); + fs->state = idle; + start_request(fs); + spin_unlock_irqrestore(&swim3_lock, flags); +} + +static irqreturn_t swim3_interrupt(int irq, void *dev_id) +{ + struct floppy_state *fs = (struct floppy_state *) dev_id; + struct swim3 __iomem *sw = fs->swim3; + int intr, err, n; + int stat, resid; + struct dbdma_regs __iomem *dr; + struct dbdma_cmd *cp; + unsigned long flags; + struct request *req = fs->cur_req; + + swim3_dbg("* interrupt, state=%d\n", fs->state); + + spin_lock_irqsave(&swim3_lock, flags); + intr = in_8(&sw->intr); + err = (intr & ERROR_INTR)? in_8(&sw->error): 0; + if ((intr & ERROR_INTR) && fs->state != do_transfer) + swim3_err("Non-transfer error interrupt: state=%d, dir=%x, intr=%x, err=%x\n", + fs->state, rq_data_dir(req), intr, err); + switch (fs->state) { + case locating: + if (intr & SEEN_SECTOR) { + out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS); + out_8(&sw->select, RELAX); + out_8(&sw->intr_enable, 0); + del_timer(&fs->timeout); + fs->timeout_pending = 0; + if (sw->ctrack == 0xff) { + swim3_err("%s", "Seen sector but cyl=ff?\n"); + fs->cur_cyl = -1; + if (fs->retries > 5) { + swim3_end_request(fs, -EIO, 0); + fs->state = idle; + start_request(fs); + } else { + fs->state = jogging; + act(fs); + } + break; + } + fs->cur_cyl = sw->ctrack; + fs->cur_sector = sw->csect; + if (fs->expect_cyl != -1 && fs->expect_cyl != fs->cur_cyl) + swim3_err("Expected cyl %d, got %d\n", + fs->expect_cyl, fs->cur_cyl); + fs->state = do_transfer; + act(fs); + } + break; + case seeking: + case jogging: + if (sw->nseek == 0) { + out_8(&sw->control_bic, DO_SEEK); + out_8(&sw->select, RELAX); + out_8(&sw->intr_enable, 0); + del_timer(&fs->timeout); + fs->timeout_pending = 0; + if (fs->state == seeking) + ++fs->retries; + fs->state = settling; + act(fs); + } + break; + case settling: + out_8(&sw->intr_enable, 0); + del_timer(&fs->timeout); + fs->timeout_pending = 0; + act(fs); + break; + case do_transfer: + if ((intr & (ERROR_INTR | TRANSFER_DONE)) == 0) + break; + out_8(&sw->intr_enable, 0); + out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION); + out_8(&sw->select, RELAX); + del_timer(&fs->timeout); + fs->timeout_pending = 0; + dr = fs->dma; + cp = fs->dma_cmd; + if (rq_data_dir(req) == WRITE) + ++cp; + /* + * Check that the main data transfer has finished. + * On writing, the swim3 sometimes doesn't use + * up all the bytes of the postamble, so we can still + * see DMA active here. That doesn't matter as long + * as all the sector data has been transferred. + */ + if ((intr & ERROR_INTR) == 0 && cp->xfer_status == 0) { + /* wait a little while for DMA to complete */ + for (n = 0; n < 100; ++n) { + if (cp->xfer_status != 0) + break; + udelay(1); + barrier(); + } + } + /* turn off DMA */ + out_le32(&dr->control, (RUN | PAUSE) << 16); + stat = le16_to_cpu(cp->xfer_status); + resid = le16_to_cpu(cp->res_count); + if (intr & ERROR_INTR) { + n = fs->scount - 1 - resid / 512; + if (n > 0) { + blk_update_request(req, 0, n << 9); + fs->req_sector += n; + } + if (fs->retries < 5) { + ++fs->retries; + act(fs); + } else { + swim3_err("Error %sing block %ld (err=%x)\n", + rq_data_dir(req) == WRITE? "writ": "read", + (long)blk_rq_pos(req), err); + swim3_end_request(fs, -EIO, 0); + fs->state = idle; + } + } else { + if ((stat & ACTIVE) == 0 || resid != 0) { + /* musta been an error */ + swim3_err("fd dma error: stat=%x resid=%d\n", stat, resid); + swim3_err(" state=%d, dir=%x, intr=%x, err=%x\n", + fs->state, rq_data_dir(req), intr, err); + swim3_end_request(fs, -EIO, 0); + fs->state = idle; + start_request(fs); + break; + } + fs->retries = 0; + if (swim3_end_request(fs, 0, fs->scount << 9)) { + fs->req_sector += fs->scount; + if (fs->req_sector > fs->secpertrack) { + fs->req_sector -= fs->secpertrack; + if (++fs->head > 1) { + fs->head = 0; + ++fs->req_cyl; + } + } + act(fs); + } else + fs->state = idle; + } + if (fs->state == idle) + start_request(fs); + break; + default: + swim3_err("Don't know what to do in state %d\n", fs->state); + } + spin_unlock_irqrestore(&swim3_lock, flags); + return IRQ_HANDLED; +} + +/* +static void fd_dma_interrupt(int irq, void *dev_id) +{ +} +*/ + +/* Called under the mutex to grab exclusive access to a drive */ +static int grab_drive(struct floppy_state *fs, enum swim_state state, + int interruptible) +{ + unsigned long flags; + + swim3_dbg("%s", "-> grab drive\n"); + + spin_lock_irqsave(&swim3_lock, flags); + if (fs->state != idle && fs->state != available) { + ++fs->wanted; + /* this will enable irqs in order to sleep */ + if (!interruptible) + wait_event_lock_irq(fs->wait, + fs->state == available, + swim3_lock); + else if (wait_event_interruptible_lock_irq(fs->wait, + fs->state == available, + swim3_lock)) { + --fs->wanted; + spin_unlock_irqrestore(&swim3_lock, flags); + return -EINTR; + } + --fs->wanted; + } + fs->state = state; + spin_unlock_irqrestore(&swim3_lock, flags); + + return 0; +} + +static void release_drive(struct floppy_state *fs) +{ + unsigned long flags; + + swim3_dbg("%s", "-> release drive\n"); + + spin_lock_irqsave(&swim3_lock, flags); + fs->state = idle; + start_request(fs); + spin_unlock_irqrestore(&swim3_lock, flags); +} + +static int fd_eject(struct floppy_state *fs) +{ + int err, n; + + err = grab_drive(fs, ejecting, 1); + if (err) + return err; + swim3_action(fs, EJECT); + for (n = 20; n > 0; --n) { + if (signal_pending(current)) { + err = -EINTR; + break; + } + swim3_select(fs, RELAX); + schedule_timeout_interruptible(1); + if (swim3_readbit(fs, DISK_IN) == 0) + break; + } + swim3_select(fs, RELAX); + udelay(150); + fs->ejected = 1; + release_drive(fs); + return err; +} + +static struct floppy_struct floppy_type = + { 2880,18,2,80,0,0x1B,0x00,0xCF,0x6C,NULL }; /* 7 1.44MB 3.5" */ + +static int floppy_locked_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long param) +{ + struct floppy_state *fs = bdev->bd_disk->private_data; + int err; + + if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (fs->mdev->media_bay && + check_media_bay(fs->mdev->media_bay) != MB_FD) + return -ENXIO; + + switch (cmd) { + case FDEJECT: + if (fs->ref_count != 1) + return -EBUSY; + err = fd_eject(fs); + return err; + case FDGETPRM: + if (copy_to_user((void __user *) param, &floppy_type, + sizeof(struct floppy_struct))) + return -EFAULT; + return 0; + } + return -ENOTTY; +} + +static int floppy_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long param) +{ + int ret; + + mutex_lock(&swim3_mutex); + ret = floppy_locked_ioctl(bdev, mode, cmd, param); + mutex_unlock(&swim3_mutex); + + return ret; +} + +static int floppy_open(struct block_device *bdev, fmode_t mode) +{ + struct floppy_state *fs = bdev->bd_disk->private_data; + struct swim3 __iomem *sw = fs->swim3; + int n, err = 0; + + if (fs->ref_count == 0) { + if (fs->mdev->media_bay && + check_media_bay(fs->mdev->media_bay) != MB_FD) + return -ENXIO; + out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2); + out_8(&sw->control_bic, 0xff); + out_8(&sw->mode, 0x95); + udelay(10); + out_8(&sw->intr_enable, 0); + out_8(&sw->control_bis, DRIVE_ENABLE | INTR_ENABLE); + swim3_action(fs, MOTOR_ON); + fs->write_prot = -1; + fs->cur_cyl = -1; + for (n = 0; n < 2 * HZ; ++n) { + if (n >= HZ/30 && swim3_readbit(fs, SEEK_COMPLETE)) + break; + if (signal_pending(current)) { + err = -EINTR; + break; + } + swim3_select(fs, RELAX); + schedule_timeout_interruptible(1); + } + if (err == 0 && (swim3_readbit(fs, SEEK_COMPLETE) == 0 + || swim3_readbit(fs, DISK_IN) == 0)) + err = -ENXIO; + swim3_action(fs, SETMFM); + swim3_select(fs, RELAX); + + } else if (fs->ref_count == -1 || mode & FMODE_EXCL) + return -EBUSY; + + if (err == 0 && (mode & FMODE_NDELAY) == 0 + && (mode & (FMODE_READ|FMODE_WRITE))) { + check_disk_change(bdev); + if (fs->ejected) + err = -ENXIO; + } + + if (err == 0 && (mode & FMODE_WRITE)) { + if (fs->write_prot < 0) + fs->write_prot = swim3_readbit(fs, WRITE_PROT); + if (fs->write_prot) + err = -EROFS; + } + + if (err) { + if (fs->ref_count == 0) { + swim3_action(fs, MOTOR_OFF); + out_8(&sw->control_bic, DRIVE_ENABLE | INTR_ENABLE); + swim3_select(fs, RELAX); + } + return err; + } + + if (mode & FMODE_EXCL) + fs->ref_count = -1; + else + ++fs->ref_count; + + return 0; +} + +static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode) +{ + int ret; + + mutex_lock(&swim3_mutex); + ret = floppy_open(bdev, mode); + mutex_unlock(&swim3_mutex); + + return ret; +} + +static void floppy_release(struct gendisk *disk, fmode_t mode) +{ + struct floppy_state *fs = disk->private_data; + struct swim3 __iomem *sw = fs->swim3; + + mutex_lock(&swim3_mutex); + if (fs->ref_count > 0 && --fs->ref_count == 0) { + swim3_action(fs, MOTOR_OFF); + out_8(&sw->control_bic, 0xff); + swim3_select(fs, RELAX); + } + mutex_unlock(&swim3_mutex); +} + +static unsigned int floppy_check_events(struct gendisk *disk, + unsigned int clearing) +{ + struct floppy_state *fs = disk->private_data; + return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0; +} + +static int floppy_revalidate(struct gendisk *disk) +{ + struct floppy_state *fs = disk->private_data; + struct swim3 __iomem *sw; + int ret, n; + + if (fs->mdev->media_bay && + check_media_bay(fs->mdev->media_bay) != MB_FD) + return -ENXIO; + + sw = fs->swim3; + grab_drive(fs, revalidating, 0); + out_8(&sw->intr_enable, 0); + out_8(&sw->control_bis, DRIVE_ENABLE); + swim3_action(fs, MOTOR_ON); /* necessary? */ + fs->write_prot = -1; + fs->cur_cyl = -1; + mdelay(1); + for (n = HZ; n > 0; --n) { + if (swim3_readbit(fs, SEEK_COMPLETE)) + break; + if (signal_pending(current)) + break; + swim3_select(fs, RELAX); + schedule_timeout_interruptible(1); + } + ret = swim3_readbit(fs, SEEK_COMPLETE) == 0 + || swim3_readbit(fs, DISK_IN) == 0; + if (ret) + swim3_action(fs, MOTOR_OFF); + else { + fs->ejected = 0; + swim3_action(fs, SETMFM); + } + swim3_select(fs, RELAX); + + release_drive(fs); + return ret; +} + +static const struct block_device_operations floppy_fops = { + .open = floppy_unlocked_open, + .release = floppy_release, + .ioctl = floppy_ioctl, + .check_events = floppy_check_events, + .revalidate_disk= floppy_revalidate, +}; + +static void swim3_mb_event(struct macio_dev* mdev, int mb_state) +{ + struct floppy_state *fs = macio_get_drvdata(mdev); + struct swim3 __iomem *sw; + + if (!fs) + return; + + sw = fs->swim3; + + if (mb_state != MB_FD) + return; + + /* Clear state */ + out_8(&sw->intr_enable, 0); + in_8(&sw->intr); + in_8(&sw->error); +} + +static int swim3_add_device(struct macio_dev *mdev, int index) +{ + struct device_node *swim = mdev->ofdev.dev.of_node; + struct floppy_state *fs = &floppy_states[index]; + int rc = -EBUSY; + + /* Do this first for message macros */ + memset(fs, 0, sizeof(*fs)); + fs->mdev = mdev; + fs->index = index; + + /* Check & Request resources */ + if (macio_resource_count(mdev) < 2) { + swim3_err("%s", "No address in device-tree\n"); + return -ENXIO; + } + if (macio_irq_count(mdev) < 1) { + swim3_err("%s", "No interrupt in device-tree\n"); + return -ENXIO; + } + if (macio_request_resource(mdev, 0, "swim3 (mmio)")) { + swim3_err("%s", "Can't request mmio resource\n"); + return -EBUSY; + } + if (macio_request_resource(mdev, 1, "swim3 (dma)")) { + swim3_err("%s", "Can't request dma resource\n"); + macio_release_resource(mdev, 0); + return -EBUSY; + } + dev_set_drvdata(&mdev->ofdev.dev, fs); + + if (mdev->media_bay == NULL) + pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1); + + fs->state = idle; + fs->swim3 = (struct swim3 __iomem *) + ioremap(macio_resource_start(mdev, 0), 0x200); + if (fs->swim3 == NULL) { + swim3_err("%s", "Couldn't map mmio registers\n"); + rc = -ENOMEM; + goto out_release; + } + fs->dma = (struct dbdma_regs __iomem *) + ioremap(macio_resource_start(mdev, 1), 0x200); + if (fs->dma == NULL) { + swim3_err("%s", "Couldn't map dma registers\n"); + iounmap(fs->swim3); + rc = -ENOMEM; + goto out_release; + } + fs->swim3_intr = macio_irq(mdev, 0); + fs->dma_intr = macio_irq(mdev, 1); + fs->cur_cyl = -1; + fs->cur_sector = -1; + fs->secpercyl = 36; + fs->secpertrack = 18; + fs->total_secs = 2880; + init_waitqueue_head(&fs->wait); + + fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space); + memset(fs->dma_cmd, 0, 2 * sizeof(struct dbdma_cmd)); + fs->dma_cmd[1].command = cpu_to_le16(DBDMA_STOP); + + if (mdev->media_bay == NULL || check_media_bay(mdev->media_bay) == MB_FD) + swim3_mb_event(mdev, MB_FD); + + if (request_irq(fs->swim3_intr, swim3_interrupt, 0, "SWIM3", fs)) { + swim3_err("%s", "Couldn't request interrupt\n"); + pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 0); + goto out_unmap; + return -EBUSY; + } + + init_timer(&fs->timeout); + + swim3_info("SWIM3 floppy controller %s\n", + mdev->media_bay ? "in media bay" : ""); + + return 0; + + out_unmap: + iounmap(fs->dma); + iounmap(fs->swim3); + + out_release: + macio_release_resource(mdev, 0); + macio_release_resource(mdev, 1); + + return rc; +} + +static int swim3_attach(struct macio_dev *mdev, + const struct of_device_id *match) +{ + struct gendisk *disk; + int index, rc; + + index = floppy_count++; + if (index >= MAX_FLOPPIES) + return -ENXIO; + + /* Add the drive */ + rc = swim3_add_device(mdev, index); + if (rc) + return rc; + /* Now register that disk. Same comment about failure handling */ + disk = disks[index] = alloc_disk(1); + if (disk == NULL) + return -ENOMEM; + disk->queue = blk_init_queue(do_fd_request, &swim3_lock); + if (disk->queue == NULL) { + put_disk(disk); + return -ENOMEM; + } + disk->queue->queuedata = &floppy_states[index]; + + if (index == 0) { + /* If we failed, there isn't much we can do as the driver is still + * too dumb to remove the device, just bail out + */ + if (register_blkdev(FLOPPY_MAJOR, "fd")) + return 0; + } + + disk->major = FLOPPY_MAJOR; + disk->first_minor = index; + disk->fops = &floppy_fops; + disk->private_data = &floppy_states[index]; + disk->flags |= GENHD_FL_REMOVABLE; + sprintf(disk->disk_name, "fd%d", index); + set_capacity(disk, 2880); + add_disk(disk); + + return 0; +} + +static struct of_device_id swim3_match[] = +{ + { + .name = "swim3", + }, + { + .compatible = "ohare-swim3" + }, + { + .compatible = "swim3" + }, + { /* end of list */ } +}; + +static struct macio_driver swim3_driver = +{ + .driver = { + .name = "swim3", + .of_match_table = swim3_match, + }, + .probe = swim3_attach, +#ifdef CONFIG_PMAC_MEDIABAY + .mediabay_event = swim3_mb_event, +#endif +#if 0 + .suspend = swim3_suspend, + .resume = swim3_resume, +#endif +}; + + +int swim3_init(void) +{ + macio_register_driver(&swim3_driver); + return 0; +} + +module_init(swim3_init) + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul Mackerras"); +MODULE_ALIAS_BLOCKDEV_MAJOR(FLOPPY_MAJOR); diff --git a/kernel/drivers/block/swim_asm.S b/kernel/drivers/block/swim_asm.S new file mode 100644 index 000000000..c96682068 --- /dev/null +++ b/kernel/drivers/block/swim_asm.S @@ -0,0 +1,247 @@ +/* + * low-level functions for the SWIM floppy controller + * + * needs assembly language because is very timing dependent + * this controller exists only on macintosh 680x0 based + * + * Copyright (C) 2004,2008 Laurent Vivier + * + * based on Alastair Bridgewater SWIM analysis, 2001 + * based on netBSD IWM driver (c) 1997, 1998 Hauke Fath. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 2004-08-21 (lv) - Initial implementation + * 2008-11-05 (lv) - add get_swim_mode + */ + + .equ write_data, 0x0000 + .equ write_mark, 0x0200 + .equ write_CRC, 0x0400 + .equ write_parameter,0x0600 + .equ write_phase, 0x0800 + .equ write_setup, 0x0a00 + .equ write_mode0, 0x0c00 + .equ write_mode1, 0x0e00 + .equ read_data, 0x1000 + .equ read_mark, 0x1200 + .equ read_error, 0x1400 + .equ read_parameter, 0x1600 + .equ read_phase, 0x1800 + .equ read_setup, 0x1a00 + .equ read_status, 0x1c00 + .equ read_handshake, 0x1e00 + + .equ o_side, 0 + .equ o_track, 1 + .equ o_sector, 2 + .equ o_size, 3 + .equ o_crc0, 4 + .equ o_crc1, 5 + + .equ seek_time, 30000 + .equ max_retry, 40 + .equ sector_size, 512 + + .global swim_read_sector_header +swim_read_sector_header: + link %a6, #0 + moveml %d1-%d5/%a0-%a4,%sp@- + movel %a6@(0x0c), %a4 + bsr mfm_read_addrmark + moveml %sp@+, %d1-%d5/%a0-%a4 + unlk %a6 + rts + +sector_address_mark: + .byte 0xa1, 0xa1, 0xa1, 0xfe +sector_data_mark: + .byte 0xa1, 0xa1, 0xa1, 0xfb + +mfm_read_addrmark: + movel %a6@(0x08), %a3 + lea %a3@(read_handshake), %a2 + lea %a3@(read_mark), %a3 + moveq #-1, %d0 + movew #seek_time, %d2 + +wait_header_init: + tstb %a3@(read_error - read_mark) + moveb #0x18, %a3@(write_mode0 - read_mark) + moveb #0x01, %a3@(write_mode1 - read_mark) + moveb #0x01, %a3@(write_mode0 - read_mark) + tstb %a3@(read_error - read_mark) + moveb #0x08, %a3@(write_mode1 - read_mark) + + lea sector_address_mark, %a0 + moveq #3, %d1 + +wait_addr_mark_byte: + + tstb %a2@ + dbmi %d2, wait_addr_mark_byte + bpl header_exit + + moveb %a3@, %d3 + cmpb %a0@+, %d3 + dbne %d1, wait_addr_mark_byte + bne wait_header_init + + moveq #max_retry, %d2 + +amark0: tstb %a2@ + dbmi %d2, amark0 + bpl signal_nonyb + + moveb %a3@, %a4@(o_track) + + moveq #max_retry, %d2 + +amark1: tstb %a2@ + dbmi %d2, amark1 + bpl signal_nonyb + + moveb %a3@, %a4@(o_side) + + moveq #max_retry, %d2 + +amark2: tstb %a2@ + dbmi %d2, amark2 + bpl signal_nonyb + + moveb %a3@, %a4@(o_sector) + + moveq #max_retry, %d2 + +amark3: tstb %a2@ + dbmi %d2, amark3 + bpl signal_nonyb + + moveb %a3@, %a4@(o_size) + + moveq #max_retry, %d2 + +crc0: tstb %a2@ + dbmi %d2, crc0 + bpl signal_nonyb + + moveb %a3@, %a4@(o_crc0) + + moveq #max_retry, %d2 + +crc1: tstb %a2@ + dbmi %d2, crc1 + bpl signal_nonyb + + moveb %a3@, %a4@(o_crc1) + + tstb %a3@(read_error - read_mark) + +header_exit: + moveq #0, %d0 + moveb #0x18, %a3@(write_mode0 - read_mark) + rts +signal_nonyb: + moveq #-1, %d0 + moveb #0x18, %a3@(write_mode0 - read_mark) + rts + + .global swim_read_sector_data +swim_read_sector_data: + link %a6, #0 + moveml %d1-%d5/%a0-%a5,%sp@- + movel %a6@(0x0c), %a4 + bsr mfm_read_data + moveml %sp@+, %d1-%d5/%a0-%a5 + unlk %a6 + rts + +mfm_read_data: + movel %a6@(0x08), %a3 + lea %a3@(read_handshake), %a2 + lea %a3@(read_data), %a5 + lea %a3@(read_mark), %a3 + movew #seek_time, %d2 + +wait_data_init: + tstb %a3@(read_error - read_mark) + moveb #0x18, %a3@(write_mode0 - read_mark) + moveb #0x01, %a3@(write_mode1 - read_mark) + moveb #0x01, %a3@(write_mode0 - read_mark) + tstb %a3@(read_error - read_mark) + moveb #0x08, %a3@(write_mode1 - read_mark) + + lea sector_data_mark, %a0 + moveq #3, %d1 + + /* wait data address mark */ + +wait_data_mark_byte: + + tstb %a2@ + dbmi %d2, wait_data_mark_byte + bpl data_exit + + moveb %a3@, %d3 + cmpb %a0@+, %d3 + dbne %d1, wait_data_mark_byte + bne wait_data_init + + /* read data */ + + tstb %a3@(read_error - read_mark) + + movel #sector_size-1, %d4 /* sector size */ +read_new_data: + movew #max_retry, %d2 +read_data_loop: + moveb %a2@, %d5 + andb #0xc0, %d5 + dbne %d2, read_data_loop + beq data_exit + moveb %a5@, %a4@+ + andb #0x40, %d5 + dbne %d4, read_new_data + beq exit_loop + moveb %a5@, %a4@+ + dbra %d4, read_new_data +exit_loop: + + /* read CRC */ + + movew #max_retry, %d2 +data_crc0: + + tstb %a2@ + dbmi %d2, data_crc0 + bpl data_exit + + moveb %a3@, %d5 + + moveq #max_retry, %d2 + +data_crc1: + + tstb %a2@ + dbmi %d2, data_crc1 + bpl data_exit + + moveb %a3@, %d5 + + tstb %a3@(read_error - read_mark) + + moveb #0x18, %a3@(write_mode0 - read_mark) + + /* return number of bytes read */ + + movel #sector_size, %d0 + addw #1, %d4 + subl %d4, %d0 + rts +data_exit: + moveb #0x18, %a3@(write_mode0 - read_mark) + moveq #-1, %d0 + rts diff --git a/kernel/drivers/block/sx8.c b/kernel/drivers/block/sx8.c new file mode 100644 index 000000000..5d552857d --- /dev/null +++ b/kernel/drivers/block/sx8.c @@ -0,0 +1,1749 @@ +/* + * sx8.c: Driver for Promise SATA SX8 looks-like-I2O hardware + * + * Copyright 2004-2005 Red Hat, Inc. + * + * Author/maintainer: Jeff Garzik + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define CARM_DEBUG +#define CARM_VERBOSE_DEBUG +#else +#undef CARM_DEBUG +#undef CARM_VERBOSE_DEBUG +#endif +#undef CARM_NDEBUG + +#define DRV_NAME "sx8" +#define DRV_VERSION "1.0" +#define PFX DRV_NAME ": " + +MODULE_AUTHOR("Jeff Garzik"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Promise SATA SX8 block driver"); +MODULE_VERSION(DRV_VERSION); + +/* + * SX8 hardware has a single message queue for all ATA ports. + * When this driver was written, the hardware (firmware?) would + * corrupt data eventually, if more than one request was outstanding. + * As one can imagine, having 8 ports bottlenecking on a single + * command hurts performance. + * + * Based on user reports, later versions of the hardware (firmware?) + * seem to be able to survive with more than one command queued. + * + * Therefore, we default to the safe option -- 1 command -- but + * allow the user to increase this. + * + * SX8 should be able to support up to ~60 queued commands (CARM_MAX_REQ), + * but problems seem to occur when you exceed ~30, even on newer hardware. + */ +static int max_queue = 1; +module_param(max_queue, int, 0444); +MODULE_PARM_DESC(max_queue, "Maximum number of queued commands. (min==1, max==30, safe==1)"); + + +#define NEXT_RESP(idx) ((idx + 1) % RMSG_Q_LEN) + +/* 0xf is just arbitrary, non-zero noise; this is sorta like poisoning */ +#define TAG_ENCODE(tag) (((tag) << 16) | 0xf) +#define TAG_DECODE(tag) (((tag) >> 16) & 0x1f) +#define TAG_VALID(tag) ((((tag) & 0xf) == 0xf) && (TAG_DECODE(tag) < 32)) + +/* note: prints function name for you */ +#ifdef CARM_DEBUG +#define DPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) +#ifdef CARM_VERBOSE_DEBUG +#define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) +#else +#define VPRINTK(fmt, args...) +#endif /* CARM_VERBOSE_DEBUG */ +#else +#define DPRINTK(fmt, args...) +#define VPRINTK(fmt, args...) +#endif /* CARM_DEBUG */ + +#ifdef CARM_NDEBUG +#define assert(expr) +#else +#define assert(expr) \ + if(unlikely(!(expr))) { \ + printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \ + #expr, __FILE__, __func__, __LINE__); \ + } +#endif + +/* defines only for the constants which don't work well as enums */ +struct carm_host; + +enum { + /* adapter-wide limits */ + CARM_MAX_PORTS = 8, + CARM_SHM_SIZE = (4096 << 7), + CARM_MINORS_PER_MAJOR = 256 / CARM_MAX_PORTS, + CARM_MAX_WAIT_Q = CARM_MAX_PORTS + 1, + + /* command message queue limits */ + CARM_MAX_REQ = 64, /* max command msgs per host */ + CARM_MSG_LOW_WATER = (CARM_MAX_REQ / 4), /* refill mark */ + + /* S/G limits, host-wide and per-request */ + CARM_MAX_REQ_SG = 32, /* max s/g entries per request */ + CARM_MAX_HOST_SG = 600, /* max s/g entries per host */ + CARM_SG_LOW_WATER = (CARM_MAX_HOST_SG / 4), /* re-fill mark */ + + /* hardware registers */ + CARM_IHQP = 0x1c, + CARM_INT_STAT = 0x10, /* interrupt status */ + CARM_INT_MASK = 0x14, /* interrupt mask */ + CARM_HMUC = 0x18, /* host message unit control */ + RBUF_ADDR_LO = 0x20, /* response msg DMA buf low 32 bits */ + RBUF_ADDR_HI = 0x24, /* response msg DMA buf high 32 bits */ + RBUF_BYTE_SZ = 0x28, + CARM_RESP_IDX = 0x2c, + CARM_CMS0 = 0x30, /* command message size reg 0 */ + CARM_LMUC = 0x48, + CARM_HMPHA = 0x6c, + CARM_INITC = 0xb5, + + /* bits in CARM_INT_{STAT,MASK} */ + INT_RESERVED = 0xfffffff0, + INT_WATCHDOG = (1 << 3), /* watchdog timer */ + INT_Q_OVERFLOW = (1 << 2), /* cmd msg q overflow */ + INT_Q_AVAILABLE = (1 << 1), /* cmd msg q has free space */ + INT_RESPONSE = (1 << 0), /* response msg available */ + INT_ACK_MASK = INT_WATCHDOG | INT_Q_OVERFLOW, + INT_DEF_MASK = INT_RESERVED | INT_Q_OVERFLOW | + INT_RESPONSE, + + /* command messages, and related register bits */ + CARM_HAVE_RESP = 0x01, + CARM_MSG_READ = 1, + CARM_MSG_WRITE = 2, + CARM_MSG_VERIFY = 3, + CARM_MSG_GET_CAPACITY = 4, + CARM_MSG_FLUSH = 5, + CARM_MSG_IOCTL = 6, + CARM_MSG_ARRAY = 8, + CARM_MSG_MISC = 9, + CARM_CME = (1 << 2), + CARM_RME = (1 << 1), + CARM_WZBC = (1 << 0), + CARM_RMI = (1 << 0), + CARM_Q_FULL = (1 << 3), + CARM_MSG_SIZE = 288, + CARM_Q_LEN = 48, + + /* CARM_MSG_IOCTL messages */ + CARM_IOC_SCAN_CHAN = 5, /* scan channels for devices */ + CARM_IOC_GET_TCQ = 13, /* get tcq/ncq depth */ + CARM_IOC_SET_TCQ = 14, /* set tcq/ncq depth */ + + IOC_SCAN_CHAN_NODEV = 0x1f, + IOC_SCAN_CHAN_OFFSET = 0x40, + + /* CARM_MSG_ARRAY messages */ + CARM_ARRAY_INFO = 0, + + ARRAY_NO_EXIST = (1 << 31), + + /* response messages */ + RMSG_SZ = 8, /* sizeof(struct carm_response) */ + RMSG_Q_LEN = 48, /* resp. msg list length */ + RMSG_OK = 1, /* bit indicating msg was successful */ + /* length of entire resp. msg buffer */ + RBUF_LEN = RMSG_SZ * RMSG_Q_LEN, + + PDC_SHM_SIZE = (4096 << 7), /* length of entire h/w buffer */ + + /* CARM_MSG_MISC messages */ + MISC_GET_FW_VER = 2, + MISC_ALLOC_MEM = 3, + MISC_SET_TIME = 5, + + /* MISC_GET_FW_VER feature bits */ + FW_VER_4PORT = (1 << 2), /* 1=4 ports, 0=8 ports */ + FW_VER_NON_RAID = (1 << 1), /* 1=non-RAID firmware, 0=RAID */ + FW_VER_ZCR = (1 << 0), /* zero channel RAID (whatever that is) */ + + /* carm_host flags */ + FL_NON_RAID = FW_VER_NON_RAID, + FL_4PORT = FW_VER_4PORT, + FL_FW_VER_MASK = (FW_VER_NON_RAID | FW_VER_4PORT), + FL_DAC = (1 << 16), + FL_DYN_MAJOR = (1 << 17), +}; + +enum { + CARM_SG_BOUNDARY = 0xffffUL, /* s/g segment boundary */ +}; + +enum scatter_gather_types { + SGT_32BIT = 0, + SGT_64BIT = 1, +}; + +enum host_states { + HST_INVALID, /* invalid state; never used */ + HST_ALLOC_BUF, /* setting up master SHM area */ + HST_ERROR, /* we never leave here */ + HST_PORT_SCAN, /* start dev scan */ + HST_DEV_SCAN_START, /* start per-device probe */ + HST_DEV_SCAN, /* continue per-device probe */ + HST_DEV_ACTIVATE, /* activate devices we found */ + HST_PROBE_FINISHED, /* probe is complete */ + HST_PROBE_START, /* initiate probe */ + HST_SYNC_TIME, /* tell firmware what time it is */ + HST_GET_FW_VER, /* get firmware version, adapter port cnt */ +}; + +#ifdef CARM_DEBUG +static const char *state_name[] = { + "HST_INVALID", + "HST_ALLOC_BUF", + "HST_ERROR", + "HST_PORT_SCAN", + "HST_DEV_SCAN_START", + "HST_DEV_SCAN", + "HST_DEV_ACTIVATE", + "HST_PROBE_FINISHED", + "HST_PROBE_START", + "HST_SYNC_TIME", + "HST_GET_FW_VER", +}; +#endif + +struct carm_port { + unsigned int port_no; + struct gendisk *disk; + struct carm_host *host; + + /* attached device characteristics */ + u64 capacity; + char name[41]; + u16 dev_geom_head; + u16 dev_geom_sect; + u16 dev_geom_cyl; +}; + +struct carm_request { + unsigned int tag; + int n_elem; + unsigned int msg_type; + unsigned int msg_subtype; + unsigned int msg_bucket; + struct request *rq; + struct carm_port *port; + struct scatterlist sg[CARM_MAX_REQ_SG]; +}; + +struct carm_host { + unsigned long flags; + void __iomem *mmio; + void *shm; + dma_addr_t shm_dma; + + int major; + int id; + char name[32]; + + spinlock_t lock; + struct pci_dev *pdev; + unsigned int state; + u32 fw_ver; + + struct request_queue *oob_q; + unsigned int n_oob; + + unsigned int hw_sg_used; + + unsigned int resp_idx; + + unsigned int wait_q_prod; + unsigned int wait_q_cons; + struct request_queue *wait_q[CARM_MAX_WAIT_Q]; + + unsigned int n_msgs; + u64 msg_alloc; + struct carm_request req[CARM_MAX_REQ]; + void *msg_base; + dma_addr_t msg_dma; + + int cur_scan_dev; + unsigned long dev_active; + unsigned long dev_present; + struct carm_port port[CARM_MAX_PORTS]; + + struct work_struct fsm_task; + + struct completion probe_comp; +}; + +struct carm_response { + __le32 ret_handle; + __le32 status; +} __attribute__((packed)); + +struct carm_msg_sg { + __le32 start; + __le32 len; +} __attribute__((packed)); + +struct carm_msg_rw { + u8 type; + u8 id; + u8 sg_count; + u8 sg_type; + __le32 handle; + __le32 lba; + __le16 lba_count; + __le16 lba_high; + struct carm_msg_sg sg[32]; +} __attribute__((packed)); + +struct carm_msg_allocbuf { + u8 type; + u8 subtype; + u8 n_sg; + u8 sg_type; + __le32 handle; + __le32 addr; + __le32 len; + __le32 evt_pool; + __le32 n_evt; + __le32 rbuf_pool; + __le32 n_rbuf; + __le32 msg_pool; + __le32 n_msg; + struct carm_msg_sg sg[8]; +} __attribute__((packed)); + +struct carm_msg_ioctl { + u8 type; + u8 subtype; + u8 array_id; + u8 reserved1; + __le32 handle; + __le32 data_addr; + u32 reserved2; +} __attribute__((packed)); + +struct carm_msg_sync_time { + u8 type; + u8 subtype; + u16 reserved1; + __le32 handle; + u32 reserved2; + __le32 timestamp; +} __attribute__((packed)); + +struct carm_msg_get_fw_ver { + u8 type; + u8 subtype; + u16 reserved1; + __le32 handle; + __le32 data_addr; + u32 reserved2; +} __attribute__((packed)); + +struct carm_fw_ver { + __le32 version; + u8 features; + u8 reserved1; + u16 reserved2; +} __attribute__((packed)); + +struct carm_array_info { + __le32 size; + + __le16 size_hi; + __le16 stripe_size; + + __le32 mode; + + __le16 stripe_blk_sz; + __le16 reserved1; + + __le16 cyl; + __le16 head; + + __le16 sect; + u8 array_id; + u8 reserved2; + + char name[40]; + + __le32 array_status; + + /* device list continues beyond this point? */ +} __attribute__((packed)); + +static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent); +static void carm_remove_one (struct pci_dev *pdev); +static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo); + +static const struct pci_device_id carm_pci_tbl[] = { + { PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, + { PCI_VENDOR_ID_PROMISE, 0x8002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, }, + { } /* terminate list */ +}; +MODULE_DEVICE_TABLE(pci, carm_pci_tbl); + +static struct pci_driver carm_driver = { + .name = DRV_NAME, + .id_table = carm_pci_tbl, + .probe = carm_init_one, + .remove = carm_remove_one, +}; + +static const struct block_device_operations carm_bd_ops = { + .owner = THIS_MODULE, + .getgeo = carm_bdev_getgeo, +}; + +static unsigned int carm_host_id; +static unsigned long carm_major_alloc; + + + +static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct carm_port *port = bdev->bd_disk->private_data; + + geo->heads = (u8) port->dev_geom_head; + geo->sectors = (u8) port->dev_geom_sect; + geo->cylinders = port->dev_geom_cyl; + return 0; +} + +static const u32 msg_sizes[] = { 32, 64, 128, CARM_MSG_SIZE }; + +static inline int carm_lookup_bucket(u32 msg_size) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(msg_sizes); i++) + if (msg_size <= msg_sizes[i]) + return i; + + return -ENOENT; +} + +static void carm_init_buckets(void __iomem *mmio) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(msg_sizes); i++) + writel(msg_sizes[i], mmio + CARM_CMS0 + (4 * i)); +} + +static inline void *carm_ref_msg(struct carm_host *host, + unsigned int msg_idx) +{ + return host->msg_base + (msg_idx * CARM_MSG_SIZE); +} + +static inline dma_addr_t carm_ref_msg_dma(struct carm_host *host, + unsigned int msg_idx) +{ + return host->msg_dma + (msg_idx * CARM_MSG_SIZE); +} + +static int carm_send_msg(struct carm_host *host, + struct carm_request *crq) +{ + void __iomem *mmio = host->mmio; + u32 msg = (u32) carm_ref_msg_dma(host, crq->tag); + u32 cm_bucket = crq->msg_bucket; + u32 tmp; + int rc = 0; + + VPRINTK("ENTER\n"); + + tmp = readl(mmio + CARM_HMUC); + if (tmp & CARM_Q_FULL) { +#if 0 + tmp = readl(mmio + CARM_INT_MASK); + tmp |= INT_Q_AVAILABLE; + writel(tmp, mmio + CARM_INT_MASK); + readl(mmio + CARM_INT_MASK); /* flush */ +#endif + DPRINTK("host msg queue full\n"); + rc = -EBUSY; + } else { + writel(msg | (cm_bucket << 1), mmio + CARM_IHQP); + readl(mmio + CARM_IHQP); /* flush */ + } + + return rc; +} + +static struct carm_request *carm_get_request(struct carm_host *host) +{ + unsigned int i; + + /* obey global hardware limit on S/G entries */ + if (host->hw_sg_used >= (CARM_MAX_HOST_SG - CARM_MAX_REQ_SG)) + return NULL; + + for (i = 0; i < max_queue; i++) + if ((host->msg_alloc & (1ULL << i)) == 0) { + struct carm_request *crq = &host->req[i]; + crq->port = NULL; + crq->n_elem = 0; + + host->msg_alloc |= (1ULL << i); + host->n_msgs++; + + assert(host->n_msgs <= CARM_MAX_REQ); + sg_init_table(crq->sg, CARM_MAX_REQ_SG); + return crq; + } + + DPRINTK("no request available, returning NULL\n"); + return NULL; +} + +static int carm_put_request(struct carm_host *host, struct carm_request *crq) +{ + assert(crq->tag < max_queue); + + if (unlikely((host->msg_alloc & (1ULL << crq->tag)) == 0)) + return -EINVAL; /* tried to clear a tag that was not active */ + + assert(host->hw_sg_used >= crq->n_elem); + + host->msg_alloc &= ~(1ULL << crq->tag); + host->hw_sg_used -= crq->n_elem; + host->n_msgs--; + + return 0; +} + +static struct carm_request *carm_get_special(struct carm_host *host) +{ + unsigned long flags; + struct carm_request *crq = NULL; + struct request *rq; + int tries = 5000; + + while (tries-- > 0) { + spin_lock_irqsave(&host->lock, flags); + crq = carm_get_request(host); + spin_unlock_irqrestore(&host->lock, flags); + + if (crq) + break; + msleep(10); + } + + if (!crq) + return NULL; + + rq = blk_get_request(host->oob_q, WRITE /* bogus */, GFP_KERNEL); + if (IS_ERR(rq)) { + spin_lock_irqsave(&host->lock, flags); + carm_put_request(host, crq); + spin_unlock_irqrestore(&host->lock, flags); + return NULL; + } + + crq->rq = rq; + return crq; +} + +static int carm_array_info (struct carm_host *host, unsigned int array_idx) +{ + struct carm_msg_ioctl *ioc; + unsigned int idx; + u32 msg_data; + dma_addr_t msg_dma; + struct carm_request *crq; + int rc; + + crq = carm_get_special(host); + if (!crq) { + rc = -ENOMEM; + goto err_out; + } + + idx = crq->tag; + + ioc = carm_ref_msg(host, idx); + msg_dma = carm_ref_msg_dma(host, idx); + msg_data = (u32) (msg_dma + sizeof(struct carm_array_info)); + + crq->msg_type = CARM_MSG_ARRAY; + crq->msg_subtype = CARM_ARRAY_INFO; + rc = carm_lookup_bucket(sizeof(struct carm_msg_ioctl) + + sizeof(struct carm_array_info)); + BUG_ON(rc < 0); + crq->msg_bucket = (u32) rc; + + memset(ioc, 0, sizeof(*ioc)); + ioc->type = CARM_MSG_ARRAY; + ioc->subtype = CARM_ARRAY_INFO; + ioc->array_id = (u8) array_idx; + ioc->handle = cpu_to_le32(TAG_ENCODE(idx)); + ioc->data_addr = cpu_to_le32(msg_data); + + spin_lock_irq(&host->lock); + assert(host->state == HST_DEV_SCAN_START || + host->state == HST_DEV_SCAN); + spin_unlock_irq(&host->lock); + + DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); + crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->special = crq; + blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); + + return 0; + +err_out: + spin_lock_irq(&host->lock); + host->state = HST_ERROR; + spin_unlock_irq(&host->lock); + return rc; +} + +typedef unsigned int (*carm_sspc_t)(struct carm_host *, unsigned int, void *); + +static int carm_send_special (struct carm_host *host, carm_sspc_t func) +{ + struct carm_request *crq; + struct carm_msg_ioctl *ioc; + void *mem; + unsigned int idx, msg_size; + int rc; + + crq = carm_get_special(host); + if (!crq) + return -ENOMEM; + + idx = crq->tag; + + mem = carm_ref_msg(host, idx); + + msg_size = func(host, idx, mem); + + ioc = mem; + crq->msg_type = ioc->type; + crq->msg_subtype = ioc->subtype; + rc = carm_lookup_bucket(msg_size); + BUG_ON(rc < 0); + crq->msg_bucket = (u32) rc; + + DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); + crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->special = crq; + blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); + + return 0; +} + +static unsigned int carm_fill_sync_time(struct carm_host *host, + unsigned int idx, void *mem) +{ + struct timeval tv; + struct carm_msg_sync_time *st = mem; + + do_gettimeofday(&tv); + + memset(st, 0, sizeof(*st)); + st->type = CARM_MSG_MISC; + st->subtype = MISC_SET_TIME; + st->handle = cpu_to_le32(TAG_ENCODE(idx)); + st->timestamp = cpu_to_le32(tv.tv_sec); + + return sizeof(struct carm_msg_sync_time); +} + +static unsigned int carm_fill_alloc_buf(struct carm_host *host, + unsigned int idx, void *mem) +{ + struct carm_msg_allocbuf *ab = mem; + + memset(ab, 0, sizeof(*ab)); + ab->type = CARM_MSG_MISC; + ab->subtype = MISC_ALLOC_MEM; + ab->handle = cpu_to_le32(TAG_ENCODE(idx)); + ab->n_sg = 1; + ab->sg_type = SGT_32BIT; + ab->addr = cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1)); + ab->len = cpu_to_le32(PDC_SHM_SIZE >> 1); + ab->evt_pool = cpu_to_le32(host->shm_dma + (16 * 1024)); + ab->n_evt = cpu_to_le32(1024); + ab->rbuf_pool = cpu_to_le32(host->shm_dma); + ab->n_rbuf = cpu_to_le32(RMSG_Q_LEN); + ab->msg_pool = cpu_to_le32(host->shm_dma + RBUF_LEN); + ab->n_msg = cpu_to_le32(CARM_Q_LEN); + ab->sg[0].start = cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1)); + ab->sg[0].len = cpu_to_le32(65536); + + return sizeof(struct carm_msg_allocbuf); +} + +static unsigned int carm_fill_scan_channels(struct carm_host *host, + unsigned int idx, void *mem) +{ + struct carm_msg_ioctl *ioc = mem; + u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) + + IOC_SCAN_CHAN_OFFSET); + + memset(ioc, 0, sizeof(*ioc)); + ioc->type = CARM_MSG_IOCTL; + ioc->subtype = CARM_IOC_SCAN_CHAN; + ioc->handle = cpu_to_le32(TAG_ENCODE(idx)); + ioc->data_addr = cpu_to_le32(msg_data); + + /* fill output data area with "no device" default values */ + mem += IOC_SCAN_CHAN_OFFSET; + memset(mem, IOC_SCAN_CHAN_NODEV, CARM_MAX_PORTS); + + return IOC_SCAN_CHAN_OFFSET + CARM_MAX_PORTS; +} + +static unsigned int carm_fill_get_fw_ver(struct carm_host *host, + unsigned int idx, void *mem) +{ + struct carm_msg_get_fw_ver *ioc = mem; + u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) + sizeof(*ioc)); + + memset(ioc, 0, sizeof(*ioc)); + ioc->type = CARM_MSG_MISC; + ioc->subtype = MISC_GET_FW_VER; + ioc->handle = cpu_to_le32(TAG_ENCODE(idx)); + ioc->data_addr = cpu_to_le32(msg_data); + + return sizeof(struct carm_msg_get_fw_ver) + + sizeof(struct carm_fw_ver); +} + +static inline void carm_end_request_queued(struct carm_host *host, + struct carm_request *crq, + int error) +{ + struct request *req = crq->rq; + int rc; + + __blk_end_request_all(req, error); + + rc = carm_put_request(host, crq); + assert(rc == 0); +} + +static inline void carm_push_q (struct carm_host *host, struct request_queue *q) +{ + unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q; + + blk_stop_queue(q); + VPRINTK("STOPPED QUEUE %p\n", q); + + host->wait_q[idx] = q; + host->wait_q_prod++; + BUG_ON(host->wait_q_prod == host->wait_q_cons); /* overrun */ +} + +static inline struct request_queue *carm_pop_q(struct carm_host *host) +{ + unsigned int idx; + + if (host->wait_q_prod == host->wait_q_cons) + return NULL; + + idx = host->wait_q_cons % CARM_MAX_WAIT_Q; + host->wait_q_cons++; + + return host->wait_q[idx]; +} + +static inline void carm_round_robin(struct carm_host *host) +{ + struct request_queue *q = carm_pop_q(host); + if (q) { + blk_start_queue(q); + VPRINTK("STARTED QUEUE %p\n", q); + } +} + +static inline void carm_end_rq(struct carm_host *host, struct carm_request *crq, + int error) +{ + carm_end_request_queued(host, crq, error); + if (max_queue == 1) + carm_round_robin(host); + else if ((host->n_msgs <= CARM_MSG_LOW_WATER) && + (host->hw_sg_used <= CARM_SG_LOW_WATER)) { + carm_round_robin(host); + } +} + +static void carm_oob_rq_fn(struct request_queue *q) +{ + struct carm_host *host = q->queuedata; + struct carm_request *crq; + struct request *rq; + int rc; + + while (1) { + DPRINTK("get req\n"); + rq = blk_fetch_request(q); + if (!rq) + break; + + crq = rq->special; + assert(crq != NULL); + assert(crq->rq == rq); + + crq->n_elem = 0; + + DPRINTK("send req\n"); + rc = carm_send_msg(host, crq); + if (rc) { + blk_requeue_request(q, rq); + carm_push_q(host, q); + return; /* call us again later, eventually */ + } + } +} + +static void carm_rq_fn(struct request_queue *q) +{ + struct carm_port *port = q->queuedata; + struct carm_host *host = port->host; + struct carm_msg_rw *msg; + struct carm_request *crq; + struct request *rq; + struct scatterlist *sg; + int writing = 0, pci_dir, i, n_elem, rc; + u32 tmp; + unsigned int msg_size; + +queue_one_request: + VPRINTK("get req\n"); + rq = blk_peek_request(q); + if (!rq) + return; + + crq = carm_get_request(host); + if (!crq) { + carm_push_q(host, q); + return; /* call us again later, eventually */ + } + crq->rq = rq; + + blk_start_request(rq); + + if (rq_data_dir(rq) == WRITE) { + writing = 1; + pci_dir = PCI_DMA_TODEVICE; + } else { + pci_dir = PCI_DMA_FROMDEVICE; + } + + /* get scatterlist from block layer */ + sg = &crq->sg[0]; + n_elem = blk_rq_map_sg(q, rq, sg); + if (n_elem <= 0) { + carm_end_rq(host, crq, -EIO); + return; /* request with no s/g entries? */ + } + + /* map scatterlist to PCI bus addresses */ + n_elem = pci_map_sg(host->pdev, sg, n_elem, pci_dir); + if (n_elem <= 0) { + carm_end_rq(host, crq, -EIO); + return; /* request with no s/g entries? */ + } + crq->n_elem = n_elem; + crq->port = port; + host->hw_sg_used += n_elem; + + /* + * build read/write message + */ + + VPRINTK("build msg\n"); + msg = (struct carm_msg_rw *) carm_ref_msg(host, crq->tag); + + if (writing) { + msg->type = CARM_MSG_WRITE; + crq->msg_type = CARM_MSG_WRITE; + } else { + msg->type = CARM_MSG_READ; + crq->msg_type = CARM_MSG_READ; + } + + msg->id = port->port_no; + msg->sg_count = n_elem; + msg->sg_type = SGT_32BIT; + msg->handle = cpu_to_le32(TAG_ENCODE(crq->tag)); + msg->lba = cpu_to_le32(blk_rq_pos(rq) & 0xffffffff); + tmp = (blk_rq_pos(rq) >> 16) >> 16; + msg->lba_high = cpu_to_le16( (u16) tmp ); + msg->lba_count = cpu_to_le16(blk_rq_sectors(rq)); + + msg_size = sizeof(struct carm_msg_rw) - sizeof(msg->sg); + for (i = 0; i < n_elem; i++) { + struct carm_msg_sg *carm_sg = &msg->sg[i]; + carm_sg->start = cpu_to_le32(sg_dma_address(&crq->sg[i])); + carm_sg->len = cpu_to_le32(sg_dma_len(&crq->sg[i])); + msg_size += sizeof(struct carm_msg_sg); + } + + rc = carm_lookup_bucket(msg_size); + BUG_ON(rc < 0); + crq->msg_bucket = (u32) rc; + + /* + * queue read/write message to hardware + */ + + VPRINTK("send msg, tag == %u\n", crq->tag); + rc = carm_send_msg(host, crq); + if (rc) { + carm_put_request(host, crq); + blk_requeue_request(q, rq); + carm_push_q(host, q); + return; /* call us again later, eventually */ + } + + goto queue_one_request; +} + +static void carm_handle_array_info(struct carm_host *host, + struct carm_request *crq, u8 *mem, + int error) +{ + struct carm_port *port; + u8 *msg_data = mem + sizeof(struct carm_array_info); + struct carm_array_info *desc = (struct carm_array_info *) msg_data; + u64 lo, hi; + int cur_port; + size_t slen; + + DPRINTK("ENTER\n"); + + carm_end_rq(host, crq, error); + + if (error) + goto out; + if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST) + goto out; + + cur_port = host->cur_scan_dev; + + /* should never occur */ + if ((cur_port < 0) || (cur_port >= CARM_MAX_PORTS)) { + printk(KERN_ERR PFX "BUG: cur_scan_dev==%d, array_id==%d\n", + cur_port, (int) desc->array_id); + goto out; + } + + port = &host->port[cur_port]; + + lo = (u64) le32_to_cpu(desc->size); + hi = (u64) le16_to_cpu(desc->size_hi); + + port->capacity = lo | (hi << 32); + port->dev_geom_head = le16_to_cpu(desc->head); + port->dev_geom_sect = le16_to_cpu(desc->sect); + port->dev_geom_cyl = le16_to_cpu(desc->cyl); + + host->dev_active |= (1 << cur_port); + + strncpy(port->name, desc->name, sizeof(port->name)); + port->name[sizeof(port->name) - 1] = 0; + slen = strlen(port->name); + while (slen && (port->name[slen - 1] == ' ')) { + port->name[slen - 1] = 0; + slen--; + } + + printk(KERN_INFO DRV_NAME "(%s): port %u device %Lu sectors\n", + pci_name(host->pdev), port->port_no, + (unsigned long long) port->capacity); + printk(KERN_INFO DRV_NAME "(%s): port %u device \"%s\"\n", + pci_name(host->pdev), port->port_no, port->name); + +out: + assert(host->state == HST_DEV_SCAN); + schedule_work(&host->fsm_task); +} + +static void carm_handle_scan_chan(struct carm_host *host, + struct carm_request *crq, u8 *mem, + int error) +{ + u8 *msg_data = mem + IOC_SCAN_CHAN_OFFSET; + unsigned int i, dev_count = 0; + int new_state = HST_DEV_SCAN_START; + + DPRINTK("ENTER\n"); + + carm_end_rq(host, crq, error); + + if (error) { + new_state = HST_ERROR; + goto out; + } + + /* TODO: scan and support non-disk devices */ + for (i = 0; i < 8; i++) + if (msg_data[i] == 0) { /* direct-access device (disk) */ + host->dev_present |= (1 << i); + dev_count++; + } + + printk(KERN_INFO DRV_NAME "(%s): found %u interesting devices\n", + pci_name(host->pdev), dev_count); + +out: + assert(host->state == HST_PORT_SCAN); + host->state = new_state; + schedule_work(&host->fsm_task); +} + +static void carm_handle_generic(struct carm_host *host, + struct carm_request *crq, int error, + int cur_state, int next_state) +{ + DPRINTK("ENTER\n"); + + carm_end_rq(host, crq, error); + + assert(host->state == cur_state); + if (error) + host->state = HST_ERROR; + else + host->state = next_state; + schedule_work(&host->fsm_task); +} + +static inline void carm_handle_rw(struct carm_host *host, + struct carm_request *crq, int error) +{ + int pci_dir; + + VPRINTK("ENTER\n"); + + if (rq_data_dir(crq->rq) == WRITE) + pci_dir = PCI_DMA_TODEVICE; + else + pci_dir = PCI_DMA_FROMDEVICE; + + pci_unmap_sg(host->pdev, &crq->sg[0], crq->n_elem, pci_dir); + + carm_end_rq(host, crq, error); +} + +static inline void carm_handle_resp(struct carm_host *host, + __le32 ret_handle_le, u32 status) +{ + u32 handle = le32_to_cpu(ret_handle_le); + unsigned int msg_idx; + struct carm_request *crq; + int error = (status == RMSG_OK) ? 0 : -EIO; + u8 *mem; + + VPRINTK("ENTER, handle == 0x%x\n", handle); + + if (unlikely(!TAG_VALID(handle))) { + printk(KERN_ERR DRV_NAME "(%s): BUG: invalid tag 0x%x\n", + pci_name(host->pdev), handle); + return; + } + + msg_idx = TAG_DECODE(handle); + VPRINTK("tag == %u\n", msg_idx); + + crq = &host->req[msg_idx]; + + /* fast path */ + if (likely(crq->msg_type == CARM_MSG_READ || + crq->msg_type == CARM_MSG_WRITE)) { + carm_handle_rw(host, crq, error); + return; + } + + mem = carm_ref_msg(host, msg_idx); + + switch (crq->msg_type) { + case CARM_MSG_IOCTL: { + switch (crq->msg_subtype) { + case CARM_IOC_SCAN_CHAN: + carm_handle_scan_chan(host, crq, mem, error); + break; + default: + /* unknown / invalid response */ + goto err_out; + } + break; + } + + case CARM_MSG_MISC: { + switch (crq->msg_subtype) { + case MISC_ALLOC_MEM: + carm_handle_generic(host, crq, error, + HST_ALLOC_BUF, HST_SYNC_TIME); + break; + case MISC_SET_TIME: + carm_handle_generic(host, crq, error, + HST_SYNC_TIME, HST_GET_FW_VER); + break; + case MISC_GET_FW_VER: { + struct carm_fw_ver *ver = (struct carm_fw_ver *) + (mem + sizeof(struct carm_msg_get_fw_ver)); + if (!error) { + host->fw_ver = le32_to_cpu(ver->version); + host->flags |= (ver->features & FL_FW_VER_MASK); + } + carm_handle_generic(host, crq, error, + HST_GET_FW_VER, HST_PORT_SCAN); + break; + } + default: + /* unknown / invalid response */ + goto err_out; + } + break; + } + + case CARM_MSG_ARRAY: { + switch (crq->msg_subtype) { + case CARM_ARRAY_INFO: + carm_handle_array_info(host, crq, mem, error); + break; + default: + /* unknown / invalid response */ + goto err_out; + } + break; + } + + default: + /* unknown / invalid response */ + goto err_out; + } + + return; + +err_out: + printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n", + pci_name(host->pdev), crq->msg_type, crq->msg_subtype); + carm_end_rq(host, crq, -EIO); +} + +static inline void carm_handle_responses(struct carm_host *host) +{ + void __iomem *mmio = host->mmio; + struct carm_response *resp = (struct carm_response *) host->shm; + unsigned int work = 0; + unsigned int idx = host->resp_idx % RMSG_Q_LEN; + + while (1) { + u32 status = le32_to_cpu(resp[idx].status); + + if (status == 0xffffffff) { + VPRINTK("ending response on index %u\n", idx); + writel(idx << 3, mmio + CARM_RESP_IDX); + break; + } + + /* response to a message we sent */ + else if ((status & (1 << 31)) == 0) { + VPRINTK("handling msg response on index %u\n", idx); + carm_handle_resp(host, resp[idx].ret_handle, status); + resp[idx].status = cpu_to_le32(0xffffffff); + } + + /* asynchronous events the hardware throws our way */ + else if ((status & 0xff000000) == (1 << 31)) { + u8 *evt_type_ptr = (u8 *) &resp[idx]; + u8 evt_type = *evt_type_ptr; + printk(KERN_WARNING DRV_NAME "(%s): unhandled event type %d\n", + pci_name(host->pdev), (int) evt_type); + resp[idx].status = cpu_to_le32(0xffffffff); + } + + idx = NEXT_RESP(idx); + work++; + } + + VPRINTK("EXIT, work==%u\n", work); + host->resp_idx += work; +} + +static irqreturn_t carm_interrupt(int irq, void *__host) +{ + struct carm_host *host = __host; + void __iomem *mmio; + u32 mask; + int handled = 0; + unsigned long flags; + + if (!host) { + VPRINTK("no host\n"); + return IRQ_NONE; + } + + spin_lock_irqsave(&host->lock, flags); + + mmio = host->mmio; + + /* reading should also clear interrupts */ + mask = readl(mmio + CARM_INT_STAT); + + if (mask == 0 || mask == 0xffffffff) { + VPRINTK("no work, mask == 0x%x\n", mask); + goto out; + } + + if (mask & INT_ACK_MASK) + writel(mask, mmio + CARM_INT_STAT); + + if (unlikely(host->state == HST_INVALID)) { + VPRINTK("not initialized yet, mask = 0x%x\n", mask); + goto out; + } + + if (mask & CARM_HAVE_RESP) { + handled = 1; + carm_handle_responses(host); + } + +out: + spin_unlock_irqrestore(&host->lock, flags); + VPRINTK("EXIT\n"); + return IRQ_RETVAL(handled); +} + +static void carm_fsm_task (struct work_struct *work) +{ + struct carm_host *host = + container_of(work, struct carm_host, fsm_task); + unsigned long flags; + unsigned int state; + int rc, i, next_dev; + int reschedule = 0; + int new_state = HST_INVALID; + + spin_lock_irqsave(&host->lock, flags); + state = host->state; + spin_unlock_irqrestore(&host->lock, flags); + + DPRINTK("ENTER, state == %s\n", state_name[state]); + + switch (state) { + case HST_PROBE_START: + new_state = HST_ALLOC_BUF; + reschedule = 1; + break; + + case HST_ALLOC_BUF: + rc = carm_send_special(host, carm_fill_alloc_buf); + if (rc) { + new_state = HST_ERROR; + reschedule = 1; + } + break; + + case HST_SYNC_TIME: + rc = carm_send_special(host, carm_fill_sync_time); + if (rc) { + new_state = HST_ERROR; + reschedule = 1; + } + break; + + case HST_GET_FW_VER: + rc = carm_send_special(host, carm_fill_get_fw_ver); + if (rc) { + new_state = HST_ERROR; + reschedule = 1; + } + break; + + case HST_PORT_SCAN: + rc = carm_send_special(host, carm_fill_scan_channels); + if (rc) { + new_state = HST_ERROR; + reschedule = 1; + } + break; + + case HST_DEV_SCAN_START: + host->cur_scan_dev = -1; + new_state = HST_DEV_SCAN; + reschedule = 1; + break; + + case HST_DEV_SCAN: + next_dev = -1; + for (i = host->cur_scan_dev + 1; i < CARM_MAX_PORTS; i++) + if (host->dev_present & (1 << i)) { + next_dev = i; + break; + } + + if (next_dev >= 0) { + host->cur_scan_dev = next_dev; + rc = carm_array_info(host, next_dev); + if (rc) { + new_state = HST_ERROR; + reschedule = 1; + } + } else { + new_state = HST_DEV_ACTIVATE; + reschedule = 1; + } + break; + + case HST_DEV_ACTIVATE: { + int activated = 0; + for (i = 0; i < CARM_MAX_PORTS; i++) + if (host->dev_active & (1 << i)) { + struct carm_port *port = &host->port[i]; + struct gendisk *disk = port->disk; + + set_capacity(disk, port->capacity); + add_disk(disk); + activated++; + } + + printk(KERN_INFO DRV_NAME "(%s): %d ports activated\n", + pci_name(host->pdev), activated); + + new_state = HST_PROBE_FINISHED; + reschedule = 1; + break; + } + + case HST_PROBE_FINISHED: + complete(&host->probe_comp); + break; + + case HST_ERROR: + /* FIXME: TODO */ + break; + + default: + /* should never occur */ + printk(KERN_ERR PFX "BUG: unknown state %d\n", state); + assert(0); + break; + } + + if (new_state != HST_INVALID) { + spin_lock_irqsave(&host->lock, flags); + host->state = new_state; + spin_unlock_irqrestore(&host->lock, flags); + } + if (reschedule) + schedule_work(&host->fsm_task); +} + +static int carm_init_wait(void __iomem *mmio, u32 bits, unsigned int test_bit) +{ + unsigned int i; + + for (i = 0; i < 50000; i++) { + u32 tmp = readl(mmio + CARM_LMUC); + udelay(100); + + if (test_bit) { + if ((tmp & bits) == bits) + return 0; + } else { + if ((tmp & bits) == 0) + return 0; + } + + cond_resched(); + } + + printk(KERN_ERR PFX "carm_init_wait timeout, bits == 0x%x, test_bit == %s\n", + bits, test_bit ? "yes" : "no"); + return -EBUSY; +} + +static void carm_init_responses(struct carm_host *host) +{ + void __iomem *mmio = host->mmio; + unsigned int i; + struct carm_response *resp = (struct carm_response *) host->shm; + + for (i = 0; i < RMSG_Q_LEN; i++) + resp[i].status = cpu_to_le32(0xffffffff); + + writel(0, mmio + CARM_RESP_IDX); +} + +static int carm_init_host(struct carm_host *host) +{ + void __iomem *mmio = host->mmio; + u32 tmp; + u8 tmp8; + int rc; + + DPRINTK("ENTER\n"); + + writel(0, mmio + CARM_INT_MASK); + + tmp8 = readb(mmio + CARM_INITC); + if (tmp8 & 0x01) { + tmp8 &= ~0x01; + writeb(tmp8, mmio + CARM_INITC); + readb(mmio + CARM_INITC); /* flush */ + + DPRINTK("snooze...\n"); + msleep(5000); + } + + tmp = readl(mmio + CARM_HMUC); + if (tmp & CARM_CME) { + DPRINTK("CME bit present, waiting\n"); + rc = carm_init_wait(mmio, CARM_CME, 1); + if (rc) { + DPRINTK("EXIT, carm_init_wait 1 failed\n"); + return rc; + } + } + if (tmp & CARM_RME) { + DPRINTK("RME bit present, waiting\n"); + rc = carm_init_wait(mmio, CARM_RME, 1); + if (rc) { + DPRINTK("EXIT, carm_init_wait 2 failed\n"); + return rc; + } + } + + tmp &= ~(CARM_RME | CARM_CME); + writel(tmp, mmio + CARM_HMUC); + readl(mmio + CARM_HMUC); /* flush */ + + rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 0); + if (rc) { + DPRINTK("EXIT, carm_init_wait 3 failed\n"); + return rc; + } + + carm_init_buckets(mmio); + + writel(host->shm_dma & 0xffffffff, mmio + RBUF_ADDR_LO); + writel((host->shm_dma >> 16) >> 16, mmio + RBUF_ADDR_HI); + writel(RBUF_LEN, mmio + RBUF_BYTE_SZ); + + tmp = readl(mmio + CARM_HMUC); + tmp |= (CARM_RME | CARM_CME | CARM_WZBC); + writel(tmp, mmio + CARM_HMUC); + readl(mmio + CARM_HMUC); /* flush */ + + rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 1); + if (rc) { + DPRINTK("EXIT, carm_init_wait 4 failed\n"); + return rc; + } + + writel(0, mmio + CARM_HMPHA); + writel(INT_DEF_MASK, mmio + CARM_INT_MASK); + + carm_init_responses(host); + + /* start initialization, probing state machine */ + spin_lock_irq(&host->lock); + assert(host->state == HST_INVALID); + host->state = HST_PROBE_START; + spin_unlock_irq(&host->lock); + schedule_work(&host->fsm_task); + + DPRINTK("EXIT\n"); + return 0; +} + +static int carm_init_disks(struct carm_host *host) +{ + unsigned int i; + int rc = 0; + + for (i = 0; i < CARM_MAX_PORTS; i++) { + struct gendisk *disk; + struct request_queue *q; + struct carm_port *port; + + port = &host->port[i]; + port->host = host; + port->port_no = i; + + disk = alloc_disk(CARM_MINORS_PER_MAJOR); + if (!disk) { + rc = -ENOMEM; + break; + } + + port->disk = disk; + sprintf(disk->disk_name, DRV_NAME "/%u", + (unsigned int) (host->id * CARM_MAX_PORTS) + i); + disk->major = host->major; + disk->first_minor = i * CARM_MINORS_PER_MAJOR; + disk->fops = &carm_bd_ops; + disk->private_data = port; + + q = blk_init_queue(carm_rq_fn, &host->lock); + if (!q) { + rc = -ENOMEM; + break; + } + disk->queue = q; + blk_queue_max_segments(q, CARM_MAX_REQ_SG); + blk_queue_segment_boundary(q, CARM_SG_BOUNDARY); + + q->queuedata = port; + } + + return rc; +} + +static void carm_free_disks(struct carm_host *host) +{ + unsigned int i; + + for (i = 0; i < CARM_MAX_PORTS; i++) { + struct gendisk *disk = host->port[i].disk; + if (disk) { + struct request_queue *q = disk->queue; + + if (disk->flags & GENHD_FL_UP) + del_gendisk(disk); + if (q) + blk_cleanup_queue(q); + put_disk(disk); + } + } +} + +static int carm_init_shm(struct carm_host *host) +{ + host->shm = pci_alloc_consistent(host->pdev, CARM_SHM_SIZE, + &host->shm_dma); + if (!host->shm) + return -ENOMEM; + + host->msg_base = host->shm + RBUF_LEN; + host->msg_dma = host->shm_dma + RBUF_LEN; + + memset(host->shm, 0xff, RBUF_LEN); + memset(host->msg_base, 0, PDC_SHM_SIZE - RBUF_LEN); + + return 0; +} + +static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) +{ + struct carm_host *host; + unsigned int pci_dac; + int rc; + struct request_queue *q; + unsigned int i; + + printk_once(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + + rc = pci_enable_device(pdev); + if (rc) + return rc; + + rc = pci_request_regions(pdev, DRV_NAME); + if (rc) + goto err_out; + +#ifdef IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */ + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (!rc) { + rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (rc) { + printk(KERN_ERR DRV_NAME "(%s): consistent DMA mask failure\n", + pci_name(pdev)); + goto err_out_regions; + } + pci_dac = 1; + } else { +#endif + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (rc) { + printk(KERN_ERR DRV_NAME "(%s): DMA mask failure\n", + pci_name(pdev)); + goto err_out_regions; + } + pci_dac = 0; +#ifdef IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */ + } +#endif + + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) { + printk(KERN_ERR DRV_NAME "(%s): memory alloc failure\n", + pci_name(pdev)); + rc = -ENOMEM; + goto err_out_regions; + } + + host->pdev = pdev; + host->flags = pci_dac ? FL_DAC : 0; + spin_lock_init(&host->lock); + INIT_WORK(&host->fsm_task, carm_fsm_task); + init_completion(&host->probe_comp); + + for (i = 0; i < ARRAY_SIZE(host->req); i++) + host->req[i].tag = i; + + host->mmio = ioremap(pci_resource_start(pdev, 0), + pci_resource_len(pdev, 0)); + if (!host->mmio) { + printk(KERN_ERR DRV_NAME "(%s): MMIO alloc failure\n", + pci_name(pdev)); + rc = -ENOMEM; + goto err_out_kfree; + } + + rc = carm_init_shm(host); + if (rc) { + printk(KERN_ERR DRV_NAME "(%s): DMA SHM alloc failure\n", + pci_name(pdev)); + goto err_out_iounmap; + } + + q = blk_init_queue(carm_oob_rq_fn, &host->lock); + if (!q) { + printk(KERN_ERR DRV_NAME "(%s): OOB queue alloc failure\n", + pci_name(pdev)); + rc = -ENOMEM; + goto err_out_pci_free; + } + host->oob_q = q; + q->queuedata = host; + + /* + * Figure out which major to use: 160, 161, or dynamic + */ + if (!test_and_set_bit(0, &carm_major_alloc)) + host->major = 160; + else if (!test_and_set_bit(1, &carm_major_alloc)) + host->major = 161; + else + host->flags |= FL_DYN_MAJOR; + + host->id = carm_host_id; + sprintf(host->name, DRV_NAME "%d", carm_host_id); + + rc = register_blkdev(host->major, host->name); + if (rc < 0) + goto err_out_free_majors; + if (host->flags & FL_DYN_MAJOR) + host->major = rc; + + rc = carm_init_disks(host); + if (rc) + goto err_out_blkdev_disks; + + pci_set_master(pdev); + + rc = request_irq(pdev->irq, carm_interrupt, IRQF_SHARED, DRV_NAME, host); + if (rc) { + printk(KERN_ERR DRV_NAME "(%s): irq alloc failure\n", + pci_name(pdev)); + goto err_out_blkdev_disks; + } + + rc = carm_init_host(host); + if (rc) + goto err_out_free_irq; + + DPRINTK("waiting for probe_comp\n"); + wait_for_completion(&host->probe_comp); + + printk(KERN_INFO "%s: pci %s, ports %d, io %llx, irq %u, major %d\n", + host->name, pci_name(pdev), (int) CARM_MAX_PORTS, + (unsigned long long)pci_resource_start(pdev, 0), + pdev->irq, host->major); + + carm_host_id++; + pci_set_drvdata(pdev, host); + return 0; + +err_out_free_irq: + free_irq(pdev->irq, host); +err_out_blkdev_disks: + carm_free_disks(host); + unregister_blkdev(host->major, host->name); +err_out_free_majors: + if (host->major == 160) + clear_bit(0, &carm_major_alloc); + else if (host->major == 161) + clear_bit(1, &carm_major_alloc); + blk_cleanup_queue(host->oob_q); +err_out_pci_free: + pci_free_consistent(pdev, CARM_SHM_SIZE, host->shm, host->shm_dma); +err_out_iounmap: + iounmap(host->mmio); +err_out_kfree: + kfree(host); +err_out_regions: + pci_release_regions(pdev); +err_out: + pci_disable_device(pdev); + return rc; +} + +static void carm_remove_one (struct pci_dev *pdev) +{ + struct carm_host *host = pci_get_drvdata(pdev); + + if (!host) { + printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n", + pci_name(pdev)); + return; + } + + free_irq(pdev->irq, host); + carm_free_disks(host); + unregister_blkdev(host->major, host->name); + if (host->major == 160) + clear_bit(0, &carm_major_alloc); + else if (host->major == 161) + clear_bit(1, &carm_major_alloc); + blk_cleanup_queue(host->oob_q); + pci_free_consistent(pdev, CARM_SHM_SIZE, host->shm, host->shm_dma); + iounmap(host->mmio); + kfree(host); + pci_release_regions(pdev); + pci_disable_device(pdev); +} + +module_pci_driver(carm_driver); diff --git a/kernel/drivers/block/umem.c b/kernel/drivers/block/umem.c new file mode 100644 index 000000000..4cf81b5bf --- /dev/null +++ b/kernel/drivers/block/umem.c @@ -0,0 +1,1136 @@ +/* + * mm.c - Micro Memory(tm) PCI memory board block device driver - v2.3 + * + * (C) 2001 San Mehat + * (C) 2001 Johannes Erdfelt + * (C) 2001 NeilBrown + * + * This driver for the Micro Memory PCI Memory Module with Battery Backup + * is Copyright Micro Memory Inc 2001-2002. All rights reserved. + * + * This driver is released to the public under the terms of the + * GNU GENERAL PUBLIC LICENSE version 2 + * See the file COPYING for details. + * + * This driver provides a standard block device interface for Micro Memory(tm) + * PCI based RAM boards. + * 10/05/01: Phap Nguyen - Rebuilt the driver + * 10/22/01: Phap Nguyen - v2.1 Added disk partitioning + * 29oct2001:NeilBrown - Use make_request_fn instead of request_fn + * - use stand disk partitioning (so fdisk works). + * 08nov2001:NeilBrown - change driver name from "mm" to "umem" + * - incorporate into main kernel + * 08apr2002:NeilBrown - Move some of interrupt handle to tasklet + * - use spin_lock_bh instead of _irq + * - Never block on make_request. queue + * bh's instead. + * - unregister umem from devfs at mod unload + * - Change version to 2.3 + * 07Nov2001:Phap Nguyen - Select pci read command: 06, 12, 15 (Decimal) + * 07Jan2002: P. Nguyen - Used PCI Memory Write & Invalidate for DMA + * 15May2002:NeilBrown - convert to bio for 2.5 + * 17May2002:NeilBrown - remove init_mem initialisation. Instead detect + * - a sequence of writes that cover the card, and + * - set initialised bit then. + */ + +#undef DEBUG /* #define DEBUG if you want debugging info (pr_debug) */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* O_ACCMODE */ +#include /* HDIO_GETGEO */ + +#include "umem.h" + +#include +#include + +#define MM_MAXCARDS 4 +#define MM_RAHEAD 2 /* two sectors */ +#define MM_BLKSIZE 1024 /* 1k blocks */ +#define MM_HARDSECT 512 /* 512-byte hardware sectors */ +#define MM_SHIFT 6 /* max 64 partitions on 4 cards */ + +/* + * Version Information + */ + +#define DRIVER_NAME "umem" +#define DRIVER_VERSION "v2.3" +#define DRIVER_AUTHOR "San Mehat, Johannes Erdfelt, NeilBrown" +#define DRIVER_DESC "Micro Memory(tm) PCI memory board block driver" + +static int debug; +/* #define HW_TRACE(x) writeb(x,cards[0].csr_remap + MEMCTRLSTATUS_MAGIC) */ +#define HW_TRACE(x) + +#define DEBUG_LED_ON_TRANSFER 0x01 +#define DEBUG_BATTERY_POLLING 0x02 + +module_param(debug, int, 0644); +MODULE_PARM_DESC(debug, "Debug bitmask"); + +static int pci_read_cmd = 0x0C; /* Read Multiple */ +module_param(pci_read_cmd, int, 0); +MODULE_PARM_DESC(pci_read_cmd, "PCI read command"); + +static int pci_write_cmd = 0x0F; /* Write and Invalidate */ +module_param(pci_write_cmd, int, 0); +MODULE_PARM_DESC(pci_write_cmd, "PCI write command"); + +static int pci_cmds; + +static int major_nr; + +#include +#include + +struct cardinfo { + struct pci_dev *dev; + + unsigned char __iomem *csr_remap; + unsigned int mm_size; /* size in kbytes */ + + unsigned int init_size; /* initial segment, in sectors, + * that we know to + * have been written + */ + struct bio *bio, *currentbio, **biotail; + struct bvec_iter current_iter; + + struct request_queue *queue; + + struct mm_page { + dma_addr_t page_dma; + struct mm_dma_desc *desc; + int cnt, headcnt; + struct bio *bio, **biotail; + struct bvec_iter iter; + } mm_pages[2]; +#define DESC_PER_PAGE ((PAGE_SIZE*2)/sizeof(struct mm_dma_desc)) + + int Active, Ready; + + struct tasklet_struct tasklet; + unsigned int dma_status; + + struct { + int good; + int warned; + unsigned long last_change; + } battery[2]; + + spinlock_t lock; + int check_batteries; + + int flags; +}; + +static struct cardinfo cards[MM_MAXCARDS]; +static struct timer_list battery_timer; + +static int num_cards; + +static struct gendisk *mm_gendisk[MM_MAXCARDS]; + +static void check_batteries(struct cardinfo *card); + +static int get_userbit(struct cardinfo *card, int bit) +{ + unsigned char led; + + led = readb(card->csr_remap + MEMCTRLCMD_LEDCTRL); + return led & bit; +} + +static int set_userbit(struct cardinfo *card, int bit, unsigned char state) +{ + unsigned char led; + + led = readb(card->csr_remap + MEMCTRLCMD_LEDCTRL); + if (state) + led |= bit; + else + led &= ~bit; + writeb(led, card->csr_remap + MEMCTRLCMD_LEDCTRL); + + return 0; +} + +/* + * NOTE: For the power LED, use the LED_POWER_* macros since they differ + */ +static void set_led(struct cardinfo *card, int shift, unsigned char state) +{ + unsigned char led; + + led = readb(card->csr_remap + MEMCTRLCMD_LEDCTRL); + if (state == LED_FLIP) + led ^= (1<csr_remap + MEMCTRLCMD_LEDCTRL); + +} + +#ifdef MM_DIAG +static void dump_regs(struct cardinfo *card) +{ + unsigned char *p; + int i, i1; + + p = card->csr_remap; + for (i = 0; i < 8; i++) { + printk(KERN_DEBUG "%p ", p); + + for (i1 = 0; i1 < 16; i1++) + printk("%02x ", *p++); + + printk("\n"); + } +} +#endif + +static void dump_dmastat(struct cardinfo *card, unsigned int dmastat) +{ + dev_printk(KERN_DEBUG, &card->dev->dev, "DMAstat - "); + if (dmastat & DMASCR_ANY_ERR) + printk(KERN_CONT "ANY_ERR "); + if (dmastat & DMASCR_MBE_ERR) + printk(KERN_CONT "MBE_ERR "); + if (dmastat & DMASCR_PARITY_ERR_REP) + printk(KERN_CONT "PARITY_ERR_REP "); + if (dmastat & DMASCR_PARITY_ERR_DET) + printk(KERN_CONT "PARITY_ERR_DET "); + if (dmastat & DMASCR_SYSTEM_ERR_SIG) + printk(KERN_CONT "SYSTEM_ERR_SIG "); + if (dmastat & DMASCR_TARGET_ABT) + printk(KERN_CONT "TARGET_ABT "); + if (dmastat & DMASCR_MASTER_ABT) + printk(KERN_CONT "MASTER_ABT "); + if (dmastat & DMASCR_CHAIN_COMPLETE) + printk(KERN_CONT "CHAIN_COMPLETE "); + if (dmastat & DMASCR_DMA_COMPLETE) + printk(KERN_CONT "DMA_COMPLETE "); + printk("\n"); +} + +/* + * Theory of request handling + * + * Each bio is assigned to one mm_dma_desc - which may not be enough FIXME + * We have two pages of mm_dma_desc, holding about 64 descriptors + * each. These are allocated at init time. + * One page is "Ready" and is either full, or can have request added. + * The other page might be "Active", which DMA is happening on it. + * + * Whenever IO on the active page completes, the Ready page is activated + * and the ex-Active page is clean out and made Ready. + * Otherwise the Ready page is only activated when it becomes full. + * + * If a request arrives while both pages a full, it is queued, and b_rdev is + * overloaded to record whether it was a read or a write. + * + * The interrupt handler only polls the device to clear the interrupt. + * The processing of the result is done in a tasklet. + */ + +static void mm_start_io(struct cardinfo *card) +{ + /* we have the lock, we know there is + * no IO active, and we know that card->Active + * is set + */ + struct mm_dma_desc *desc; + struct mm_page *page; + int offset; + + /* make the last descriptor end the chain */ + page = &card->mm_pages[card->Active]; + pr_debug("start_io: %d %d->%d\n", + card->Active, page->headcnt, page->cnt - 1); + desc = &page->desc[page->cnt-1]; + + desc->control_bits |= cpu_to_le32(DMASCR_CHAIN_COMP_EN); + desc->control_bits &= ~cpu_to_le32(DMASCR_CHAIN_EN); + desc->sem_control_bits = desc->control_bits; + + + if (debug & DEBUG_LED_ON_TRANSFER) + set_led(card, LED_REMOVE, LED_ON); + + desc = &page->desc[page->headcnt]; + writel(0, card->csr_remap + DMA_PCI_ADDR); + writel(0, card->csr_remap + DMA_PCI_ADDR + 4); + + writel(0, card->csr_remap + DMA_LOCAL_ADDR); + writel(0, card->csr_remap + DMA_LOCAL_ADDR + 4); + + writel(0, card->csr_remap + DMA_TRANSFER_SIZE); + writel(0, card->csr_remap + DMA_TRANSFER_SIZE + 4); + + writel(0, card->csr_remap + DMA_SEMAPHORE_ADDR); + writel(0, card->csr_remap + DMA_SEMAPHORE_ADDR + 4); + + offset = ((char *)desc) - ((char *)page->desc); + writel(cpu_to_le32((page->page_dma+offset) & 0xffffffff), + card->csr_remap + DMA_DESCRIPTOR_ADDR); + /* Force the value to u64 before shifting otherwise >> 32 is undefined C + * and on some ports will do nothing ! */ + writel(cpu_to_le32(((u64)page->page_dma)>>32), + card->csr_remap + DMA_DESCRIPTOR_ADDR + 4); + + /* Go, go, go */ + writel(cpu_to_le32(DMASCR_GO | DMASCR_CHAIN_EN | pci_cmds), + card->csr_remap + DMA_STATUS_CTRL); +} + +static int add_bio(struct cardinfo *card); + +static void activate(struct cardinfo *card) +{ + /* if No page is Active, and Ready is + * not empty, then switch Ready page + * to active and start IO. + * Then add any bh's that are available to Ready + */ + + do { + while (add_bio(card)) + ; + + if (card->Active == -1 && + card->mm_pages[card->Ready].cnt > 0) { + card->Active = card->Ready; + card->Ready = 1-card->Ready; + mm_start_io(card); + } + + } while (card->Active == -1 && add_bio(card)); +} + +static inline void reset_page(struct mm_page *page) +{ + page->cnt = 0; + page->headcnt = 0; + page->bio = NULL; + page->biotail = &page->bio; +} + +/* + * If there is room on Ready page, take + * one bh off list and add it. + * return 1 if there was room, else 0. + */ +static int add_bio(struct cardinfo *card) +{ + struct mm_page *p; + struct mm_dma_desc *desc; + dma_addr_t dma_handle; + int offset; + struct bio *bio; + struct bio_vec vec; + int rw; + + bio = card->currentbio; + if (!bio && card->bio) { + card->currentbio = card->bio; + card->current_iter = card->bio->bi_iter; + card->bio = card->bio->bi_next; + if (card->bio == NULL) + card->biotail = &card->bio; + card->currentbio->bi_next = NULL; + return 1; + } + if (!bio) + return 0; + + rw = bio_rw(bio); + if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE) + return 0; + + vec = bio_iter_iovec(bio, card->current_iter); + + dma_handle = pci_map_page(card->dev, + vec.bv_page, + vec.bv_offset, + vec.bv_len, + (rw == READ) ? + PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); + + p = &card->mm_pages[card->Ready]; + desc = &p->desc[p->cnt]; + p->cnt++; + if (p->bio == NULL) + p->iter = card->current_iter; + if ((p->biotail) != &bio->bi_next) { + *(p->biotail) = bio; + p->biotail = &(bio->bi_next); + bio->bi_next = NULL; + } + + desc->data_dma_handle = dma_handle; + + desc->pci_addr = cpu_to_le64((u64)desc->data_dma_handle); + desc->local_addr = cpu_to_le64(card->current_iter.bi_sector << 9); + desc->transfer_size = cpu_to_le32(vec.bv_len); + offset = (((char *)&desc->sem_control_bits) - ((char *)p->desc)); + desc->sem_addr = cpu_to_le64((u64)(p->page_dma+offset)); + desc->zero1 = desc->zero2 = 0; + offset = (((char *)(desc+1)) - ((char *)p->desc)); + desc->next_desc_addr = cpu_to_le64(p->page_dma+offset); + desc->control_bits = cpu_to_le32(DMASCR_GO|DMASCR_ERR_INT_EN| + DMASCR_PARITY_INT_EN| + DMASCR_CHAIN_EN | + DMASCR_SEM_EN | + pci_cmds); + if (rw == WRITE) + desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ); + desc->sem_control_bits = desc->control_bits; + + + bio_advance_iter(bio, &card->current_iter, vec.bv_len); + if (!card->current_iter.bi_size) + card->currentbio = NULL; + + return 1; +} + +static void process_page(unsigned long data) +{ + /* check if any of the requests in the page are DMA_COMPLETE, + * and deal with them appropriately. + * If we find a descriptor without DMA_COMPLETE in the semaphore, then + * dma must have hit an error on that descriptor, so use dma_status + * instead and assume that all following descriptors must be re-tried. + */ + struct mm_page *page; + struct bio *return_bio = NULL; + struct cardinfo *card = (struct cardinfo *)data; + unsigned int dma_status = card->dma_status; + + spin_lock_bh(&card->lock); + if (card->Active < 0) + goto out_unlock; + page = &card->mm_pages[card->Active]; + + while (page->headcnt < page->cnt) { + struct bio *bio = page->bio; + struct mm_dma_desc *desc = &page->desc[page->headcnt]; + int control = le32_to_cpu(desc->sem_control_bits); + int last = 0; + struct bio_vec vec; + + if (!(control & DMASCR_DMA_COMPLETE)) { + control = dma_status; + last = 1; + } + + page->headcnt++; + vec = bio_iter_iovec(bio, page->iter); + bio_advance_iter(bio, &page->iter, vec.bv_len); + + if (!page->iter.bi_size) { + page->bio = bio->bi_next; + if (page->bio) + page->iter = page->bio->bi_iter; + } + + pci_unmap_page(card->dev, desc->data_dma_handle, + vec.bv_len, + (control & DMASCR_TRANSFER_READ) ? + PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); + if (control & DMASCR_HARD_ERROR) { + /* error */ + clear_bit(BIO_UPTODATE, &bio->bi_flags); + dev_printk(KERN_WARNING, &card->dev->dev, + "I/O error on sector %d/%d\n", + le32_to_cpu(desc->local_addr)>>9, + le32_to_cpu(desc->transfer_size)); + dump_dmastat(card, control); + } else if ((bio->bi_rw & REQ_WRITE) && + le32_to_cpu(desc->local_addr) >> 9 == + card->init_size) { + card->init_size += le32_to_cpu(desc->transfer_size) >> 9; + if (card->init_size >> 1 >= card->mm_size) { + dev_printk(KERN_INFO, &card->dev->dev, + "memory now initialised\n"); + set_userbit(card, MEMORY_INITIALIZED, 1); + } + } + if (bio != page->bio) { + bio->bi_next = return_bio; + return_bio = bio; + } + + if (last) + break; + } + + if (debug & DEBUG_LED_ON_TRANSFER) + set_led(card, LED_REMOVE, LED_OFF); + + if (card->check_batteries) { + card->check_batteries = 0; + check_batteries(card); + } + if (page->headcnt >= page->cnt) { + reset_page(page); + card->Active = -1; + activate(card); + } else { + /* haven't finished with this one yet */ + pr_debug("do some more\n"); + mm_start_io(card); + } + out_unlock: + spin_unlock_bh(&card->lock); + + while (return_bio) { + struct bio *bio = return_bio; + + return_bio = bio->bi_next; + bio->bi_next = NULL; + bio_endio(bio, 0); + } +} + +static void mm_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct cardinfo *card = cb->data; + + spin_lock_irq(&card->lock); + activate(card); + spin_unlock_irq(&card->lock); + kfree(cb); +} + +static int mm_check_plugged(struct cardinfo *card) +{ + return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb)); +} + +static void mm_make_request(struct request_queue *q, struct bio *bio) +{ + struct cardinfo *card = q->queuedata; + pr_debug("mm_make_request %llu %u\n", + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size); + + spin_lock_irq(&card->lock); + *card->biotail = bio; + bio->bi_next = NULL; + card->biotail = &bio->bi_next; + if (bio->bi_rw & REQ_SYNC || !mm_check_plugged(card)) + activate(card); + spin_unlock_irq(&card->lock); + + return; +} + +static irqreturn_t mm_interrupt(int irq, void *__card) +{ + struct cardinfo *card = (struct cardinfo *) __card; + unsigned int dma_status; + unsigned short cfg_status; + +HW_TRACE(0x30); + + dma_status = le32_to_cpu(readl(card->csr_remap + DMA_STATUS_CTRL)); + + if (!(dma_status & (DMASCR_ERROR_MASK | DMASCR_CHAIN_COMPLETE))) { + /* interrupt wasn't for me ... */ + return IRQ_NONE; + } + + /* clear COMPLETION interrupts */ + if (card->flags & UM_FLAG_NO_BYTE_STATUS) + writel(cpu_to_le32(DMASCR_DMA_COMPLETE|DMASCR_CHAIN_COMPLETE), + card->csr_remap + DMA_STATUS_CTRL); + else + writeb((DMASCR_DMA_COMPLETE|DMASCR_CHAIN_COMPLETE) >> 16, + card->csr_remap + DMA_STATUS_CTRL + 2); + + /* log errors and clear interrupt status */ + if (dma_status & DMASCR_ANY_ERR) { + unsigned int data_log1, data_log2; + unsigned int addr_log1, addr_log2; + unsigned char stat, count, syndrome, check; + + stat = readb(card->csr_remap + MEMCTRLCMD_ERRSTATUS); + + data_log1 = le32_to_cpu(readl(card->csr_remap + + ERROR_DATA_LOG)); + data_log2 = le32_to_cpu(readl(card->csr_remap + + ERROR_DATA_LOG + 4)); + addr_log1 = le32_to_cpu(readl(card->csr_remap + + ERROR_ADDR_LOG)); + addr_log2 = readb(card->csr_remap + ERROR_ADDR_LOG + 4); + + count = readb(card->csr_remap + ERROR_COUNT); + syndrome = readb(card->csr_remap + ERROR_SYNDROME); + check = readb(card->csr_remap + ERROR_CHECK); + + dump_dmastat(card, dma_status); + + if (stat & 0x01) + dev_printk(KERN_ERR, &card->dev->dev, + "Memory access error detected (err count %d)\n", + count); + if (stat & 0x02) + dev_printk(KERN_ERR, &card->dev->dev, + "Multi-bit EDC error\n"); + + dev_printk(KERN_ERR, &card->dev->dev, + "Fault Address 0x%02x%08x, Fault Data 0x%08x%08x\n", + addr_log2, addr_log1, data_log2, data_log1); + dev_printk(KERN_ERR, &card->dev->dev, + "Fault Check 0x%02x, Fault Syndrome 0x%02x\n", + check, syndrome); + + writeb(0, card->csr_remap + ERROR_COUNT); + } + + if (dma_status & DMASCR_PARITY_ERR_REP) { + dev_printk(KERN_ERR, &card->dev->dev, + "PARITY ERROR REPORTED\n"); + pci_read_config_word(card->dev, PCI_STATUS, &cfg_status); + pci_write_config_word(card->dev, PCI_STATUS, cfg_status); + } + + if (dma_status & DMASCR_PARITY_ERR_DET) { + dev_printk(KERN_ERR, &card->dev->dev, + "PARITY ERROR DETECTED\n"); + pci_read_config_word(card->dev, PCI_STATUS, &cfg_status); + pci_write_config_word(card->dev, PCI_STATUS, cfg_status); + } + + if (dma_status & DMASCR_SYSTEM_ERR_SIG) { + dev_printk(KERN_ERR, &card->dev->dev, "SYSTEM ERROR\n"); + pci_read_config_word(card->dev, PCI_STATUS, &cfg_status); + pci_write_config_word(card->dev, PCI_STATUS, cfg_status); + } + + if (dma_status & DMASCR_TARGET_ABT) { + dev_printk(KERN_ERR, &card->dev->dev, "TARGET ABORT\n"); + pci_read_config_word(card->dev, PCI_STATUS, &cfg_status); + pci_write_config_word(card->dev, PCI_STATUS, cfg_status); + } + + if (dma_status & DMASCR_MASTER_ABT) { + dev_printk(KERN_ERR, &card->dev->dev, "MASTER ABORT\n"); + pci_read_config_word(card->dev, PCI_STATUS, &cfg_status); + pci_write_config_word(card->dev, PCI_STATUS, cfg_status); + } + + /* and process the DMA descriptors */ + card->dma_status = dma_status; + tasklet_schedule(&card->tasklet); + +HW_TRACE(0x36); + + return IRQ_HANDLED; +} + +/* + * If both batteries are good, no LED + * If either battery has been warned, solid LED + * If both batteries are bad, flash the LED quickly + * If either battery is bad, flash the LED semi quickly + */ +static void set_fault_to_battery_status(struct cardinfo *card) +{ + if (card->battery[0].good && card->battery[1].good) + set_led(card, LED_FAULT, LED_OFF); + else if (card->battery[0].warned || card->battery[1].warned) + set_led(card, LED_FAULT, LED_ON); + else if (!card->battery[0].good && !card->battery[1].good) + set_led(card, LED_FAULT, LED_FLASH_7_0); + else + set_led(card, LED_FAULT, LED_FLASH_3_5); +} + +static void init_battery_timer(void); + +static int check_battery(struct cardinfo *card, int battery, int status) +{ + if (status != card->battery[battery].good) { + card->battery[battery].good = !card->battery[battery].good; + card->battery[battery].last_change = jiffies; + + if (card->battery[battery].good) { + dev_printk(KERN_ERR, &card->dev->dev, + "Battery %d now good\n", battery + 1); + card->battery[battery].warned = 0; + } else + dev_printk(KERN_ERR, &card->dev->dev, + "Battery %d now FAILED\n", battery + 1); + + return 1; + } else if (!card->battery[battery].good && + !card->battery[battery].warned && + time_after_eq(jiffies, card->battery[battery].last_change + + (HZ * 60 * 60 * 5))) { + dev_printk(KERN_ERR, &card->dev->dev, + "Battery %d still FAILED after 5 hours\n", battery + 1); + card->battery[battery].warned = 1; + + return 1; + } + + return 0; +} + +static void check_batteries(struct cardinfo *card) +{ + /* NOTE: this must *never* be called while the card + * is doing (bus-to-card) DMA, or you will need the + * reset switch + */ + unsigned char status; + int ret1, ret2; + + status = readb(card->csr_remap + MEMCTRLSTATUS_BATTERY); + if (debug & DEBUG_BATTERY_POLLING) + dev_printk(KERN_DEBUG, &card->dev->dev, + "checking battery status, 1 = %s, 2 = %s\n", + (status & BATTERY_1_FAILURE) ? "FAILURE" : "OK", + (status & BATTERY_2_FAILURE) ? "FAILURE" : "OK"); + + ret1 = check_battery(card, 0, !(status & BATTERY_1_FAILURE)); + ret2 = check_battery(card, 1, !(status & BATTERY_2_FAILURE)); + + if (ret1 || ret2) + set_fault_to_battery_status(card); +} + +static void check_all_batteries(unsigned long ptr) +{ + int i; + + for (i = 0; i < num_cards; i++) + if (!(cards[i].flags & UM_FLAG_NO_BATT)) { + struct cardinfo *card = &cards[i]; + spin_lock_bh(&card->lock); + if (card->Active >= 0) + card->check_batteries = 1; + else + check_batteries(card); + spin_unlock_bh(&card->lock); + } + + init_battery_timer(); +} + +static void init_battery_timer(void) +{ + init_timer(&battery_timer); + battery_timer.function = check_all_batteries; + battery_timer.expires = jiffies + (HZ * 60); + add_timer(&battery_timer); +} + +static void del_battery_timer(void) +{ + del_timer(&battery_timer); +} + +/* + * Note no locks taken out here. In a worst case scenario, we could drop + * a chunk of system memory. But that should never happen, since validation + * happens at open or mount time, when locks are held. + * + * That's crap, since doing that while some partitions are opened + * or mounted will give you really nasty results. + */ +static int mm_revalidate(struct gendisk *disk) +{ + struct cardinfo *card = disk->private_data; + set_capacity(disk, card->mm_size << 1); + return 0; +} + +static int mm_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct cardinfo *card = bdev->bd_disk->private_data; + int size = card->mm_size * (1024 / MM_HARDSECT); + + /* + * get geometry: we have to fake one... trim the size to a + * multiple of 2048 (1M): tell we have 32 sectors, 64 heads, + * whatever cylinders. + */ + geo->heads = 64; + geo->sectors = 32; + geo->cylinders = size / (geo->heads * geo->sectors); + return 0; +} + +static const struct block_device_operations mm_fops = { + .owner = THIS_MODULE, + .getgeo = mm_getgeo, + .revalidate_disk = mm_revalidate, +}; + +static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + int ret = -ENODEV; + struct cardinfo *card = &cards[num_cards]; + unsigned char mem_present; + unsigned char batt_status; + unsigned int saved_bar, data; + unsigned long csr_base; + unsigned long csr_len; + int magic_number; + static int printed_version; + + if (!printed_version++) + printk(KERN_INFO DRIVER_VERSION " : " DRIVER_DESC "\n"); + + ret = pci_enable_device(dev); + if (ret) + return ret; + + pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0xF8); + pci_set_master(dev); + + card->dev = dev; + + csr_base = pci_resource_start(dev, 0); + csr_len = pci_resource_len(dev, 0); + if (!csr_base || !csr_len) + return -ENODEV; + + dev_printk(KERN_INFO, &dev->dev, + "Micro Memory(tm) controller found (PCI Mem Module (Battery Backup))\n"); + + if (pci_set_dma_mask(dev, DMA_BIT_MASK(64)) && + pci_set_dma_mask(dev, DMA_BIT_MASK(32))) { + dev_printk(KERN_WARNING, &dev->dev, "NO suitable DMA found\n"); + return -ENOMEM; + } + + ret = pci_request_regions(dev, DRIVER_NAME); + if (ret) { + dev_printk(KERN_ERR, &card->dev->dev, + "Unable to request memory region\n"); + goto failed_req_csr; + } + + card->csr_remap = ioremap_nocache(csr_base, csr_len); + if (!card->csr_remap) { + dev_printk(KERN_ERR, &card->dev->dev, + "Unable to remap memory region\n"); + ret = -ENOMEM; + + goto failed_remap_csr; + } + + dev_printk(KERN_INFO, &card->dev->dev, + "CSR 0x%08lx -> 0x%p (0x%lx)\n", + csr_base, card->csr_remap, csr_len); + + switch (card->dev->device) { + case 0x5415: + card->flags |= UM_FLAG_NO_BYTE_STATUS | UM_FLAG_NO_BATTREG; + magic_number = 0x59; + break; + + case 0x5425: + card->flags |= UM_FLAG_NO_BYTE_STATUS; + magic_number = 0x5C; + break; + + case 0x6155: + card->flags |= UM_FLAG_NO_BYTE_STATUS | + UM_FLAG_NO_BATTREG | UM_FLAG_NO_BATT; + magic_number = 0x99; + break; + + default: + magic_number = 0x100; + break; + } + + if (readb(card->csr_remap + MEMCTRLSTATUS_MAGIC) != magic_number) { + dev_printk(KERN_ERR, &card->dev->dev, "Magic number invalid\n"); + ret = -ENOMEM; + goto failed_magic; + } + + card->mm_pages[0].desc = pci_alloc_consistent(card->dev, + PAGE_SIZE * 2, + &card->mm_pages[0].page_dma); + card->mm_pages[1].desc = pci_alloc_consistent(card->dev, + PAGE_SIZE * 2, + &card->mm_pages[1].page_dma); + if (card->mm_pages[0].desc == NULL || + card->mm_pages[1].desc == NULL) { + dev_printk(KERN_ERR, &card->dev->dev, "alloc failed\n"); + goto failed_alloc; + } + reset_page(&card->mm_pages[0]); + reset_page(&card->mm_pages[1]); + card->Ready = 0; /* page 0 is ready */ + card->Active = -1; /* no page is active */ + card->bio = NULL; + card->biotail = &card->bio; + + card->queue = blk_alloc_queue(GFP_KERNEL); + if (!card->queue) + goto failed_alloc; + + blk_queue_make_request(card->queue, mm_make_request); + card->queue->queue_lock = &card->lock; + card->queue->queuedata = card; + + tasklet_init(&card->tasklet, process_page, (unsigned long)card); + + card->check_batteries = 0; + + mem_present = readb(card->csr_remap + MEMCTRLSTATUS_MEMORY); + switch (mem_present) { + case MEM_128_MB: + card->mm_size = 1024 * 128; + break; + case MEM_256_MB: + card->mm_size = 1024 * 256; + break; + case MEM_512_MB: + card->mm_size = 1024 * 512; + break; + case MEM_1_GB: + card->mm_size = 1024 * 1024; + break; + case MEM_2_GB: + card->mm_size = 1024 * 2048; + break; + default: + card->mm_size = 0; + break; + } + + /* Clear the LED's we control */ + set_led(card, LED_REMOVE, LED_OFF); + set_led(card, LED_FAULT, LED_OFF); + + batt_status = readb(card->csr_remap + MEMCTRLSTATUS_BATTERY); + + card->battery[0].good = !(batt_status & BATTERY_1_FAILURE); + card->battery[1].good = !(batt_status & BATTERY_2_FAILURE); + card->battery[0].last_change = card->battery[1].last_change = jiffies; + + if (card->flags & UM_FLAG_NO_BATT) + dev_printk(KERN_INFO, &card->dev->dev, + "Size %d KB\n", card->mm_size); + else { + dev_printk(KERN_INFO, &card->dev->dev, + "Size %d KB, Battery 1 %s (%s), Battery 2 %s (%s)\n", + card->mm_size, + batt_status & BATTERY_1_DISABLED ? "Disabled" : "Enabled", + card->battery[0].good ? "OK" : "FAILURE", + batt_status & BATTERY_2_DISABLED ? "Disabled" : "Enabled", + card->battery[1].good ? "OK" : "FAILURE"); + + set_fault_to_battery_status(card); + } + + pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, &saved_bar); + data = 0xffffffff; + pci_write_config_dword(dev, PCI_BASE_ADDRESS_1, data); + pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, &data); + pci_write_config_dword(dev, PCI_BASE_ADDRESS_1, saved_bar); + data &= 0xfffffff0; + data = ~data; + data += 1; + + if (request_irq(dev->irq, mm_interrupt, IRQF_SHARED, DRIVER_NAME, + card)) { + dev_printk(KERN_ERR, &card->dev->dev, + "Unable to allocate IRQ\n"); + ret = -ENODEV; + goto failed_req_irq; + } + + dev_printk(KERN_INFO, &card->dev->dev, + "Window size %d bytes, IRQ %d\n", data, dev->irq); + + spin_lock_init(&card->lock); + + pci_set_drvdata(dev, card); + + if (pci_write_cmd != 0x0F) /* If not Memory Write & Invalidate */ + pci_write_cmd = 0x07; /* then Memory Write command */ + + if (pci_write_cmd & 0x08) { /* use Memory Write and Invalidate */ + unsigned short cfg_command; + pci_read_config_word(dev, PCI_COMMAND, &cfg_command); + cfg_command |= 0x10; /* Memory Write & Invalidate Enable */ + pci_write_config_word(dev, PCI_COMMAND, cfg_command); + } + pci_cmds = (pci_read_cmd << 28) | (pci_write_cmd << 24); + + num_cards++; + + if (!get_userbit(card, MEMORY_INITIALIZED)) { + dev_printk(KERN_INFO, &card->dev->dev, + "memory NOT initialized. Consider over-writing whole device.\n"); + card->init_size = 0; + } else { + dev_printk(KERN_INFO, &card->dev->dev, + "memory already initialized\n"); + card->init_size = card->mm_size; + } + + /* Enable ECC */ + writeb(EDC_STORE_CORRECT, card->csr_remap + MEMCTRLCMD_ERRCTRL); + + return 0; + + failed_req_irq: + failed_alloc: + if (card->mm_pages[0].desc) + pci_free_consistent(card->dev, PAGE_SIZE*2, + card->mm_pages[0].desc, + card->mm_pages[0].page_dma); + if (card->mm_pages[1].desc) + pci_free_consistent(card->dev, PAGE_SIZE*2, + card->mm_pages[1].desc, + card->mm_pages[1].page_dma); + failed_magic: + iounmap(card->csr_remap); + failed_remap_csr: + pci_release_regions(dev); + failed_req_csr: + + return ret; +} + +static void mm_pci_remove(struct pci_dev *dev) +{ + struct cardinfo *card = pci_get_drvdata(dev); + + tasklet_kill(&card->tasklet); + free_irq(dev->irq, card); + iounmap(card->csr_remap); + + if (card->mm_pages[0].desc) + pci_free_consistent(card->dev, PAGE_SIZE*2, + card->mm_pages[0].desc, + card->mm_pages[0].page_dma); + if (card->mm_pages[1].desc) + pci_free_consistent(card->dev, PAGE_SIZE*2, + card->mm_pages[1].desc, + card->mm_pages[1].page_dma); + blk_cleanup_queue(card->queue); + + pci_release_regions(dev); + pci_disable_device(dev); +} + +static const struct pci_device_id mm_pci_ids[] = { + {PCI_DEVICE(PCI_VENDOR_ID_MICRO_MEMORY, PCI_DEVICE_ID_MICRO_MEMORY_5415CN)}, + {PCI_DEVICE(PCI_VENDOR_ID_MICRO_MEMORY, PCI_DEVICE_ID_MICRO_MEMORY_5425CN)}, + {PCI_DEVICE(PCI_VENDOR_ID_MICRO_MEMORY, PCI_DEVICE_ID_MICRO_MEMORY_6155)}, + { + .vendor = 0x8086, + .device = 0xB555, + .subvendor = 0x1332, + .subdevice = 0x5460, + .class = 0x050000, + .class_mask = 0, + }, { /* end: all zeroes */ } +}; + +MODULE_DEVICE_TABLE(pci, mm_pci_ids); + +static struct pci_driver mm_pci_driver = { + .name = DRIVER_NAME, + .id_table = mm_pci_ids, + .probe = mm_pci_probe, + .remove = mm_pci_remove, +}; + +static int __init mm_init(void) +{ + int retval, i; + int err; + + retval = pci_register_driver(&mm_pci_driver); + if (retval) + return -ENOMEM; + + err = major_nr = register_blkdev(0, DRIVER_NAME); + if (err < 0) { + pci_unregister_driver(&mm_pci_driver); + return -EIO; + } + + for (i = 0; i < num_cards; i++) { + mm_gendisk[i] = alloc_disk(1 << MM_SHIFT); + if (!mm_gendisk[i]) + goto out; + } + + for (i = 0; i < num_cards; i++) { + struct gendisk *disk = mm_gendisk[i]; + sprintf(disk->disk_name, "umem%c", 'a'+i); + spin_lock_init(&cards[i].lock); + disk->major = major_nr; + disk->first_minor = i << MM_SHIFT; + disk->fops = &mm_fops; + disk->private_data = &cards[i]; + disk->queue = cards[i].queue; + set_capacity(disk, cards[i].mm_size << 1); + add_disk(disk); + } + + init_battery_timer(); + printk(KERN_INFO "MM: desc_per_page = %ld\n", DESC_PER_PAGE); +/* printk("mm_init: Done. 10-19-01 9:00\n"); */ + return 0; + +out: + pci_unregister_driver(&mm_pci_driver); + unregister_blkdev(major_nr, DRIVER_NAME); + while (i--) + put_disk(mm_gendisk[i]); + return -ENOMEM; +} + +static void __exit mm_cleanup(void) +{ + int i; + + del_battery_timer(); + + for (i = 0; i < num_cards ; i++) { + del_gendisk(mm_gendisk[i]); + put_disk(mm_gendisk[i]); + } + + pci_unregister_driver(&mm_pci_driver); + + unregister_blkdev(major_nr, DRIVER_NAME); +} + +module_init(mm_init); +module_exit(mm_cleanup); + +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_LICENSE("GPL"); diff --git a/kernel/drivers/block/umem.h b/kernel/drivers/block/umem.h new file mode 100644 index 000000000..375c68974 --- /dev/null +++ b/kernel/drivers/block/umem.h @@ -0,0 +1,133 @@ + +/* + * This file contains defines for the + * Micro Memory MM5415 + * family PCI Memory Module with Battery Backup. + * + * Copyright Micro Memory INC 2001. All rights reserved. + * Release under the terms of the GNU GENERAL PUBLIC LICENSE version 2. + * See the file COPYING. + */ + +#ifndef _DRIVERS_BLOCK_MM_H +#define _DRIVERS_BLOCK_MM_H + + +#define IRQ_TIMEOUT (1 * HZ) + +/* CSR register definition */ +#define MEMCTRLSTATUS_MAGIC 0x00 +#define MM_MAGIC_VALUE (unsigned char)0x59 + +#define MEMCTRLSTATUS_BATTERY 0x04 +#define BATTERY_1_DISABLED 0x01 +#define BATTERY_1_FAILURE 0x02 +#define BATTERY_2_DISABLED 0x04 +#define BATTERY_2_FAILURE 0x08 + +#define MEMCTRLSTATUS_MEMORY 0x07 +#define MEM_128_MB 0xfe +#define MEM_256_MB 0xfc +#define MEM_512_MB 0xf8 +#define MEM_1_GB 0xf0 +#define MEM_2_GB 0xe0 + +#define MEMCTRLCMD_LEDCTRL 0x08 +#define LED_REMOVE 2 +#define LED_FAULT 4 +#define LED_POWER 6 +#define LED_FLIP 255 +#define LED_OFF 0x00 +#define LED_ON 0x01 +#define LED_FLASH_3_5 0x02 +#define LED_FLASH_7_0 0x03 +#define LED_POWER_ON 0x00 +#define LED_POWER_OFF 0x01 +#define USER_BIT1 0x01 +#define USER_BIT2 0x02 + +#define MEMORY_INITIALIZED USER_BIT1 + +#define MEMCTRLCMD_ERRCTRL 0x0C +#define EDC_NONE_DEFAULT 0x00 +#define EDC_NONE 0x01 +#define EDC_STORE_READ 0x02 +#define EDC_STORE_CORRECT 0x03 + +#define MEMCTRLCMD_ERRCNT 0x0D +#define MEMCTRLCMD_ERRSTATUS 0x0E + +#define ERROR_DATA_LOG 0x20 +#define ERROR_ADDR_LOG 0x28 +#define ERROR_COUNT 0x3D +#define ERROR_SYNDROME 0x3E +#define ERROR_CHECK 0x3F + +#define DMA_PCI_ADDR 0x40 +#define DMA_LOCAL_ADDR 0x48 +#define DMA_TRANSFER_SIZE 0x50 +#define DMA_DESCRIPTOR_ADDR 0x58 +#define DMA_SEMAPHORE_ADDR 0x60 +#define DMA_STATUS_CTRL 0x68 +#define DMASCR_GO 0x00001 +#define DMASCR_TRANSFER_READ 0x00002 +#define DMASCR_CHAIN_EN 0x00004 +#define DMASCR_SEM_EN 0x00010 +#define DMASCR_DMA_COMP_EN 0x00020 +#define DMASCR_CHAIN_COMP_EN 0x00040 +#define DMASCR_ERR_INT_EN 0x00080 +#define DMASCR_PARITY_INT_EN 0x00100 +#define DMASCR_ANY_ERR 0x00800 +#define DMASCR_MBE_ERR 0x01000 +#define DMASCR_PARITY_ERR_REP 0x02000 +#define DMASCR_PARITY_ERR_DET 0x04000 +#define DMASCR_SYSTEM_ERR_SIG 0x08000 +#define DMASCR_TARGET_ABT 0x10000 +#define DMASCR_MASTER_ABT 0x20000 +#define DMASCR_DMA_COMPLETE 0x40000 +#define DMASCR_CHAIN_COMPLETE 0x80000 + +/* +3.SOME PCs HAVE HOST BRIDGES WHICH APPARENTLY DO NOT CORRECTLY HANDLE +READ-LINE (0xE) OR READ-MULTIPLE (0xC) PCI COMMAND CODES DURING DMA +TRANSFERS. IN OTHER SYSTEMS THESE COMMAND CODES WILL CAUSE THE HOST BRIDGE +TO ALLOW LONGER BURSTS DURING DMA READ OPERATIONS. THE UPPER FOUR BITS +(31..28) OF THE DMA CSR HAVE BEEN MADE PROGRAMMABLE, SO THAT EITHER A 0x6, +AN 0xE OR A 0xC CAN BE WRITTEN TO THEM TO SET THE COMMAND CODE USED DURING +DMA READ OPERATIONS. +*/ +#define DMASCR_READ 0x60000000 +#define DMASCR_READLINE 0xE0000000 +#define DMASCR_READMULTI 0xC0000000 + + +#define DMASCR_ERROR_MASK (DMASCR_MASTER_ABT | DMASCR_TARGET_ABT | DMASCR_SYSTEM_ERR_SIG | DMASCR_PARITY_ERR_DET | DMASCR_MBE_ERR | DMASCR_ANY_ERR) +#define DMASCR_HARD_ERROR (DMASCR_MASTER_ABT | DMASCR_TARGET_ABT | DMASCR_SYSTEM_ERR_SIG | DMASCR_PARITY_ERR_DET | DMASCR_MBE_ERR) + +#define WINDOWMAP_WINNUM 0x7B + +#define DMA_READ_FROM_HOST 0 +#define DMA_WRITE_TO_HOST 1 + +struct mm_dma_desc { + __le64 pci_addr; + __le64 local_addr; + __le32 transfer_size; + u32 zero1; + __le64 next_desc_addr; + __le64 sem_addr; + __le32 control_bits; + u32 zero2; + + dma_addr_t data_dma_handle; + + /* Copy of the bits */ + __le64 sem_control_bits; +} __attribute__((aligned(8))); + +/* bits for card->flags */ +#define UM_FLAG_DMA_IN_REGS 1 +#define UM_FLAG_NO_BYTE_STATUS 2 +#define UM_FLAG_NO_BATTREG 4 +#define UM_FLAG_NO_BATT 8 +#endif diff --git a/kernel/drivers/block/virtio_blk.c b/kernel/drivers/block/virtio_blk.c new file mode 100644 index 000000000..5ea2f0bbb --- /dev/null +++ b/kernel/drivers/block/virtio_blk.c @@ -0,0 +1,901 @@ +//#define DEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PART_BITS 4 +#define VQ_NAME_LEN 16 + +static int major; +static DEFINE_IDA(vd_index_ida); + +static struct workqueue_struct *virtblk_wq; + +struct virtio_blk_vq { + struct virtqueue *vq; + spinlock_t lock; + char name[VQ_NAME_LEN]; +} ____cacheline_aligned_in_smp; + +struct virtio_blk { + struct virtio_device *vdev; + + /* The disk structure for the kernel. */ + struct gendisk *disk; + + /* Block layer tags. */ + struct blk_mq_tag_set tag_set; + + /* Process context for config space updates */ + struct work_struct config_work; + + /* What host tells us, plus 2 for header & tailer. */ + unsigned int sg_elems; + + /* Ida index - used to track minor number allocations. */ + int index; + + /* num of vqs */ + int num_vqs; + struct virtio_blk_vq *vqs; +}; + +struct virtblk_req { + struct request *req; + struct virtio_blk_outhdr out_hdr; + struct virtio_scsi_inhdr in_hdr; + u8 status; + struct scatterlist sg[]; +}; + +static inline int virtblk_result(struct virtblk_req *vbr) +{ + switch (vbr->status) { + case VIRTIO_BLK_S_OK: + return 0; + case VIRTIO_BLK_S_UNSUPP: + return -ENOTTY; + default: + return -EIO; + } +} + +static int __virtblk_add_req(struct virtqueue *vq, + struct virtblk_req *vbr, + struct scatterlist *data_sg, + bool have_data) +{ + struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; + unsigned int num_out = 0, num_in = 0; + __virtio32 type = vbr->out_hdr.type & ~cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT); + + sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); + sgs[num_out++] = &hdr; + + /* + * If this is a packet command we need a couple of additional headers. + * Behind the normal outhdr we put a segment with the scsi command + * block, and before the normal inhdr we put the sense data and the + * inhdr with additional status information. + */ + if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) { + sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len); + sgs[num_out++] = &cmd; + } + + if (have_data) { + if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT)) + sgs[num_out++] = data_sg; + else + sgs[num_out + num_in++] = data_sg; + } + + if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) { + sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); + sgs[num_out + num_in++] = &sense; + sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); + sgs[num_out + num_in++] = &inhdr; + } + + sg_init_one(&status, &vbr->status, sizeof(vbr->status)); + sgs[num_out + num_in++] = &status; + + return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); +} + +static inline void virtblk_request_done(struct request *req) +{ + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + struct virtio_blk *vblk = req->q->queuedata; + int error = virtblk_result(vbr); + + if (req->cmd_type == REQ_TYPE_BLOCK_PC) { + req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual); + req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len); + req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors); + } else if (req->cmd_type == REQ_TYPE_SPECIAL) { + req->errors = (error != 0); + } + + blk_mq_end_request(req, error); +} + +static void virtblk_done(struct virtqueue *vq) +{ + struct virtio_blk *vblk = vq->vdev->priv; + bool req_done = false; + int qid = vq->index; + struct virtblk_req *vbr; + unsigned long flags; + unsigned int len; + + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); + do { + virtqueue_disable_cb(vq); + while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { + blk_mq_complete_request(vbr->req); + req_done = true; + } + if (unlikely(virtqueue_is_broken(vq))) + break; + } while (!virtqueue_enable_cb(vq)); + + /* In case queue is stopped waiting for more buffers. */ + if (req_done) + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); +} + +static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct virtio_blk *vblk = hctx->queue->queuedata; + struct request *req = bd->rq; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + unsigned long flags; + unsigned int num; + int qid = hctx->queue_num; + int err; + bool notify = false; + + BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); + + vbr->req = req; + if (req->cmd_flags & REQ_FLUSH) { + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH); + vbr->out_hdr.sector = 0; + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); + } else { + switch (req->cmd_type) { + case REQ_TYPE_FS: + vbr->out_hdr.type = 0; + vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, blk_rq_pos(vbr->req)); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); + break; + case REQ_TYPE_BLOCK_PC: + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_SCSI_CMD); + vbr->out_hdr.sector = 0; + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); + break; + case REQ_TYPE_SPECIAL: + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); + vbr->out_hdr.sector = 0; + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); + break; + default: + /* We don't put anything else in the queue. */ + BUG(); + } + } + + blk_mq_start_request(req); + + num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg); + if (num) { + if (rq_data_dir(vbr->req) == WRITE) + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT); + else + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN); + } + + spin_lock_irqsave(&vblk->vqs[qid].lock, flags); + err = __virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num); + if (err) { + virtqueue_kick(vblk->vqs[qid].vq); + blk_mq_stop_hw_queue(hctx); + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); + /* Out of mem doesn't actually happen, since we fall back + * to direct descriptors */ + if (err == -ENOMEM || err == -ENOSPC) + return BLK_MQ_RQ_QUEUE_BUSY; + return BLK_MQ_RQ_QUEUE_ERROR; + } + + if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) + notify = true; + spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); + + if (notify) + virtqueue_notify(vblk->vqs[qid].vq); + return BLK_MQ_RQ_QUEUE_OK; +} + +/* return id (s/n) string for *disk to *id_str + */ +static int virtblk_get_id(struct gendisk *disk, char *id_str) +{ + struct virtio_blk *vblk = disk->private_data; + struct request *req; + struct bio *bio; + int err; + + bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES, + GFP_KERNEL); + if (IS_ERR(bio)) + return PTR_ERR(bio); + + req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL); + if (IS_ERR(req)) { + bio_put(bio); + return PTR_ERR(req); + } + + req->cmd_type = REQ_TYPE_SPECIAL; + err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); + blk_put_request(req); + + return err; +} + +static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long data) +{ + struct gendisk *disk = bdev->bd_disk; + struct virtio_blk *vblk = disk->private_data; + + /* + * Only allow the generic SCSI ioctls if the host can support it. + */ + if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI)) + return -ENOTTY; + + return scsi_cmd_blk_ioctl(bdev, mode, cmd, + (void __user *)data); +} + +/* We provide getgeo only to please some old bootloader/partitioning tools */ +static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) +{ + struct virtio_blk *vblk = bd->bd_disk->private_data; + + /* see if the host passed in geometry config */ + if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) { + virtio_cread(vblk->vdev, struct virtio_blk_config, + geometry.cylinders, &geo->cylinders); + virtio_cread(vblk->vdev, struct virtio_blk_config, + geometry.heads, &geo->heads); + virtio_cread(vblk->vdev, struct virtio_blk_config, + geometry.sectors, &geo->sectors); + } else { + /* some standard values, similar to sd */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bd->bd_disk) >> 11; + } + return 0; +} + +static const struct block_device_operations virtblk_fops = { + .ioctl = virtblk_ioctl, + .owner = THIS_MODULE, + .getgeo = virtblk_getgeo, +}; + +static int index_to_minor(int index) +{ + return index << PART_BITS; +} + +static int minor_to_index(int minor) +{ + return minor >> PART_BITS; +} + +static ssize_t virtblk_serial_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + int err; + + /* sysfs gives us a PAGE_SIZE buffer */ + BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES); + + buf[VIRTIO_BLK_ID_BYTES] = '\0'; + err = virtblk_get_id(disk, buf); + if (!err) + return strlen(buf); + + if (err == -EIO) /* Unsupported? Make it empty. */ + return 0; + + return err; +} + +static DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL); + +static void virtblk_config_changed_work(struct work_struct *work) +{ + struct virtio_blk *vblk = + container_of(work, struct virtio_blk, config_work); + struct virtio_device *vdev = vblk->vdev; + struct request_queue *q = vblk->disk->queue; + char cap_str_2[10], cap_str_10[10]; + char *envp[] = { "RESIZE=1", NULL }; + u64 capacity; + + /* Host must always specify the capacity. */ + virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity); + + /* If capacity is too big, truncate with warning. */ + if ((sector_t)capacity != capacity) { + dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n", + (unsigned long long)capacity); + capacity = (sector_t)-1; + } + + string_get_size(capacity, queue_logical_block_size(q), + STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); + string_get_size(capacity, queue_logical_block_size(q), + STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); + + dev_notice(&vdev->dev, + "new size: %llu %d-byte logical blocks (%s/%s)\n", + (unsigned long long)capacity, + queue_logical_block_size(q), + cap_str_10, cap_str_2); + + set_capacity(vblk->disk, capacity); + revalidate_disk(vblk->disk); + kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp); +} + +static void virtblk_config_changed(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + + queue_work(virtblk_wq, &vblk->config_work); +} + +static int init_vq(struct virtio_blk *vblk) +{ + int err = 0; + int i; + vq_callback_t **callbacks; + const char **names; + struct virtqueue **vqs; + unsigned short num_vqs; + struct virtio_device *vdev = vblk->vdev; + + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ, + struct virtio_blk_config, num_queues, + &num_vqs); + if (err) + num_vqs = 1; + + vblk->vqs = kmalloc(sizeof(*vblk->vqs) * num_vqs, GFP_KERNEL); + if (!vblk->vqs) { + err = -ENOMEM; + goto out; + } + + names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL); + if (!names) + goto err_names; + + callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL); + if (!callbacks) + goto err_callbacks; + + vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL); + if (!vqs) + goto err_vqs; + + for (i = 0; i < num_vqs; i++) { + callbacks[i] = virtblk_done; + snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i); + names[i] = vblk->vqs[i].name; + } + + /* Discover virtqueues and write information to configuration. */ + err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names); + if (err) + goto err_find_vqs; + + for (i = 0; i < num_vqs; i++) { + spin_lock_init(&vblk->vqs[i].lock); + vblk->vqs[i].vq = vqs[i]; + } + vblk->num_vqs = num_vqs; + + err_find_vqs: + kfree(vqs); + err_vqs: + kfree(callbacks); + err_callbacks: + kfree(names); + err_names: + if (err) + kfree(vblk->vqs); + out: + return err; +} + +/* + * Legacy naming scheme used for virtio devices. We are stuck with it for + * virtio blk but don't ever use it for any new driver. + */ +static int virtblk_name_format(char *prefix, int index, char *buf, int buflen) +{ + const int base = 'z' - 'a' + 1; + char *begin = buf + strlen(prefix); + char *end = buf + buflen; + char *p; + int unit; + + p = end - 1; + *p = '\0'; + unit = base; + do { + if (p == begin) + return -EINVAL; + *--p = 'a' + (index % unit); + index = (index / unit) - 1; + } while (index >= 0); + + memmove(begin, p, end - p); + memcpy(buf, prefix, strlen(prefix)); + + return 0; +} + +static int virtblk_get_cache_mode(struct virtio_device *vdev) +{ + u8 writeback; + int err; + + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE, + struct virtio_blk_config, wce, + &writeback); + if (err) + writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE) || + virtio_has_feature(vdev, VIRTIO_F_VERSION_1); + + return writeback; +} + +static void virtblk_update_cache_mode(struct virtio_device *vdev) +{ + u8 writeback = virtblk_get_cache_mode(vdev); + struct virtio_blk *vblk = vdev->priv; + + if (writeback) + blk_queue_flush(vblk->disk->queue, REQ_FLUSH); + else + blk_queue_flush(vblk->disk->queue, 0); + + revalidate_disk(vblk->disk); +} + +static const char *const virtblk_cache_types[] = { + "write through", "write back" +}; + +static ssize_t +virtblk_cache_type_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + struct virtio_blk *vblk = disk->private_data; + struct virtio_device *vdev = vblk->vdev; + int i; + + BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE)); + for (i = ARRAY_SIZE(virtblk_cache_types); --i >= 0; ) + if (sysfs_streq(buf, virtblk_cache_types[i])) + break; + + if (i < 0) + return -EINVAL; + + virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i); + virtblk_update_cache_mode(vdev); + return count; +} + +static ssize_t +virtblk_cache_type_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + struct virtio_blk *vblk = disk->private_data; + u8 writeback = virtblk_get_cache_mode(vblk->vdev); + + BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types)); + return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]); +} + +static const struct device_attribute dev_attr_cache_type_ro = + __ATTR(cache_type, S_IRUGO, + virtblk_cache_type_show, NULL); +static const struct device_attribute dev_attr_cache_type_rw = + __ATTR(cache_type, S_IRUGO|S_IWUSR, + virtblk_cache_type_show, virtblk_cache_type_store); + +static int virtblk_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct virtio_blk *vblk = data; + struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq); + + sg_init_table(vbr->sg, vblk->sg_elems); + return 0; +} + +static struct blk_mq_ops virtio_mq_ops = { + .queue_rq = virtio_queue_rq, + .map_queue = blk_mq_map_queue, + .complete = virtblk_request_done, + .init_request = virtblk_init_request, +}; + +static unsigned int virtblk_queue_depth; +module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); + +static int virtblk_probe(struct virtio_device *vdev) +{ + struct virtio_blk *vblk; + struct request_queue *q; + int err, index; + + u64 cap; + u32 v, blk_size, sg_elems, opt_io_size; + u16 min_io_size; + u8 physical_block_exp, alignment_offset; + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + + err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS), + GFP_KERNEL); + if (err < 0) + goto out; + index = err; + + /* We need to know how many segments before we allocate. */ + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX, + struct virtio_blk_config, seg_max, + &sg_elems); + + /* We need at least one SG element, whatever they say. */ + if (err || !sg_elems) + sg_elems = 1; + + /* We need an extra sg elements at head and tail. */ + sg_elems += 2; + vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); + if (!vblk) { + err = -ENOMEM; + goto out_free_index; + } + + vblk->vdev = vdev; + vblk->sg_elems = sg_elems; + + INIT_WORK(&vblk->config_work, virtblk_config_changed_work); + + err = init_vq(vblk); + if (err) + goto out_free_vblk; + + /* FIXME: How many partitions? How long is a piece of string? */ + vblk->disk = alloc_disk(1 << PART_BITS); + if (!vblk->disk) { + err = -ENOMEM; + goto out_free_vq; + } + + /* Default queue sizing is to fill the ring. */ + if (!virtblk_queue_depth) { + virtblk_queue_depth = vblk->vqs[0].vq->num_free; + /* ... but without indirect descs, we use 2 descs per req */ + if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) + virtblk_queue_depth /= 2; + } + + memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); + vblk->tag_set.ops = &virtio_mq_ops; + vblk->tag_set.queue_depth = virtblk_queue_depth; + vblk->tag_set.numa_node = NUMA_NO_NODE; + vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + vblk->tag_set.cmd_size = + sizeof(struct virtblk_req) + + sizeof(struct scatterlist) * sg_elems; + vblk->tag_set.driver_data = vblk; + vblk->tag_set.nr_hw_queues = vblk->num_vqs; + + err = blk_mq_alloc_tag_set(&vblk->tag_set); + if (err) + goto out_put_disk; + + q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set); + if (IS_ERR(q)) { + err = -ENOMEM; + goto out_free_tags; + } + + q->queuedata = vblk; + + virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); + + vblk->disk->major = major; + vblk->disk->first_minor = index_to_minor(index); + vblk->disk->private_data = vblk; + vblk->disk->fops = &virtblk_fops; + vblk->disk->driverfs_dev = &vdev->dev; + vblk->index = index; + + /* configure queue flush support */ + virtblk_update_cache_mode(vdev); + + /* If disk is read-only in the host, the guest should obey */ + if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) + set_disk_ro(vblk->disk, 1); + + /* Host must always specify the capacity. */ + virtio_cread(vdev, struct virtio_blk_config, capacity, &cap); + + /* If capacity is too big, truncate with warning. */ + if ((sector_t)cap != cap) { + dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n", + (unsigned long long)cap); + cap = (sector_t)-1; + } + set_capacity(vblk->disk, cap); + + /* We can handle whatever the host told us to handle. */ + blk_queue_max_segments(q, vblk->sg_elems-2); + + /* No need to bounce any requests */ + blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); + + /* No real sector limit. */ + blk_queue_max_hw_sectors(q, -1U); + + /* Host can optionally specify maximum segment size and number of + * segments. */ + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX, + struct virtio_blk_config, size_max, &v); + if (!err) + blk_queue_max_segment_size(q, v); + else + blk_queue_max_segment_size(q, -1U); + + /* Host can optionally specify the block size of the device */ + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE, + struct virtio_blk_config, blk_size, + &blk_size); + if (!err) + blk_queue_logical_block_size(q, blk_size); + else + blk_size = queue_logical_block_size(q); + + /* Use topology information if available */ + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, + struct virtio_blk_config, physical_block_exp, + &physical_block_exp); + if (!err && physical_block_exp) + blk_queue_physical_block_size(q, + blk_size * (1 << physical_block_exp)); + + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, + struct virtio_blk_config, alignment_offset, + &alignment_offset); + if (!err && alignment_offset) + blk_queue_alignment_offset(q, blk_size * alignment_offset); + + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, + struct virtio_blk_config, min_io_size, + &min_io_size); + if (!err && min_io_size) + blk_queue_io_min(q, blk_size * min_io_size); + + err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, + struct virtio_blk_config, opt_io_size, + &opt_io_size); + if (!err && opt_io_size) + blk_queue_io_opt(q, blk_size * opt_io_size); + + virtio_device_ready(vdev); + + add_disk(vblk->disk); + err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial); + if (err) + goto out_del_disk; + + if (virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) + err = device_create_file(disk_to_dev(vblk->disk), + &dev_attr_cache_type_rw); + else + err = device_create_file(disk_to_dev(vblk->disk), + &dev_attr_cache_type_ro); + if (err) + goto out_del_disk; + return 0; + +out_del_disk: + del_gendisk(vblk->disk); + blk_cleanup_queue(vblk->disk->queue); +out_free_tags: + blk_mq_free_tag_set(&vblk->tag_set); +out_put_disk: + put_disk(vblk->disk); +out_free_vq: + vdev->config->del_vqs(vdev); +out_free_vblk: + kfree(vblk); +out_free_index: + ida_simple_remove(&vd_index_ida, index); +out: + return err; +} + +static void virtblk_remove(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + int index = vblk->index; + int refc; + + /* Make sure no work handler is accessing the device. */ + flush_work(&vblk->config_work); + + del_gendisk(vblk->disk); + blk_cleanup_queue(vblk->disk->queue); + + blk_mq_free_tag_set(&vblk->tag_set); + + /* Stop all the virtqueues. */ + vdev->config->reset(vdev); + + refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount); + put_disk(vblk->disk); + vdev->config->del_vqs(vdev); + kfree(vblk->vqs); + kfree(vblk); + + /* Only free device id if we don't have any users */ + if (refc == 1) + ida_simple_remove(&vd_index_ida, index); +} + +#ifdef CONFIG_PM_SLEEP +static int virtblk_freeze(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + + /* Ensure we don't receive any more interrupts */ + vdev->config->reset(vdev); + + /* Make sure no work handler is accessing the device. */ + flush_work(&vblk->config_work); + + blk_mq_stop_hw_queues(vblk->disk->queue); + + vdev->config->del_vqs(vdev); + return 0; +} + +static int virtblk_restore(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + int ret; + + ret = init_vq(vdev->priv); + if (ret) + return ret; + + virtio_device_ready(vdev); + + blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + return 0; +} +#endif + +static const struct virtio_device_id id_table[] = { + { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static unsigned int features_legacy[] = { + VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, + VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI, + VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, + VIRTIO_BLK_F_MQ, +} +; +static unsigned int features[] = { + VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, + VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, + VIRTIO_BLK_F_TOPOLOGY, + VIRTIO_BLK_F_MQ, +}; + +static struct virtio_driver virtio_blk = { + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .feature_table_legacy = features_legacy, + .feature_table_size_legacy = ARRAY_SIZE(features_legacy), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtblk_probe, + .remove = virtblk_remove, + .config_changed = virtblk_config_changed, +#ifdef CONFIG_PM_SLEEP + .freeze = virtblk_freeze, + .restore = virtblk_restore, +#endif +}; + +static int __init init(void) +{ + int error; + + virtblk_wq = alloc_workqueue("virtio-blk", 0, 0); + if (!virtblk_wq) + return -ENOMEM; + + major = register_blkdev(0, "virtblk"); + if (major < 0) { + error = major; + goto out_destroy_workqueue; + } + + error = register_virtio_driver(&virtio_blk); + if (error) + goto out_unregister_blkdev; + return 0; + +out_unregister_blkdev: + unregister_blkdev(major, "virtblk"); +out_destroy_workqueue: + destroy_workqueue(virtblk_wq); + return error; +} + +static void __exit fini(void) +{ + unregister_virtio_driver(&virtio_blk); + unregister_blkdev(major, "virtblk"); + destroy_workqueue(virtblk_wq); +} +module_init(init); +module_exit(fini); + +MODULE_DEVICE_TABLE(virtio, id_table); +MODULE_DESCRIPTION("Virtio block driver"); +MODULE_LICENSE("GPL"); diff --git a/kernel/drivers/block/xen-blkback/Makefile b/kernel/drivers/block/xen-blkback/Makefile new file mode 100644 index 000000000..e491c1b76 --- /dev/null +++ b/kernel/drivers/block/xen-blkback/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o + +xen-blkback-y := blkback.o xenbus.o diff --git a/kernel/drivers/block/xen-blkback/blkback.c b/kernel/drivers/block/xen-blkback/blkback.c new file mode 100644 index 000000000..713fc9ff1 --- /dev/null +++ b/kernel/drivers/block/xen-blkback/blkback.c @@ -0,0 +1,1456 @@ +/****************************************************************************** + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * drivers/block/xen-blkfront.c + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Copyright (c) 2005, Christopher Clark + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen-blkback: " fmt + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +/* + * Maximum number of unused free pages to keep in the internal buffer. + * Setting this to a value too low will reduce memory used in each backend, + * but can have a performance penalty. + * + * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can + * be set to a lower value that might degrade performance on some intensive + * IO workloads. + */ + +static int xen_blkif_max_buffer_pages = 1024; +module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644); +MODULE_PARM_DESC(max_buffer_pages, +"Maximum number of free pages to keep in each block backend buffer"); + +/* + * Maximum number of grants to map persistently in blkback. For maximum + * performance this should be the total numbers of grants that can be used + * to fill the ring, but since this might become too high, specially with + * the use of indirect descriptors, we set it to a value that provides good + * performance without using too much memory. + * + * When the list of persistent grants is full we clean it up using a LRU + * algorithm. + */ + +static int xen_blkif_max_pgrants = 1056; +module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); +MODULE_PARM_DESC(max_persistent_grants, + "Maximum number of grants to map persistently"); + +/* + * The LRU mechanism to clean the lists of persistent grants needs to + * be executed periodically. The time interval between consecutive executions + * of the purge mechanism is set in ms. + */ +#define LRU_INTERVAL 100 + +/* + * When the persistent grants list is full we will remove unused grants + * from the list. The percent number of grants to be removed at each LRU + * execution. + */ +#define LRU_PERCENT_CLEAN 5 + +/* Run-time switchable: /sys/module/blkback/parameters/ */ +static unsigned int log_stats; +module_param(log_stats, int, 0644); + +#define BLKBACK_INVALID_HANDLE (~0) + +/* Number of free pages to remove on each call to gnttab_free_pages */ +#define NUM_BATCH_FREE_PAGES 10 + +static inline int get_free_page(struct xen_blkif *blkif, struct page **page) +{ + unsigned long flags; + + spin_lock_irqsave(&blkif->free_pages_lock, flags); + if (list_empty(&blkif->free_pages)) { + BUG_ON(blkif->free_pages_num != 0); + spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + return gnttab_alloc_pages(1, page); + } + BUG_ON(blkif->free_pages_num == 0); + page[0] = list_first_entry(&blkif->free_pages, struct page, lru); + list_del(&page[0]->lru); + blkif->free_pages_num--; + spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + + return 0; +} + +static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, + int num) +{ + unsigned long flags; + int i; + + spin_lock_irqsave(&blkif->free_pages_lock, flags); + for (i = 0; i < num; i++) + list_add(&page[i]->lru, &blkif->free_pages); + blkif->free_pages_num += num; + spin_unlock_irqrestore(&blkif->free_pages_lock, flags); +} + +static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) +{ + /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ + struct page *page[NUM_BATCH_FREE_PAGES]; + unsigned int num_pages = 0; + unsigned long flags; + + spin_lock_irqsave(&blkif->free_pages_lock, flags); + while (blkif->free_pages_num > num) { + BUG_ON(list_empty(&blkif->free_pages)); + page[num_pages] = list_first_entry(&blkif->free_pages, + struct page, lru); + list_del(&page[num_pages]->lru); + blkif->free_pages_num--; + if (++num_pages == NUM_BATCH_FREE_PAGES) { + spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + gnttab_free_pages(num_pages, page); + spin_lock_irqsave(&blkif->free_pages_lock, flags); + num_pages = 0; + } + } + spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + if (num_pages != 0) + gnttab_free_pages(num_pages, page); +} + +#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) + +static int do_block_io_op(struct xen_blkif *blkif); +static int dispatch_rw_block_io(struct xen_blkif *blkif, + struct blkif_request *req, + struct pending_req *pending_req); +static void make_response(struct xen_blkif *blkif, u64 id, + unsigned short op, int st); + +#define foreach_grant_safe(pos, n, rbtree, node) \ + for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \ + (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \ + &(pos)->node != NULL; \ + (pos) = container_of(n, typeof(*(pos)), node), \ + (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) + + +/* + * We don't need locking around the persistent grant helpers + * because blkback uses a single-thread for each backed, so we + * can be sure that this functions will never be called recursively. + * + * The only exception to that is put_persistent_grant, that can be called + * from interrupt context (by xen_blkbk_unmap), so we have to use atomic + * bit operations to modify the flags of a persistent grant and to count + * the number of used grants. + */ +static int add_persistent_gnt(struct xen_blkif *blkif, + struct persistent_gnt *persistent_gnt) +{ + struct rb_node **new = NULL, *parent = NULL; + struct persistent_gnt *this; + + if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { + if (!blkif->vbd.overflow_max_grants) + blkif->vbd.overflow_max_grants = 1; + return -EBUSY; + } + /* Figure out where to put new node */ + new = &blkif->persistent_gnts.rb_node; + while (*new) { + this = container_of(*new, struct persistent_gnt, node); + + parent = *new; + if (persistent_gnt->gnt < this->gnt) + new = &((*new)->rb_left); + else if (persistent_gnt->gnt > this->gnt) + new = &((*new)->rb_right); + else { + pr_alert_ratelimited("trying to add a gref that's already in the tree\n"); + return -EINVAL; + } + } + + bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE); + set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); + /* Add new node and rebalance tree. */ + rb_link_node(&(persistent_gnt->node), parent, new); + rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); + blkif->persistent_gnt_c++; + atomic_inc(&blkif->persistent_gnt_in_use); + return 0; +} + +static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, + grant_ref_t gref) +{ + struct persistent_gnt *data; + struct rb_node *node = NULL; + + node = blkif->persistent_gnts.rb_node; + while (node) { + data = container_of(node, struct persistent_gnt, node); + + if (gref < data->gnt) + node = node->rb_left; + else if (gref > data->gnt) + node = node->rb_right; + else { + if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) { + pr_alert_ratelimited("requesting a grant already in use\n"); + return NULL; + } + set_bit(PERSISTENT_GNT_ACTIVE, data->flags); + atomic_inc(&blkif->persistent_gnt_in_use); + return data; + } + } + return NULL; +} + +static void put_persistent_gnt(struct xen_blkif *blkif, + struct persistent_gnt *persistent_gnt) +{ + if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) + pr_alert_ratelimited("freeing a grant already unused\n"); + set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); + clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); + atomic_dec(&blkif->persistent_gnt_in_use); +} + +static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, + unsigned int num) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct persistent_gnt *persistent_gnt; + struct rb_node *n; + int segs_to_unmap = 0; + struct gntab_unmap_queue_data unmap_data; + + unmap_data.pages = pages; + unmap_data.unmap_ops = unmap; + unmap_data.kunmap_ops = NULL; + + foreach_grant_safe(persistent_gnt, n, root, node) { + BUG_ON(persistent_gnt->handle == + BLKBACK_INVALID_HANDLE); + gnttab_set_unmap_op(&unmap[segs_to_unmap], + (unsigned long) pfn_to_kaddr(page_to_pfn( + persistent_gnt->page)), + GNTMAP_host_map, + persistent_gnt->handle); + + pages[segs_to_unmap] = persistent_gnt->page; + + if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST || + !rb_next(&persistent_gnt->node)) { + + unmap_data.count = segs_to_unmap; + BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); + + put_free_pages(blkif, pages, segs_to_unmap); + segs_to_unmap = 0; + } + + rb_erase(&persistent_gnt->node, root); + kfree(persistent_gnt); + num--; + } + BUG_ON(num != 0); +} + +void xen_blkbk_unmap_purged_grants(struct work_struct *work) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct persistent_gnt *persistent_gnt; + int segs_to_unmap = 0; + struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); + struct gntab_unmap_queue_data unmap_data; + + unmap_data.pages = pages; + unmap_data.unmap_ops = unmap; + unmap_data.kunmap_ops = NULL; + + while(!list_empty(&blkif->persistent_purge_list)) { + persistent_gnt = list_first_entry(&blkif->persistent_purge_list, + struct persistent_gnt, + remove_node); + list_del(&persistent_gnt->remove_node); + + gnttab_set_unmap_op(&unmap[segs_to_unmap], + vaddr(persistent_gnt->page), + GNTMAP_host_map, + persistent_gnt->handle); + + pages[segs_to_unmap] = persistent_gnt->page; + + if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { + unmap_data.count = segs_to_unmap; + BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); + put_free_pages(blkif, pages, segs_to_unmap); + segs_to_unmap = 0; + } + kfree(persistent_gnt); + } + if (segs_to_unmap > 0) { + unmap_data.count = segs_to_unmap; + BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); + put_free_pages(blkif, pages, segs_to_unmap); + } +} + +static void purge_persistent_gnt(struct xen_blkif *blkif) +{ + struct persistent_gnt *persistent_gnt; + struct rb_node *n; + unsigned int num_clean, total; + bool scan_used = false, clean_used = false; + struct rb_root *root; + + if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || + (blkif->persistent_gnt_c == xen_blkif_max_pgrants && + !blkif->vbd.overflow_max_grants)) { + return; + } + + if (work_pending(&blkif->persistent_purge_work)) { + pr_alert_ratelimited("Scheduled work from previous purge is still pending, cannot purge list\n"); + return; + } + + num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; + num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; + num_clean = min(blkif->persistent_gnt_c, num_clean); + if ((num_clean == 0) || + (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) + return; + + /* + * At this point, we can assure that there will be no calls + * to get_persistent_grant (because we are executing this code from + * xen_blkif_schedule), there can only be calls to put_persistent_gnt, + * which means that the number of currently used grants will go down, + * but never up, so we will always be able to remove the requested + * number of grants. + */ + + total = num_clean; + + pr_debug("Going to purge %u persistent grants\n", num_clean); + + BUG_ON(!list_empty(&blkif->persistent_purge_list)); + root = &blkif->persistent_gnts; +purge_list: + foreach_grant_safe(persistent_gnt, n, root, node) { + BUG_ON(persistent_gnt->handle == + BLKBACK_INVALID_HANDLE); + + if (clean_used) { + clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); + continue; + } + + if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) + continue; + if (!scan_used && + (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags))) + continue; + + rb_erase(&persistent_gnt->node, root); + list_add(&persistent_gnt->remove_node, + &blkif->persistent_purge_list); + if (--num_clean == 0) + goto finished; + } + /* + * If we get here it means we also need to start cleaning + * grants that were used since last purge in order to cope + * with the requested num + */ + if (!scan_used && !clean_used) { + pr_debug("Still missing %u purged frames\n", num_clean); + scan_used = true; + goto purge_list; + } +finished: + if (!clean_used) { + pr_debug("Finished scanning for grants to clean, removing used flag\n"); + clean_used = true; + goto purge_list; + } + + blkif->persistent_gnt_c -= (total - num_clean); + blkif->vbd.overflow_max_grants = 0; + + /* We can defer this work */ + schedule_work(&blkif->persistent_purge_work); + pr_debug("Purged %u/%u\n", (total - num_clean), total); + return; +} + +/* + * Retrieve from the 'pending_reqs' a free pending_req structure to be used. + */ +static struct pending_req *alloc_req(struct xen_blkif *blkif) +{ + struct pending_req *req = NULL; + unsigned long flags; + + spin_lock_irqsave(&blkif->pending_free_lock, flags); + if (!list_empty(&blkif->pending_free)) { + req = list_entry(blkif->pending_free.next, struct pending_req, + free_list); + list_del(&req->free_list); + } + spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + return req; +} + +/* + * Return the 'pending_req' structure back to the freepool. We also + * wake up the thread if it was waiting for a free page. + */ +static void free_req(struct xen_blkif *blkif, struct pending_req *req) +{ + unsigned long flags; + int was_empty; + + spin_lock_irqsave(&blkif->pending_free_lock, flags); + was_empty = list_empty(&blkif->pending_free); + list_add(&req->free_list, &blkif->pending_free); + spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + if (was_empty) + wake_up(&blkif->pending_free_wq); +} + +/* + * Routines for managing virtual block devices (vbds). + */ +static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif, + int operation) +{ + struct xen_vbd *vbd = &blkif->vbd; + int rc = -EACCES; + + if ((operation != READ) && vbd->readonly) + goto out; + + if (likely(req->nr_sects)) { + blkif_sector_t end = req->sector_number + req->nr_sects; + + if (unlikely(end < req->sector_number)) + goto out; + if (unlikely(end > vbd_sz(vbd))) + goto out; + } + + req->dev = vbd->pdevice; + req->bdev = vbd->bdev; + rc = 0; + + out: + return rc; +} + +static void xen_vbd_resize(struct xen_blkif *blkif) +{ + struct xen_vbd *vbd = &blkif->vbd; + struct xenbus_transaction xbt; + int err; + struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be); + unsigned long long new_size = vbd_sz(vbd); + + pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n", + blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice)); + pr_info("VBD Resize: new size %llu\n", new_size); + vbd->size = new_size; +again: + err = xenbus_transaction_start(&xbt); + if (err) { + pr_warn("Error starting transaction\n"); + return; + } + err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", + (unsigned long long)vbd_sz(vbd)); + if (err) { + pr_warn("Error writing new size\n"); + goto abort; + } + /* + * Write the current state; we will use this to synchronize + * the front-end. If the current state is "connected" the + * front-end will get the new size information online. + */ + err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state); + if (err) { + pr_warn("Error writing the state\n"); + goto abort; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + pr_warn("Error ending transaction\n"); + return; +abort: + xenbus_transaction_end(xbt, 1); +} + +/* + * Notification from the guest OS. + */ +static void blkif_notify_work(struct xen_blkif *blkif) +{ + blkif->waiting_reqs = 1; + wake_up(&blkif->wq); +} + +irqreturn_t xen_blkif_be_int(int irq, void *dev_id) +{ + blkif_notify_work(dev_id); + return IRQ_HANDLED; +} + +/* + * SCHEDULER FUNCTIONS + */ + +static void print_stats(struct xen_blkif *blkif) +{ + pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" + " | ds %4llu | pg: %4u/%4d\n", + current->comm, blkif->st_oo_req, + blkif->st_rd_req, blkif->st_wr_req, + blkif->st_f_req, blkif->st_ds_req, + blkif->persistent_gnt_c, + xen_blkif_max_pgrants); + blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); + blkif->st_rd_req = 0; + blkif->st_wr_req = 0; + blkif->st_oo_req = 0; + blkif->st_ds_req = 0; +} + +int xen_blkif_schedule(void *arg) +{ + struct xen_blkif *blkif = arg; + struct xen_vbd *vbd = &blkif->vbd; + unsigned long timeout; + int ret; + + xen_blkif_get(blkif); + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + if (unlikely(vbd->size != vbd_sz(vbd))) + xen_vbd_resize(blkif); + + timeout = msecs_to_jiffies(LRU_INTERVAL); + + timeout = wait_event_interruptible_timeout( + blkif->wq, + blkif->waiting_reqs || kthread_should_stop(), + timeout); + if (timeout == 0) + goto purge_gnt_list; + timeout = wait_event_interruptible_timeout( + blkif->pending_free_wq, + !list_empty(&blkif->pending_free) || + kthread_should_stop(), + timeout); + if (timeout == 0) + goto purge_gnt_list; + + blkif->waiting_reqs = 0; + smp_mb(); /* clear flag *before* checking for work */ + + ret = do_block_io_op(blkif); + if (ret > 0) + blkif->waiting_reqs = 1; + if (ret == -EACCES) + wait_event_interruptible(blkif->shutdown_wq, + kthread_should_stop()); + +purge_gnt_list: + if (blkif->vbd.feature_gnt_persistent && + time_after(jiffies, blkif->next_lru)) { + purge_persistent_gnt(blkif); + blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); + } + + /* Shrink if we have more than xen_blkif_max_buffer_pages */ + shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); + + if (log_stats && time_after(jiffies, blkif->st_print)) + print_stats(blkif); + } + + /* Drain pending purge work */ + flush_work(&blkif->persistent_purge_work); + + if (log_stats) + print_stats(blkif); + + blkif->xenblkd = NULL; + xen_blkif_put(blkif); + + return 0; +} + +/* + * Remove persistent grants and empty the pool of free pages + */ +void xen_blkbk_free_caches(struct xen_blkif *blkif) +{ + /* Free all persistent grant pages */ + if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) + free_persistent_gnts(blkif, &blkif->persistent_gnts, + blkif->persistent_gnt_c); + + BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); + blkif->persistent_gnt_c = 0; + + /* Since we are shutting down remove all pages from the buffer */ + shrink_free_pagepool(blkif, 0 /* All */); +} + +static unsigned int xen_blkbk_unmap_prepare( + struct xen_blkif *blkif, + struct grant_page **pages, + unsigned int num, + struct gnttab_unmap_grant_ref *unmap_ops, + struct page **unmap_pages) +{ + unsigned int i, invcount = 0; + + for (i = 0; i < num; i++) { + if (pages[i]->persistent_gnt != NULL) { + put_persistent_gnt(blkif, pages[i]->persistent_gnt); + continue; + } + if (pages[i]->handle == BLKBACK_INVALID_HANDLE) + continue; + unmap_pages[invcount] = pages[i]->page; + gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page), + GNTMAP_host_map, pages[i]->handle); + pages[i]->handle = BLKBACK_INVALID_HANDLE; + invcount++; + } + + return invcount; +} + +static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) +{ + struct pending_req* pending_req = (struct pending_req*) (data->data); + struct xen_blkif *blkif = pending_req->blkif; + + /* BUG_ON used to reproduce existing behaviour, + but is this the best way to deal with this? */ + BUG_ON(result); + + put_free_pages(blkif, data->pages, data->count); + make_response(blkif, pending_req->id, + pending_req->operation, pending_req->status); + free_req(blkif, pending_req); + /* + * Make sure the request is freed before releasing blkif, + * or there could be a race between free_req and the + * cleanup done in xen_blkif_free during shutdown. + * + * NB: The fact that we might try to wake up pending_free_wq + * before drain_complete (in case there's a drain going on) + * it's not a problem with our current implementation + * because we can assure there's no thread waiting on + * pending_free_wq if there's a drain going on, but it has + * to be taken into account if the current model is changed. + */ + if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { + complete(&blkif->drain_complete); + } + xen_blkif_put(blkif); +} + +static void xen_blkbk_unmap_and_respond(struct pending_req *req) +{ + struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data; + struct xen_blkif *blkif = req->blkif; + struct grant_page **pages = req->segments; + unsigned int invcount; + + invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_pages, + req->unmap, req->unmap_pages); + + work->data = req; + work->done = xen_blkbk_unmap_and_respond_callback; + work->unmap_ops = req->unmap; + work->kunmap_ops = NULL; + work->pages = req->unmap_pages; + work->count = invcount; + + gnttab_unmap_refs_async(&req->gnttab_unmap_data); +} + + +/* + * Unmap the grant references. + * + * This could accumulate ops up to the batch size to reduce the number + * of hypercalls, but since this is only used in error paths there's + * no real need. + */ +static void xen_blkbk_unmap(struct xen_blkif *blkif, + struct grant_page *pages[], + int num) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int invcount = 0; + int ret; + + while (num) { + unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST); + + invcount = xen_blkbk_unmap_prepare(blkif, pages, batch, + unmap, unmap_pages); + if (invcount) { + ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); + BUG_ON(ret); + put_free_pages(blkif, unmap_pages, invcount); + } + pages += batch; + num -= batch; + } +} + +static int xen_blkbk_map(struct xen_blkif *blkif, + struct grant_page *pages[], + int num, bool ro) +{ + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct persistent_gnt *persistent_gnt = NULL; + phys_addr_t addr = 0; + int i, seg_idx, new_map_idx; + int segs_to_map = 0; + int ret = 0; + int last_map = 0, map_until = 0; + int use_persistent_gnts; + + use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); + + /* + * Fill out preq.nr_sects with proper amount of sectors, and setup + * assign map[..] with the PFN of the page in our domain with the + * corresponding grant reference for each page. + */ +again: + for (i = map_until; i < num; i++) { + uint32_t flags; + + if (use_persistent_gnts) + persistent_gnt = get_persistent_gnt( + blkif, + pages[i]->gref); + + if (persistent_gnt) { + /* + * We are using persistent grants and + * the grant is already mapped + */ + pages[i]->page = persistent_gnt->page; + pages[i]->persistent_gnt = persistent_gnt; + } else { + if (get_free_page(blkif, &pages[i]->page)) + goto out_of_memory; + addr = vaddr(pages[i]->page); + pages_to_gnt[segs_to_map] = pages[i]->page; + pages[i]->persistent_gnt = NULL; + flags = GNTMAP_host_map; + if (!use_persistent_gnts && ro) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[segs_to_map++], addr, + flags, pages[i]->gref, + blkif->domid); + } + map_until = i + 1; + if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST) + break; + } + + if (segs_to_map) { + ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map); + BUG_ON(ret); + } + + /* + * Now swizzle the MFN in our domain with the MFN from the other domain + * so that when we access vaddr(pending_req,i) it has the contents of + * the page from the other domain. + */ + for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) { + if (!pages[seg_idx]->persistent_gnt) { + /* This is a newly mapped grant */ + BUG_ON(new_map_idx >= segs_to_map); + if (unlikely(map[new_map_idx].status != 0)) { + pr_debug("invalid buffer -- could not remap it\n"); + put_free_pages(blkif, &pages[seg_idx]->page, 1); + pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; + ret |= 1; + goto next; + } + pages[seg_idx]->handle = map[new_map_idx].handle; + } else { + continue; + } + if (use_persistent_gnts && + blkif->persistent_gnt_c < xen_blkif_max_pgrants) { + /* + * We are using persistent grants, the grant is + * not mapped but we might have room for it. + */ + persistent_gnt = kmalloc(sizeof(struct persistent_gnt), + GFP_KERNEL); + if (!persistent_gnt) { + /* + * If we don't have enough memory to + * allocate the persistent_gnt struct + * map this grant non-persistenly + */ + goto next; + } + persistent_gnt->gnt = map[new_map_idx].ref; + persistent_gnt->handle = map[new_map_idx].handle; + persistent_gnt->page = pages[seg_idx]->page; + if (add_persistent_gnt(blkif, + persistent_gnt)) { + kfree(persistent_gnt); + persistent_gnt = NULL; + goto next; + } + pages[seg_idx]->persistent_gnt = persistent_gnt; + pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", + persistent_gnt->gnt, blkif->persistent_gnt_c, + xen_blkif_max_pgrants); + goto next; + } + if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) { + blkif->vbd.overflow_max_grants = 1; + pr_debug("domain %u, device %#x is using maximum number of persistent grants\n", + blkif->domid, blkif->vbd.handle); + } + /* + * We could not map this grant persistently, so use it as + * a non-persistent grant. + */ +next: + new_map_idx++; + } + segs_to_map = 0; + last_map = map_until; + if (map_until != num) + goto again; + + return ret; + +out_of_memory: + pr_alert("%s: out of memory\n", __func__); + put_free_pages(blkif, pages_to_gnt, segs_to_map); + return -ENOMEM; +} + +static int xen_blkbk_map_seg(struct pending_req *pending_req) +{ + int rc; + + rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, + pending_req->nr_pages, + (pending_req->operation != BLKIF_OP_READ)); + + return rc; +} + +static int xen_blkbk_parse_indirect(struct blkif_request *req, + struct pending_req *pending_req, + struct seg_buf seg[], + struct phys_req *preq) +{ + struct grant_page **pages = pending_req->indirect_pages; + struct xen_blkif *blkif = pending_req->blkif; + int indirect_grefs, rc, n, nseg, i; + struct blkif_request_segment *segments = NULL; + + nseg = pending_req->nr_pages; + indirect_grefs = INDIRECT_PAGES(nseg); + BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); + + for (i = 0; i < indirect_grefs; i++) + pages[i]->gref = req->u.indirect.indirect_grefs[i]; + + rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); + if (rc) + goto unmap; + + for (n = 0, i = 0; n < nseg; n++) { + if ((n % SEGS_PER_INDIRECT_FRAME) == 0) { + /* Map indirect segments */ + if (segments) + kunmap_atomic(segments); + segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page); + } + i = n % SEGS_PER_INDIRECT_FRAME; + pending_req->segments[n]->gref = segments[i].gref; + seg[n].nsec = segments[i].last_sect - + segments[i].first_sect + 1; + seg[n].offset = (segments[i].first_sect << 9); + if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) || + (segments[i].last_sect < segments[i].first_sect)) { + rc = -EINVAL; + goto unmap; + } + preq->nr_sects += seg[n].nsec; + } + +unmap: + if (segments) + kunmap_atomic(segments); + xen_blkbk_unmap(blkif, pages, indirect_grefs); + return rc; +} + +static int dispatch_discard_io(struct xen_blkif *blkif, + struct blkif_request *req) +{ + int err = 0; + int status = BLKIF_RSP_OKAY; + struct block_device *bdev = blkif->vbd.bdev; + unsigned long secure; + struct phys_req preq; + + xen_blkif_get(blkif); + + preq.sector_number = req->u.discard.sector_number; + preq.nr_sects = req->u.discard.nr_sectors; + + err = xen_vbd_translate(&preq, blkif, WRITE); + if (err) { + pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n", + preq.sector_number, + preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); + goto fail_response; + } + blkif->st_ds_req++; + + secure = (blkif->vbd.discard_secure && + (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? + BLKDEV_DISCARD_SECURE : 0; + + err = blkdev_issue_discard(bdev, req->u.discard.sector_number, + req->u.discard.nr_sectors, + GFP_KERNEL, secure); +fail_response: + if (err == -EOPNOTSUPP) { + pr_debug("discard op failed, not supported\n"); + status = BLKIF_RSP_EOPNOTSUPP; + } else if (err) + status = BLKIF_RSP_ERROR; + + make_response(blkif, req->u.discard.id, req->operation, status); + xen_blkif_put(blkif); + return err; +} + +static int dispatch_other_io(struct xen_blkif *blkif, + struct blkif_request *req, + struct pending_req *pending_req) +{ + free_req(blkif, pending_req); + make_response(blkif, req->u.other.id, req->operation, + BLKIF_RSP_EOPNOTSUPP); + return -EIO; +} + +static void xen_blk_drain_io(struct xen_blkif *blkif) +{ + atomic_set(&blkif->drain, 1); + do { + if (atomic_read(&blkif->inflight) == 0) + break; + wait_for_completion_interruptible_timeout( + &blkif->drain_complete, HZ); + + if (!atomic_read(&blkif->drain)) + break; + } while (!kthread_should_stop()); + atomic_set(&blkif->drain, 0); +} + +/* + * Completion callback on the bio's. Called as bh->b_end_io() + */ + +static void __end_block_io_op(struct pending_req *pending_req, int error) +{ + /* An error fails the entire request. */ + if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && + (error == -EOPNOTSUPP)) { + pr_debug("flush diskcache op failed, not supported\n"); + xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); + pending_req->status = BLKIF_RSP_EOPNOTSUPP; + } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && + (error == -EOPNOTSUPP)) { + pr_debug("write barrier op failed, not supported\n"); + xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); + pending_req->status = BLKIF_RSP_EOPNOTSUPP; + } else if (error) { + pr_debug("Buffer not up-to-date at end of operation," + " error=%d\n", error); + pending_req->status = BLKIF_RSP_ERROR; + } + + /* + * If all of the bio's have completed it is time to unmap + * the grant references associated with 'request' and provide + * the proper response on the ring. + */ + if (atomic_dec_and_test(&pending_req->pendcnt)) + xen_blkbk_unmap_and_respond(pending_req); +} + +/* + * bio callback. + */ +static void end_block_io_op(struct bio *bio, int error) +{ + __end_block_io_op(bio->bi_private, error); + bio_put(bio); +} + + + +/* + * Function to copy the from the ring buffer the 'struct blkif_request' + * (which has the sectors we want, number of them, grant references, etc), + * and transmute it to the block API to hand it over to the proper block disk. + */ +static int +__do_block_io_op(struct xen_blkif *blkif) +{ + union blkif_back_rings *blk_rings = &blkif->blk_rings; + struct blkif_request req; + struct pending_req *pending_req; + RING_IDX rc, rp; + int more_to_do = 0; + + rc = blk_rings->common.req_cons; + rp = blk_rings->common.sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { + rc = blk_rings->common.rsp_prod_pvt; + pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", + rp, rc, rp - rc, blkif->vbd.pdevice); + return -EACCES; + } + while (rc != rp) { + + if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) + break; + + if (kthread_should_stop()) { + more_to_do = 1; + break; + } + + pending_req = alloc_req(blkif); + if (NULL == pending_req) { + blkif->st_oo_req++; + more_to_do = 1; + break; + } + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); + break; + case BLKIF_PROTOCOL_X86_32: + blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); + break; + case BLKIF_PROTOCOL_X86_64: + blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); + break; + default: + BUG(); + } + blk_rings->common.req_cons = ++rc; /* before make_response() */ + + /* Apply all sanity checks to /private copy/ of request. */ + barrier(); + + switch (req.operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + case BLKIF_OP_INDIRECT: + if (dispatch_rw_block_io(blkif, &req, pending_req)) + goto done; + break; + case BLKIF_OP_DISCARD: + free_req(blkif, pending_req); + if (dispatch_discard_io(blkif, &req)) + goto done; + break; + default: + if (dispatch_other_io(blkif, &req, pending_req)) + goto done; + break; + } + + /* Yield point for this unbounded loop. */ + cond_resched(); + } +done: + return more_to_do; +} + +static int +do_block_io_op(struct xen_blkif *blkif) +{ + union blkif_back_rings *blk_rings = &blkif->blk_rings; + int more_to_do; + + do { + more_to_do = __do_block_io_op(blkif); + if (more_to_do) + break; + + RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); + } while (more_to_do); + + return more_to_do; +} +/* + * Transmutation of the 'struct blkif_request' to a proper 'struct bio' + * and call the 'submit_bio' to pass it to the underlying storage. + */ +static int dispatch_rw_block_io(struct xen_blkif *blkif, + struct blkif_request *req, + struct pending_req *pending_req) +{ + struct phys_req preq; + struct seg_buf *seg = pending_req->seg; + unsigned int nseg; + struct bio *bio = NULL; + struct bio **biolist = pending_req->biolist; + int i, nbio = 0; + int operation; + struct blk_plug plug; + bool drain = false; + struct grant_page **pages = pending_req->segments; + unsigned short req_operation; + + req_operation = req->operation == BLKIF_OP_INDIRECT ? + req->u.indirect.indirect_op : req->operation; + if ((req->operation == BLKIF_OP_INDIRECT) && + (req_operation != BLKIF_OP_READ) && + (req_operation != BLKIF_OP_WRITE)) { + pr_debug("Invalid indirect operation (%u)\n", req_operation); + goto fail_response; + } + + switch (req_operation) { + case BLKIF_OP_READ: + blkif->st_rd_req++; + operation = READ; + break; + case BLKIF_OP_WRITE: + blkif->st_wr_req++; + operation = WRITE_ODIRECT; + break; + case BLKIF_OP_WRITE_BARRIER: + drain = true; + case BLKIF_OP_FLUSH_DISKCACHE: + blkif->st_f_req++; + operation = WRITE_FLUSH; + break; + default: + operation = 0; /* make gcc happy */ + goto fail_response; + break; + } + + /* Check that the number of segments is sane. */ + nseg = req->operation == BLKIF_OP_INDIRECT ? + req->u.indirect.nr_segments : req->u.rw.nr_segments; + + if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || + unlikely((req->operation != BLKIF_OP_INDIRECT) && + (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) || + unlikely((req->operation == BLKIF_OP_INDIRECT) && + (nseg > MAX_INDIRECT_SEGMENTS))) { + pr_debug("Bad number of segments in request (%d)\n", nseg); + /* Haven't submitted any bio's yet. */ + goto fail_response; + } + + preq.nr_sects = 0; + + pending_req->blkif = blkif; + pending_req->id = req->u.rw.id; + pending_req->operation = req_operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + + if (req->operation != BLKIF_OP_INDIRECT) { + preq.dev = req->u.rw.handle; + preq.sector_number = req->u.rw.sector_number; + for (i = 0; i < nseg; i++) { + pages[i]->gref = req->u.rw.seg[i].gref; + seg[i].nsec = req->u.rw.seg[i].last_sect - + req->u.rw.seg[i].first_sect + 1; + seg[i].offset = (req->u.rw.seg[i].first_sect << 9); + if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || + (req->u.rw.seg[i].last_sect < + req->u.rw.seg[i].first_sect)) + goto fail_response; + preq.nr_sects += seg[i].nsec; + } + } else { + preq.dev = req->u.indirect.handle; + preq.sector_number = req->u.indirect.sector_number; + if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq)) + goto fail_response; + } + + if (xen_vbd_translate(&preq, blkif, operation) != 0) { + pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", + operation == READ ? "read" : "write", + preq.sector_number, + preq.sector_number + preq.nr_sects, + blkif->vbd.pdevice); + goto fail_response; + } + + /* + * This check _MUST_ be done after xen_vbd_translate as the preq.bdev + * is set there. + */ + for (i = 0; i < nseg; i++) { + if (((int)preq.sector_number|(int)seg[i].nsec) & + ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { + pr_debug("Misaligned I/O request from domain %d\n", + blkif->domid); + goto fail_response; + } + } + + /* Wait on all outstanding I/O's and once that has been completed + * issue the WRITE_FLUSH. + */ + if (drain) + xen_blk_drain_io(pending_req->blkif); + + /* + * If we have failed at this point, we need to undo the M2P override, + * set gnttab_set_unmap_op on all of the grant references and perform + * the hypercall to unmap the grants - that is all done in + * xen_blkbk_unmap. + */ + if (xen_blkbk_map_seg(pending_req)) + goto fail_flush; + + /* + * This corresponding xen_blkif_put is done in __end_block_io_op, or + * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. + */ + xen_blkif_get(blkif); + atomic_inc(&blkif->inflight); + + for (i = 0; i < nseg; i++) { + while ((bio == NULL) || + (bio_add_page(bio, + pages[i]->page, + seg[i].nsec << 9, + seg[i].offset) == 0)) { + + int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES); + bio = bio_alloc(GFP_KERNEL, nr_iovecs); + if (unlikely(bio == NULL)) + goto fail_put_bio; + + biolist[nbio++] = bio; + bio->bi_bdev = preq.bdev; + bio->bi_private = pending_req; + bio->bi_end_io = end_block_io_op; + bio->bi_iter.bi_sector = preq.sector_number; + } + + preq.sector_number += seg[i].nsec; + } + + /* This will be hit if the operation was a flush or discard. */ + if (!bio) { + BUG_ON(operation != WRITE_FLUSH); + + bio = bio_alloc(GFP_KERNEL, 0); + if (unlikely(bio == NULL)) + goto fail_put_bio; + + biolist[nbio++] = bio; + bio->bi_bdev = preq.bdev; + bio->bi_private = pending_req; + bio->bi_end_io = end_block_io_op; + } + + atomic_set(&pending_req->pendcnt, nbio); + blk_start_plug(&plug); + + for (i = 0; i < nbio; i++) + submit_bio(operation, biolist[i]); + + /* Let the I/Os go.. */ + blk_finish_plug(&plug); + + if (operation == READ) + blkif->st_rd_sect += preq.nr_sects; + else if (operation & WRITE) + blkif->st_wr_sect += preq.nr_sects; + + return 0; + + fail_flush: + xen_blkbk_unmap(blkif, pending_req->segments, + pending_req->nr_pages); + fail_response: + /* Haven't submitted any bio's yet. */ + make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); + free_req(blkif, pending_req); + msleep(1); /* back off a bit */ + return -EIO; + + fail_put_bio: + for (i = 0; i < nbio; i++) + bio_put(biolist[i]); + atomic_set(&pending_req->pendcnt, 1); + __end_block_io_op(pending_req, -EINVAL); + msleep(1); /* back off a bit */ + return -EIO; +} + + + +/* + * Put a response on the ring on how the operation fared. + */ +static void make_response(struct xen_blkif *blkif, u64 id, + unsigned short op, int st) +{ + struct blkif_response resp; + unsigned long flags; + union blkif_back_rings *blk_rings = &blkif->blk_rings; + int notify; + + resp.id = id; + resp.operation = op; + resp.status = st; + + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + /* Place on the response ring for the relevant domain. */ + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_32: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_64: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + default: + BUG(); + } + blk_rings->common.rsp_prod_pvt++; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + if (notify) + notify_remote_via_irq(blkif->irq); +} + +static int __init xen_blkif_init(void) +{ + int rc = 0; + + if (!xen_domain()) + return -ENODEV; + + rc = xen_blkif_interface_init(); + if (rc) + goto failed_init; + + rc = xen_blkif_xenbus_init(); + if (rc) + goto failed_init; + + failed_init: + return rc; +} + +module_init(xen_blkif_init); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("xen-backend:vbd"); diff --git a/kernel/drivers/block/xen-blkback/common.h b/kernel/drivers/block/xen-blkback/common.h new file mode 100644 index 000000000..f620b5d3f --- /dev/null +++ b/kernel/drivers/block/xen-blkback/common.h @@ -0,0 +1,492 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_BLKIF__BACKEND__COMMON_H__ +#define __XEN_BLKIF__BACKEND__COMMON_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is the maximum number of segments that would be allowed in indirect + * requests. This value will also be passed to the frontend. + */ +#define MAX_INDIRECT_SEGMENTS 256 + +#define SEGS_PER_INDIRECT_FRAME \ + (PAGE_SIZE/sizeof(struct blkif_request_segment)) +#define MAX_INDIRECT_PAGES \ + ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) +#define INDIRECT_PAGES(_segs) \ + ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) + +/* Not a real protocol. Used to generate ring structs which contain + * the elements common to all protocols only. This way we get a + * compiler-checkable way to use common struct elements, so we can + * avoid using switch(protocol) in a number of places. */ +struct blkif_common_request { + char dummy; +}; +struct blkif_common_response { + char dummy; +}; + +struct blkif_x86_32_request_rw { + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +} __attribute__((__packed__)); + +struct blkif_x86_32_request_discard { + uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */ + blkif_vdev_t _pad1; /* was "handle" for read/write requests */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + uint64_t nr_sectors; +} __attribute__((__packed__)); + +struct blkif_x86_32_request_other { + uint8_t _pad1; + blkif_vdev_t _pad2; + uint64_t id; /* private guest value, echoed in resp */ +} __attribute__((__packed__)); + +struct blkif_x86_32_request_indirect { + uint8_t indirect_op; + uint16_t nr_segments; + uint64_t id; + blkif_sector_t sector_number; + blkif_vdev_t handle; + uint16_t _pad1; + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; + /* + * The maximum number of indirect segments (and pages) that will + * be used is determined by MAX_INDIRECT_SEGMENTS, this value + * is also exported to the guest (via xenstore + * feature-max-indirect-segments entry), so the frontend knows how + * many indirect segments the backend supports. + */ + uint64_t _pad2; /* make it 64 byte aligned */ +} __attribute__((__packed__)); + +struct blkif_x86_32_request { + uint8_t operation; /* BLKIF_OP_??? */ + union { + struct blkif_x86_32_request_rw rw; + struct blkif_x86_32_request_discard discard; + struct blkif_x86_32_request_other other; + struct blkif_x86_32_request_indirect indirect; + } u; +} __attribute__((__packed__)); + +/* i386 protocol version */ +#pragma pack(push, 4) +struct blkif_x86_32_response { + uint64_t id; /* copied from request */ + uint8_t operation; /* copied from request */ + int16_t status; /* BLKIF_RSP_??? */ +}; +#pragma pack(pop) +/* x86_64 protocol version */ + +struct blkif_x86_64_request_rw { + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint32_t _pad1; /* offsetof(blkif_reqest..,u.rw.id)==8 */ + uint64_t id; + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +} __attribute__((__packed__)); + +struct blkif_x86_64_request_discard { + uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */ + blkif_vdev_t _pad1; /* was "handle" for read/write requests */ + uint32_t _pad2; /* offsetof(blkif_..,u.discard.id)==8 */ + uint64_t id; + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + uint64_t nr_sectors; +} __attribute__((__packed__)); + +struct blkif_x86_64_request_other { + uint8_t _pad1; + blkif_vdev_t _pad2; + uint32_t _pad3; /* offsetof(blkif_..,u.discard.id)==8 */ + uint64_t id; /* private guest value, echoed in resp */ +} __attribute__((__packed__)); + +struct blkif_x86_64_request_indirect { + uint8_t indirect_op; + uint16_t nr_segments; + uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */ + uint64_t id; + blkif_sector_t sector_number; + blkif_vdev_t handle; + uint16_t _pad2; + grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST]; + /* + * The maximum number of indirect segments (and pages) that will + * be used is determined by MAX_INDIRECT_SEGMENTS, this value + * is also exported to the guest (via xenstore + * feature-max-indirect-segments entry), so the frontend knows how + * many indirect segments the backend supports. + */ + uint32_t _pad3; /* make it 64 byte aligned */ +} __attribute__((__packed__)); + +struct blkif_x86_64_request { + uint8_t operation; /* BLKIF_OP_??? */ + union { + struct blkif_x86_64_request_rw rw; + struct blkif_x86_64_request_discard discard; + struct blkif_x86_64_request_other other; + struct blkif_x86_64_request_indirect indirect; + } u; +} __attribute__((__packed__)); + +struct blkif_x86_64_response { + uint64_t __attribute__((__aligned__(8))) id; + uint8_t operation; /* copied from request */ + int16_t status; /* BLKIF_RSP_??? */ +}; + +DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, + struct blkif_common_response); +DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, + struct blkif_x86_32_response); +DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, + struct blkif_x86_64_response); + +union blkif_back_rings { + struct blkif_back_ring native; + struct blkif_common_back_ring common; + struct blkif_x86_32_back_ring x86_32; + struct blkif_x86_64_back_ring x86_64; +}; + +enum blkif_protocol { + BLKIF_PROTOCOL_NATIVE = 1, + BLKIF_PROTOCOL_X86_32 = 2, + BLKIF_PROTOCOL_X86_64 = 3, +}; + +/* + * Default protocol if the frontend doesn't specify one. + */ +#ifdef CONFIG_X86 +# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32 +#else +# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE +#endif + +struct xen_vbd { + /* What the domain refers to this vbd as. */ + blkif_vdev_t handle; + /* Non-zero -> read-only */ + unsigned char readonly; + /* VDISK_xxx */ + unsigned char type; + /* phys device that this vbd maps to. */ + u32 pdevice; + struct block_device *bdev; + /* Cached size parameter. */ + sector_t size; + unsigned int flush_support:1; + unsigned int discard_secure:1; + unsigned int feature_gnt_persistent:1; + unsigned int overflow_max_grants:1; +}; + +struct backend_info; + +/* Number of available flags */ +#define PERSISTENT_GNT_FLAGS_SIZE 2 +/* This persistent grant is currently in use */ +#define PERSISTENT_GNT_ACTIVE 0 +/* + * This persistent grant has been used, this flag is set when we remove the + * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently. + */ +#define PERSISTENT_GNT_WAS_ACTIVE 1 + +/* Number of requests that we can fit in a ring */ +#define XEN_BLKIF_REQS 32 + +struct persistent_gnt { + struct page *page; + grant_ref_t gnt; + grant_handle_t handle; + DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE); + struct rb_node node; + struct list_head remove_node; +}; + +struct xen_blkif { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned int irq; + /* Comms information. */ + enum blkif_protocol blk_protocol; + union blkif_back_rings blk_rings; + void *blk_ring; + /* The VBD attached to this interface. */ + struct xen_vbd vbd; + /* Back pointer to the backend_info. */ + struct backend_info *be; + /* Private fields. */ + spinlock_t blk_ring_lock; + atomic_t refcnt; + + wait_queue_head_t wq; + /* for barrier (drain) requests */ + struct completion drain_complete; + atomic_t drain; + atomic_t inflight; + /* One thread per one blkif. */ + struct task_struct *xenblkd; + unsigned int waiting_reqs; + + /* tree to store persistent grants */ + struct rb_root persistent_gnts; + unsigned int persistent_gnt_c; + atomic_t persistent_gnt_in_use; + unsigned long next_lru; + + /* used by the kworker that offload work from the persistent purge */ + struct list_head persistent_purge_list; + struct work_struct persistent_purge_work; + + /* buffer of free pages to map grant refs */ + spinlock_t free_pages_lock; + int free_pages_num; + struct list_head free_pages; + + /* List of all 'pending_req' available */ + struct list_head pending_free; + /* And its spinlock. */ + spinlock_t pending_free_lock; + wait_queue_head_t pending_free_wq; + + /* statistics */ + unsigned long st_print; + unsigned long long st_rd_req; + unsigned long long st_wr_req; + unsigned long long st_oo_req; + unsigned long long st_f_req; + unsigned long long st_ds_req; + unsigned long long st_rd_sect; + unsigned long long st_wr_sect; + + struct work_struct free_work; + /* Thread shutdown wait queue. */ + wait_queue_head_t shutdown_wq; +}; + +struct seg_buf { + unsigned long offset; + unsigned int nsec; +}; + +struct grant_page { + struct page *page; + struct persistent_gnt *persistent_gnt; + grant_handle_t handle; + grant_ref_t gref; +}; + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +struct pending_req { + struct xen_blkif *blkif; + u64 id; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; + struct list_head free_list; + struct grant_page *segments[MAX_INDIRECT_SEGMENTS]; + /* Indirect descriptors */ + struct grant_page *indirect_pages[MAX_INDIRECT_PAGES]; + struct seg_buf seg[MAX_INDIRECT_SEGMENTS]; + struct bio *biolist[MAX_INDIRECT_SEGMENTS]; + struct gnttab_unmap_grant_ref unmap[MAX_INDIRECT_SEGMENTS]; + struct page *unmap_pages[MAX_INDIRECT_SEGMENTS]; + struct gntab_unmap_queue_data gnttab_unmap_data; +}; + + +#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ + (_v)->bdev->bd_part->nr_sects : \ + get_capacity((_v)->bdev->bd_disk)) + +#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define xen_blkif_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + schedule_work(&(_b)->free_work);\ + } while (0) + +struct phys_req { + unsigned short dev; + blkif_sector_t nr_sects; + struct block_device *bdev; + blkif_sector_t sector_number; +}; +int xen_blkif_interface_init(void); + +int xen_blkif_xenbus_init(void); + +irqreturn_t xen_blkif_be_int(int irq, void *dev_id); +int xen_blkif_schedule(void *arg); +int xen_blkif_purge_persistent(void *arg); +void xen_blkbk_free_caches(struct xen_blkif *blkif); + +int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, + struct backend_info *be, int state); + +int xen_blkbk_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state); +struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); +void xen_blkbk_unmap_purged_grants(struct work_struct *work); + +static inline void blkif_get_x86_32_req(struct blkif_request *dst, + struct blkif_x86_32_request *src) +{ + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; + dst->operation = src->operation; + switch (src->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + dst->u.rw.nr_segments = src->u.rw.nr_segments; + dst->u.rw.handle = src->u.rw.handle; + dst->u.rw.id = src->u.rw.id; + dst->u.rw.sector_number = src->u.rw.sector_number; + barrier(); + if (n > dst->u.rw.nr_segments) + n = dst->u.rw.nr_segments; + for (i = 0; i < n; i++) + dst->u.rw.seg[i] = src->u.rw.seg[i]; + break; + case BLKIF_OP_DISCARD: + dst->u.discard.flag = src->u.discard.flag; + dst->u.discard.id = src->u.discard.id; + dst->u.discard.sector_number = src->u.discard.sector_number; + dst->u.discard.nr_sectors = src->u.discard.nr_sectors; + break; + case BLKIF_OP_INDIRECT: + dst->u.indirect.indirect_op = src->u.indirect.indirect_op; + dst->u.indirect.nr_segments = src->u.indirect.nr_segments; + dst->u.indirect.handle = src->u.indirect.handle; + dst->u.indirect.id = src->u.indirect.id; + dst->u.indirect.sector_number = src->u.indirect.sector_number; + barrier(); + j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); + for (i = 0; i < j; i++) + dst->u.indirect.indirect_grefs[i] = + src->u.indirect.indirect_grefs[i]; + break; + default: + /* + * Don't know how to translate this op. Only get the + * ID so failure can be reported to the frontend. + */ + dst->u.other.id = src->u.other.id; + break; + } +} + +static inline void blkif_get_x86_64_req(struct blkif_request *dst, + struct blkif_x86_64_request *src) +{ + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; + dst->operation = src->operation; + switch (src->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + dst->u.rw.nr_segments = src->u.rw.nr_segments; + dst->u.rw.handle = src->u.rw.handle; + dst->u.rw.id = src->u.rw.id; + dst->u.rw.sector_number = src->u.rw.sector_number; + barrier(); + if (n > dst->u.rw.nr_segments) + n = dst->u.rw.nr_segments; + for (i = 0; i < n; i++) + dst->u.rw.seg[i] = src->u.rw.seg[i]; + break; + case BLKIF_OP_DISCARD: + dst->u.discard.flag = src->u.discard.flag; + dst->u.discard.id = src->u.discard.id; + dst->u.discard.sector_number = src->u.discard.sector_number; + dst->u.discard.nr_sectors = src->u.discard.nr_sectors; + break; + case BLKIF_OP_INDIRECT: + dst->u.indirect.indirect_op = src->u.indirect.indirect_op; + dst->u.indirect.nr_segments = src->u.indirect.nr_segments; + dst->u.indirect.handle = src->u.indirect.handle; + dst->u.indirect.id = src->u.indirect.id; + dst->u.indirect.sector_number = src->u.indirect.sector_number; + barrier(); + j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments)); + for (i = 0; i < j; i++) + dst->u.indirect.indirect_grefs[i] = + src->u.indirect.indirect_grefs[i]; + break; + default: + /* + * Don't know how to translate this op. Only get the + * ID so failure can be reported to the frontend. + */ + dst->u.other.id = src->u.other.id; + break; + } +} + +#endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */ diff --git a/kernel/drivers/block/xen-blkback/xenbus.c b/kernel/drivers/block/xen-blkback/xenbus.c new file mode 100644 index 000000000..6ab69ad61 --- /dev/null +++ b/kernel/drivers/block/xen-blkback/xenbus.c @@ -0,0 +1,934 @@ +/* Xenbus code for blkif backend + Copyright (C) 2005 Rusty Russell + Copyright (C) 2005 XenSource Ltd + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + +*/ + +#define pr_fmt(fmt) "xen-blkback: " fmt + +#include +#include +#include +#include +#include +#include "common.h" + +/* Enlarge the array size in order to fully show blkback name. */ +#define BLKBACK_NAME_LEN (20) + +struct backend_info { + struct xenbus_device *dev; + struct xen_blkif *blkif; + struct xenbus_watch backend_watch; + unsigned major; + unsigned minor; + char *mode; +}; + +static struct kmem_cache *xen_blkif_cachep; +static void connect(struct backend_info *); +static int connect_ring(struct backend_info *); +static void backend_changed(struct xenbus_watch *, const char **, + unsigned int); +static void xen_blkif_free(struct xen_blkif *blkif); +static void xen_vbd_free(struct xen_vbd *vbd); + +struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be) +{ + return be->dev; +} + +/* + * The last request could free the device from softirq context and + * xen_blkif_free() can sleep. + */ +static void xen_blkif_deferred_free(struct work_struct *work) +{ + struct xen_blkif *blkif; + + blkif = container_of(work, struct xen_blkif, free_work); + xen_blkif_free(blkif); +} + +static int blkback_name(struct xen_blkif *blkif, char *buf) +{ + char *devpath, *devname; + struct xenbus_device *dev = blkif->be->dev; + + devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL); + if (IS_ERR(devpath)) + return PTR_ERR(devpath); + + devname = strstr(devpath, "/dev/"); + if (devname != NULL) + devname += strlen("/dev/"); + else + devname = devpath; + + snprintf(buf, BLKBACK_NAME_LEN, "blkback.%d.%s", blkif->domid, devname); + kfree(devpath); + + return 0; +} + +static void xen_update_blkif_status(struct xen_blkif *blkif) +{ + int err; + char name[BLKBACK_NAME_LEN]; + + /* Not ready to connect? */ + if (!blkif->irq || !blkif->vbd.bdev) + return; + + /* Already connected? */ + if (blkif->be->dev->state == XenbusStateConnected) + return; + + /* Attempt to connect: exit if we fail to. */ + connect(blkif->be); + if (blkif->be->dev->state != XenbusStateConnected) + return; + + err = blkback_name(blkif, name); + if (err) { + xenbus_dev_error(blkif->be->dev, err, "get blkback dev name"); + return; + } + + err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping); + if (err) { + xenbus_dev_error(blkif->be->dev, err, "block flush"); + return; + } + invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); + + blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name); + if (IS_ERR(blkif->xenblkd)) { + err = PTR_ERR(blkif->xenblkd); + blkif->xenblkd = NULL; + xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); + return; + } +} + +static struct xen_blkif *xen_blkif_alloc(domid_t domid) +{ + struct xen_blkif *blkif; + struct pending_req *req, *n; + int i, j; + + BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); + + blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL); + if (!blkif) + return ERR_PTR(-ENOMEM); + + blkif->domid = domid; + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 1); + init_waitqueue_head(&blkif->wq); + init_completion(&blkif->drain_complete); + atomic_set(&blkif->drain, 0); + blkif->st_print = jiffies; + blkif->persistent_gnts.rb_node = NULL; + spin_lock_init(&blkif->free_pages_lock); + INIT_LIST_HEAD(&blkif->free_pages); + INIT_LIST_HEAD(&blkif->persistent_purge_list); + blkif->free_pages_num = 0; + atomic_set(&blkif->persistent_gnt_in_use, 0); + atomic_set(&blkif->inflight, 0); + INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); + + INIT_LIST_HEAD(&blkif->pending_free); + INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); + + for (i = 0; i < XEN_BLKIF_REQS; i++) { + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + goto fail; + list_add_tail(&req->free_list, + &blkif->pending_free); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { + req->segments[j] = kzalloc(sizeof(*req->segments[0]), + GFP_KERNEL); + if (!req->segments[j]) + goto fail; + } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) { + req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]), + GFP_KERNEL); + if (!req->indirect_pages[j]) + goto fail; + } + } + spin_lock_init(&blkif->pending_free_lock); + init_waitqueue_head(&blkif->pending_free_wq); + init_waitqueue_head(&blkif->shutdown_wq); + + return blkif; + +fail: + list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { + list_del(&req->free_list); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { + if (!req->segments[j]) + break; + kfree(req->segments[j]); + } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) { + if (!req->indirect_pages[j]) + break; + kfree(req->indirect_pages[j]); + } + kfree(req); + } + + kmem_cache_free(xen_blkif_cachep, blkif); + + return ERR_PTR(-ENOMEM); +} + +static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, + unsigned int evtchn) +{ + int err; + + /* Already connected through? */ + if (blkif->irq) + return 0; + + err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1, + &blkif->blk_ring); + if (err < 0) + return err; + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + { + struct blkif_sring *sring; + sring = (struct blkif_sring *)blkif->blk_ring; + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_32: + { + struct blkif_x86_32_sring *sring_x86_32; + sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + struct blkif_x86_64_sring *sring_x86_64; + sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); + break; + } + default: + BUG(); + } + + err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, + xen_blkif_be_int, 0, + "blkif-backend", blkif); + if (err < 0) { + xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); + blkif->blk_rings.common.sring = NULL; + return err; + } + blkif->irq = err; + + return 0; +} + +static int xen_blkif_disconnect(struct xen_blkif *blkif) +{ + if (blkif->xenblkd) { + kthread_stop(blkif->xenblkd); + wake_up(&blkif->shutdown_wq); + blkif->xenblkd = NULL; + } + + /* The above kthread_stop() guarantees that at this point we + * don't have any discard_io or other_io requests. So, checking + * for inflight IO is enough. + */ + if (atomic_read(&blkif->inflight) > 0) + return -EBUSY; + + if (blkif->irq) { + unbind_from_irqhandler(blkif->irq, blkif); + blkif->irq = 0; + } + + if (blkif->blk_rings.common.sring) { + xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); + blkif->blk_rings.common.sring = NULL; + } + + /* Remove all persistent grants and the cache of ballooned pages. */ + xen_blkbk_free_caches(blkif); + + return 0; +} + +static void xen_blkif_free(struct xen_blkif *blkif) +{ + struct pending_req *req, *n; + int i = 0, j; + + xen_blkif_disconnect(blkif); + xen_vbd_free(&blkif->vbd); + + /* Make sure everything is drained before shutting down */ + BUG_ON(blkif->persistent_gnt_c != 0); + BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0); + BUG_ON(blkif->free_pages_num != 0); + BUG_ON(!list_empty(&blkif->persistent_purge_list)); + BUG_ON(!list_empty(&blkif->free_pages)); + BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); + + /* Check that there is no request in use */ + list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { + list_del(&req->free_list); + + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) + kfree(req->segments[j]); + + for (j = 0; j < MAX_INDIRECT_PAGES; j++) + kfree(req->indirect_pages[j]); + + kfree(req); + i++; + } + + WARN_ON(i != XEN_BLKIF_REQS); + + kmem_cache_free(xen_blkif_cachep, blkif); +} + +int __init xen_blkif_interface_init(void) +{ + xen_blkif_cachep = kmem_cache_create("blkif_cache", + sizeof(struct xen_blkif), + 0, 0, NULL); + if (!xen_blkif_cachep) + return -ENOMEM; + + return 0; +} + +/* + * sysfs interface for VBD I/O requests + */ + +#define VBD_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct xenbus_device *dev = to_xenbus_device(_dev); \ + struct backend_info *be = dev_get_drvdata(&dev->dev); \ + \ + return sprintf(buf, format, ##args); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); +VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); +VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); +VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); +VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); +VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); +VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); + +static struct attribute *xen_vbdstat_attrs[] = { + &dev_attr_oo_req.attr, + &dev_attr_rd_req.attr, + &dev_attr_wr_req.attr, + &dev_attr_f_req.attr, + &dev_attr_ds_req.attr, + &dev_attr_rd_sect.attr, + &dev_attr_wr_sect.attr, + NULL +}; + +static struct attribute_group xen_vbdstat_group = { + .name = "statistics", + .attrs = xen_vbdstat_attrs, +}; + +VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); +VBD_SHOW(mode, "%s\n", be->mode); + +static int xenvbd_sysfs_addif(struct xenbus_device *dev) +{ + int error; + + error = device_create_file(&dev->dev, &dev_attr_physical_device); + if (error) + goto fail1; + + error = device_create_file(&dev->dev, &dev_attr_mode); + if (error) + goto fail2; + + error = sysfs_create_group(&dev->dev.kobj, &xen_vbdstat_group); + if (error) + goto fail3; + + return 0; + +fail3: sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group); +fail2: device_remove_file(&dev->dev, &dev_attr_mode); +fail1: device_remove_file(&dev->dev, &dev_attr_physical_device); + return error; +} + +static void xenvbd_sysfs_delif(struct xenbus_device *dev) +{ + sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group); + device_remove_file(&dev->dev, &dev_attr_mode); + device_remove_file(&dev->dev, &dev_attr_physical_device); +} + + +static void xen_vbd_free(struct xen_vbd *vbd) +{ + if (vbd->bdev) + blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE); + vbd->bdev = NULL; +} + +static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, + unsigned major, unsigned minor, int readonly, + int cdrom) +{ + struct xen_vbd *vbd; + struct block_device *bdev; + struct request_queue *q; + + vbd = &blkif->vbd; + vbd->handle = handle; + vbd->readonly = readonly; + vbd->type = 0; + + vbd->pdevice = MKDEV(major, minor); + + bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ? + FMODE_READ : FMODE_WRITE, NULL); + + if (IS_ERR(bdev)) { + pr_warn("xen_vbd_create: device %08x could not be opened\n", + vbd->pdevice); + return -ENOENT; + } + + vbd->bdev = bdev; + if (vbd->bdev->bd_disk == NULL) { + pr_warn("xen_vbd_create: device %08x doesn't exist\n", + vbd->pdevice); + xen_vbd_free(vbd); + return -ENOENT; + } + vbd->size = vbd_sz(vbd); + + if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom) + vbd->type |= VDISK_CDROM; + if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) + vbd->type |= VDISK_REMOVABLE; + + q = bdev_get_queue(bdev); + if (q && q->flush_flags) + vbd->flush_support = true; + + if (q && blk_queue_secdiscard(q)) + vbd->discard_secure = true; + + pr_debug("Successful creation of handle=%04x (dom=%u)\n", + handle, blkif->domid); + return 0; +} +static int xen_blkbk_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev_get_drvdata(&dev->dev); + + pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id); + + if (be->major || be->minor) + xenvbd_sysfs_delif(dev); + + if (be->backend_watch.node) { + unregister_xenbus_watch(&be->backend_watch); + kfree(be->backend_watch.node); + be->backend_watch.node = NULL; + } + + dev_set_drvdata(&dev->dev, NULL); + + if (be->blkif) { + xen_blkif_disconnect(be->blkif); + xen_blkif_put(be->blkif); + } + + kfree(be->mode); + kfree(be); + return 0; +} + +int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, + struct backend_info *be, int state) +{ + struct xenbus_device *dev = be->dev; + int err; + + err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache", + "%d", state); + if (err) + dev_warn(&dev->dev, "writing feature-flush-cache (%d)", err); + + return err; +} + +static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + struct xen_blkif *blkif = be->blkif; + int err; + int state = 0, discard_enable; + struct block_device *bdev = be->blkif->vbd.bdev; + struct request_queue *q = bdev_get_queue(bdev); + + err = xenbus_scanf(XBT_NIL, dev->nodename, "discard-enable", "%d", + &discard_enable); + if (err == 1 && !discard_enable) + return; + + if (blk_queue_discard(q)) { + err = xenbus_printf(xbt, dev->nodename, + "discard-granularity", "%u", + q->limits.discard_granularity); + if (err) { + dev_warn(&dev->dev, "writing discard-granularity (%d)", err); + return; + } + err = xenbus_printf(xbt, dev->nodename, + "discard-alignment", "%u", + q->limits.discard_alignment); + if (err) { + dev_warn(&dev->dev, "writing discard-alignment (%d)", err); + return; + } + state = 1; + /* Optional. */ + err = xenbus_printf(xbt, dev->nodename, + "discard-secure", "%d", + blkif->vbd.discard_secure); + if (err) { + dev_warn(&dev->dev, "writing discard-secure (%d)", err); + return; + } + } + err = xenbus_printf(xbt, dev->nodename, "feature-discard", + "%d", state); + if (err) + dev_warn(&dev->dev, "writing feature-discard (%d)", err); +} +int xen_blkbk_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state) +{ + struct xenbus_device *dev = be->dev; + int err; + + err = xenbus_printf(xbt, dev->nodename, "feature-barrier", + "%d", state); + if (err) + dev_warn(&dev->dev, "writing feature-barrier (%d)", err); + + return err; +} + +/* + * Entry point to this code when a new device is created. Allocate the basic + * structures, and watch the store waiting for the hotplug scripts to tell us + * the device's physical major and minor numbers. Switch to InitWait. + */ +static int xen_blkbk_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + struct backend_info *be = kzalloc(sizeof(struct backend_info), + GFP_KERNEL); + + /* match the pr_debug in xen_blkbk_remove */ + pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id); + + if (!be) { + xenbus_dev_fatal(dev, -ENOMEM, + "allocating backend structure"); + return -ENOMEM; + } + be->dev = dev; + dev_set_drvdata(&dev->dev, be); + + be->blkif = xen_blkif_alloc(dev->otherend_id); + if (IS_ERR(be->blkif)) { + err = PTR_ERR(be->blkif); + be->blkif = NULL; + xenbus_dev_fatal(dev, err, "creating block interface"); + goto fail; + } + + /* setup back pointer */ + be->blkif->be = be; + + err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed, + "%s/%s", dev->nodename, "physical-device"); + if (err) + goto fail; + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + + return 0; + +fail: + pr_warn("%s failed\n", __func__); + xen_blkbk_remove(dev); + return err; +} + + +/* + * Callback received when the hotplug scripts have placed the physical-device + * node. Read it and the mode node, and create a vbd. If the frontend is + * ready, connect. + */ +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + int err; + unsigned major; + unsigned minor; + struct backend_info *be + = container_of(watch, struct backend_info, backend_watch); + struct xenbus_device *dev = be->dev; + int cdrom = 0; + unsigned long handle; + char *device_type; + + pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id); + + err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", + &major, &minor); + if (XENBUS_EXIST_ERR(err)) { + /* + * Since this watch will fire once immediately after it is + * registered, we expect this. Ignore it, and wait for the + * hotplug scripts. + */ + return; + } + if (err != 2) { + xenbus_dev_fatal(dev, err, "reading physical-device"); + return; + } + + if (be->major | be->minor) { + if (be->major != major || be->minor != minor) + pr_warn("changing physical device (from %x:%x to %x:%x) not supported.\n", + be->major, be->minor, major, minor); + return; + } + + be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL); + if (IS_ERR(be->mode)) { + err = PTR_ERR(be->mode); + be->mode = NULL; + xenbus_dev_fatal(dev, err, "reading mode"); + return; + } + + device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL); + if (!IS_ERR(device_type)) { + cdrom = strcmp(device_type, "cdrom") == 0; + kfree(device_type); + } + + /* Front end dir is a number, which is used as the handle. */ + err = kstrtoul(strrchr(dev->otherend, '/') + 1, 0, &handle); + if (err) + return; + + be->major = major; + be->minor = minor; + + err = xen_vbd_create(be->blkif, handle, major, minor, + !strchr(be->mode, 'w'), cdrom); + + if (err) + xenbus_dev_fatal(dev, err, "creating vbd structure"); + else { + err = xenvbd_sysfs_addif(dev); + if (err) { + xen_vbd_free(&be->blkif->vbd); + xenbus_dev_fatal(dev, err, "creating sysfs entries"); + } + } + + if (err) { + kfree(be->mode); + be->mode = NULL; + be->major = 0; + be->minor = 0; + } else { + /* We're potentially connected now */ + xen_update_blkif_status(be->blkif); + } +} + + +/* + * Callback received when the frontend's state changes. + */ +static void frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct backend_info *be = dev_get_drvdata(&dev->dev); + int err; + + pr_debug("%s %p %s\n", __func__, dev, xenbus_strstate(frontend_state)); + + switch (frontend_state) { + case XenbusStateInitialising: + if (dev->state == XenbusStateClosed) { + pr_info("%s: prepare for reconnect\n", dev->nodename); + xenbus_switch_state(dev, XenbusStateInitWait); + } + break; + + case XenbusStateInitialised: + case XenbusStateConnected: + /* + * Ensure we connect even when two watches fire in + * close succession and we miss the intermediate value + * of frontend_state. + */ + if (dev->state == XenbusStateConnected) + break; + + /* + * Enforce precondition before potential leak point. + * xen_blkif_disconnect() is idempotent. + */ + err = xen_blkif_disconnect(be->blkif); + if (err) { + xenbus_dev_fatal(dev, err, "pending I/O"); + break; + } + + err = connect_ring(be); + if (err) + break; + xen_update_blkif_status(be->blkif); + break; + + case XenbusStateClosing: + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xen_blkif_disconnect(be->blkif); + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + /* implies xen_blkif_disconnect() via xen_blkbk_remove() */ + device_unregister(&dev->dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + + +/* ** Connection ** */ + + +/* + * Write the physical details regarding the block device to the store, and + * switch to Connected state. + */ +static void connect(struct backend_info *be) +{ + struct xenbus_transaction xbt; + int err; + struct xenbus_device *dev = be->dev; + + pr_debug("%s %s\n", __func__, dev->otherend); + + /* Supply the information about the device the frontend needs */ +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + return; + } + + /* If we can't advertise it is OK. */ + xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support); + + xen_blkbk_discard(xbt, be); + + xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); + + err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", 1); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/feature-persistent", + dev->nodename); + goto abort; + } + err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u", + MAX_INDIRECT_SEGMENTS); + if (err) + dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)", + dev->nodename, err); + + err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", + (unsigned long long)vbd_sz(&be->blkif->vbd)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/sectors", + dev->nodename); + goto abort; + } + + /* FIXME: use a typename instead */ + err = xenbus_printf(xbt, dev->nodename, "info", "%u", + be->blkif->vbd.type | + (be->blkif->vbd.readonly ? VDISK_READONLY : 0)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/info", + dev->nodename); + goto abort; + } + err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu", + (unsigned long) + bdev_logical_block_size(be->blkif->vbd.bdev)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/sector-size", + dev->nodename); + goto abort; + } + err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u", + bdev_physical_block_size(be->blkif->vbd.bdev)); + if (err) + xenbus_dev_error(dev, err, "writing %s/physical-sector-size", + dev->nodename); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + xenbus_dev_fatal(dev, err, "ending transaction"); + + err = xenbus_switch_state(dev, XenbusStateConnected); + if (err) + xenbus_dev_fatal(dev, err, "%s: switching to Connected state", + dev->nodename); + + return; + abort: + xenbus_transaction_end(xbt, 1); +} + + +static int connect_ring(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned long ring_ref; + unsigned int evtchn; + unsigned int pers_grants; + char protocol[64] = ""; + int err; + + pr_debug("%s %s\n", __func__, dev->otherend); + + err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", + &ring_ref, "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, + "reading %s/ring-ref and event-channel", + dev->otherend); + return err; + } + + be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; + err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", + "%63s", protocol, NULL); + if (err) + strcpy(protocol, "unspecified, assuming default"); + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + else { + xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); + return -1; + } + err = xenbus_gather(XBT_NIL, dev->otherend, + "feature-persistent", "%u", + &pers_grants, NULL); + if (err) + pers_grants = 0; + + be->blkif->vbd.feature_gnt_persistent = pers_grants; + be->blkif->vbd.overflow_max_grants = 0; + + pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n", + ring_ref, evtchn, be->blkif->blk_protocol, protocol, + pers_grants ? "persistent grants" : ""); + + /* Map the shared frame, irq etc. */ + err = xen_blkif_map(be->blkif, ring_ref, evtchn); + if (err) { + xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", + ring_ref, evtchn); + return err; + } + + return 0; +} + +static const struct xenbus_device_id xen_blkbk_ids[] = { + { "vbd" }, + { "" } +}; + +static struct xenbus_driver xen_blkbk_driver = { + .ids = xen_blkbk_ids, + .probe = xen_blkbk_probe, + .remove = xen_blkbk_remove, + .otherend_changed = frontend_changed +}; + +int xen_blkif_xenbus_init(void) +{ + return xenbus_register_backend(&xen_blkbk_driver); +} diff --git a/kernel/drivers/block/xen-blkfront.c b/kernel/drivers/block/xen-blkfront.c new file mode 100644 index 000000000..2c61cf8c6 --- /dev/null +++ b/kernel/drivers/block/xen-blkfront.c @@ -0,0 +1,2126 @@ +/* + * blkfront.c + * + * XenLinux virtual block device driver. + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Modifications by Mark A. Williamson are (c) Intel Research Cambridge + * Copyright (c) 2004, Christian Limpach + * Copyright (c) 2004, Andrew Warfield + * Copyright (c) 2005, Christopher Clark + * Copyright (c) 2005, XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +enum blkif_state { + BLKIF_STATE_DISCONNECTED, + BLKIF_STATE_CONNECTED, + BLKIF_STATE_SUSPENDED, +}; + +struct grant { + grant_ref_t gref; + unsigned long pfn; + struct list_head node; +}; + +struct blk_shadow { + struct blkif_request req; + struct request *request; + struct grant **grants_used; + struct grant **indirect_grants; + struct scatterlist *sg; +}; + +struct split_bio { + struct bio *bio; + atomic_t pending; + int err; +}; + +static DEFINE_MUTEX(blkfront_mutex); +static const struct block_device_operations xlvbd_block_fops; + +/* + * Maximum number of segments in indirect requests, the actual value used by + * the frontend driver is the minimum of this value and the value provided + * by the backend driver. + */ + +static unsigned int xen_blkif_max_segments = 32; +module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); +MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); + +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) + +/* + * We have one of these per vbd, whether ide, scsi or 'other'. They + * hang in private_data off the gendisk structure. We may end up + * putting all kinds of interesting stuff here :-) + */ +struct blkfront_info +{ + spinlock_t io_lock; + struct mutex mutex; + struct xenbus_device *xbdev; + struct gendisk *gd; + int vdevice; + blkif_vdev_t handle; + enum blkif_state connected; + int ring_ref; + struct blkif_front_ring ring; + unsigned int evtchn, irq; + struct request_queue *rq; + struct work_struct work; + struct gnttab_free_callback callback; + struct blk_shadow shadow[BLK_RING_SIZE]; + struct list_head grants; + struct list_head indirect_pages; + unsigned int persistent_gnts_c; + unsigned long shadow_free; + unsigned int feature_flush; + unsigned int feature_discard:1; + unsigned int feature_secdiscard:1; + unsigned int discard_granularity; + unsigned int discard_alignment; + unsigned int feature_persistent:1; + unsigned int max_indirect_segments; + int is_ready; +}; + +static unsigned int nr_minors; +static unsigned long *minors; +static DEFINE_SPINLOCK(minor_lock); + +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ + (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) +#define GRANT_INVALID_REF 0 + +#define PARTS_PER_DISK 16 +#define PARTS_PER_EXT_DISK 256 + +#define BLKIF_MAJOR(dev) ((dev)>>8) +#define BLKIF_MINOR(dev) ((dev) & 0xff) + +#define EXT_SHIFT 28 +#define EXTENDED (1<shadow_free; + BUG_ON(free >= BLK_RING_SIZE); + info->shadow_free = info->shadow[free].req.u.rw.id; + info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ + return free; +} + +static int add_id_to_freelist(struct blkfront_info *info, + unsigned long id) +{ + if (info->shadow[id].req.u.rw.id != id) + return -EINVAL; + if (info->shadow[id].request == NULL) + return -EINVAL; + info->shadow[id].req.u.rw.id = info->shadow_free; + info->shadow[id].request = NULL; + info->shadow_free = id; + return 0; +} + +static int fill_grant_buffer(struct blkfront_info *info, int num) +{ + struct page *granted_page; + struct grant *gnt_list_entry, *n; + int i = 0; + + while(i < num) { + gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); + if (!gnt_list_entry) + goto out_of_memory; + + if (info->feature_persistent) { + granted_page = alloc_page(GFP_NOIO); + if (!granted_page) { + kfree(gnt_list_entry); + goto out_of_memory; + } + gnt_list_entry->pfn = page_to_pfn(granted_page); + } + + gnt_list_entry->gref = GRANT_INVALID_REF; + list_add(&gnt_list_entry->node, &info->grants); + i++; + } + + return 0; + +out_of_memory: + list_for_each_entry_safe(gnt_list_entry, n, + &info->grants, node) { + list_del(&gnt_list_entry->node); + if (info->feature_persistent) + __free_page(pfn_to_page(gnt_list_entry->pfn)); + kfree(gnt_list_entry); + i--; + } + BUG_ON(i != 0); + return -ENOMEM; +} + +static struct grant *get_grant(grant_ref_t *gref_head, + unsigned long pfn, + struct blkfront_info *info) +{ + struct grant *gnt_list_entry; + unsigned long buffer_mfn; + + BUG_ON(list_empty(&info->grants)); + gnt_list_entry = list_first_entry(&info->grants, struct grant, + node); + list_del(&gnt_list_entry->node); + + if (gnt_list_entry->gref != GRANT_INVALID_REF) { + info->persistent_gnts_c--; + return gnt_list_entry; + } + + /* Assign a gref to this page */ + gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); + BUG_ON(gnt_list_entry->gref == -ENOSPC); + if (!info->feature_persistent) { + BUG_ON(!pfn); + gnt_list_entry->pfn = pfn; + } + buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); + gnttab_grant_foreign_access_ref(gnt_list_entry->gref, + info->xbdev->otherend_id, + buffer_mfn, 0); + return gnt_list_entry; +} + +static const char *op_name(int op) +{ + static const char *const names[] = { + [BLKIF_OP_READ] = "read", + [BLKIF_OP_WRITE] = "write", + [BLKIF_OP_WRITE_BARRIER] = "barrier", + [BLKIF_OP_FLUSH_DISKCACHE] = "flush", + [BLKIF_OP_DISCARD] = "discard" }; + + if (op < 0 || op >= ARRAY_SIZE(names)) + return "unknown"; + + if (!names[op]) + return "reserved"; + + return names[op]; +} +static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) +{ + unsigned int end = minor + nr; + int rc; + + if (end > nr_minors) { + unsigned long *bitmap, *old; + + bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap), + GFP_KERNEL); + if (bitmap == NULL) + return -ENOMEM; + + spin_lock(&minor_lock); + if (end > nr_minors) { + old = minors; + memcpy(bitmap, minors, + BITS_TO_LONGS(nr_minors) * sizeof(*bitmap)); + minors = bitmap; + nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG; + } else + old = bitmap; + spin_unlock(&minor_lock); + kfree(old); + } + + spin_lock(&minor_lock); + if (find_next_bit(minors, end, minor) >= end) { + bitmap_set(minors, minor, nr); + rc = 0; + } else + rc = -EBUSY; + spin_unlock(&minor_lock); + + return rc; +} + +static void xlbd_release_minors(unsigned int minor, unsigned int nr) +{ + unsigned int end = minor + nr; + + BUG_ON(end > nr_minors); + spin_lock(&minor_lock); + bitmap_clear(minors, minor, nr); + spin_unlock(&minor_lock); +} + +static void blkif_restart_queue_callback(void *arg) +{ + struct blkfront_info *info = (struct blkfront_info *)arg; + schedule_work(&info->work); +} + +static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) +{ + /* We don't have real geometry info, but let's at least return + values consistent with the size of the device */ + sector_t nsect = get_capacity(bd->bd_disk); + sector_t cylinders = nsect; + + hg->heads = 0xff; + hg->sectors = 0x3f; + sector_div(cylinders, hg->heads * hg->sectors); + hg->cylinders = cylinders; + if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) + hg->cylinders = 0xffff; + return 0; +} + +static int blkif_ioctl(struct block_device *bdev, fmode_t mode, + unsigned command, unsigned long argument) +{ + struct blkfront_info *info = bdev->bd_disk->private_data; + int i; + + dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n", + command, (long)argument); + + switch (command) { + case CDROMMULTISESSION: + dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n"); + for (i = 0; i < sizeof(struct cdrom_multisession); i++) + if (put_user(0, (char __user *)(argument + i))) + return -EFAULT; + return 0; + + case CDROM_GET_CAPABILITY: { + struct gendisk *gd = info->gd; + if (gd->flags & GENHD_FL_CD) + return 0; + return -EINVAL; + } + + default: + /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", + command);*/ + return -EINVAL; /* same return as native Linux */ + } + + return 0; +} + +/* + * Generate a Xen blkfront IO request from a blk layer request. Reads + * and writes are handled as expected. + * + * @req: a request struct + */ +static int blkif_queue_request(struct request *req) +{ + struct blkfront_info *info = req->rq_disk->private_data; + struct blkif_request *ring_req; + unsigned long id; + unsigned int fsect, lsect; + int i, ref, n; + struct blkif_request_segment *segments = NULL; + + /* + * Used to store if we are able to queue the request by just using + * existing persistent grants, or if we have to get new grants, + * as there are not sufficiently many free. + */ + bool new_persistent_gnts; + grant_ref_t gref_head; + struct grant *gnt_list_entry = NULL; + struct scatterlist *sg; + int nseg, max_grefs; + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + return 1; + + max_grefs = req->nr_phys_segments; + if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST) + /* + * If we are using indirect segments we need to account + * for the indirect grefs used in the request. + */ + max_grefs += INDIRECT_GREFS(req->nr_phys_segments); + + /* Check if we have enough grants to allocate a requests */ + if (info->persistent_gnts_c < max_grefs) { + new_persistent_gnts = 1; + if (gnttab_alloc_grant_references( + max_grefs - info->persistent_gnts_c, + &gref_head) < 0) { + gnttab_request_free_callback( + &info->callback, + blkif_restart_queue_callback, + info, + max_grefs); + return 1; + } + } else + new_persistent_gnts = 0; + + /* Fill out a communications ring structure. */ + ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); + id = get_id_from_freelist(info); + info->shadow[id].request = req; + + if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { + ring_req->operation = BLKIF_OP_DISCARD; + ring_req->u.discard.nr_sectors = blk_rq_sectors(req); + ring_req->u.discard.id = id; + ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); + if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) + ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; + else + ring_req->u.discard.flag = 0; + } else { + BUG_ON(info->max_indirect_segments == 0 && + req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); + BUG_ON(info->max_indirect_segments && + req->nr_phys_segments > info->max_indirect_segments); + nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + ring_req->u.rw.id = id; + if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + /* + * The indirect operation can only be a BLKIF_OP_READ or + * BLKIF_OP_WRITE + */ + BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); + ring_req->operation = BLKIF_OP_INDIRECT; + ring_req->u.indirect.indirect_op = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.indirect.handle = info->handle; + ring_req->u.indirect.nr_segments = nseg; + } else { + ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.rw.handle = info->handle; + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { + /* + * Ideally we can do an unordered flush-to-disk. In case the + * backend onlysupports barriers, use that. A barrier request + * a superset of FUA, so we can implement it the same + * way. (It's also a FLUSH+FUA, since it is + * guaranteed ordered WRT previous writes.) + */ + switch (info->feature_flush & + ((REQ_FLUSH|REQ_FUA))) { + case REQ_FLUSH|REQ_FUA: + ring_req->operation = + BLKIF_OP_WRITE_BARRIER; + break; + case REQ_FLUSH: + ring_req->operation = + BLKIF_OP_FLUSH_DISKCACHE; + break; + default: + ring_req->operation = 0; + } + } + ring_req->u.rw.nr_segments = nseg; + } + for_each_sg(info->shadow[id].sg, sg, nseg, i) { + fsect = sg->offset >> 9; + lsect = fsect + (sg->length >> 9) - 1; + + if ((ring_req->operation == BLKIF_OP_INDIRECT) && + (i % SEGS_PER_INDIRECT_FRAME == 0)) { + unsigned long uninitialized_var(pfn); + + if (segments) + kunmap_atomic(segments); + + n = i / SEGS_PER_INDIRECT_FRAME; + if (!info->feature_persistent) { + struct page *indirect_page; + + /* Fetch a pre-allocated page to use for indirect grefs */ + BUG_ON(list_empty(&info->indirect_pages)); + indirect_page = list_first_entry(&info->indirect_pages, + struct page, lru); + list_del(&indirect_page->lru); + pfn = page_to_pfn(indirect_page); + } + gnt_list_entry = get_grant(&gref_head, pfn, info); + info->shadow[id].indirect_grants[n] = gnt_list_entry; + segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); + ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; + } + + gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info); + ref = gnt_list_entry->gref; + + info->shadow[id].grants_used[i] = gnt_list_entry; + + if (rq_data_dir(req) && info->feature_persistent) { + char *bvec_data; + void *shared_data; + + BUG_ON(sg->offset + sg->length > PAGE_SIZE); + + shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); + bvec_data = kmap_atomic(sg_page(sg)); + + /* + * this does not wipe data stored outside the + * range sg->offset..sg->offset+sg->length. + * Therefore, blkback *could* see data from + * previous requests. This is OK as long as + * persistent grants are shared with just one + * domain. It may need refactoring if this + * changes + */ + memcpy(shared_data + sg->offset, + bvec_data + sg->offset, + sg->length); + + kunmap_atomic(bvec_data); + kunmap_atomic(shared_data); + } + if (ring_req->operation != BLKIF_OP_INDIRECT) { + ring_req->u.rw.seg[i] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } else { + n = i % SEGS_PER_INDIRECT_FRAME; + segments[n] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } + } + if (segments) + kunmap_atomic(segments); + } + + info->ring.req_prod_pvt++; + + /* Keep a private copy so we can reissue requests when recovering. */ + info->shadow[id].req = *ring_req; + + if (new_persistent_gnts) + gnttab_free_grant_references(gref_head); + + return 0; +} + + +static inline void flush_requests(struct blkfront_info *info) +{ + int notify; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); + + if (notify) + notify_remote_via_irq(info->irq); +} + +static inline bool blkif_request_flush_invalid(struct request *req, + struct blkfront_info *info) +{ + return ((req->cmd_type != REQ_TYPE_FS) || + ((req->cmd_flags & REQ_FLUSH) && + !(info->feature_flush & REQ_FLUSH)) || + ((req->cmd_flags & REQ_FUA) && + !(info->feature_flush & REQ_FUA))); +} + +/* + * do_blkif_request + * read a block; request is in a request queue + */ +static void do_blkif_request(struct request_queue *rq) +{ + struct blkfront_info *info = NULL; + struct request *req; + int queued; + + pr_debug("Entered do_blkif_request\n"); + + queued = 0; + + while ((req = blk_peek_request(rq)) != NULL) { + info = req->rq_disk->private_data; + + if (RING_FULL(&info->ring)) + goto wait; + + blk_start_request(req); + + if (blkif_request_flush_invalid(req, info)) { + __blk_end_request_all(req, -EOPNOTSUPP); + continue; + } + + pr_debug("do_blk_req %p: cmd %p, sec %lx, " + "(%u/%u) [%s]\n", + req, req->cmd, (unsigned long)blk_rq_pos(req), + blk_rq_cur_sectors(req), blk_rq_sectors(req), + rq_data_dir(req) ? "write" : "read"); + + if (blkif_queue_request(req)) { + blk_requeue_request(rq, req); +wait: + /* Avoid pointless unplugs. */ + blk_stop_queue(rq); + break; + } + + queued++; + } + + if (queued != 0) + flush_requests(info); +} + +static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, + unsigned int physical_sector_size, + unsigned int segments) +{ + struct request_queue *rq; + struct blkfront_info *info = gd->private_data; + + rq = blk_init_queue(do_blkif_request, &info->io_lock); + if (rq == NULL) + return -1; + + queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); + + if (info->feature_discard) { + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq); + blk_queue_max_discard_sectors(rq, get_capacity(gd)); + rq->limits.discard_granularity = info->discard_granularity; + rq->limits.discard_alignment = info->discard_alignment; + if (info->feature_secdiscard) + queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq); + } + + /* Hard sector size and max sectors impersonate the equiv. hardware. */ + blk_queue_logical_block_size(rq, sector_size); + blk_queue_physical_block_size(rq, physical_sector_size); + blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512); + + /* Each segment in a request is up to an aligned page in size. */ + blk_queue_segment_boundary(rq, PAGE_SIZE - 1); + blk_queue_max_segment_size(rq, PAGE_SIZE); + + /* Ensure a merged request will fit in a single I/O ring slot. */ + blk_queue_max_segments(rq, segments); + + /* Make sure buffer addresses are sector-aligned. */ + blk_queue_dma_alignment(rq, 511); + + /* Make sure we don't use bounce buffers. */ + blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY); + + gd->queue = rq; + + return 0; +} + +static const char *flush_info(unsigned int feature_flush) +{ + switch (feature_flush & ((REQ_FLUSH | REQ_FUA))) { + case REQ_FLUSH|REQ_FUA: + return "barrier: enabled;"; + case REQ_FLUSH: + return "flush diskcache: enabled;"; + default: + return "barrier or flush: disabled;"; + } +} + +static void xlvbd_flush(struct blkfront_info *info) +{ + blk_queue_flush(info->rq, info->feature_flush); + pr_info("blkfront: %s: %s %s %s %s %s\n", + info->gd->disk_name, flush_info(info->feature_flush), + "persistent grants:", info->feature_persistent ? + "enabled;" : "disabled;", "indirect descriptors:", + info->max_indirect_segments ? "enabled;" : "disabled;"); +} + +static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) +{ + int major; + major = BLKIF_MAJOR(vdevice); + *minor = BLKIF_MINOR(vdevice); + switch (major) { + case XEN_IDE0_MAJOR: + *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET; + *minor = ((*minor / 64) * PARTS_PER_DISK) + + EMULATED_HD_DISK_MINOR_OFFSET; + break; + case XEN_IDE1_MAJOR: + *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET; + *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) + + EMULATED_HD_DISK_MINOR_OFFSET; + break; + case XEN_SCSI_DISK0_MAJOR: + *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET; + *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET; + break; + case XEN_SCSI_DISK1_MAJOR: + case XEN_SCSI_DISK2_MAJOR: + case XEN_SCSI_DISK3_MAJOR: + case XEN_SCSI_DISK4_MAJOR: + case XEN_SCSI_DISK5_MAJOR: + case XEN_SCSI_DISK6_MAJOR: + case XEN_SCSI_DISK7_MAJOR: + *offset = (*minor / PARTS_PER_DISK) + + ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) + + EMULATED_SD_DISK_NAME_OFFSET; + *minor = *minor + + ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) + + EMULATED_SD_DISK_MINOR_OFFSET; + break; + case XEN_SCSI_DISK8_MAJOR: + case XEN_SCSI_DISK9_MAJOR: + case XEN_SCSI_DISK10_MAJOR: + case XEN_SCSI_DISK11_MAJOR: + case XEN_SCSI_DISK12_MAJOR: + case XEN_SCSI_DISK13_MAJOR: + case XEN_SCSI_DISK14_MAJOR: + case XEN_SCSI_DISK15_MAJOR: + *offset = (*minor / PARTS_PER_DISK) + + ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) + + EMULATED_SD_DISK_NAME_OFFSET; + *minor = *minor + + ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) + + EMULATED_SD_DISK_MINOR_OFFSET; + break; + case XENVBD_MAJOR: + *offset = *minor / PARTS_PER_DISK; + break; + default: + printk(KERN_WARNING "blkfront: your disk configuration is " + "incorrect, please use an xvd device instead\n"); + return -ENODEV; + } + return 0; +} + +static char *encode_disk_name(char *ptr, unsigned int n) +{ + if (n >= 26) + ptr = encode_disk_name(ptr, n / 26 - 1); + *ptr = 'a' + n % 26; + return ptr + 1; +} + +static int xlvbd_alloc_gendisk(blkif_sector_t capacity, + struct blkfront_info *info, + u16 vdisk_info, u16 sector_size, + unsigned int physical_sector_size) +{ + struct gendisk *gd; + int nr_minors = 1; + int err; + unsigned int offset; + int minor; + int nr_parts; + char *ptr; + + BUG_ON(info->gd != NULL); + BUG_ON(info->rq != NULL); + + if ((info->vdevice>>EXT_SHIFT) > 1) { + /* this is above the extended range; something is wrong */ + printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice); + return -ENODEV; + } + + if (!VDEV_IS_EXTENDED(info->vdevice)) { + err = xen_translate_vdev(info->vdevice, &minor, &offset); + if (err) + return err; + nr_parts = PARTS_PER_DISK; + } else { + minor = BLKIF_MINOR_EXT(info->vdevice); + nr_parts = PARTS_PER_EXT_DISK; + offset = minor / nr_parts; + if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4) + printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with " + "emulated IDE disks,\n\t choose an xvd device name" + "from xvde on\n", info->vdevice); + } + if (minor >> MINORBITS) { + pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n", + info->vdevice, minor); + return -ENODEV; + } + + if ((minor % nr_parts) == 0) + nr_minors = nr_parts; + + err = xlbd_reserve_minors(minor, nr_minors); + if (err) + goto out; + err = -ENODEV; + + gd = alloc_disk(nr_minors); + if (gd == NULL) + goto release; + + strcpy(gd->disk_name, DEV_NAME); + ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); + BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN); + if (nr_minors > 1) + *ptr = 0; + else + snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr, + "%d", minor & (nr_parts - 1)); + + gd->major = XENVBD_MAJOR; + gd->first_minor = minor; + gd->fops = &xlvbd_block_fops; + gd->private_data = info; + gd->driverfs_dev = &(info->xbdev->dev); + set_capacity(gd, capacity); + + if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, + info->max_indirect_segments ? : + BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + del_gendisk(gd); + goto release; + } + + info->rq = gd->queue; + info->gd = gd; + + xlvbd_flush(info); + + if (vdisk_info & VDISK_READONLY) + set_disk_ro(gd, 1); + + if (vdisk_info & VDISK_REMOVABLE) + gd->flags |= GENHD_FL_REMOVABLE; + + if (vdisk_info & VDISK_CDROM) + gd->flags |= GENHD_FL_CD; + + return 0; + + release: + xlbd_release_minors(minor, nr_minors); + out: + return err; +} + +static void xlvbd_release_gendisk(struct blkfront_info *info) +{ + unsigned int minor, nr_minors; + unsigned long flags; + + if (info->rq == NULL) + return; + + spin_lock_irqsave(&info->io_lock, flags); + + /* No more blkif_request(). */ + blk_stop_queue(info->rq); + + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&info->callback); + spin_unlock_irqrestore(&info->io_lock, flags); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_work(&info->work); + + del_gendisk(info->gd); + + minor = info->gd->first_minor; + nr_minors = info->gd->minors; + xlbd_release_minors(minor, nr_minors); + + blk_cleanup_queue(info->rq); + info->rq = NULL; + + put_disk(info->gd); + info->gd = NULL; +} + +static void kick_pending_request_queues(struct blkfront_info *info) +{ + if (!RING_FULL(&info->ring)) { + /* Re-enable calldowns. */ + blk_start_queue(info->rq); + /* Kick things off immediately. */ + do_blkif_request(info->rq); + } +} + +static void blkif_restart_queue(struct work_struct *work) +{ + struct blkfront_info *info = container_of(work, struct blkfront_info, work); + + spin_lock_irq(&info->io_lock); + if (info->connected == BLKIF_STATE_CONNECTED) + kick_pending_request_queues(info); + spin_unlock_irq(&info->io_lock); +} + +static void blkif_free(struct blkfront_info *info, int suspend) +{ + struct grant *persistent_gnt; + struct grant *n; + int i, j, segs; + + /* Prevent new requests being issued until we fix things up. */ + spin_lock_irq(&info->io_lock); + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ + if (info->rq) + blk_stop_queue(info->rq); + + /* Remove all persistent grants */ + if (!list_empty(&info->grants)) { + list_for_each_entry_safe(persistent_gnt, n, + &info->grants, node) { + list_del(&persistent_gnt->node); + if (persistent_gnt->gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(persistent_gnt->gref, + 0, 0UL); + info->persistent_gnts_c--; + } + if (info->feature_persistent) + __free_page(pfn_to_page(persistent_gnt->pfn)); + kfree(persistent_gnt); + } + } + BUG_ON(info->persistent_gnts_c != 0); + + /* + * Remove indirect pages, this only happens when using indirect + * descriptors but not persistent grants + */ + if (!list_empty(&info->indirect_pages)) { + struct page *indirect_page, *n; + + BUG_ON(info->feature_persistent); + list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_del(&indirect_page->lru); + __free_page(indirect_page); + } + } + + for (i = 0; i < BLK_RING_SIZE; i++) { + /* + * Clear persistent grants present in requests already + * on the shared ring + */ + if (!info->shadow[i].request) + goto free_shadow; + + segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? + info->shadow[i].req.u.indirect.nr_segments : + info->shadow[i].req.u.rw.nr_segments; + for (j = 0; j < segs; j++) { + persistent_gnt = info->shadow[i].grants_used[j]; + gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + if (info->feature_persistent) + __free_page(pfn_to_page(persistent_gnt->pfn)); + kfree(persistent_gnt); + } + + if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) + /* + * If this is not an indirect operation don't try to + * free indirect segments + */ + goto free_shadow; + + for (j = 0; j < INDIRECT_GREFS(segs); j++) { + persistent_gnt = info->shadow[i].indirect_grants[j]; + gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); + __free_page(pfn_to_page(persistent_gnt->pfn)); + kfree(persistent_gnt); + } + +free_shadow: + kfree(info->shadow[i].grants_used); + info->shadow[i].grants_used = NULL; + kfree(info->shadow[i].indirect_grants); + info->shadow[i].indirect_grants = NULL; + kfree(info->shadow[i].sg); + info->shadow[i].sg = NULL; + } + + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&info->callback); + spin_unlock_irq(&info->io_lock); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_work(&info->work); + + /* Free resources associated with old device channel. */ + if (info->ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref, 0, + (unsigned long)info->ring.sring); + info->ring_ref = GRANT_INVALID_REF; + info->ring.sring = NULL; + } + if (info->irq) + unbind_from_irqhandler(info->irq, info); + info->evtchn = info->irq = 0; + +} + +static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, + struct blkif_response *bret) +{ + int i = 0; + struct scatterlist *sg; + char *bvec_data; + void *shared_data; + int nseg; + + nseg = s->req.operation == BLKIF_OP_INDIRECT ? + s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; + + if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { + /* + * Copy the data received from the backend into the bvec. + * Since bv_offset can be different than 0, and bv_len different + * than PAGE_SIZE, we have to keep track of the current offset, + * to be sure we are copying the data from the right shared page. + */ + for_each_sg(s->sg, sg, nseg, i) { + BUG_ON(sg->offset + sg->length > PAGE_SIZE); + shared_data = kmap_atomic( + pfn_to_page(s->grants_used[i]->pfn)); + bvec_data = kmap_atomic(sg_page(sg)); + memcpy(bvec_data + sg->offset, + shared_data + sg->offset, + sg->length); + kunmap_atomic(bvec_data); + kunmap_atomic(shared_data); + } + } + /* Add the persistent grant into the list of free grants */ + for (i = 0; i < nseg; i++) { + if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { + /* + * If the grant is still mapped by the backend (the + * backend has chosen to make this grant persistent) + * we add it at the head of the list, so it will be + * reused first. + */ + if (!info->feature_persistent) + pr_alert_ratelimited("backed has not unmapped grant: %u\n", + s->grants_used[i]->gref); + list_add(&s->grants_used[i]->node, &info->grants); + info->persistent_gnts_c++; + } else { + /* + * If the grant is not mapped by the backend we end the + * foreign access and add it to the tail of the list, + * so it will not be picked again unless we run out of + * persistent grants. + */ + gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); + s->grants_used[i]->gref = GRANT_INVALID_REF; + list_add_tail(&s->grants_used[i]->node, &info->grants); + } + } + if (s->req.operation == BLKIF_OP_INDIRECT) { + for (i = 0; i < INDIRECT_GREFS(nseg); i++) { + if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { + if (!info->feature_persistent) + pr_alert_ratelimited("backed has not unmapped grant: %u\n", + s->indirect_grants[i]->gref); + list_add(&s->indirect_grants[i]->node, &info->grants); + info->persistent_gnts_c++; + } else { + struct page *indirect_page; + + gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL); + /* + * Add the used indirect page back to the list of + * available pages for indirect grefs. + */ + indirect_page = pfn_to_page(s->indirect_grants[i]->pfn); + list_add(&indirect_page->lru, &info->indirect_pages); + s->indirect_grants[i]->gref = GRANT_INVALID_REF; + list_add_tail(&s->indirect_grants[i]->node, &info->grants); + } + } + } +} + +static irqreturn_t blkif_interrupt(int irq, void *dev_id) +{ + struct request *req; + struct blkif_response *bret; + RING_IDX i, rp; + unsigned long flags; + struct blkfront_info *info = (struct blkfront_info *)dev_id; + int error; + + spin_lock_irqsave(&info->io_lock, flags); + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { + spin_unlock_irqrestore(&info->io_lock, flags); + return IRQ_HANDLED; + } + + again: + rp = info->ring.sring->rsp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + for (i = info->ring.rsp_cons; i != rp; i++) { + unsigned long id; + + bret = RING_GET_RESPONSE(&info->ring, i); + id = bret->id; + /* + * The backend has messed up and given us an id that we would + * never have given to it (we stamp it up to BLK_RING_SIZE - + * look in get_id_from_freelist. + */ + if (id >= BLK_RING_SIZE) { + WARN(1, "%s: response to %s has incorrect id (%ld)\n", + info->gd->disk_name, op_name(bret->operation), id); + /* We can't safely get the 'struct request' as + * the id is busted. */ + continue; + } + req = info->shadow[id].request; + + if (bret->operation != BLKIF_OP_DISCARD) + blkif_completion(&info->shadow[id], info, bret); + + if (add_id_to_freelist(info, id)) { + WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", + info->gd->disk_name, op_name(bret->operation), id); + continue; + } + + error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; + switch (bret->operation) { + case BLKIF_OP_DISCARD: + if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + struct request_queue *rq = info->rq; + printk(KERN_WARNING "blkfront: %s: %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); + error = -EOPNOTSUPP; + info->feature_discard = 0; + info->feature_secdiscard = 0; + queue_flag_clear(QUEUE_FLAG_DISCARD, rq); + queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); + } + __blk_end_request_all(req, error); + break; + case BLKIF_OP_FLUSH_DISKCACHE: + case BLKIF_OP_WRITE_BARRIER: + if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + printk(KERN_WARNING "blkfront: %s: %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); + error = -EOPNOTSUPP; + } + if (unlikely(bret->status == BLKIF_RSP_ERROR && + info->shadow[id].req.u.rw.nr_segments == 0)) { + printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); + error = -EOPNOTSUPP; + } + if (unlikely(error)) { + if (error == -EOPNOTSUPP) + error = 0; + info->feature_flush = 0; + xlvbd_flush(info); + } + /* fall through */ + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + if (unlikely(bret->status != BLKIF_RSP_OKAY)) + dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " + "request: %x\n", bret->status); + + __blk_end_request_all(req, error); + break; + default: + BUG(); + } + } + + info->ring.rsp_cons = i; + + if (i != info->ring.req_prod_pvt) { + int more_to_do; + RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + if (more_to_do) + goto again; + } else + info->ring.sring->rsp_event = i + 1; + + kick_pending_request_queues(info); + + spin_unlock_irqrestore(&info->io_lock, flags); + + return IRQ_HANDLED; +} + + +static int setup_blkring(struct xenbus_device *dev, + struct blkfront_info *info) +{ + struct blkif_sring *sring; + grant_ref_t gref; + int err; + + info->ring_ref = GRANT_INVALID_REF; + + sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH); + if (!sring) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); + return -ENOMEM; + } + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref); + if (err < 0) { + free_page((unsigned long)sring); + info->ring.sring = NULL; + goto fail; + } + info->ring_ref = gref; + + err = xenbus_alloc_evtchn(dev, &info->evtchn); + if (err) + goto fail; + + err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, + "blkif", info); + if (err <= 0) { + xenbus_dev_fatal(dev, err, + "bind_evtchn_to_irqhandler failed"); + goto fail; + } + info->irq = err; + + return 0; +fail: + blkif_free(info, 0); + return err; +} + + +/* Common code used when first setting up, and when resuming. */ +static int talk_to_blkback(struct xenbus_device *dev, + struct blkfront_info *info) +{ + const char *message = NULL; + struct xenbus_transaction xbt; + int err; + + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, info); + if (err) + goto out; + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_blkring; + } + + err = xenbus_printf(xbt, dev->nodename, + "ring-ref", "%u", info->ring_ref); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, + "event-channel", "%u", info->evtchn); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", + XEN_IO_PROTO_ABI_NATIVE); + if (err) { + message = "writing protocol"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, + "feature-persistent", "%u", 1); + if (err) + dev_warn(&dev->dev, + "writing persistent grants feature to xenbus"); + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_blkring; + } + + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + + abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_fatal(dev, err, "%s", message); + destroy_blkring: + blkif_free(info, 0); + out: + return err; +} + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures and the ring buffer for communication with the backend, and + * inform the backend of the appropriate details for those. Switch to + * Initialised state. + */ +static int blkfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err, vdevice, i; + struct blkfront_info *info; + + /* FIXME: Use dynamic device id if this is not set. */ + err = xenbus_scanf(XBT_NIL, dev->nodename, + "virtual-device", "%i", &vdevice); + if (err != 1) { + /* go looking in the extended area instead */ + err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext", + "%i", &vdevice); + if (err != 1) { + xenbus_dev_fatal(dev, err, "reading virtual-device"); + return err; + } + } + + if (xen_hvm_domain()) { + char *type; + int len; + /* no unplug has been done: do not hook devices != xen vbds */ + if (xen_has_pv_and_legacy_disk_devices()) { + int major; + + if (!VDEV_IS_EXTENDED(vdevice)) + major = BLKIF_MAJOR(vdevice); + else + major = XENVBD_MAJOR; + + if (major != XENVBD_MAJOR) { + printk(KERN_INFO + "%s: HVM does not support vbd %d as xen block device\n", + __func__, vdevice); + return -ENODEV; + } + } + /* do not create a PV cdrom device if we are an HVM guest */ + type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len); + if (IS_ERR(type)) + return -ENODEV; + if (strncmp(type, "cdrom", 5) == 0) { + kfree(type); + return -ENODEV; + } + kfree(type); + } + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); + return -ENOMEM; + } + + mutex_init(&info->mutex); + spin_lock_init(&info->io_lock); + info->xbdev = dev; + info->vdevice = vdevice; + INIT_LIST_HEAD(&info->grants); + INIT_LIST_HEAD(&info->indirect_pages); + info->persistent_gnts_c = 0; + info->connected = BLKIF_STATE_DISCONNECTED; + INIT_WORK(&info->work, blkif_restart_queue); + + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.u.rw.id = i+1; + info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; + + /* Front end dir is a number, which is used as the id. */ + info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); + dev_set_drvdata(&dev->dev, info); + + err = talk_to_blkback(dev, info); + if (err) { + kfree(info); + dev_set_drvdata(&dev->dev, NULL); + return err; + } + + return 0; +} + +static void split_bio_end(struct bio *bio, int error) +{ + struct split_bio *split_bio = bio->bi_private; + + if (error) + split_bio->err = error; + + if (atomic_dec_and_test(&split_bio->pending)) { + split_bio->bio->bi_phys_segments = 0; + bio_endio(split_bio->bio, split_bio->err); + kfree(split_bio); + } + bio_put(bio); +} + +static int blkif_recover(struct blkfront_info *info) +{ + int i; + struct request *req, *n; + struct blk_shadow *copy; + int rc; + struct bio *bio, *cloned_bio; + struct bio_list bio_list, merge_bio; + unsigned int segs, offset; + int pending, size; + struct split_bio *split_bio; + struct list_head requests; + + /* Stage 1: Make a safe copy of the shadow state. */ + copy = kmemdup(info->shadow, sizeof(info->shadow), + GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); + if (!copy) + return -ENOMEM; + + /* Stage 2: Set up free list. */ + memset(&info->shadow, 0, sizeof(info->shadow)); + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.u.rw.id = i+1; + info->shadow_free = info->ring.req_prod_pvt; + info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; + + rc = blkfront_setup_indirect(info); + if (rc) { + kfree(copy); + return rc; + } + + segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; + blk_queue_max_segments(info->rq, segs); + bio_list_init(&bio_list); + INIT_LIST_HEAD(&requests); + for (i = 0; i < BLK_RING_SIZE; i++) { + /* Not in use? */ + if (!copy[i].request) + continue; + + /* + * Get the bios in the request so we can re-queue them. + */ + if (copy[i].request->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + */ + list_add(©[i].request->queuelist, &requests); + continue; + } + merge_bio.head = copy[i].request->bio; + merge_bio.tail = copy[i].request->biotail; + bio_list_merge(&bio_list, &merge_bio); + copy[i].request->bio = NULL; + blk_end_request_all(copy[i].request, 0); + } + + kfree(copy); + + /* + * Empty the queue, this is important because we might have + * requests in the queue with more segments than what we + * can handle now. + */ + spin_lock_irq(&info->io_lock); + while ((req = blk_fetch_request(info->rq)) != NULL) { + if (req->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + list_add(&req->queuelist, &requests); + continue; + } + merge_bio.head = req->bio; + merge_bio.tail = req->biotail; + bio_list_merge(&bio_list, &merge_bio); + req->bio = NULL; + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) + pr_alert("diskcache flush request found!\n"); + __blk_end_request_all(req, 0); + } + spin_unlock_irq(&info->io_lock); + + xenbus_switch_state(info->xbdev, XenbusStateConnected); + + spin_lock_irq(&info->io_lock); + + /* Now safe for us to use the shared ring */ + info->connected = BLKIF_STATE_CONNECTED; + + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(info); + + list_for_each_entry_safe(req, n, &requests, queuelist) { + /* Requeue pending requests (flush or discard) */ + list_del_init(&req->queuelist); + BUG_ON(req->nr_phys_segments > segs); + blk_requeue_request(info->rq, req); + } + spin_unlock_irq(&info->io_lock); + + while ((bio = bio_list_pop(&bio_list)) != NULL) { + /* Traverse the list of pending bios and re-queue them */ + if (bio_segments(bio) > segs) { + /* + * This bio has more segments than what we can + * handle, we have to split it. + */ + pending = (bio_segments(bio) + segs - 1) / segs; + split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO); + BUG_ON(split_bio == NULL); + atomic_set(&split_bio->pending, pending); + split_bio->bio = bio; + for (i = 0; i < pending; i++) { + offset = (i * segs * PAGE_SIZE) >> 9; + size = min((unsigned int)(segs * PAGE_SIZE) >> 9, + (unsigned int)bio_sectors(bio) - offset); + cloned_bio = bio_clone(bio, GFP_NOIO); + BUG_ON(cloned_bio == NULL); + bio_trim(cloned_bio, offset, size); + cloned_bio->bi_private = split_bio; + cloned_bio->bi_end_io = split_bio_end; + submit_bio(cloned_bio->bi_rw, cloned_bio); + } + /* + * Now we have to wait for all those smaller bios to + * end, so we can also end the "parent" bio. + */ + continue; + } + /* We don't need to split this bio */ + submit_bio(bio->bi_rw, bio); + } + + return 0; +} + +/** + * We are reconnecting to the backend, due to a suspend/resume, or a backend + * driver restart. We tear down our blkif structure and recreate it, but + * leave the device-layer structures intact so that this is transparent to the + * rest of the kernel. + */ +static int blkfront_resume(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev_get_drvdata(&dev->dev); + int err; + + dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename); + + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); + + err = talk_to_blkback(dev, info); + + /* + * We have to wait for the backend to switch to + * connected state, since we want to read which + * features it supports. + */ + + return err; +} + +static void +blkfront_closing(struct blkfront_info *info) +{ + struct xenbus_device *xbdev = info->xbdev; + struct block_device *bdev = NULL; + + mutex_lock(&info->mutex); + + if (xbdev->state == XenbusStateClosing) { + mutex_unlock(&info->mutex); + return; + } + + if (info->gd) + bdev = bdget_disk(info->gd, 0); + + mutex_unlock(&info->mutex); + + if (!bdev) { + xenbus_frontend_closed(xbdev); + return; + } + + mutex_lock(&bdev->bd_mutex); + + if (bdev->bd_openers) { + xenbus_dev_error(xbdev, -EBUSY, + "Device in use; refusing to close"); + xenbus_switch_state(xbdev, XenbusStateClosing); + } else { + xlvbd_release_gendisk(info); + xenbus_frontend_closed(xbdev); + } + + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); +} + +static void blkfront_setup_discard(struct blkfront_info *info) +{ + int err; + unsigned int discard_granularity; + unsigned int discard_alignment; + unsigned int discard_secure; + + info->feature_discard = 1; + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "discard-granularity", "%u", &discard_granularity, + "discard-alignment", "%u", &discard_alignment, + NULL); + if (!err) { + info->discard_granularity = discard_granularity; + info->discard_alignment = discard_alignment; + } + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "discard-secure", "%d", &discard_secure, + NULL); + if (!err) + info->feature_secdiscard = !!discard_secure; +} + +static int blkfront_setup_indirect(struct blkfront_info *info) +{ + unsigned int indirect_segments, segs; + int err, i; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-max-indirect-segments", "%u", &indirect_segments, + NULL); + if (err) { + info->max_indirect_segments = 0; + segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; + } else { + info->max_indirect_segments = min(indirect_segments, + xen_blkif_max_segments); + segs = info->max_indirect_segments; + } + + err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE); + if (err) + goto out_of_memory; + + if (!info->feature_persistent && info->max_indirect_segments) { + /* + * We are using indirect descriptors but not persistent + * grants, we need to allocate a set of pages that can be + * used for mapping indirect grefs + */ + int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE; + + BUG_ON(!list_empty(&info->indirect_pages)); + for (i = 0; i < num; i++) { + struct page *indirect_page = alloc_page(GFP_NOIO); + if (!indirect_page) + goto out_of_memory; + list_add(&indirect_page->lru, &info->indirect_pages); + } + } + + for (i = 0; i < BLK_RING_SIZE; i++) { + info->shadow[i].grants_used = kzalloc( + sizeof(info->shadow[i].grants_used[0]) * segs, + GFP_NOIO); + info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO); + if (info->max_indirect_segments) + info->shadow[i].indirect_grants = kzalloc( + sizeof(info->shadow[i].indirect_grants[0]) * + INDIRECT_GREFS(segs), + GFP_NOIO); + if ((info->shadow[i].grants_used == NULL) || + (info->shadow[i].sg == NULL) || + (info->max_indirect_segments && + (info->shadow[i].indirect_grants == NULL))) + goto out_of_memory; + sg_init_table(info->shadow[i].sg, segs); + } + + + return 0; + +out_of_memory: + for (i = 0; i < BLK_RING_SIZE; i++) { + kfree(info->shadow[i].grants_used); + info->shadow[i].grants_used = NULL; + kfree(info->shadow[i].sg); + info->shadow[i].sg = NULL; + kfree(info->shadow[i].indirect_grants); + info->shadow[i].indirect_grants = NULL; + } + if (!list_empty(&info->indirect_pages)) { + struct page *indirect_page, *n; + list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_del(&indirect_page->lru); + __free_page(indirect_page); + } + } + return -ENOMEM; +} + +/* + * Invoked when the backend is finally 'ready' (and has told produced + * the details about the physical device - #sectors, size, etc). + */ +static void blkfront_connect(struct blkfront_info *info) +{ + unsigned long long sectors; + unsigned long sector_size; + unsigned int physical_sector_size; + unsigned int binfo; + int err; + int barrier, flush, discard, persistent; + + switch (info->connected) { + case BLKIF_STATE_CONNECTED: + /* + * Potentially, the back-end may be signalling + * a capacity change; update the capacity. + */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "sectors", "%Lu", §ors); + if (XENBUS_EXIST_ERR(err)) + return; + printk(KERN_INFO "Setting capacity to %Lu\n", + sectors); + set_capacity(info->gd, sectors); + revalidate_disk(info->gd); + + return; + case BLKIF_STATE_SUSPENDED: + /* + * If we are recovering from suspension, we need to wait + * for the backend to announce it's features before + * reconnecting, at least we need to know if the backend + * supports indirect descriptors, and how many. + */ + blkif_recover(info); + return; + + default: + break; + } + + dev_dbg(&info->xbdev->dev, "%s:%s.\n", + __func__, info->xbdev->otherend); + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "sectors", "%llu", §ors, + "info", "%u", &binfo, + "sector-size", "%lu", §or_size, + NULL); + if (err) { + xenbus_dev_fatal(info->xbdev, err, + "reading backend fields at %s", + info->xbdev->otherend); + return; + } + + /* + * physcial-sector-size is a newer field, so old backends may not + * provide this. Assume physical sector size to be the same as + * sector_size in that case. + */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "physical-sector-size", "%u", &physical_sector_size); + if (err != 1) + physical_sector_size = sector_size; + + info->feature_flush = 0; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-barrier", "%d", &barrier, + NULL); + + /* + * If there's no "feature-barrier" defined, then it means + * we're dealing with a very old backend which writes + * synchronously; nothing to do. + * + * If there are barriers, then we use flush. + */ + if (!err && barrier) + info->feature_flush = REQ_FLUSH | REQ_FUA; + /* + * And if there is "feature-flush-cache" use that above + * barriers. + */ + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-flush-cache", "%d", &flush, + NULL); + + if (!err && flush) + info->feature_flush = REQ_FLUSH; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-discard", "%d", &discard, + NULL); + + if (!err && discard) + blkfront_setup_discard(info); + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-persistent", "%u", &persistent, + NULL); + if (err) + info->feature_persistent = 0; + else + info->feature_persistent = persistent; + + err = blkfront_setup_indirect(info); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", + info->xbdev->otherend); + return; + } + + err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, + physical_sector_size); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", + info->xbdev->otherend); + return; + } + + xenbus_switch_state(info->xbdev, XenbusStateConnected); + + /* Kick pending requests. */ + spin_lock_irq(&info->io_lock); + info->connected = BLKIF_STATE_CONNECTED; + kick_pending_request_queues(info); + spin_unlock_irq(&info->io_lock); + + add_disk(info->gd); + + info->is_ready = 1; +} + +/** + * Callback received when the backend's state changes. + */ +static void blkback_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + struct blkfront_info *info = dev_get_drvdata(&dev->dev); + + dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state); + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateUnknown: + break; + + case XenbusStateConnected: + blkfront_connect(info); + break; + + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + /* Missed the backend's Closing state -- fallthrough */ + case XenbusStateClosing: + blkfront_closing(info); + break; + } +} + +static int blkfront_remove(struct xenbus_device *xbdev) +{ + struct blkfront_info *info = dev_get_drvdata(&xbdev->dev); + struct block_device *bdev = NULL; + struct gendisk *disk; + + dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename); + + blkif_free(info, 0); + + mutex_lock(&info->mutex); + + disk = info->gd; + if (disk) + bdev = bdget_disk(disk, 0); + + info->xbdev = NULL; + mutex_unlock(&info->mutex); + + if (!bdev) { + kfree(info); + return 0; + } + + /* + * The xbdev was removed before we reached the Closed + * state. See if it's safe to remove the disk. If the bdev + * isn't closed yet, we let release take care of it. + */ + + mutex_lock(&bdev->bd_mutex); + info = disk->private_data; + + dev_warn(disk_to_dev(disk), + "%s was hot-unplugged, %d stale handles\n", + xbdev->nodename, bdev->bd_openers); + + if (info && !bdev->bd_openers) { + xlvbd_release_gendisk(info); + disk->private_data = NULL; + kfree(info); + } + + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + + return 0; +} + +static int blkfront_is_ready(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev_get_drvdata(&dev->dev); + + return info->is_ready && info->xbdev; +} + +static int blkif_open(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + struct blkfront_info *info; + int err = 0; + + mutex_lock(&blkfront_mutex); + + info = disk->private_data; + if (!info) { + /* xbdev gone */ + err = -ERESTARTSYS; + goto out; + } + + mutex_lock(&info->mutex); + + if (!info->gd) + /* xbdev is closed */ + err = -ERESTARTSYS; + + mutex_unlock(&info->mutex); + +out: + mutex_unlock(&blkfront_mutex); + return err; +} + +static void blkif_release(struct gendisk *disk, fmode_t mode) +{ + struct blkfront_info *info = disk->private_data; + struct block_device *bdev; + struct xenbus_device *xbdev; + + mutex_lock(&blkfront_mutex); + + bdev = bdget_disk(disk, 0); + + if (!bdev) { + WARN(1, "Block device %s yanked out from us!\n", disk->disk_name); + goto out_mutex; + } + if (bdev->bd_openers) + goto out; + + /* + * Check if we have been instructed to close. We will have + * deferred this request, because the bdev was still open. + */ + + mutex_lock(&info->mutex); + xbdev = info->xbdev; + + if (xbdev && xbdev->state == XenbusStateClosing) { + /* pending switch to state closed */ + dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); + xlvbd_release_gendisk(info); + xenbus_frontend_closed(info->xbdev); + } + + mutex_unlock(&info->mutex); + + if (!xbdev) { + /* sudden device removal */ + dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); + xlvbd_release_gendisk(info); + disk->private_data = NULL; + kfree(info); + } + +out: + bdput(bdev); +out_mutex: + mutex_unlock(&blkfront_mutex); +} + +static const struct block_device_operations xlvbd_block_fops = +{ + .owner = THIS_MODULE, + .open = blkif_open, + .release = blkif_release, + .getgeo = blkif_getgeo, + .ioctl = blkif_ioctl, +}; + + +static const struct xenbus_device_id blkfront_ids[] = { + { "vbd" }, + { "" } +}; + +static struct xenbus_driver blkfront_driver = { + .ids = blkfront_ids, + .probe = blkfront_probe, + .remove = blkfront_remove, + .resume = blkfront_resume, + .otherend_changed = blkback_changed, + .is_ready = blkfront_is_ready, +}; + +static int __init xlblk_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + if (!xen_has_pv_disk_devices()) + return -ENODEV; + + if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { + printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n", + XENVBD_MAJOR, DEV_NAME); + return -ENODEV; + } + + ret = xenbus_register_frontend(&blkfront_driver); + if (ret) { + unregister_blkdev(XENVBD_MAJOR, DEV_NAME); + return ret; + } + + return 0; +} +module_init(xlblk_init); + + +static void __exit xlblk_exit(void) +{ + xenbus_unregister_driver(&blkfront_driver); + unregister_blkdev(XENVBD_MAJOR, DEV_NAME); + kfree(minors); +} +module_exit(xlblk_exit); + +MODULE_DESCRIPTION("Xen virtual block device frontend"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR); +MODULE_ALIAS("xen:vbd"); +MODULE_ALIAS("xenblk"); diff --git a/kernel/drivers/block/xsysace.c b/kernel/drivers/block/xsysace.c new file mode 100644 index 000000000..c4328d9d9 --- /dev/null +++ b/kernel/drivers/block/xsysace.c @@ -0,0 +1,1245 @@ +/* + * Xilinx SystemACE device driver + * + * Copyright 2007 Secret Lab Technologies Ltd. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +/* + * The SystemACE chip is designed to configure FPGAs by loading an FPGA + * bitstream from a file on a CF card and squirting it into FPGAs connected + * to the SystemACE JTAG chain. It also has the advantage of providing an + * MPU interface which can be used to control the FPGA configuration process + * and to use the attached CF card for general purpose storage. + * + * This driver is a block device driver for the SystemACE. + * + * Initialization: + * The driver registers itself as a platform_device driver at module + * load time. The platform bus will take care of calling the + * ace_probe() method for all SystemACE instances in the system. Any + * number of SystemACE instances are supported. ace_probe() calls + * ace_setup() which initialized all data structures, reads the CF + * id structure and registers the device. + * + * Processing: + * Just about all of the heavy lifting in this driver is performed by + * a Finite State Machine (FSM). The driver needs to wait on a number + * of events; some raised by interrupts, some which need to be polled + * for. Describing all of the behaviour in a FSM seems to be the + * easiest way to keep the complexity low and make it easy to + * understand what the driver is doing. If the block ops or the + * request function need to interact with the hardware, then they + * simply need to flag the request and kick of FSM processing. + * + * The FSM itself is atomic-safe code which can be run from any + * context. The general process flow is: + * 1. obtain the ace->lock spinlock. + * 2. loop on ace_fsm_dostate() until the ace->fsm_continue flag is + * cleared. + * 3. release the lock. + * + * Individual states do not sleep in any way. If a condition needs to + * be waited for then the state much clear the fsm_continue flag and + * either schedule the FSM to be run again at a later time, or expect + * an interrupt to call the FSM when the desired condition is met. + * + * In normal operation, the FSM is processed at interrupt context + * either when the driver's tasklet is scheduled, or when an irq is + * raised by the hardware. The tasklet can be scheduled at any time. + * The request method in particular schedules the tasklet when a new + * request has been indicated by the block layer. Once started, the + * FSM proceeds as far as it can processing the request until it + * needs on a hardware event. At this point, it must yield execution. + * + * A state has two options when yielding execution: + * 1. ace_fsm_yield() + * - Call if need to poll for event. + * - clears the fsm_continue flag to exit the processing loop + * - reschedules the tasklet to run again as soon as possible + * 2. ace_fsm_yieldirq() + * - Call if an irq is expected from the HW + * - clears the fsm_continue flag to exit the processing loop + * - does not reschedule the tasklet so the FSM will not be processed + * again until an irq is received. + * After calling a yield function, the state must return control back + * to the FSM main loop. + * + * Additionally, the driver maintains a kernel timer which can process + * the FSM. If the FSM gets stalled, typically due to a missed + * interrupt, then the kernel timer will expire and the driver can + * continue where it left off. + * + * To Do: + * - Add FPGA configuration control interface. + * - Request major number from lanana + */ + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_OF) +#include +#include +#include +#endif + +MODULE_AUTHOR("Grant Likely "); +MODULE_DESCRIPTION("Xilinx SystemACE device driver"); +MODULE_LICENSE("GPL"); + +/* SystemACE register definitions */ +#define ACE_BUSMODE (0x00) + +#define ACE_STATUS (0x04) +#define ACE_STATUS_CFGLOCK (0x00000001) +#define ACE_STATUS_MPULOCK (0x00000002) +#define ACE_STATUS_CFGERROR (0x00000004) /* config controller error */ +#define ACE_STATUS_CFCERROR (0x00000008) /* CF controller error */ +#define ACE_STATUS_CFDETECT (0x00000010) +#define ACE_STATUS_DATABUFRDY (0x00000020) +#define ACE_STATUS_DATABUFMODE (0x00000040) +#define ACE_STATUS_CFGDONE (0x00000080) +#define ACE_STATUS_RDYFORCFCMD (0x00000100) +#define ACE_STATUS_CFGMODEPIN (0x00000200) +#define ACE_STATUS_CFGADDR_MASK (0x0000e000) +#define ACE_STATUS_CFBSY (0x00020000) +#define ACE_STATUS_CFRDY (0x00040000) +#define ACE_STATUS_CFDWF (0x00080000) +#define ACE_STATUS_CFDSC (0x00100000) +#define ACE_STATUS_CFDRQ (0x00200000) +#define ACE_STATUS_CFCORR (0x00400000) +#define ACE_STATUS_CFERR (0x00800000) + +#define ACE_ERROR (0x08) +#define ACE_CFGLBA (0x0c) +#define ACE_MPULBA (0x10) + +#define ACE_SECCNTCMD (0x14) +#define ACE_SECCNTCMD_RESET (0x0100) +#define ACE_SECCNTCMD_IDENTIFY (0x0200) +#define ACE_SECCNTCMD_READ_DATA (0x0300) +#define ACE_SECCNTCMD_WRITE_DATA (0x0400) +#define ACE_SECCNTCMD_ABORT (0x0600) + +#define ACE_VERSION (0x16) +#define ACE_VERSION_REVISION_MASK (0x00FF) +#define ACE_VERSION_MINOR_MASK (0x0F00) +#define ACE_VERSION_MAJOR_MASK (0xF000) + +#define ACE_CTRL (0x18) +#define ACE_CTRL_FORCELOCKREQ (0x0001) +#define ACE_CTRL_LOCKREQ (0x0002) +#define ACE_CTRL_FORCECFGADDR (0x0004) +#define ACE_CTRL_FORCECFGMODE (0x0008) +#define ACE_CTRL_CFGMODE (0x0010) +#define ACE_CTRL_CFGSTART (0x0020) +#define ACE_CTRL_CFGSEL (0x0040) +#define ACE_CTRL_CFGRESET (0x0080) +#define ACE_CTRL_DATABUFRDYIRQ (0x0100) +#define ACE_CTRL_ERRORIRQ (0x0200) +#define ACE_CTRL_CFGDONEIRQ (0x0400) +#define ACE_CTRL_RESETIRQ (0x0800) +#define ACE_CTRL_CFGPROG (0x1000) +#define ACE_CTRL_CFGADDR_MASK (0xe000) + +#define ACE_FATSTAT (0x1c) + +#define ACE_NUM_MINORS 16 +#define ACE_SECTOR_SIZE (512) +#define ACE_FIFO_SIZE (32) +#define ACE_BUF_PER_SECTOR (ACE_SECTOR_SIZE / ACE_FIFO_SIZE) + +#define ACE_BUS_WIDTH_8 0 +#define ACE_BUS_WIDTH_16 1 + +struct ace_reg_ops; + +struct ace_device { + /* driver state data */ + int id; + int media_change; + int users; + struct list_head list; + + /* finite state machine data */ + struct tasklet_struct fsm_tasklet; + uint fsm_task; /* Current activity (ACE_TASK_*) */ + uint fsm_state; /* Current state (ACE_FSM_STATE_*) */ + uint fsm_continue_flag; /* cleared to exit FSM mainloop */ + uint fsm_iter_num; + struct timer_list stall_timer; + + /* Transfer state/result, use for both id and block request */ + struct request *req; /* request being processed */ + void *data_ptr; /* pointer to I/O buffer */ + int data_count; /* number of buffers remaining */ + int data_result; /* Result of transfer; 0 := success */ + + int id_req_count; /* count of id requests */ + int id_result; + struct completion id_completion; /* used when id req finishes */ + int in_irq; + + /* Details of hardware device */ + resource_size_t physaddr; + void __iomem *baseaddr; + int irq; + int bus_width; /* 0 := 8 bit; 1 := 16 bit */ + struct ace_reg_ops *reg_ops; + int lock_count; + + /* Block device data structures */ + spinlock_t lock; + struct device *dev; + struct request_queue *queue; + struct gendisk *gd; + + /* Inserted CF card parameters */ + u16 cf_id[ATA_ID_WORDS]; +}; + +static DEFINE_MUTEX(xsysace_mutex); +static int ace_major; + +/* --------------------------------------------------------------------- + * Low level register access + */ + +struct ace_reg_ops { + u16(*in) (struct ace_device * ace, int reg); + void (*out) (struct ace_device * ace, int reg, u16 val); + void (*datain) (struct ace_device * ace); + void (*dataout) (struct ace_device * ace); +}; + +/* 8 Bit bus width */ +static u16 ace_in_8(struct ace_device *ace, int reg) +{ + void __iomem *r = ace->baseaddr + reg; + return in_8(r) | (in_8(r + 1) << 8); +} + +static void ace_out_8(struct ace_device *ace, int reg, u16 val) +{ + void __iomem *r = ace->baseaddr + reg; + out_8(r, val); + out_8(r + 1, val >> 8); +} + +static void ace_datain_8(struct ace_device *ace) +{ + void __iomem *r = ace->baseaddr + 0x40; + u8 *dst = ace->data_ptr; + int i = ACE_FIFO_SIZE; + while (i--) + *dst++ = in_8(r++); + ace->data_ptr = dst; +} + +static void ace_dataout_8(struct ace_device *ace) +{ + void __iomem *r = ace->baseaddr + 0x40; + u8 *src = ace->data_ptr; + int i = ACE_FIFO_SIZE; + while (i--) + out_8(r++, *src++); + ace->data_ptr = src; +} + +static struct ace_reg_ops ace_reg_8_ops = { + .in = ace_in_8, + .out = ace_out_8, + .datain = ace_datain_8, + .dataout = ace_dataout_8, +}; + +/* 16 bit big endian bus attachment */ +static u16 ace_in_be16(struct ace_device *ace, int reg) +{ + return in_be16(ace->baseaddr + reg); +} + +static void ace_out_be16(struct ace_device *ace, int reg, u16 val) +{ + out_be16(ace->baseaddr + reg, val); +} + +static void ace_datain_be16(struct ace_device *ace) +{ + int i = ACE_FIFO_SIZE / 2; + u16 *dst = ace->data_ptr; + while (i--) + *dst++ = in_le16(ace->baseaddr + 0x40); + ace->data_ptr = dst; +} + +static void ace_dataout_be16(struct ace_device *ace) +{ + int i = ACE_FIFO_SIZE / 2; + u16 *src = ace->data_ptr; + while (i--) + out_le16(ace->baseaddr + 0x40, *src++); + ace->data_ptr = src; +} + +/* 16 bit little endian bus attachment */ +static u16 ace_in_le16(struct ace_device *ace, int reg) +{ + return in_le16(ace->baseaddr + reg); +} + +static void ace_out_le16(struct ace_device *ace, int reg, u16 val) +{ + out_le16(ace->baseaddr + reg, val); +} + +static void ace_datain_le16(struct ace_device *ace) +{ + int i = ACE_FIFO_SIZE / 2; + u16 *dst = ace->data_ptr; + while (i--) + *dst++ = in_be16(ace->baseaddr + 0x40); + ace->data_ptr = dst; +} + +static void ace_dataout_le16(struct ace_device *ace) +{ + int i = ACE_FIFO_SIZE / 2; + u16 *src = ace->data_ptr; + while (i--) + out_be16(ace->baseaddr + 0x40, *src++); + ace->data_ptr = src; +} + +static struct ace_reg_ops ace_reg_be16_ops = { + .in = ace_in_be16, + .out = ace_out_be16, + .datain = ace_datain_be16, + .dataout = ace_dataout_be16, +}; + +static struct ace_reg_ops ace_reg_le16_ops = { + .in = ace_in_le16, + .out = ace_out_le16, + .datain = ace_datain_le16, + .dataout = ace_dataout_le16, +}; + +static inline u16 ace_in(struct ace_device *ace, int reg) +{ + return ace->reg_ops->in(ace, reg); +} + +static inline u32 ace_in32(struct ace_device *ace, int reg) +{ + return ace_in(ace, reg) | (ace_in(ace, reg + 2) << 16); +} + +static inline void ace_out(struct ace_device *ace, int reg, u16 val) +{ + ace->reg_ops->out(ace, reg, val); +} + +static inline void ace_out32(struct ace_device *ace, int reg, u32 val) +{ + ace_out(ace, reg, val); + ace_out(ace, reg + 2, val >> 16); +} + +/* --------------------------------------------------------------------- + * Debug support functions + */ + +#if defined(DEBUG) +static void ace_dump_mem(void *base, int len) +{ + const char *ptr = base; + int i, j; + + for (i = 0; i < len; i += 16) { + printk(KERN_INFO "%.8x:", i); + for (j = 0; j < 16; j++) { + if (!(j % 4)) + printk(" "); + printk("%.2x", ptr[i + j]); + } + printk(" "); + for (j = 0; j < 16; j++) + printk("%c", isprint(ptr[i + j]) ? ptr[i + j] : '.'); + printk("\n"); + } +} +#else +static inline void ace_dump_mem(void *base, int len) +{ +} +#endif + +static void ace_dump_regs(struct ace_device *ace) +{ + dev_info(ace->dev, + " ctrl: %.8x seccnt/cmd: %.4x ver:%.4x\n" + " status:%.8x mpu_lba:%.8x busmode:%4x\n" + " error: %.8x cfg_lba:%.8x fatstat:%.4x\n", + ace_in32(ace, ACE_CTRL), + ace_in(ace, ACE_SECCNTCMD), + ace_in(ace, ACE_VERSION), + ace_in32(ace, ACE_STATUS), + ace_in32(ace, ACE_MPULBA), + ace_in(ace, ACE_BUSMODE), + ace_in32(ace, ACE_ERROR), + ace_in32(ace, ACE_CFGLBA), ace_in(ace, ACE_FATSTAT)); +} + +static void ace_fix_driveid(u16 *id) +{ +#if defined(__BIG_ENDIAN) + int i; + + /* All half words have wrong byte order; swap the bytes */ + for (i = 0; i < ATA_ID_WORDS; i++, id++) + *id = le16_to_cpu(*id); +#endif +} + +/* --------------------------------------------------------------------- + * Finite State Machine (FSM) implementation + */ + +/* FSM tasks; used to direct state transitions */ +#define ACE_TASK_IDLE 0 +#define ACE_TASK_IDENTIFY 1 +#define ACE_TASK_READ 2 +#define ACE_TASK_WRITE 3 +#define ACE_FSM_NUM_TASKS 4 + +/* FSM state definitions */ +#define ACE_FSM_STATE_IDLE 0 +#define ACE_FSM_STATE_REQ_LOCK 1 +#define ACE_FSM_STATE_WAIT_LOCK 2 +#define ACE_FSM_STATE_WAIT_CFREADY 3 +#define ACE_FSM_STATE_IDENTIFY_PREPARE 4 +#define ACE_FSM_STATE_IDENTIFY_TRANSFER 5 +#define ACE_FSM_STATE_IDENTIFY_COMPLETE 6 +#define ACE_FSM_STATE_REQ_PREPARE 7 +#define ACE_FSM_STATE_REQ_TRANSFER 8 +#define ACE_FSM_STATE_REQ_COMPLETE 9 +#define ACE_FSM_STATE_ERROR 10 +#define ACE_FSM_NUM_STATES 11 + +/* Set flag to exit FSM loop and reschedule tasklet */ +static inline void ace_fsm_yield(struct ace_device *ace) +{ + dev_dbg(ace->dev, "ace_fsm_yield()\n"); + tasklet_schedule(&ace->fsm_tasklet); + ace->fsm_continue_flag = 0; +} + +/* Set flag to exit FSM loop and wait for IRQ to reschedule tasklet */ +static inline void ace_fsm_yieldirq(struct ace_device *ace) +{ + dev_dbg(ace->dev, "ace_fsm_yieldirq()\n"); + + if (!ace->irq) + /* No IRQ assigned, so need to poll */ + tasklet_schedule(&ace->fsm_tasklet); + ace->fsm_continue_flag = 0; +} + +/* Get the next read/write request; ending requests that we don't handle */ +static struct request *ace_get_next_request(struct request_queue *q) +{ + struct request *req; + + while ((req = blk_peek_request(q)) != NULL) { + if (req->cmd_type == REQ_TYPE_FS) + break; + blk_start_request(req); + __blk_end_request_all(req, -EIO); + } + return req; +} + +static void ace_fsm_dostate(struct ace_device *ace) +{ + struct request *req; + u32 status; + u16 val; + int count; + +#if defined(DEBUG) + dev_dbg(ace->dev, "fsm_state=%i, id_req_count=%i\n", + ace->fsm_state, ace->id_req_count); +#endif + + /* Verify that there is actually a CF in the slot. If not, then + * bail out back to the idle state and wake up all the waiters */ + status = ace_in32(ace, ACE_STATUS); + if ((status & ACE_STATUS_CFDETECT) == 0) { + ace->fsm_state = ACE_FSM_STATE_IDLE; + ace->media_change = 1; + set_capacity(ace->gd, 0); + dev_info(ace->dev, "No CF in slot\n"); + + /* Drop all in-flight and pending requests */ + if (ace->req) { + __blk_end_request_all(ace->req, -EIO); + ace->req = NULL; + } + while ((req = blk_fetch_request(ace->queue)) != NULL) + __blk_end_request_all(req, -EIO); + + /* Drop back to IDLE state and notify waiters */ + ace->fsm_state = ACE_FSM_STATE_IDLE; + ace->id_result = -EIO; + while (ace->id_req_count) { + complete(&ace->id_completion); + ace->id_req_count--; + } + } + + switch (ace->fsm_state) { + case ACE_FSM_STATE_IDLE: + /* See if there is anything to do */ + if (ace->id_req_count || ace_get_next_request(ace->queue)) { + ace->fsm_iter_num++; + ace->fsm_state = ACE_FSM_STATE_REQ_LOCK; + mod_timer(&ace->stall_timer, jiffies + HZ); + if (!timer_pending(&ace->stall_timer)) + add_timer(&ace->stall_timer); + break; + } + del_timer(&ace->stall_timer); + ace->fsm_continue_flag = 0; + break; + + case ACE_FSM_STATE_REQ_LOCK: + if (ace_in(ace, ACE_STATUS) & ACE_STATUS_MPULOCK) { + /* Already have the lock, jump to next state */ + ace->fsm_state = ACE_FSM_STATE_WAIT_CFREADY; + break; + } + + /* Request the lock */ + val = ace_in(ace, ACE_CTRL); + ace_out(ace, ACE_CTRL, val | ACE_CTRL_LOCKREQ); + ace->fsm_state = ACE_FSM_STATE_WAIT_LOCK; + break; + + case ACE_FSM_STATE_WAIT_LOCK: + if (ace_in(ace, ACE_STATUS) & ACE_STATUS_MPULOCK) { + /* got the lock; move to next state */ + ace->fsm_state = ACE_FSM_STATE_WAIT_CFREADY; + break; + } + + /* wait a bit for the lock */ + ace_fsm_yield(ace); + break; + + case ACE_FSM_STATE_WAIT_CFREADY: + status = ace_in32(ace, ACE_STATUS); + if (!(status & ACE_STATUS_RDYFORCFCMD) || + (status & ACE_STATUS_CFBSY)) { + /* CF card isn't ready; it needs to be polled */ + ace_fsm_yield(ace); + break; + } + + /* Device is ready for command; determine what to do next */ + if (ace->id_req_count) + ace->fsm_state = ACE_FSM_STATE_IDENTIFY_PREPARE; + else + ace->fsm_state = ACE_FSM_STATE_REQ_PREPARE; + break; + + case ACE_FSM_STATE_IDENTIFY_PREPARE: + /* Send identify command */ + ace->fsm_task = ACE_TASK_IDENTIFY; + ace->data_ptr = ace->cf_id; + ace->data_count = ACE_BUF_PER_SECTOR; + ace_out(ace, ACE_SECCNTCMD, ACE_SECCNTCMD_IDENTIFY); + + /* As per datasheet, put config controller in reset */ + val = ace_in(ace, ACE_CTRL); + ace_out(ace, ACE_CTRL, val | ACE_CTRL_CFGRESET); + + /* irq handler takes over from this point; wait for the + * transfer to complete */ + ace->fsm_state = ACE_FSM_STATE_IDENTIFY_TRANSFER; + ace_fsm_yieldirq(ace); + break; + + case ACE_FSM_STATE_IDENTIFY_TRANSFER: + /* Check that the sysace is ready to receive data */ + status = ace_in32(ace, ACE_STATUS); + if (status & ACE_STATUS_CFBSY) { + dev_dbg(ace->dev, "CFBSY set; t=%i iter=%i dc=%i\n", + ace->fsm_task, ace->fsm_iter_num, + ace->data_count); + ace_fsm_yield(ace); + break; + } + if (!(status & ACE_STATUS_DATABUFRDY)) { + ace_fsm_yield(ace); + break; + } + + /* Transfer the next buffer */ + ace->reg_ops->datain(ace); + ace->data_count--; + + /* If there are still buffers to be transfers; jump out here */ + if (ace->data_count != 0) { + ace_fsm_yieldirq(ace); + break; + } + + /* transfer finished; kick state machine */ + dev_dbg(ace->dev, "identify finished\n"); + ace->fsm_state = ACE_FSM_STATE_IDENTIFY_COMPLETE; + break; + + case ACE_FSM_STATE_IDENTIFY_COMPLETE: + ace_fix_driveid(ace->cf_id); + ace_dump_mem(ace->cf_id, 512); /* Debug: Dump out disk ID */ + + if (ace->data_result) { + /* Error occurred, disable the disk */ + ace->media_change = 1; + set_capacity(ace->gd, 0); + dev_err(ace->dev, "error fetching CF id (%i)\n", + ace->data_result); + } else { + ace->media_change = 0; + + /* Record disk parameters */ + set_capacity(ace->gd, + ata_id_u32(ace->cf_id, ATA_ID_LBA_CAPACITY)); + dev_info(ace->dev, "capacity: %i sectors\n", + ata_id_u32(ace->cf_id, ATA_ID_LBA_CAPACITY)); + } + + /* We're done, drop to IDLE state and notify waiters */ + ace->fsm_state = ACE_FSM_STATE_IDLE; + ace->id_result = ace->data_result; + while (ace->id_req_count) { + complete(&ace->id_completion); + ace->id_req_count--; + } + break; + + case ACE_FSM_STATE_REQ_PREPARE: + req = ace_get_next_request(ace->queue); + if (!req) { + ace->fsm_state = ACE_FSM_STATE_IDLE; + break; + } + blk_start_request(req); + + /* Okay, it's a data request, set it up for transfer */ + dev_dbg(ace->dev, + "request: sec=%llx hcnt=%x, ccnt=%x, dir=%i\n", + (unsigned long long)blk_rq_pos(req), + blk_rq_sectors(req), blk_rq_cur_sectors(req), + rq_data_dir(req)); + + ace->req = req; + ace->data_ptr = bio_data(req->bio); + ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR; + ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF); + + count = blk_rq_sectors(req); + if (rq_data_dir(req)) { + /* Kick off write request */ + dev_dbg(ace->dev, "write data\n"); + ace->fsm_task = ACE_TASK_WRITE; + ace_out(ace, ACE_SECCNTCMD, + count | ACE_SECCNTCMD_WRITE_DATA); + } else { + /* Kick off read request */ + dev_dbg(ace->dev, "read data\n"); + ace->fsm_task = ACE_TASK_READ; + ace_out(ace, ACE_SECCNTCMD, + count | ACE_SECCNTCMD_READ_DATA); + } + + /* As per datasheet, put config controller in reset */ + val = ace_in(ace, ACE_CTRL); + ace_out(ace, ACE_CTRL, val | ACE_CTRL_CFGRESET); + + /* Move to the transfer state. The systemace will raise + * an interrupt once there is something to do + */ + ace->fsm_state = ACE_FSM_STATE_REQ_TRANSFER; + if (ace->fsm_task == ACE_TASK_READ) + ace_fsm_yieldirq(ace); /* wait for data ready */ + break; + + case ACE_FSM_STATE_REQ_TRANSFER: + /* Check that the sysace is ready to receive data */ + status = ace_in32(ace, ACE_STATUS); + if (status & ACE_STATUS_CFBSY) { + dev_dbg(ace->dev, + "CFBSY set; t=%i iter=%i c=%i dc=%i irq=%i\n", + ace->fsm_task, ace->fsm_iter_num, + blk_rq_cur_sectors(ace->req) * 16, + ace->data_count, ace->in_irq); + ace_fsm_yield(ace); /* need to poll CFBSY bit */ + break; + } + if (!(status & ACE_STATUS_DATABUFRDY)) { + dev_dbg(ace->dev, + "DATABUF not set; t=%i iter=%i c=%i dc=%i irq=%i\n", + ace->fsm_task, ace->fsm_iter_num, + blk_rq_cur_sectors(ace->req) * 16, + ace->data_count, ace->in_irq); + ace_fsm_yieldirq(ace); + break; + } + + /* Transfer the next buffer */ + if (ace->fsm_task == ACE_TASK_WRITE) + ace->reg_ops->dataout(ace); + else + ace->reg_ops->datain(ace); + ace->data_count--; + + /* If there are still buffers to be transfers; jump out here */ + if (ace->data_count != 0) { + ace_fsm_yieldirq(ace); + break; + } + + /* bio finished; is there another one? */ + if (__blk_end_request_cur(ace->req, 0)) { + /* dev_dbg(ace->dev, "next block; h=%u c=%u\n", + * blk_rq_sectors(ace->req), + * blk_rq_cur_sectors(ace->req)); + */ + ace->data_ptr = bio_data(ace->req->bio); + ace->data_count = blk_rq_cur_sectors(ace->req) * 16; + ace_fsm_yieldirq(ace); + break; + } + + ace->fsm_state = ACE_FSM_STATE_REQ_COMPLETE; + break; + + case ACE_FSM_STATE_REQ_COMPLETE: + ace->req = NULL; + + /* Finished request; go to idle state */ + ace->fsm_state = ACE_FSM_STATE_IDLE; + break; + + default: + ace->fsm_state = ACE_FSM_STATE_IDLE; + break; + } +} + +static void ace_fsm_tasklet(unsigned long data) +{ + struct ace_device *ace = (void *)data; + unsigned long flags; + + spin_lock_irqsave(&ace->lock, flags); + + /* Loop over state machine until told to stop */ + ace->fsm_continue_flag = 1; + while (ace->fsm_continue_flag) + ace_fsm_dostate(ace); + + spin_unlock_irqrestore(&ace->lock, flags); +} + +static void ace_stall_timer(unsigned long data) +{ + struct ace_device *ace = (void *)data; + unsigned long flags; + + dev_warn(ace->dev, + "kicking stalled fsm; state=%i task=%i iter=%i dc=%i\n", + ace->fsm_state, ace->fsm_task, ace->fsm_iter_num, + ace->data_count); + spin_lock_irqsave(&ace->lock, flags); + + /* Rearm the stall timer *before* entering FSM (which may then + * delete the timer) */ + mod_timer(&ace->stall_timer, jiffies + HZ); + + /* Loop over state machine until told to stop */ + ace->fsm_continue_flag = 1; + while (ace->fsm_continue_flag) + ace_fsm_dostate(ace); + + spin_unlock_irqrestore(&ace->lock, flags); +} + +/* --------------------------------------------------------------------- + * Interrupt handling routines + */ +static int ace_interrupt_checkstate(struct ace_device *ace) +{ + u32 sreg = ace_in32(ace, ACE_STATUS); + u16 creg = ace_in(ace, ACE_CTRL); + + /* Check for error occurrence */ + if ((sreg & (ACE_STATUS_CFGERROR | ACE_STATUS_CFCERROR)) && + (creg & ACE_CTRL_ERRORIRQ)) { + dev_err(ace->dev, "transfer failure\n"); + ace_dump_regs(ace); + return -EIO; + } + + return 0; +} + +static irqreturn_t ace_interrupt(int irq, void *dev_id) +{ + u16 creg; + struct ace_device *ace = dev_id; + + /* be safe and get the lock */ + spin_lock(&ace->lock); + ace->in_irq = 1; + + /* clear the interrupt */ + creg = ace_in(ace, ACE_CTRL); + ace_out(ace, ACE_CTRL, creg | ACE_CTRL_RESETIRQ); + ace_out(ace, ACE_CTRL, creg); + + /* check for IO failures */ + if (ace_interrupt_checkstate(ace)) + ace->data_result = -EIO; + + if (ace->fsm_task == 0) { + dev_err(ace->dev, + "spurious irq; stat=%.8x ctrl=%.8x cmd=%.4x\n", + ace_in32(ace, ACE_STATUS), ace_in32(ace, ACE_CTRL), + ace_in(ace, ACE_SECCNTCMD)); + dev_err(ace->dev, "fsm_task=%i fsm_state=%i data_count=%i\n", + ace->fsm_task, ace->fsm_state, ace->data_count); + } + + /* Loop over state machine until told to stop */ + ace->fsm_continue_flag = 1; + while (ace->fsm_continue_flag) + ace_fsm_dostate(ace); + + /* done with interrupt; drop the lock */ + ace->in_irq = 0; + spin_unlock(&ace->lock); + + return IRQ_HANDLED; +} + +/* --------------------------------------------------------------------- + * Block ops + */ +static void ace_request(struct request_queue * q) +{ + struct request *req; + struct ace_device *ace; + + req = ace_get_next_request(q); + + if (req) { + ace = req->rq_disk->private_data; + tasklet_schedule(&ace->fsm_tasklet); + } +} + +static unsigned int ace_check_events(struct gendisk *gd, unsigned int clearing) +{ + struct ace_device *ace = gd->private_data; + dev_dbg(ace->dev, "ace_check_events(): %i\n", ace->media_change); + + return ace->media_change ? DISK_EVENT_MEDIA_CHANGE : 0; +} + +static int ace_revalidate_disk(struct gendisk *gd) +{ + struct ace_device *ace = gd->private_data; + unsigned long flags; + + dev_dbg(ace->dev, "ace_revalidate_disk()\n"); + + if (ace->media_change) { + dev_dbg(ace->dev, "requesting cf id and scheduling tasklet\n"); + + spin_lock_irqsave(&ace->lock, flags); + ace->id_req_count++; + spin_unlock_irqrestore(&ace->lock, flags); + + tasklet_schedule(&ace->fsm_tasklet); + wait_for_completion(&ace->id_completion); + } + + dev_dbg(ace->dev, "revalidate complete\n"); + return ace->id_result; +} + +static int ace_open(struct block_device *bdev, fmode_t mode) +{ + struct ace_device *ace = bdev->bd_disk->private_data; + unsigned long flags; + + dev_dbg(ace->dev, "ace_open() users=%i\n", ace->users + 1); + + mutex_lock(&xsysace_mutex); + spin_lock_irqsave(&ace->lock, flags); + ace->users++; + spin_unlock_irqrestore(&ace->lock, flags); + + check_disk_change(bdev); + mutex_unlock(&xsysace_mutex); + + return 0; +} + +static void ace_release(struct gendisk *disk, fmode_t mode) +{ + struct ace_device *ace = disk->private_data; + unsigned long flags; + u16 val; + + dev_dbg(ace->dev, "ace_release() users=%i\n", ace->users - 1); + + mutex_lock(&xsysace_mutex); + spin_lock_irqsave(&ace->lock, flags); + ace->users--; + if (ace->users == 0) { + val = ace_in(ace, ACE_CTRL); + ace_out(ace, ACE_CTRL, val & ~ACE_CTRL_LOCKREQ); + } + spin_unlock_irqrestore(&ace->lock, flags); + mutex_unlock(&xsysace_mutex); +} + +static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + struct ace_device *ace = bdev->bd_disk->private_data; + u16 *cf_id = ace->cf_id; + + dev_dbg(ace->dev, "ace_getgeo()\n"); + + geo->heads = cf_id[ATA_ID_HEADS]; + geo->sectors = cf_id[ATA_ID_SECTORS]; + geo->cylinders = cf_id[ATA_ID_CYLS]; + + return 0; +} + +static const struct block_device_operations ace_fops = { + .owner = THIS_MODULE, + .open = ace_open, + .release = ace_release, + .check_events = ace_check_events, + .revalidate_disk = ace_revalidate_disk, + .getgeo = ace_getgeo, +}; + +/* -------------------------------------------------------------------- + * SystemACE device setup/teardown code + */ +static int ace_setup(struct ace_device *ace) +{ + u16 version; + u16 val; + int rc; + + dev_dbg(ace->dev, "ace_setup(ace=0x%p)\n", ace); + dev_dbg(ace->dev, "physaddr=0x%llx irq=%i\n", + (unsigned long long)ace->physaddr, ace->irq); + + spin_lock_init(&ace->lock); + init_completion(&ace->id_completion); + + /* + * Map the device + */ + ace->baseaddr = ioremap(ace->physaddr, 0x80); + if (!ace->baseaddr) + goto err_ioremap; + + /* + * Initialize the state machine tasklet and stall timer + */ + tasklet_init(&ace->fsm_tasklet, ace_fsm_tasklet, (unsigned long)ace); + setup_timer(&ace->stall_timer, ace_stall_timer, (unsigned long)ace); + + /* + * Initialize the request queue + */ + ace->queue = blk_init_queue(ace_request, &ace->lock); + if (ace->queue == NULL) + goto err_blk_initq; + blk_queue_logical_block_size(ace->queue, 512); + + /* + * Allocate and initialize GD structure + */ + ace->gd = alloc_disk(ACE_NUM_MINORS); + if (!ace->gd) + goto err_alloc_disk; + + ace->gd->major = ace_major; + ace->gd->first_minor = ace->id * ACE_NUM_MINORS; + ace->gd->fops = &ace_fops; + ace->gd->queue = ace->queue; + ace->gd->private_data = ace; + snprintf(ace->gd->disk_name, 32, "xs%c", ace->id + 'a'); + + /* set bus width */ + if (ace->bus_width == ACE_BUS_WIDTH_16) { + /* 0x0101 should work regardless of endianess */ + ace_out_le16(ace, ACE_BUSMODE, 0x0101); + + /* read it back to determine endianess */ + if (ace_in_le16(ace, ACE_BUSMODE) == 0x0001) + ace->reg_ops = &ace_reg_le16_ops; + else + ace->reg_ops = &ace_reg_be16_ops; + } else { + ace_out_8(ace, ACE_BUSMODE, 0x00); + ace->reg_ops = &ace_reg_8_ops; + } + + /* Make sure version register is sane */ + version = ace_in(ace, ACE_VERSION); + if ((version == 0) || (version == 0xFFFF)) + goto err_read; + + /* Put sysace in a sane state by clearing most control reg bits */ + ace_out(ace, ACE_CTRL, ACE_CTRL_FORCECFGMODE | + ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ); + + /* Now we can hook up the irq handler */ + if (ace->irq) { + rc = request_irq(ace->irq, ace_interrupt, 0, "systemace", ace); + if (rc) { + /* Failure - fall back to polled mode */ + dev_err(ace->dev, "request_irq failed\n"); + ace->irq = 0; + } + } + + /* Enable interrupts */ + val = ace_in(ace, ACE_CTRL); + val |= ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ; + ace_out(ace, ACE_CTRL, val); + + /* Print the identification */ + dev_info(ace->dev, "Xilinx SystemACE revision %i.%i.%i\n", + (version >> 12) & 0xf, (version >> 8) & 0x0f, version & 0xff); + dev_dbg(ace->dev, "physaddr 0x%llx, mapped to 0x%p, irq=%i\n", + (unsigned long long) ace->physaddr, ace->baseaddr, ace->irq); + + ace->media_change = 1; + ace_revalidate_disk(ace->gd); + + /* Make the sysace device 'live' */ + add_disk(ace->gd); + + return 0; + +err_read: + put_disk(ace->gd); +err_alloc_disk: + blk_cleanup_queue(ace->queue); +err_blk_initq: + iounmap(ace->baseaddr); +err_ioremap: + dev_info(ace->dev, "xsysace: error initializing device at 0x%llx\n", + (unsigned long long) ace->physaddr); + return -ENOMEM; +} + +static void ace_teardown(struct ace_device *ace) +{ + if (ace->gd) { + del_gendisk(ace->gd); + put_disk(ace->gd); + } + + if (ace->queue) + blk_cleanup_queue(ace->queue); + + tasklet_kill(&ace->fsm_tasklet); + + if (ace->irq) + free_irq(ace->irq, ace); + + iounmap(ace->baseaddr); +} + +static int ace_alloc(struct device *dev, int id, resource_size_t physaddr, + int irq, int bus_width) +{ + struct ace_device *ace; + int rc; + dev_dbg(dev, "ace_alloc(%p)\n", dev); + + if (!physaddr) { + rc = -ENODEV; + goto err_noreg; + } + + /* Allocate and initialize the ace device structure */ + ace = kzalloc(sizeof(struct ace_device), GFP_KERNEL); + if (!ace) { + rc = -ENOMEM; + goto err_alloc; + } + + ace->dev = dev; + ace->id = id; + ace->physaddr = physaddr; + ace->irq = irq; + ace->bus_width = bus_width; + + /* Call the setup code */ + rc = ace_setup(ace); + if (rc) + goto err_setup; + + dev_set_drvdata(dev, ace); + return 0; + +err_setup: + dev_set_drvdata(dev, NULL); + kfree(ace); +err_alloc: +err_noreg: + dev_err(dev, "could not initialize device, err=%i\n", rc); + return rc; +} + +static void ace_free(struct device *dev) +{ + struct ace_device *ace = dev_get_drvdata(dev); + dev_dbg(dev, "ace_free(%p)\n", dev); + + if (ace) { + ace_teardown(ace); + dev_set_drvdata(dev, NULL); + kfree(ace); + } +} + +/* --------------------------------------------------------------------- + * Platform Bus Support + */ + +static int ace_probe(struct platform_device *dev) +{ + resource_size_t physaddr = 0; + int bus_width = ACE_BUS_WIDTH_16; /* FIXME: should not be hard coded */ + u32 id = dev->id; + int irq = 0; + int i; + + dev_dbg(&dev->dev, "ace_probe(%p)\n", dev); + + /* device id and bus width */ + if (of_property_read_u32(dev->dev.of_node, "port-number", &id)) + id = 0; + if (of_find_property(dev->dev.of_node, "8-bit", NULL)) + bus_width = ACE_BUS_WIDTH_8; + + for (i = 0; i < dev->num_resources; i++) { + if (dev->resource[i].flags & IORESOURCE_MEM) + physaddr = dev->resource[i].start; + if (dev->resource[i].flags & IORESOURCE_IRQ) + irq = dev->resource[i].start; + } + + /* Call the bus-independent setup code */ + return ace_alloc(&dev->dev, id, physaddr, irq, bus_width); +} + +/* + * Platform bus remove() method + */ +static int ace_remove(struct platform_device *dev) +{ + ace_free(&dev->dev); + return 0; +} + +#if defined(CONFIG_OF) +/* Match table for of_platform binding */ +static const struct of_device_id ace_of_match[] = { + { .compatible = "xlnx,opb-sysace-1.00.b", }, + { .compatible = "xlnx,opb-sysace-1.00.c", }, + { .compatible = "xlnx,xps-sysace-1.00.a", }, + { .compatible = "xlnx,sysace", }, + {}, +}; +MODULE_DEVICE_TABLE(of, ace_of_match); +#else /* CONFIG_OF */ +#define ace_of_match NULL +#endif /* CONFIG_OF */ + +static struct platform_driver ace_platform_driver = { + .probe = ace_probe, + .remove = ace_remove, + .driver = { + .name = "xsysace", + .of_match_table = ace_of_match, + }, +}; + +/* --------------------------------------------------------------------- + * Module init/exit routines + */ +static int __init ace_init(void) +{ + int rc; + + ace_major = register_blkdev(ace_major, "xsysace"); + if (ace_major <= 0) { + rc = -ENOMEM; + goto err_blk; + } + + rc = platform_driver_register(&ace_platform_driver); + if (rc) + goto err_plat; + + pr_info("Xilinx SystemACE device driver, major=%i\n", ace_major); + return 0; + +err_plat: + unregister_blkdev(ace_major, "xsysace"); +err_blk: + printk(KERN_ERR "xsysace: registration failed; err=%i\n", rc); + return rc; +} +module_init(ace_init); + +static void __exit ace_exit(void) +{ + pr_debug("Unregistering Xilinx SystemACE driver\n"); + platform_driver_unregister(&ace_platform_driver); + unregister_blkdev(ace_major, "xsysace"); +} +module_exit(ace_exit); diff --git a/kernel/drivers/block/z2ram.c b/kernel/drivers/block/z2ram.c new file mode 100644 index 000000000..968f9e52e --- /dev/null +++ b/kernel/drivers/block/z2ram.c @@ -0,0 +1,418 @@ +/* +** z2ram - Amiga pseudo-driver to access 16bit-RAM in ZorroII space +** as a block device, to be used as a RAM disk or swap space +** +** Copyright (C) 1994 by Ingo Wilken (Ingo.Wilken@informatik.uni-oldenburg.de) +** +** ++Geert: support for zorro_unused_z2ram, better range checking +** ++roman: translate accesses via an array +** ++Milan: support for ChipRAM usage +** ++yambo: converted to 2.0 kernel +** ++yambo: modularized and support added for 3 minor devices including: +** MAJOR MINOR DESCRIPTION +** ----- ----- ---------------------------------------------- +** 37 0 Use Zorro II and Chip ram +** 37 1 Use only Zorro II ram +** 37 2 Use only Chip ram +** 37 4-7 Use memory list entry 1-4 (first is 0) +** ++jskov: support for 1-4th memory list entry. +** +** Permission to use, copy, modify, and distribute this software and its +** documentation for any purpose and without fee is hereby granted, provided +** that the above copyright notice appear in all copies and that both that +** copyright notice and this permission notice appear in supporting +** documentation. This software is provided "as is" without express or +** implied warranty. +*/ + +#define DEVICE_NAME "Z2RAM" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + + +#define Z2MINOR_COMBINED (0) +#define Z2MINOR_Z2ONLY (1) +#define Z2MINOR_CHIPONLY (2) +#define Z2MINOR_MEMLIST1 (4) +#define Z2MINOR_MEMLIST2 (5) +#define Z2MINOR_MEMLIST3 (6) +#define Z2MINOR_MEMLIST4 (7) +#define Z2MINOR_COUNT (8) /* Move this down when adding a new minor */ + +#define Z2RAM_CHUNK1024 ( Z2RAM_CHUNKSIZE >> 10 ) + +static DEFINE_MUTEX(z2ram_mutex); +static u_long *z2ram_map = NULL; +static u_long z2ram_size = 0; +static int z2_count = 0; +static int chip_count = 0; +static int list_count = 0; +static int current_device = -1; + +static DEFINE_SPINLOCK(z2ram_lock); + +static struct gendisk *z2ram_gendisk; + +static void do_z2_request(struct request_queue *q) +{ + struct request *req; + + req = blk_fetch_request(q); + while (req) { + unsigned long start = blk_rq_pos(req) << 9; + unsigned long len = blk_rq_cur_bytes(req); + int err = 0; + + if (start + len > z2ram_size) { + pr_err(DEVICE_NAME ": bad access: block=%llu, " + "count=%u\n", + (unsigned long long)blk_rq_pos(req), + blk_rq_cur_sectors(req)); + err = -EIO; + goto done; + } + while (len) { + unsigned long addr = start & Z2RAM_CHUNKMASK; + unsigned long size = Z2RAM_CHUNKSIZE - addr; + void *buffer = bio_data(req->bio); + + if (len < size) + size = len; + addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ]; + if (rq_data_dir(req) == READ) + memcpy(buffer, (char *)addr, size); + else + memcpy((char *)addr, buffer, size); + start += size; + len -= size; + } + done: + if (!__blk_end_request_cur(req, err)) + req = blk_fetch_request(q); + } +} + +static void +get_z2ram( void ) +{ + int i; + + for ( i = 0; i < Z2RAM_SIZE / Z2RAM_CHUNKSIZE; i++ ) + { + if ( test_bit( i, zorro_unused_z2ram ) ) + { + z2_count++; + z2ram_map[z2ram_size++] = (unsigned long)ZTWO_VADDR(Z2RAM_START) + + (i << Z2RAM_CHUNKSHIFT); + clear_bit( i, zorro_unused_z2ram ); + } + } + + return; +} + +static void +get_chipram( void ) +{ + + while ( amiga_chip_avail() > ( Z2RAM_CHUNKSIZE * 4 ) ) + { + chip_count++; + z2ram_map[ z2ram_size ] = + (u_long)amiga_chip_alloc( Z2RAM_CHUNKSIZE, "z2ram" ); + + if ( z2ram_map[ z2ram_size ] == 0 ) + { + break; + } + + z2ram_size++; + } + + return; +} + +static int z2_open(struct block_device *bdev, fmode_t mode) +{ + int device; + int max_z2_map = ( Z2RAM_SIZE / Z2RAM_CHUNKSIZE ) * + sizeof( z2ram_map[0] ); + int max_chip_map = ( amiga_chip_size / Z2RAM_CHUNKSIZE ) * + sizeof( z2ram_map[0] ); + int rc = -ENOMEM; + + device = MINOR(bdev->bd_dev); + + mutex_lock(&z2ram_mutex); + if ( current_device != -1 && current_device != device ) + { + rc = -EBUSY; + goto err_out; + } + + if ( current_device == -1 ) + { + z2_count = 0; + chip_count = 0; + list_count = 0; + z2ram_size = 0; + + /* Use a specific list entry. */ + if (device >= Z2MINOR_MEMLIST1 && device <= Z2MINOR_MEMLIST4) { + int index = device - Z2MINOR_MEMLIST1 + 1; + unsigned long size, paddr, vaddr; + + if (index >= m68k_realnum_memory) { + printk( KERN_ERR DEVICE_NAME + ": no such entry in z2ram_map\n" ); + goto err_out; + } + + paddr = m68k_memory[index].addr; + size = m68k_memory[index].size & ~(Z2RAM_CHUNKSIZE-1); + +#ifdef __powerpc__ + /* FIXME: ioremap doesn't build correct memory tables. */ + { + vfree(vmalloc (size)); + } + + vaddr = (unsigned long) __ioremap (paddr, size, + _PAGE_WRITETHRU); + +#else + vaddr = (unsigned long)z_remap_nocache_nonser(paddr, size); +#endif + z2ram_map = + kmalloc((size/Z2RAM_CHUNKSIZE)*sizeof(z2ram_map[0]), + GFP_KERNEL); + if ( z2ram_map == NULL ) + { + printk( KERN_ERR DEVICE_NAME + ": cannot get mem for z2ram_map\n" ); + goto err_out; + } + + while (size) { + z2ram_map[ z2ram_size++ ] = vaddr; + size -= Z2RAM_CHUNKSIZE; + vaddr += Z2RAM_CHUNKSIZE; + list_count++; + } + + if ( z2ram_size != 0 ) + printk( KERN_INFO DEVICE_NAME + ": using %iK List Entry %d Memory\n", + list_count * Z2RAM_CHUNK1024, index ); + } else + + switch ( device ) + { + case Z2MINOR_COMBINED: + + z2ram_map = kmalloc( max_z2_map + max_chip_map, GFP_KERNEL ); + if ( z2ram_map == NULL ) + { + printk( KERN_ERR DEVICE_NAME + ": cannot get mem for z2ram_map\n" ); + goto err_out; + } + + get_z2ram(); + get_chipram(); + + if ( z2ram_size != 0 ) + printk( KERN_INFO DEVICE_NAME + ": using %iK Zorro II RAM and %iK Chip RAM (Total %dK)\n", + z2_count * Z2RAM_CHUNK1024, + chip_count * Z2RAM_CHUNK1024, + ( z2_count + chip_count ) * Z2RAM_CHUNK1024 ); + + break; + + case Z2MINOR_Z2ONLY: + z2ram_map = kmalloc( max_z2_map, GFP_KERNEL ); + if ( z2ram_map == NULL ) + { + printk( KERN_ERR DEVICE_NAME + ": cannot get mem for z2ram_map\n" ); + goto err_out; + } + + get_z2ram(); + + if ( z2ram_size != 0 ) + printk( KERN_INFO DEVICE_NAME + ": using %iK of Zorro II RAM\n", + z2_count * Z2RAM_CHUNK1024 ); + + break; + + case Z2MINOR_CHIPONLY: + z2ram_map = kmalloc( max_chip_map, GFP_KERNEL ); + if ( z2ram_map == NULL ) + { + printk( KERN_ERR DEVICE_NAME + ": cannot get mem for z2ram_map\n" ); + goto err_out; + } + + get_chipram(); + + if ( z2ram_size != 0 ) + printk( KERN_INFO DEVICE_NAME + ": using %iK Chip RAM\n", + chip_count * Z2RAM_CHUNK1024 ); + + break; + + default: + rc = -ENODEV; + goto err_out; + + break; + } + + if ( z2ram_size == 0 ) + { + printk( KERN_NOTICE DEVICE_NAME + ": no unused ZII/Chip RAM found\n" ); + goto err_out_kfree; + } + + current_device = device; + z2ram_size <<= Z2RAM_CHUNKSHIFT; + set_capacity(z2ram_gendisk, z2ram_size >> 9); + } + + mutex_unlock(&z2ram_mutex); + return 0; + +err_out_kfree: + kfree(z2ram_map); +err_out: + mutex_unlock(&z2ram_mutex); + return rc; +} + +static void +z2_release(struct gendisk *disk, fmode_t mode) +{ + mutex_lock(&z2ram_mutex); + if ( current_device == -1 ) { + mutex_unlock(&z2ram_mutex); + return; + } + mutex_unlock(&z2ram_mutex); + /* + * FIXME: unmap memory + */ +} + +static const struct block_device_operations z2_fops = +{ + .owner = THIS_MODULE, + .open = z2_open, + .release = z2_release, +}; + +static struct kobject *z2_find(dev_t dev, int *part, void *data) +{ + *part = 0; + return get_disk(z2ram_gendisk); +} + +static struct request_queue *z2_queue; + +static int __init +z2_init(void) +{ + int ret; + + if (!MACH_IS_AMIGA) + return -ENODEV; + + ret = -EBUSY; + if (register_blkdev(Z2RAM_MAJOR, DEVICE_NAME)) + goto err; + + ret = -ENOMEM; + z2ram_gendisk = alloc_disk(1); + if (!z2ram_gendisk) + goto out_disk; + + z2_queue = blk_init_queue(do_z2_request, &z2ram_lock); + if (!z2_queue) + goto out_queue; + + z2ram_gendisk->major = Z2RAM_MAJOR; + z2ram_gendisk->first_minor = 0; + z2ram_gendisk->fops = &z2_fops; + sprintf(z2ram_gendisk->disk_name, "z2ram"); + + z2ram_gendisk->queue = z2_queue; + add_disk(z2ram_gendisk); + blk_register_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT, THIS_MODULE, + z2_find, NULL, NULL); + + return 0; + +out_queue: + put_disk(z2ram_gendisk); +out_disk: + unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME); +err: + return ret; +} + +static void __exit z2_exit(void) +{ + int i, j; + blk_unregister_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT); + unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME); + del_gendisk(z2ram_gendisk); + put_disk(z2ram_gendisk); + blk_cleanup_queue(z2_queue); + + if ( current_device != -1 ) + { + i = 0; + + for ( j = 0 ; j < z2_count; j++ ) + { + set_bit( i++, zorro_unused_z2ram ); + } + + for ( j = 0 ; j < chip_count; j++ ) + { + if ( z2ram_map[ i ] ) + { + amiga_chip_free( (void *) z2ram_map[ i++ ] ); + } + } + + if ( z2ram_map != NULL ) + { + kfree( z2ram_map ); + } + } + + return; +} + +module_init(z2_init); +module_exit(z2_exit); +MODULE_LICENSE("GPL"); diff --git a/kernel/drivers/block/zram/Kconfig b/kernel/drivers/block/zram/Kconfig new file mode 100644 index 000000000..6489c0fd0 --- /dev/null +++ b/kernel/drivers/block/zram/Kconfig @@ -0,0 +1,34 @@ +config ZRAM + tristate "Compressed RAM block device support" + depends on BLOCK && SYSFS && ZSMALLOC + select LZO_COMPRESS + select LZO_DECOMPRESS + default n + help + Creates virtual block devices called /dev/zramX (X = 0, 1, ...). + Pages written to these disks are compressed and stored in memory + itself. These disks allow very fast I/O and compression provides + good amounts of memory savings. + + It has several use cases, for example: /tmp storage, use as swap + disks and maybe many more. + + See zram.txt for more information. + +config ZRAM_LZ4_COMPRESS + bool "Enable LZ4 algorithm support" + depends on ZRAM + select LZ4_COMPRESS + select LZ4_DECOMPRESS + default n + help + This option enables LZ4 compression algorithm support. Compression + algorithm can be changed using `comp_algorithm' device attribute. + +config ZRAM_DEBUG + bool "Compressed RAM block device debug support" + depends on ZRAM + default n + help + This option adds additional debugging code to the compressed + RAM block device driver. diff --git a/kernel/drivers/block/zram/Makefile b/kernel/drivers/block/zram/Makefile new file mode 100644 index 000000000..be0763ff5 --- /dev/null +++ b/kernel/drivers/block/zram/Makefile @@ -0,0 +1,5 @@ +zram-y := zcomp_lzo.o zcomp.o zram_drv.o + +zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o + +obj-$(CONFIG_ZRAM) += zram.o diff --git a/kernel/drivers/block/zram/zcomp.c b/kernel/drivers/block/zram/zcomp.c new file mode 100644 index 000000000..f1ff39a3d --- /dev/null +++ b/kernel/drivers/block/zram/zcomp.c @@ -0,0 +1,353 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include "zcomp.h" +#include "zcomp_lzo.h" +#ifdef CONFIG_ZRAM_LZ4_COMPRESS +#include "zcomp_lz4.h" +#endif + +/* + * single zcomp_strm backend + */ +struct zcomp_strm_single { + struct mutex strm_lock; + struct zcomp_strm *zstrm; +}; + +/* + * multi zcomp_strm backend + */ +struct zcomp_strm_multi { + /* protect strm list */ + spinlock_t strm_lock; + /* max possible number of zstrm streams */ + int max_strm; + /* number of available zstrm streams */ + int avail_strm; + /* list of available strms */ + struct list_head idle_strm; + wait_queue_head_t strm_wait; +}; + +static struct zcomp_backend *backends[] = { + &zcomp_lzo, +#ifdef CONFIG_ZRAM_LZ4_COMPRESS + &zcomp_lz4, +#endif + NULL +}; + +static struct zcomp_backend *find_backend(const char *compress) +{ + int i = 0; + while (backends[i]) { + if (sysfs_streq(compress, backends[i]->name)) + break; + i++; + } + return backends[i]; +} + +static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + if (zstrm->private) + comp->backend->destroy(zstrm->private); + free_pages((unsigned long)zstrm->buffer, 1); + kfree(zstrm); +} + +/* + * allocate new zcomp_strm structure with ->private initialized by + * backend, return NULL on error + */ +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) +{ + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); + if (!zstrm) + return NULL; + + zstrm->private = comp->backend->create(); + /* + * allocate 2 pages. 1 for compressed data, plus 1 extra for the + * case when compressed size is larger than the original one + */ + zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + if (!zstrm->private || !zstrm->buffer) { + zcomp_strm_free(comp, zstrm); + zstrm = NULL; + } + return zstrm; +} + +/* + * get idle zcomp_strm or wait until other process release + * (zcomp_strm_release()) one for us + */ +static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (1) { + spin_lock(&zs->strm_lock); + if (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + spin_unlock(&zs->strm_lock); + return zstrm; + } + /* zstrm streams limit reached, wait for idle stream */ + if (zs->avail_strm >= zs->max_strm) { + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + /* allocate new zstrm stream */ + zs->avail_strm++; + spin_unlock(&zs->strm_lock); + + zstrm = zcomp_strm_alloc(comp); + if (!zstrm) { + spin_lock(&zs->strm_lock); + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + wait_event(zs->strm_wait, !list_empty(&zs->idle_strm)); + continue; + } + break; + } + return zstrm; +} + +/* add stream back to idle list and wake up waiter or free the stream */ +static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + struct zcomp_strm_multi *zs = comp->stream; + + spin_lock(&zs->strm_lock); + if (zs->avail_strm <= zs->max_strm) { + list_add(&zstrm->list, &zs->idle_strm); + spin_unlock(&zs->strm_lock); + wake_up(&zs->strm_wait); + return; + } + + zs->avail_strm--; + spin_unlock(&zs->strm_lock); + zcomp_strm_free(comp, zstrm); +} + +/* change max_strm limit */ +static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + spin_lock(&zs->strm_lock); + zs->max_strm = num_strm; + /* + * if user has lowered the limit and there are idle streams, + * immediately free as much streams (and memory) as we can. + */ + while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + zcomp_strm_free(comp, zstrm); + zs->avail_strm--; + } + spin_unlock(&zs->strm_lock); + return true; +} + +static void zcomp_strm_multi_destroy(struct zcomp *comp) +{ + struct zcomp_strm_multi *zs = comp->stream; + struct zcomp_strm *zstrm; + + while (!list_empty(&zs->idle_strm)) { + zstrm = list_entry(zs->idle_strm.next, + struct zcomp_strm, list); + list_del(&zstrm->list); + zcomp_strm_free(comp, zstrm); + } + kfree(zs); +} + +static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm) +{ + struct zcomp_strm *zstrm; + struct zcomp_strm_multi *zs; + + comp->destroy = zcomp_strm_multi_destroy; + comp->strm_find = zcomp_strm_multi_find; + comp->strm_release = zcomp_strm_multi_release; + comp->set_max_streams = zcomp_strm_multi_set_max_streams; + zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL); + if (!zs) + return -ENOMEM; + + comp->stream = zs; + spin_lock_init(&zs->strm_lock); + INIT_LIST_HEAD(&zs->idle_strm); + init_waitqueue_head(&zs->strm_wait); + zs->max_strm = max_strm; + zs->avail_strm = 1; + + zstrm = zcomp_strm_alloc(comp); + if (!zstrm) { + kfree(zs); + return -ENOMEM; + } + list_add(&zstrm->list, &zs->idle_strm); + return 0; +} + +static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp) +{ + struct zcomp_strm_single *zs = comp->stream; + mutex_lock(&zs->strm_lock); + return zs->zstrm; +} + +static void zcomp_strm_single_release(struct zcomp *comp, + struct zcomp_strm *zstrm) +{ + struct zcomp_strm_single *zs = comp->stream; + mutex_unlock(&zs->strm_lock); +} + +static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm) +{ + /* zcomp_strm_single support only max_comp_streams == 1 */ + return false; +} + +static void zcomp_strm_single_destroy(struct zcomp *comp) +{ + struct zcomp_strm_single *zs = comp->stream; + zcomp_strm_free(comp, zs->zstrm); + kfree(zs); +} + +static int zcomp_strm_single_create(struct zcomp *comp) +{ + struct zcomp_strm_single *zs; + + comp->destroy = zcomp_strm_single_destroy; + comp->strm_find = zcomp_strm_single_find; + comp->strm_release = zcomp_strm_single_release; + comp->set_max_streams = zcomp_strm_single_set_max_streams; + zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL); + if (!zs) + return -ENOMEM; + + comp->stream = zs; + mutex_init(&zs->strm_lock); + zs->zstrm = zcomp_strm_alloc(comp); + if (!zs->zstrm) { + kfree(zs); + return -ENOMEM; + } + return 0; +} + +/* show available compressors */ +ssize_t zcomp_available_show(const char *comp, char *buf) +{ + ssize_t sz = 0; + int i = 0; + + while (backends[i]) { + if (sysfs_streq(comp, backends[i]->name)) + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "[%s] ", backends[i]->name); + else + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "%s ", backends[i]->name); + i++; + } + sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); + return sz; +} + +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) +{ + return comp->set_max_streams(comp, num_strm); +} + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) +{ + return comp->strm_find(comp); +} + +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) +{ + comp->strm_release(comp, zstrm); +} + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len) +{ + return comp->backend->compress(src, zstrm->buffer, dst_len, + zstrm->private); +} + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst) +{ + return comp->backend->decompress(src, src_len, dst); +} + +void zcomp_destroy(struct zcomp *comp) +{ + comp->destroy(comp); + kfree(comp); +} + +/* + * search available compressors for requested algorithm. + * allocate new zcomp and initialize it. return compressing + * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL) + * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in + * case of allocation error. + */ +struct zcomp *zcomp_create(const char *compress, int max_strm) +{ + struct zcomp *comp; + struct zcomp_backend *backend; + + backend = find_backend(compress); + if (!backend) + return ERR_PTR(-EINVAL); + + comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); + if (!comp) + return ERR_PTR(-ENOMEM); + + comp->backend = backend; + if (max_strm > 1) + zcomp_strm_multi_create(comp, max_strm); + else + zcomp_strm_single_create(comp); + if (!comp->stream) { + kfree(comp); + return ERR_PTR(-ENOMEM); + } + return comp; +} diff --git a/kernel/drivers/block/zram/zcomp.h b/kernel/drivers/block/zram/zcomp.h new file mode 100644 index 000000000..c59d1fca7 --- /dev/null +++ b/kernel/drivers/block/zram/zcomp.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_H_ +#define _ZCOMP_H_ + +#include + +struct zcomp_strm { + /* compression/decompression buffer */ + void *buffer; + /* + * The private data of the compression stream, only compression + * stream backend can touch this (e.g. compression algorithm + * working memory) + */ + void *private; + /* used in multi stream backend, protected by backend strm_lock */ + struct list_head list; +}; + +/* static compression backend */ +struct zcomp_backend { + int (*compress)(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private); + + int (*decompress)(const unsigned char *src, size_t src_len, + unsigned char *dst); + + void *(*create)(void); + void (*destroy)(void *private); + + const char *name; +}; + +/* dynamic per-device compression frontend */ +struct zcomp { + void *stream; + struct zcomp_backend *backend; + + struct zcomp_strm *(*strm_find)(struct zcomp *comp); + void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm); + bool (*set_max_streams)(struct zcomp *comp, int num_strm); + void (*destroy)(struct zcomp *comp); +}; + +ssize_t zcomp_available_show(const char *comp, char *buf); + +struct zcomp *zcomp_create(const char *comp, int max_strm); +void zcomp_destroy(struct zcomp *comp); + +struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); +void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); + +int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, + const unsigned char *src, size_t *dst_len); + +int zcomp_decompress(struct zcomp *comp, const unsigned char *src, + size_t src_len, unsigned char *dst); + +bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); +#endif /* _ZCOMP_H_ */ diff --git a/kernel/drivers/block/zram/zcomp_lz4.c b/kernel/drivers/block/zram/zcomp_lz4.c new file mode 100644 index 000000000..f2afb7e98 --- /dev/null +++ b/kernel/drivers/block/zram/zcomp_lz4.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "zcomp_lz4.h" + +static void *zcomp_lz4_create(void) +{ + return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); +} + +static void zcomp_lz4_destroy(void *private) +{ + kfree(private); +} + +static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + /* return : Success if return 0 */ + return lz4_compress(src, PAGE_SIZE, dst, dst_len, private); +} + +static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + /* return : Success if return 0 */ + return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len); +} + +struct zcomp_backend zcomp_lz4 = { + .compress = zcomp_lz4_compress, + .decompress = zcomp_lz4_decompress, + .create = zcomp_lz4_create, + .destroy = zcomp_lz4_destroy, + .name = "lz4", +}; diff --git a/kernel/drivers/block/zram/zcomp_lz4.h b/kernel/drivers/block/zram/zcomp_lz4.h new file mode 100644 index 000000000..60613fb29 --- /dev/null +++ b/kernel/drivers/block/zram/zcomp_lz4.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZ4_H_ +#define _ZCOMP_LZ4_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lz4; + +#endif /* _ZCOMP_LZ4_H_ */ diff --git a/kernel/drivers/block/zram/zcomp_lzo.c b/kernel/drivers/block/zram/zcomp_lzo.c new file mode 100644 index 000000000..da1bc47d5 --- /dev/null +++ b/kernel/drivers/block/zram/zcomp_lzo.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "zcomp_lzo.h" + +static void *lzo_create(void) +{ + return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); +} + +static void lzo_destroy(void *private) +{ + kfree(private); +} + +static int lzo_compress(const unsigned char *src, unsigned char *dst, + size_t *dst_len, void *private) +{ + int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private); + return ret == LZO_E_OK ? 0 : ret; +} + +static int lzo_decompress(const unsigned char *src, size_t src_len, + unsigned char *dst) +{ + size_t dst_len = PAGE_SIZE; + int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len); + return ret == LZO_E_OK ? 0 : ret; +} + +struct zcomp_backend zcomp_lzo = { + .compress = lzo_compress, + .decompress = lzo_decompress, + .create = lzo_create, + .destroy = lzo_destroy, + .name = "lzo", +}; diff --git a/kernel/drivers/block/zram/zcomp_lzo.h b/kernel/drivers/block/zram/zcomp_lzo.h new file mode 100644 index 000000000..128c5807f --- /dev/null +++ b/kernel/drivers/block/zram/zcomp_lzo.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2014 Sergey Senozhatsky. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ZCOMP_LZO_H_ +#define _ZCOMP_LZO_H_ + +#include "zcomp.h" + +extern struct zcomp_backend zcomp_lzo; + +#endif /* _ZCOMP_LZO_H_ */ diff --git a/kernel/drivers/block/zram/zram_drv.c b/kernel/drivers/block/zram/zram_drv.c new file mode 100644 index 000000000..6e134f475 --- /dev/null +++ b/kernel/drivers/block/zram/zram_drv.c @@ -0,0 +1,1322 @@ +/* + * Compressed RAM block device + * + * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim + * + * This code is released using a dual license strategy: BSD/GPL + * You can choose the licence that better fits your requirements. + * + * Released under the terms of 3-clause BSD License + * Released under the terms of GNU General Public License Version 2.0 + * + */ + +#define KMSG_COMPONENT "zram" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#ifdef CONFIG_ZRAM_DEBUG +#define DEBUG +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zram_drv.h" + +/* Globals */ +static int zram_major; +static struct zram *zram_devices; +static const char *default_compressor = "lzo"; + +/* Module params (documentation at end) */ +static unsigned int num_devices = 1; + +static inline void deprecated_attr_warn(const char *name) +{ + pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n", + task_pid_nr(current), + current->comm, + name, + "See zram documentation."); +} + +#define ZRAM_ATTR_RO(name) \ +static ssize_t name##_show(struct device *d, \ + struct device_attribute *attr, char *b) \ +{ \ + struct zram *zram = dev_to_zram(d); \ + \ + deprecated_attr_warn(__stringify(name)); \ + return scnprintf(b, PAGE_SIZE, "%llu\n", \ + (u64)atomic64_read(&zram->stats.name)); \ +} \ +static DEVICE_ATTR_RO(name); + +static inline bool init_done(struct zram *zram) +{ + return zram->disksize; +} + +static inline struct zram *dev_to_zram(struct device *dev) +{ + return (struct zram *)dev_to_disk(dev)->private_data; +} + +static ssize_t compact_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + unsigned long nr_migrated; + struct zram *zram = dev_to_zram(dev); + struct zram_meta *meta; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } + + meta = zram->meta; + nr_migrated = zs_compact(meta->mem_pool); + atomic64_add(nr_migrated, &zram->stats.num_migrated); + up_read(&zram->init_lock); + + return len; +} + +static ssize_t disksize_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); +} + +static ssize_t initstate_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = init_done(zram); + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%u\n", val); +} + +static ssize_t orig_data_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("orig_data_size"); + return scnprintf(buf, PAGE_SIZE, "%llu\n", + (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT); +} + +static ssize_t mem_used_total_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val = 0; + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("mem_used_total"); + down_read(&zram->init_lock); + if (init_done(zram)) { + struct zram_meta *meta = zram->meta; + val = zs_get_total_pages(meta->mem_pool); + } + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t max_comp_streams_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->max_comp_streams; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%d\n", val); +} + +static ssize_t mem_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val; + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("mem_limit"); + down_read(&zram->init_lock); + val = zram->limit_pages; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_limit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + u64 limit; + char *tmp; + struct zram *zram = dev_to_zram(dev); + + limit = memparse(buf, &tmp); + if (buf == tmp) /* no chars parsed, invalid input */ + return -EINVAL; + + down_write(&zram->init_lock); + zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT; + up_write(&zram->init_lock); + + return len; +} + +static ssize_t mem_used_max_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u64 val = 0; + struct zram *zram = dev_to_zram(dev); + + deprecated_attr_warn("mem_used_max"); + down_read(&zram->init_lock); + if (init_done(zram)) + val = atomic_long_read(&zram->stats.max_used_pages); + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); +} + +static ssize_t mem_used_max_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int err; + unsigned long val; + struct zram *zram = dev_to_zram(dev); + + err = kstrtoul(buf, 10, &val); + if (err || val != 0) + return -EINVAL; + + down_read(&zram->init_lock); + if (init_done(zram)) { + struct zram_meta *meta = zram->meta; + atomic_long_set(&zram->stats.max_used_pages, + zs_get_total_pages(meta->mem_pool)); + } + up_read(&zram->init_lock); + + return len; +} + +static ssize_t max_comp_streams_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int num; + struct zram *zram = dev_to_zram(dev); + int ret; + + ret = kstrtoint(buf, 0, &num); + if (ret < 0) + return ret; + if (num < 1) + return -EINVAL; + + down_write(&zram->init_lock); + if (init_done(zram)) { + if (!zcomp_set_max_streams(zram->comp, num)) { + pr_info("Cannot change max compression streams\n"); + ret = -EINVAL; + goto out; + } + } + + zram->max_comp_streams = num; + ret = len; +out: + up_write(&zram->init_lock); + return ret; +} + +static ssize_t comp_algorithm_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + size_t sz; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + sz = zcomp_available_show(zram->compressor, buf); + up_read(&zram->init_lock); + + return sz; +} + +static ssize_t comp_algorithm_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + down_write(&zram->init_lock); + if (init_done(zram)) { + up_write(&zram->init_lock); + pr_info("Can't change algorithm for initialized device\n"); + return -EBUSY; + } + strlcpy(zram->compressor, buf, sizeof(zram->compressor)); + up_write(&zram->init_lock); + return len; +} + +/* flag operations needs meta->tb_lock */ +static int zram_test_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + return meta->table[index].value & BIT(flag); +} + +static void zram_set_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value |= BIT(flag); +} + +static void zram_clear_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value &= ~BIT(flag); +} + +static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +{ + return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); +} + +static void zram_set_obj_size(struct zram_meta *meta, + u32 index, size_t size) +{ + unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + + meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; +} + +static inline int is_partial_io(struct bio_vec *bvec) +{ + return bvec->bv_len != PAGE_SIZE; +} + +/* + * Check if request is within bounds and aligned on zram logical blocks. + */ +static inline int valid_io_request(struct zram *zram, + sector_t start, unsigned int size) +{ + u64 end, bound; + + /* unaligned request */ + if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + return 0; + if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + return 0; + + end = start + (size >> SECTOR_SHIFT); + bound = zram->disksize >> SECTOR_SHIFT; + /* out of range range */ + if (unlikely(start >= bound || end > bound || start > end)) + return 0; + + /* I/O request is valid */ + return 1; +} + +static void zram_meta_free(struct zram_meta *meta, u64 disksize) +{ + size_t num_pages = disksize >> PAGE_SHIFT; + size_t index; + + /* Free all pages that are still in this zram device */ + for (index = 0; index < num_pages; index++) { + unsigned long handle = meta->table[index].handle; + + if (!handle) + continue; + + zs_free(meta->mem_pool, handle); + } + + zs_destroy_pool(meta->mem_pool); + vfree(meta->table); + kfree(meta); +} + +static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize) +{ + size_t num_pages; + char pool_name[8]; + struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); + + if (!meta) + return NULL; + + num_pages = disksize >> PAGE_SHIFT; + meta->table = vzalloc(num_pages * sizeof(*meta->table)); + if (!meta->table) { + pr_err("Error allocating zram address table\n"); + goto out_error; + } + + snprintf(pool_name, sizeof(pool_name), "zram%d", device_id); + meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM); + if (!meta->mem_pool) { + pr_err("Error creating memory pool\n"); + goto out_error; + } + + return meta; + +out_error: + vfree(meta->table); + kfree(meta); + return NULL; +} + +static inline bool zram_meta_get(struct zram *zram) +{ + if (atomic_inc_not_zero(&zram->refcount)) + return true; + return false; +} + +static inline void zram_meta_put(struct zram *zram) +{ + atomic_dec(&zram->refcount); +} + +static void update_position(u32 *index, int *offset, struct bio_vec *bvec) +{ + if (*offset + bvec->bv_len >= PAGE_SIZE) + (*index)++; + *offset = (*offset + bvec->bv_len) % PAGE_SIZE; +} + +static int page_zero_filled(void *ptr) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + + for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos]) + return 0; + } + + return 1; +} + +static void handle_zero_page(struct bio_vec *bvec) +{ + struct page *page = bvec->bv_page; + void *user_mem; + + user_mem = kmap_atomic(page); + if (is_partial_io(bvec)) + memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); + else + clear_page(user_mem); + kunmap_atomic(user_mem); + + flush_dcache_page(page); +} + + +/* + * To protect concurrent access to the same index entry, + * caller should hold this table index entry's bit_spinlock to + * indicate this index entry is accessing. + */ +static void zram_free_page(struct zram *zram, size_t index) +{ + struct zram_meta *meta = zram->meta; + unsigned long handle = meta->table[index].handle; + + if (unlikely(!handle)) { + /* + * No memory is allocated for zero filled pages. + * Simply clear zero page flag. + */ + if (zram_test_flag(meta, index, ZRAM_ZERO)) { + zram_clear_flag(meta, index, ZRAM_ZERO); + atomic64_dec(&zram->stats.zero_pages); + } + return; + } + + zs_free(meta->mem_pool, handle); + + atomic64_sub(zram_get_obj_size(meta, index), + &zram->stats.compr_data_size); + atomic64_dec(&zram->stats.pages_stored); + + meta->table[index].handle = 0; + zram_set_obj_size(meta, index, 0); +} + +static int zram_decompress_page(struct zram *zram, char *mem, u32 index) +{ + int ret = 0; + unsigned char *cmem; + struct zram_meta *meta = zram->meta; + unsigned long handle; + size_t size; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + handle = meta->table[index].handle; + size = zram_get_obj_size(meta, index); + + if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + clear_page(mem); + return 0; + } + + cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); + if (size == PAGE_SIZE) + copy_page(mem, cmem); + else + ret = zcomp_decompress(zram->comp, cmem, size, mem); + zs_unmap_object(meta->mem_pool, handle); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + /* Should NEVER happen. Return bio error if it does. */ + if (unlikely(ret)) { + pr_err("Decompression failed! err=%d, page=%u\n", ret, index); + return ret; + } + + return 0; +} + +static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset) +{ + int ret; + struct page *page; + unsigned char *user_mem, *uncmem = NULL; + struct zram_meta *meta = zram->meta; + page = bvec->bv_page; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + if (unlikely(!meta->table[index].handle) || + zram_test_flag(meta, index, ZRAM_ZERO)) { + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + handle_zero_page(bvec); + return 0; + } + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + if (is_partial_io(bvec)) + /* Use a temporary buffer to decompress the page */ + uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); + + user_mem = kmap_atomic(page); + if (!is_partial_io(bvec)) + uncmem = user_mem; + + if (!uncmem) { + pr_info("Unable to allocate temp memory\n"); + ret = -ENOMEM; + goto out_cleanup; + } + + ret = zram_decompress_page(zram, uncmem, index); + /* Should NEVER happen. Return bio error if it does. */ + if (unlikely(ret)) + goto out_cleanup; + + if (is_partial_io(bvec)) + memcpy(user_mem + bvec->bv_offset, uncmem + offset, + bvec->bv_len); + + flush_dcache_page(page); + ret = 0; +out_cleanup: + kunmap_atomic(user_mem); + if (is_partial_io(bvec)) + kfree(uncmem); + return ret; +} + +static inline void update_used_max(struct zram *zram, + const unsigned long pages) +{ + unsigned long old_max, cur_max; + + old_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + cur_max = old_max; + if (pages > cur_max) + old_max = atomic_long_cmpxchg( + &zram->stats.max_used_pages, cur_max, pages); + } while (old_max != cur_max); +} + +static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, + int offset) +{ + int ret = 0; + size_t clen; + unsigned long handle; + struct page *page; + unsigned char *user_mem, *cmem, *src, *uncmem = NULL; + struct zram_meta *meta = zram->meta; + struct zcomp_strm *zstrm; + bool locked = false; + unsigned long alloced_pages; + + page = bvec->bv_page; + if (is_partial_io(bvec)) { + /* + * This is a partial IO. We need to read the full page + * before to write the changes. + */ + uncmem = kmalloc(PAGE_SIZE, GFP_NOIO); + if (!uncmem) { + ret = -ENOMEM; + goto out; + } + ret = zram_decompress_page(zram, uncmem, index); + if (ret) + goto out; + } + + zstrm = zcomp_strm_find(zram->comp); + locked = true; + user_mem = kmap_atomic(page); + + if (is_partial_io(bvec)) { + memcpy(uncmem + offset, user_mem + bvec->bv_offset, + bvec->bv_len); + kunmap_atomic(user_mem); + user_mem = NULL; + } else { + uncmem = user_mem; + } + + if (page_zero_filled(uncmem)) { + if (user_mem) + kunmap_atomic(user_mem); + /* Free memory associated with this sector now. */ + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + zram_set_flag(meta, index, ZRAM_ZERO); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + atomic64_inc(&zram->stats.zero_pages); + ret = 0; + goto out; + } + + ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); + if (!is_partial_io(bvec)) { + kunmap_atomic(user_mem); + user_mem = NULL; + uncmem = NULL; + } + + if (unlikely(ret)) { + pr_err("Compression failed! err=%d\n", ret); + goto out; + } + src = zstrm->buffer; + if (unlikely(clen > max_zpage_size)) { + clen = PAGE_SIZE; + if (is_partial_io(bvec)) + src = uncmem; + } + + handle = zs_malloc(meta->mem_pool, clen); + if (!handle) { + pr_info("Error allocating memory for compressed page: %u, size=%zu\n", + index, clen); + ret = -ENOMEM; + goto out; + } + + alloced_pages = zs_get_total_pages(meta->mem_pool); + if (zram->limit_pages && alloced_pages > zram->limit_pages) { + zs_free(meta->mem_pool, handle); + ret = -ENOMEM; + goto out; + } + + update_used_max(zram, alloced_pages); + + cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); + + if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { + src = kmap_atomic(page); + copy_page(cmem, src); + kunmap_atomic(src); + } else { + memcpy(cmem, src, clen); + } + + zcomp_strm_release(zram->comp, zstrm); + locked = false; + zs_unmap_object(meta->mem_pool, handle); + + /* + * Free memory associated with this sector + * before overwriting unused sectors. + */ + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + + meta->table[index].handle = handle; + zram_set_obj_size(meta, index, clen); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + + /* Update stats */ + atomic64_add(clen, &zram->stats.compr_data_size); + atomic64_inc(&zram->stats.pages_stored); +out: + if (locked) + zcomp_strm_release(zram->comp, zstrm); + if (is_partial_io(bvec)) + kfree(uncmem); + return ret; +} + +static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, + int offset, int rw) +{ + unsigned long start_time = jiffies; + int ret; + + generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, + &zram->disk->part0); + + if (rw == READ) { + atomic64_inc(&zram->stats.num_reads); + ret = zram_bvec_read(zram, bvec, index, offset); + } else { + atomic64_inc(&zram->stats.num_writes); + ret = zram_bvec_write(zram, bvec, index, offset); + } + + generic_end_io_acct(rw, &zram->disk->part0, start_time); + + if (unlikely(ret)) { + if (rw == READ) + atomic64_inc(&zram->stats.failed_reads); + else + atomic64_inc(&zram->stats.failed_writes); + } + + return ret; +} + +/* + * zram_bio_discard - handler on discard request + * @index: physical block index in PAGE_SIZE units + * @offset: byte offset within physical block + */ +static void zram_bio_discard(struct zram *zram, u32 index, + int offset, struct bio *bio) +{ + size_t n = bio->bi_iter.bi_size; + struct zram_meta *meta = zram->meta; + + /* + * zram manages data in physical block size units. Because logical block + * size isn't identical with physical block size on some arch, we + * could get a discard request pointing to a specific offset within a + * certain physical block. Although we can handle this request by + * reading that physiclal block and decompressing and partially zeroing + * and re-compressing and then re-storing it, this isn't reasonable + * because our intent with a discard request is to save memory. So + * skipping this logical block is appropriate here. + */ + if (offset) { + if (n <= (PAGE_SIZE - offset)) + return; + + n -= (PAGE_SIZE - offset); + index++; + } + + while (n >= PAGE_SIZE) { + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + atomic64_inc(&zram->stats.notify_free); + index++; + n -= PAGE_SIZE; + } +} + +static void zram_reset_device(struct zram *zram) +{ + struct zram_meta *meta; + struct zcomp *comp; + u64 disksize; + + down_write(&zram->init_lock); + + zram->limit_pages = 0; + + if (!init_done(zram)) { + up_write(&zram->init_lock); + return; + } + + meta = zram->meta; + comp = zram->comp; + disksize = zram->disksize; + /* + * Refcount will go down to 0 eventually and r/w handler + * cannot handle further I/O so it will bail out by + * check zram_meta_get. + */ + zram_meta_put(zram); + /* + * We want to free zram_meta in process context to avoid + * deadlock between reclaim path and any other locks. + */ + wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); + + /* Reset stats */ + memset(&zram->stats, 0, sizeof(zram->stats)); + zram->disksize = 0; + zram->max_comp_streams = 1; + + set_capacity(zram->disk, 0); + part_stat_set_all(&zram->disk->part0, 0); + + up_write(&zram->init_lock); + /* I/O operation under all of CPU are done so let's free */ + zram_meta_free(meta, disksize); + zcomp_destroy(comp); +} + +static ssize_t disksize_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + u64 disksize; + struct zcomp *comp; + struct zram_meta *meta; + struct zram *zram = dev_to_zram(dev); + int err; + + disksize = memparse(buf, NULL); + if (!disksize) + return -EINVAL; + + disksize = PAGE_ALIGN(disksize); + meta = zram_meta_alloc(zram->disk->first_minor, disksize); + if (!meta) + return -ENOMEM; + + comp = zcomp_create(zram->compressor, zram->max_comp_streams); + if (IS_ERR(comp)) { + pr_info("Cannot initialise %s compressing backend\n", + zram->compressor); + err = PTR_ERR(comp); + goto out_free_meta; + } + + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Cannot change disksize for initialized device\n"); + err = -EBUSY; + goto out_destroy_comp; + } + + init_waitqueue_head(&zram->io_done); + atomic_set(&zram->refcount, 1); + zram->meta = meta; + zram->comp = comp; + zram->disksize = disksize; + set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + up_write(&zram->init_lock); + + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock + * so that revalidate_disk always sees up-to-date capacity. + */ + revalidate_disk(zram->disk); + + return len; + +out_destroy_comp: + up_write(&zram->init_lock); + zcomp_destroy(comp); +out_free_meta: + zram_meta_free(meta, disksize); + return err; +} + +static ssize_t reset_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + int ret; + unsigned short do_reset; + struct zram *zram; + struct block_device *bdev; + + zram = dev_to_zram(dev); + bdev = bdget_disk(zram->disk, 0); + + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + /* Do not reset an active device! */ + if (bdev->bd_openers) { + ret = -EBUSY; + goto out; + } + + ret = kstrtou16(buf, 10, &do_reset); + if (ret) + goto out; + + if (!do_reset) { + ret = -EINVAL; + goto out; + } + + /* Make sure all pending I/O is finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + + mutex_unlock(&bdev->bd_mutex); + revalidate_disk(zram->disk); + bdput(bdev); + + return len; + +out: + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return ret; +} + +static void __zram_make_request(struct zram *zram, struct bio *bio) +{ + int offset, rw; + u32 index; + struct bio_vec bvec; + struct bvec_iter iter; + + index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT; + offset = (bio->bi_iter.bi_sector & + (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; + + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + zram_bio_discard(zram, index, offset, bio); + bio_endio(bio, 0); + return; + } + + rw = bio_data_dir(bio); + bio_for_each_segment(bvec, bio, iter) { + int max_transfer_size = PAGE_SIZE - offset; + + if (bvec.bv_len > max_transfer_size) { + /* + * zram_bvec_rw() can only make operation on a single + * zram page. Split the bio vector. + */ + struct bio_vec bv; + + bv.bv_page = bvec.bv_page; + bv.bv_len = max_transfer_size; + bv.bv_offset = bvec.bv_offset; + + if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) + goto out; + + bv.bv_len = bvec.bv_len - max_transfer_size; + bv.bv_offset += max_transfer_size; + if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) + goto out; + } else + if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0) + goto out; + + update_position(&index, &offset, &bvec); + } + + set_bit(BIO_UPTODATE, &bio->bi_flags); + bio_endio(bio, 0); + return; + +out: + bio_io_error(bio); +} + +/* + * Handler function for all zram I/O requests. + */ +static void zram_make_request(struct request_queue *queue, struct bio *bio) +{ + struct zram *zram = queue->queuedata; + + if (unlikely(!zram_meta_get(zram))) + goto error; + + if (!valid_io_request(zram, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size)) { + atomic64_inc(&zram->stats.invalid_io); + goto put_zram; + } + + __zram_make_request(zram, bio); + zram_meta_put(zram); + return; +put_zram: + zram_meta_put(zram); +error: + bio_io_error(bio); +} + +static void zram_slot_free_notify(struct block_device *bdev, + unsigned long index) +{ + struct zram *zram; + struct zram_meta *meta; + + zram = bdev->bd_disk->private_data; + meta = zram->meta; + + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_free_page(zram, index); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + atomic64_inc(&zram->stats.notify_free); +} + +static int zram_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + int offset, err = -EIO; + u32 index; + struct zram *zram; + struct bio_vec bv; + + zram = bdev->bd_disk->private_data; + if (unlikely(!zram_meta_get(zram))) + goto out; + + if (!valid_io_request(zram, sector, PAGE_SIZE)) { + atomic64_inc(&zram->stats.invalid_io); + err = -EINVAL; + goto put_zram; + } + + index = sector >> SECTORS_PER_PAGE_SHIFT; + offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT; + + bv.bv_page = page; + bv.bv_len = PAGE_SIZE; + bv.bv_offset = 0; + + err = zram_bvec_rw(zram, &bv, index, offset, rw); +put_zram: + zram_meta_put(zram); +out: + /* + * If I/O fails, just return error(ie, non-zero) without + * calling page_endio. + * It causes resubmit the I/O with bio request by upper functions + * of rw_page(e.g., swap_readpage, __swap_writepage) and + * bio->bi_end_io does things to handle the error + * (e.g., SetPageError, set_page_dirty and extra works). + */ + if (err == 0) + page_endio(page, rw, 0); + return err; +} + +static const struct block_device_operations zram_devops = { + .swap_slot_free_notify = zram_slot_free_notify, + .rw_page = zram_rw_page, + .owner = THIS_MODULE +}; + +static DEVICE_ATTR_WO(compact); +static DEVICE_ATTR_RW(disksize); +static DEVICE_ATTR_RO(initstate); +static DEVICE_ATTR_WO(reset); +static DEVICE_ATTR_RO(orig_data_size); +static DEVICE_ATTR_RO(mem_used_total); +static DEVICE_ATTR_RW(mem_limit); +static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_RW(max_comp_streams); +static DEVICE_ATTR_RW(comp_algorithm); + +static ssize_t io_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + ssize_t ret; + + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8llu\n", + (u64)atomic64_read(&zram->stats.failed_reads), + (u64)atomic64_read(&zram->stats.failed_writes), + (u64)atomic64_read(&zram->stats.invalid_io), + (u64)atomic64_read(&zram->stats.notify_free)); + up_read(&zram->init_lock); + + return ret; +} + +static ssize_t mm_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + u64 orig_size, mem_used = 0; + long max_used; + ssize_t ret; + + down_read(&zram->init_lock); + if (init_done(zram)) + mem_used = zs_get_total_pages(zram->meta->mem_pool); + + orig_size = atomic64_read(&zram->stats.pages_stored); + max_used = atomic_long_read(&zram->stats.max_used_pages); + + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n", + orig_size << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.compr_data_size), + mem_used << PAGE_SHIFT, + zram->limit_pages << PAGE_SHIFT, + max_used << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.zero_pages), + (u64)atomic64_read(&zram->stats.num_migrated)); + up_read(&zram->init_lock); + + return ret; +} + +static DEVICE_ATTR_RO(io_stat); +static DEVICE_ATTR_RO(mm_stat); +ZRAM_ATTR_RO(num_reads); +ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(failed_reads); +ZRAM_ATTR_RO(failed_writes); +ZRAM_ATTR_RO(invalid_io); +ZRAM_ATTR_RO(notify_free); +ZRAM_ATTR_RO(zero_pages); +ZRAM_ATTR_RO(compr_data_size); + +static struct attribute *zram_disk_attrs[] = { + &dev_attr_disksize.attr, + &dev_attr_initstate.attr, + &dev_attr_reset.attr, + &dev_attr_num_reads.attr, + &dev_attr_num_writes.attr, + &dev_attr_failed_reads.attr, + &dev_attr_failed_writes.attr, + &dev_attr_compact.attr, + &dev_attr_invalid_io.attr, + &dev_attr_notify_free.attr, + &dev_attr_zero_pages.attr, + &dev_attr_orig_data_size.attr, + &dev_attr_compr_data_size.attr, + &dev_attr_mem_used_total.attr, + &dev_attr_mem_limit.attr, + &dev_attr_mem_used_max.attr, + &dev_attr_max_comp_streams.attr, + &dev_attr_comp_algorithm.attr, + &dev_attr_io_stat.attr, + &dev_attr_mm_stat.attr, + NULL, +}; + +static struct attribute_group zram_disk_attr_group = { + .attrs = zram_disk_attrs, +}; + +static int create_device(struct zram *zram, int device_id) +{ + struct request_queue *queue; + int ret = -ENOMEM; + + init_rwsem(&zram->init_lock); + + queue = blk_alloc_queue(GFP_KERNEL); + if (!queue) { + pr_err("Error allocating disk queue for device %d\n", + device_id); + goto out; + } + + blk_queue_make_request(queue, zram_make_request); + + /* gendisk structure */ + zram->disk = alloc_disk(1); + if (!zram->disk) { + pr_warn("Error allocating disk structure for device %d\n", + device_id); + ret = -ENOMEM; + goto out_free_queue; + } + + zram->disk->major = zram_major; + zram->disk->first_minor = device_id; + zram->disk->fops = &zram_devops; + zram->disk->queue = queue; + zram->disk->queue->queuedata = zram; + zram->disk->private_data = zram; + snprintf(zram->disk->disk_name, 16, "zram%d", device_id); + + /* Actual capacity set using syfs (/sys/block/zram/disksize */ + set_capacity(zram->disk, 0); + /* zram devices sort of resembles non-rotational disks */ + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); + /* + * To ensure that we always get PAGE_SIZE aligned + * and n*PAGE_SIZED sized I/O requests. + */ + blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE); + blk_queue_logical_block_size(zram->disk->queue, + ZRAM_LOGICAL_BLOCK_SIZE); + blk_queue_io_min(zram->disk->queue, PAGE_SIZE); + blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); + zram->disk->queue->limits.discard_granularity = PAGE_SIZE; + zram->disk->queue->limits.max_discard_sectors = UINT_MAX; + /* + * zram_bio_discard() will clear all logical blocks if logical block + * size is identical with physical block size(PAGE_SIZE). But if it is + * different, we will skip discarding some parts of logical blocks in + * the part of the request range which isn't aligned to physical block + * size. So we can't ensure that all discarded logical blocks are + * zeroed. + */ + if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) + zram->disk->queue->limits.discard_zeroes_data = 1; + else + zram->disk->queue->limits.discard_zeroes_data = 0; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue); + + add_disk(zram->disk); + + ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, + &zram_disk_attr_group); + if (ret < 0) { + pr_warn("Error creating sysfs group"); + goto out_free_disk; + } + strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + zram->meta = NULL; + zram->max_comp_streams = 1; + return 0; + +out_free_disk: + del_gendisk(zram->disk); + put_disk(zram->disk); +out_free_queue: + blk_cleanup_queue(queue); +out: + return ret; +} + +static void destroy_devices(unsigned int nr) +{ + struct zram *zram; + unsigned int i; + + for (i = 0; i < nr; i++) { + zram = &zram_devices[i]; + /* + * Remove sysfs first, so no one will perform a disksize + * store while we destroy the devices + */ + sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, + &zram_disk_attr_group); + + zram_reset_device(zram); + + blk_cleanup_queue(zram->disk->queue); + del_gendisk(zram->disk); + put_disk(zram->disk); + } + + kfree(zram_devices); + unregister_blkdev(zram_major, "zram"); + pr_info("Destroyed %u device(s)\n", nr); +} + +static int __init zram_init(void) +{ + int ret, dev_id; + + if (num_devices > max_num_devices) { + pr_warn("Invalid value for num_devices: %u\n", + num_devices); + return -EINVAL; + } + + zram_major = register_blkdev(0, "zram"); + if (zram_major <= 0) { + pr_warn("Unable to get major number\n"); + return -EBUSY; + } + + /* Allocate the device array and initialize each one */ + zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL); + if (!zram_devices) { + unregister_blkdev(zram_major, "zram"); + return -ENOMEM; + } + + for (dev_id = 0; dev_id < num_devices; dev_id++) { + ret = create_device(&zram_devices[dev_id], dev_id); + if (ret) + goto out_error; + } + + pr_info("Created %u device(s)\n", num_devices); + return 0; + +out_error: + destroy_devices(dev_id); + return ret; +} + +static void __exit zram_exit(void) +{ + destroy_devices(num_devices); +} + +module_init(zram_init); +module_exit(zram_exit); + +module_param(num_devices, uint, 0); +MODULE_PARM_DESC(num_devices, "Number of zram devices"); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("Nitin Gupta "); +MODULE_DESCRIPTION("Compressed RAM Block Device"); diff --git a/kernel/drivers/block/zram/zram_drv.h b/kernel/drivers/block/zram/zram_drv.h new file mode 100644 index 000000000..570c598f4 --- /dev/null +++ b/kernel/drivers/block/zram/zram_drv.h @@ -0,0 +1,125 @@ +/* + * Compressed RAM block device + * + * Copyright (C) 2008, 2009, 2010 Nitin Gupta + * 2012, 2013 Minchan Kim + * + * This code is released using a dual license strategy: BSD/GPL + * You can choose the licence that better fits your requirements. + * + * Released under the terms of 3-clause BSD License + * Released under the terms of GNU General Public License Version 2.0 + * + */ + +#ifndef _ZRAM_DRV_H_ +#define _ZRAM_DRV_H_ + +#include +#include + +#include "zcomp.h" + +/* + * Some arbitrary value. This is just to catch + * invalid value for num_devices module parameter. + */ +static const unsigned max_num_devices = 32; + +/*-- Configurable parameters */ + +/* + * Pages that compress to size greater than this are stored + * uncompressed in memory. + */ +static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; + +/* + * NOTE: max_zpage_size must be less than or equal to: + * ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would + * always return failure. + */ + +/*-- End of configurable params */ + +#define SECTOR_SHIFT 9 +#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) +#define ZRAM_LOGICAL_BLOCK_SHIFT 12 +#define ZRAM_LOGICAL_BLOCK_SIZE (1 << ZRAM_LOGICAL_BLOCK_SHIFT) +#define ZRAM_SECTOR_PER_LOGICAL_BLOCK \ + (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT)) + + +/* + * The lower ZRAM_FLAG_SHIFT bits of table.value is for + * object size (excluding header), the higher bits is for + * zram_pageflags. + * + * zram is mainly used for memory efficiency so we want to keep memory + * footprint small so we can squeeze size and flags into a field. + * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), + * the higher bits is for zram_pageflags. + */ +#define ZRAM_FLAG_SHIFT 24 + +/* Flags for zram pages (table[page_no].value) */ +enum zram_pageflags { + /* Page consists entirely of zeros */ + ZRAM_ZERO = ZRAM_FLAG_SHIFT, + ZRAM_ACCESS, /* page is now accessed */ + + __NR_ZRAM_PAGEFLAGS, +}; + +/*-- Data structures */ + +/* Allocated for each disk page */ +struct zram_table_entry { + unsigned long handle; + unsigned long value; +}; + +struct zram_stats { + atomic64_t compr_data_size; /* compressed size of pages stored */ + atomic64_t num_reads; /* failed + successful */ + atomic64_t num_writes; /* --do-- */ + atomic64_t num_migrated; /* no. of migrated object */ + atomic64_t failed_reads; /* can happen when memory is too low */ + atomic64_t failed_writes; /* can happen when memory is too low */ + atomic64_t invalid_io; /* non-page-aligned I/O requests */ + atomic64_t notify_free; /* no. of swap slot free notifications */ + atomic64_t zero_pages; /* no. of zero filled pages */ + atomic64_t pages_stored; /* no. of pages currently stored */ + atomic_long_t max_used_pages; /* no. of maximum pages stored */ +}; + +struct zram_meta { + struct zram_table_entry *table; + struct zs_pool *mem_pool; +}; + +struct zram { + struct zram_meta *meta; + struct zcomp *comp; + struct gendisk *disk; + /* Prevent concurrent execution of device init */ + struct rw_semaphore init_lock; + /* + * the number of pages zram can consume for storing compressed data + */ + unsigned long limit_pages; + int max_comp_streams; + + struct zram_stats stats; + atomic_t refcount; /* refcount for zram_meta */ + /* wait all IO under all of cpu are done */ + wait_queue_head_t io_done; + /* + * This is the limit on amount of *uncompressed* worth of data + * we can store in a disk. + */ + u64 disksize; /* bytes */ + char compressor[10]; +}; +#endif -- cgit 1.2.3-korg