From e09b41010ba33a20a87472ee821fa407a5b8da36 Mon Sep 17 00:00:00 2001 From: José Pekkarinen Date: Mon, 11 Apr 2016 10:41:07 +0300 Subject: These changes are the raw update to linux-4.4.6-rt14. Kernel sources are taken from kernel.org, and rt patch from the rt wiki download page. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During the rebasing, the following patch collided: Force tick interrupt and get rid of softirq magic(I70131fb85). Collisions have been removed because its logic was found on the source already. Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769 Signed-off-by: José Pekkarinen --- kernel/drivers/block/Kconfig | 23 - kernel/drivers/block/Makefile | 3 - kernel/drivers/block/aoe/aoeblk.c | 2 +- kernel/drivers/block/aoe/aoecmd.c | 10 +- kernel/drivers/block/aoe/aoedev.c | 2 +- kernel/drivers/block/brd.c | 29 +- kernel/drivers/block/cciss.c | 27 +- kernel/drivers/block/cciss_scsi.c | 1 - kernel/drivers/block/drbd/drbd_actlog.c | 4 +- kernel/drivers/block/drbd/drbd_bitmap.c | 23 +- kernel/drivers/block/drbd/drbd_debugfs.c | 10 +- kernel/drivers/block/drbd/drbd_int.h | 15 +- kernel/drivers/block/drbd/drbd_main.c | 11 +- kernel/drivers/block/drbd/drbd_nl.c | 4 +- kernel/drivers/block/drbd/drbd_receiver.c | 7 +- kernel/drivers/block/drbd/drbd_req.c | 50 +- kernel/drivers/block/drbd/drbd_worker.c | 44 +- kernel/drivers/block/floppy.c | 7 +- kernel/drivers/block/loop.c | 341 ++- kernel/drivers/block/loop.h | 15 +- kernel/drivers/block/mtip32xx/mtip32xx.c | 244 +-- kernel/drivers/block/mtip32xx/mtip32xx.h | 10 +- kernel/drivers/block/nbd.c | 433 +++- kernel/drivers/block/null_blk.c | 351 ++- kernel/drivers/block/nvme-core.c | 3178 ---------------------------- kernel/drivers/block/nvme-scsi.c | 3070 --------------------------- kernel/drivers/block/osdblk.c | 2 +- kernel/drivers/block/paride/paride.c | 57 +- kernel/drivers/block/paride/paride.h | 2 + kernel/drivers/block/paride/pcd.c | 9 + kernel/drivers/block/paride/pd.c | 18 +- kernel/drivers/block/paride/pf.c | 7 + kernel/drivers/block/paride/pg.c | 8 + kernel/drivers/block/paride/pt.c | 8 + kernel/drivers/block/pktcdvd.c | 76 +- kernel/drivers/block/pmem.c | 262 --- kernel/drivers/block/ps3vram.c | 45 +- kernel/drivers/block/rbd.c | 350 +-- kernel/drivers/block/rsxx/dev.c | 16 +- kernel/drivers/block/skd_main.c | 2 +- kernel/drivers/block/sx8.c | 4 +- kernel/drivers/block/umem.c | 10 +- kernel/drivers/block/virtio_blk.c | 14 +- kernel/drivers/block/xen-blkback/blkback.c | 45 +- kernel/drivers/block/xen-blkback/common.h | 31 +- kernel/drivers/block/xen-blkback/xenbus.c | 206 +- kernel/drivers/block/xen-blkfront.c | 962 +++++---- kernel/drivers/block/zram/Kconfig | 10 +- kernel/drivers/block/zram/zcomp.c | 11 +- kernel/drivers/block/zram/zcomp.h | 1 + kernel/drivers/block/zram/zcomp_lz4.c | 23 +- kernel/drivers/block/zram/zcomp_lzo.c | 23 +- kernel/drivers/block/zram/zram_drv.c | 1116 +++++----- kernel/drivers/block/zram/zram_drv.h | 52 +- 54 files changed, 2902 insertions(+), 8382 deletions(-) delete mode 100644 kernel/drivers/block/nvme-core.c delete mode 100644 kernel/drivers/block/nvme-scsi.c delete mode 100644 kernel/drivers/block/pmem.c (limited to 'kernel/drivers/block') diff --git a/kernel/drivers/block/Kconfig b/kernel/drivers/block/Kconfig index 3ccef9eba..29819e719 100644 --- a/kernel/drivers/block/Kconfig +++ b/kernel/drivers/block/Kconfig @@ -310,17 +310,6 @@ config BLK_DEV_NBD If unsure, say N. -config BLK_DEV_NVME - tristate "NVM Express block device" - depends on PCI - ---help--- - The NVM Express driver is for solid state drives directly - connected to the PCI or PCI Express bus. If you know you - don't have one of these, it is safe to answer N. - - To compile this driver as a module, choose M here: the - module will be called nvme. - config BLK_DEV_SKD tristate "STEC S1120 Block Driver" depends on PCI @@ -404,18 +393,6 @@ config BLK_DEV_RAM_DAX and will prevent RAM block device backing store memory from being allocated from highmem (only a problem for highmem systems). -config BLK_DEV_PMEM - tristate "Persistent memory block device support" - depends on HAS_IOMEM - help - Saying Y here will allow you to use a contiguous range of reserved - memory as one or more persistent block devices. - - To compile this driver as a module, choose M here: the module will be - called 'pmem'. - - If unsure, say N. - config CDROM_PKTCDVD tristate "Packet writing on CD/DVD media" depends on !UML diff --git a/kernel/drivers/block/Makefile b/kernel/drivers/block/Makefile index 9cc6c18a1..671329023 100644 --- a/kernel/drivers/block/Makefile +++ b/kernel/drivers/block/Makefile @@ -14,7 +14,6 @@ obj-$(CONFIG_PS3_VRAM) += ps3vram.o obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o obj-$(CONFIG_BLK_DEV_RAM) += brd.o -obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o obj-$(CONFIG_BLK_DEV_LOOP) += loop.o obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o @@ -23,7 +22,6 @@ obj-$(CONFIG_XILINX_SYSACE) += xsysace.o obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o obj-$(CONFIG_MG_DISK) += mg_disk.o obj-$(CONFIG_SUNVDC) += sunvdc.o -obj-$(CONFIG_BLK_DEV_NVME) += nvme.o obj-$(CONFIG_BLK_DEV_SKD) += skd.o obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o @@ -45,6 +43,5 @@ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o obj-$(CONFIG_ZRAM) += zram/ -nvme-y := nvme-core.o nvme-scsi.o skd-y := skd_main.o swim_mod-y := swim.o swim_asm.o diff --git a/kernel/drivers/block/aoe/aoeblk.c b/kernel/drivers/block/aoe/aoeblk.c index 46c282fff..dd73e1ff1 100644 --- a/kernel/drivers/block/aoe/aoeblk.c +++ b/kernel/drivers/block/aoe/aoeblk.c @@ -395,7 +395,7 @@ aoeblk_gdalloc(void *vp) WARN_ON(d->flags & DEVFL_TKILL); WARN_ON(d->gd); WARN_ON(d->flags & DEVFL_UP); - blk_queue_max_hw_sectors(q, 1024); + blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); q->backing_dev_info.name = "aoe"; q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; d->bufpool = mp; diff --git a/kernel/drivers/block/aoe/aoecmd.c b/kernel/drivers/block/aoe/aoecmd.c index 422b7d84f..ad80c85e0 100644 --- a/kernel/drivers/block/aoe/aoecmd.c +++ b/kernel/drivers/block/aoe/aoecmd.c @@ -1110,7 +1110,7 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail) d->ip.rq = NULL; do { bio = rq->bio; - bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags); + bok = !fastfail && !bio->bi_error; } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size)); /* cf. http://lkml.org/lkml/2006/10/31/28 */ @@ -1172,7 +1172,7 @@ ktiocomplete(struct frame *f) ahout->cmdstat, ahin->cmdstat, d->aoemajor, d->aoeminor); noskb: if (buf) - clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + buf->bio->bi_error = -EIO; goto out; } @@ -1185,7 +1185,7 @@ noskb: if (buf) "aoe: runt data size in read from", (long) d->aoemajor, d->aoeminor, skb->len, n); - clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + buf->bio->bi_error = -EIO; break; } if (n > f->iter.bi_size) { @@ -1193,7 +1193,7 @@ noskb: if (buf) "aoe: too-large data size in read from", (long) d->aoemajor, d->aoeminor, n, f->iter.bi_size); - clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + buf->bio->bi_error = -EIO; break; } bvcpy(skb, f->buf->bio, f->iter, n); @@ -1695,7 +1695,7 @@ aoe_failbuf(struct aoedev *d, struct buf *buf) if (buf == NULL) return; buf->iter.bi_size = 0; - clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + buf->bio->bi_error = -EIO; if (buf->nframesout == 0) aoe_end_buf(d, buf); } diff --git a/kernel/drivers/block/aoe/aoedev.c b/kernel/drivers/block/aoe/aoedev.c index e774c50b6..ffd194750 100644 --- a/kernel/drivers/block/aoe/aoedev.c +++ b/kernel/drivers/block/aoe/aoedev.c @@ -170,7 +170,7 @@ aoe_failip(struct aoedev *d) if (rq == NULL) return; while ((bio = d->ip.nxbio)) { - clear_bit(BIO_UPTODATE, &bio->bi_flags); + bio->bi_error = -EIO; d->ip.nxbio = bio->bi_next; n = (unsigned long) rq->special; rq->special = (void *) --n; diff --git a/kernel/drivers/block/brd.c b/kernel/drivers/block/brd.c index 64ab4951e..a5880f4ab 100644 --- a/kernel/drivers/block/brd.c +++ b/kernel/drivers/block/brd.c @@ -323,7 +323,7 @@ out: return err; } -static void brd_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct brd_device *brd = bdev->bd_disk->private_data; @@ -331,14 +331,15 @@ static void brd_make_request(struct request_queue *q, struct bio *bio) struct bio_vec bvec; sector_t sector; struct bvec_iter iter; - int err = -EIO; sector = bio->bi_iter.bi_sector; if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) - goto out; + goto io_error; if (unlikely(bio->bi_rw & REQ_DISCARD)) { - err = 0; + if (sector & ((PAGE_SIZE >> SECTOR_SHIFT) - 1) || + bio->bi_iter.bi_size & PAGE_MASK) + goto io_error; discard_from_brd(brd, sector, bio->bi_iter.bi_size); goto out; } @@ -349,15 +350,21 @@ static void brd_make_request(struct request_queue *q, struct bio *bio) bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; + int err; + err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, rw, sector); if (err) - break; + goto io_error; sector += len >> SECTOR_SHIFT; } out: - bio_endio(bio, err); + bio_endio(bio); + return BLK_QC_T_NONE; +io_error: + bio_io_error(bio); + return BLK_QC_T_NONE; } static int brd_rw_page(struct block_device *bdev, sector_t sector, @@ -371,7 +378,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, #ifdef CONFIG_BLK_DEV_RAM_DAX static long brd_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn) { struct brd_device *brd = bdev->bd_disk->private_data; struct page *page; @@ -381,13 +388,9 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector, page = brd_insert_page(brd, sector); if (!page) return -ENOSPC; - *kaddr = page_address(page); + *kaddr = (void __pmem *)page_address(page); *pfn = page_to_pfn(page); - /* - * TODO: If size > PAGE_SIZE, we could look to see if the next page in - * the file happens to be mapped to the next page of physical RAM. - */ return PAGE_SIZE; } #else @@ -500,7 +503,7 @@ static struct brd_device *brd_alloc(int i) blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); brd->brd_queue->limits.discard_granularity = PAGE_SIZE; - brd->brd_queue->limits.max_discard_sectors = UINT_MAX; + blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX); brd->brd_queue->limits.discard_zeroes_data = 1; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue); diff --git a/kernel/drivers/block/cciss.c b/kernel/drivers/block/cciss.c index ff20f192b..0422c4726 100644 --- a/kernel/drivers/block/cciss.c +++ b/kernel/drivers/block/cciss.c @@ -139,8 +139,6 @@ static struct board_type products[] = { {0x3214103C, "Smart Array E200i", &SA5_access}, {0x3215103C, "Smart Array E200i", &SA5_access}, {0x3237103C, "Smart Array E500", &SA5_access}, - {0x3223103C, "Smart Array P800", &SA5_access}, - {0x3234103C, "Smart Array P400", &SA5_access}, {0x323D103C, "Smart Array P700m", &SA5_access}, }; @@ -574,8 +572,6 @@ static void cciss_procinit(ctlr_info_t *h) /* List of controllers which cannot be hard reset on kexec with reset_devices */ static u32 unresettable_controller[] = { - 0x324a103C, /* Smart Array P712m */ - 0x324b103C, /* SmartArray P711m */ 0x3223103C, /* Smart Array P800 */ 0x3234103C, /* Smart Array P400 */ 0x3235103C, /* Smart Array P400i */ @@ -586,12 +582,32 @@ static u32 unresettable_controller[] = { 0x3215103C, /* Smart Array E200i */ 0x3237103C, /* Smart Array E500 */ 0x323D103C, /* Smart Array P700m */ + 0x40800E11, /* Smart Array 5i */ 0x409C0E11, /* Smart Array 6400 */ 0x409D0E11, /* Smart Array 6400 EM */ + 0x40700E11, /* Smart Array 5300 */ + 0x40820E11, /* Smart Array 532 */ + 0x40830E11, /* Smart Array 5312 */ + 0x409A0E11, /* Smart Array 641 */ + 0x409B0E11, /* Smart Array 642 */ + 0x40910E11, /* Smart Array 6i */ }; /* List of controllers which cannot even be soft reset */ static u32 soft_unresettable_controller[] = { + 0x40800E11, /* Smart Array 5i */ + 0x40700E11, /* Smart Array 5300 */ + 0x40820E11, /* Smart Array 532 */ + 0x40830E11, /* Smart Array 5312 */ + 0x409A0E11, /* Smart Array 641 */ + 0x409B0E11, /* Smart Array 642 */ + 0x40910E11, /* Smart Array 6i */ + /* Exclude 640x boards. These are two pci devices in one slot + * which share a battery backed cache module. One controls the + * cache, the other accesses the cache through the one that controls + * it. If we reset the one controlling the cache, the other will + * likely not be happy. Just forbid resetting this conjoined mess. + */ 0x409C0E11, /* Smart Array 6400 */ 0x409D0E11, /* Smart Array 6400 EM */ }; @@ -4667,8 +4683,7 @@ static int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) */ cciss_lookup_board_id(pdev, &board_id); if (!ctlr_is_resettable(board_id)) { - dev_warn(&pdev->dev, "Cannot reset Smart Array 640x " - "due to shared cache module."); + dev_warn(&pdev->dev, "Controller not resettable\n"); return -ENODEV; } diff --git a/kernel/drivers/block/cciss_scsi.c b/kernel/drivers/block/cciss_scsi.c index ecd845cd2..1537302e5 100644 --- a/kernel/drivers/block/cciss_scsi.c +++ b/kernel/drivers/block/cciss_scsi.c @@ -84,7 +84,6 @@ static struct scsi_host_template cciss_driver_template = { .show_info = cciss_scsi_show_info, .queuecommand = cciss_scsi_queue_command, .this_id = 7, - .cmd_per_lun = 1, .use_clustering = DISABLE_CLUSTERING, /* Can't have eh_bus_reset_handler or eh_host_reset_handler for cciss */ .eh_device_reset_handler= cciss_eh_device_reset_handler, diff --git a/kernel/drivers/block/drbd/drbd_actlog.c b/kernel/drivers/block/drbd/drbd_actlog.c index 1318e3217..b3868e7a1 100644 --- a/kernel/drivers/block/drbd/drbd_actlog.c +++ b/kernel/drivers/block/drbd/drbd_actlog.c @@ -175,11 +175,11 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */ device->md_io.submit_jif = jiffies; if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) - bio_endio(bio, -EIO); + bio_io_error(bio); else submit_bio(rw, bio); wait_until_done_or_force_detached(device, bdev, &device->md_io.done); - if (bio_flagged(bio, BIO_UPTODATE)) + if (!bio->bi_error) err = device->md_io.error; out: diff --git a/kernel/drivers/block/drbd/drbd_bitmap.c b/kernel/drivers/block/drbd/drbd_bitmap.c index 434c77dcc..9462d2752 100644 --- a/kernel/drivers/block/drbd/drbd_bitmap.c +++ b/kernel/drivers/block/drbd/drbd_bitmap.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include "drbd_int.h" @@ -941,36 +941,27 @@ static void drbd_bm_aio_ctx_destroy(struct kref *kref) } /* bv_page may be a copy, or may be the original */ -static void drbd_bm_endio(struct bio *bio, int error) +static void drbd_bm_endio(struct bio *bio) { struct drbd_bm_aio_ctx *ctx = bio->bi_private; struct drbd_device *device = ctx->device; struct drbd_bitmap *b = device->bitmap; unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page); - int uptodate = bio_flagged(bio, BIO_UPTODATE); - - - /* strange behavior of some lower level drivers... - * fail the request by clearing the uptodate flag, - * but do not return any error?! - * do we want to WARN() on this? */ - if (!error && !uptodate) - error = -EIO; if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 && !bm_test_page_unchanged(b->bm_pages[idx])) drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx); - if (error) { + if (bio->bi_error) { /* ctx error will hold the completed-last non-zero error code, * in case error codes differ. */ - ctx->error = error; + ctx->error = bio->bi_error; bm_set_page_io_err(b->bm_pages[idx]); /* Not identical to on disk version of it. * Is BM_PAGE_IO_ERROR enough? */ if (__ratelimit(&drbd_ratelimit_state)) drbd_err(device, "IO ERROR %d on bitmap page idx %u\n", - error, idx); + bio->bi_error, idx); } else { bm_clear_page_io_err(b->bm_pages[idx]); dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx); @@ -1016,7 +1007,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { - page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_RECLAIM); copy_highpage(page, b->bm_pages[page_nr]); bm_store_page_idx(page, page_nr); } else @@ -1031,7 +1022,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { bio->bi_rw |= rw; - bio_endio(bio, -EIO); + bio_io_error(bio); } else { submit_bio(rw, bio); /* this should not count as user activity and cause the diff --git a/kernel/drivers/block/drbd/drbd_debugfs.c b/kernel/drivers/block/drbd/drbd_debugfs.c index a6ee3d750..6b88a35fb 100644 --- a/kernel/drivers/block/drbd/drbd_debugfs.c +++ b/kernel/drivers/block/drbd/drbd_debugfs.c @@ -419,14 +419,6 @@ static int in_flight_summary_show(struct seq_file *m, void *pos) return 0; } -/* simple_positive(file->f_path.dentry) respectively debugfs_positive(), - * but neither is "reachable" from here. - * So we have our own inline version of it above. :-( */ -static inline int debugfs_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - /* make sure at *open* time that the respective object won't go away. */ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *), void *data, struct kref *kref, @@ -444,7 +436,7 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo /* serialize with d_delete() */ mutex_lock(&d_inode(parent)->i_mutex); /* Make sure the object is still alive */ - if (debugfs_positive(file->f_path.dentry) + if (simple_positive(file->f_path.dentry) && kref_get_unless_zero(kref)) ret = 0; mutex_unlock(&d_inode(parent)->i_mutex); diff --git a/kernel/drivers/block/drbd/drbd_int.h b/kernel/drivers/block/drbd/drbd_int.h index b905e9888..e66d453a5 100644 --- a/kernel/drivers/block/drbd/drbd_int.h +++ b/kernel/drivers/block/drbd/drbd_int.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -1447,9 +1448,8 @@ extern int proc_details; /* drbd_req */ extern void do_submit(struct work_struct *ws); extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long); -extern void drbd_make_request(struct request_queue *q, struct bio *bio); +extern blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio); extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); -extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec); extern int is_valid_ar_handle(struct drbd_request *, sector_t); @@ -1480,9 +1480,9 @@ extern int drbd_khelper(struct drbd_device *device, char *cmd); /* drbd_worker.c */ /* bi_end_io handlers */ -extern void drbd_md_endio(struct bio *bio, int error); -extern void drbd_peer_request_endio(struct bio *bio, int error); -extern void drbd_request_endio(struct bio *bio, int error); +extern void drbd_md_endio(struct bio *bio); +extern void drbd_peer_request_endio(struct bio *bio); +extern void drbd_request_endio(struct bio *bio); extern int drbd_worker(struct drbd_thread *thi); enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor); void drbd_resync_after_changed(struct drbd_device *device); @@ -1603,12 +1603,13 @@ static inline void drbd_generic_make_request(struct drbd_device *device, __release(local); if (!bio->bi_bdev) { drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n"); - bio_endio(bio, -ENODEV); + bio->bi_error = -ENODEV; + bio_endio(bio); return; } if (drbd_insert_fault(device, fault_type)) - bio_endio(bio, -EIO); + bio_io_error(bio); else generic_make_request(bio); } diff --git a/kernel/drivers/block/drbd/drbd_main.c b/kernel/drivers/block/drbd/drbd_main.c index 81fde9ef7..74d97f4ba 100644 --- a/kernel/drivers/block/drbd/drbd_main.c +++ b/kernel/drivers/block/drbd/drbd_main.c @@ -2359,7 +2359,7 @@ static void drbd_cleanup(void) * @congested_data: User data * @bdi_bits: Bits the BDI flusher thread is currently interested in * - * Returns 1<connection->flags)) { - r |= (1 << BDI_async_congested); + r |= (1 << WB_async_congested); /* Without good local data, we would need to read from remote, * and that would need the worker thread as well, which is * currently blocked waiting for that usermode helper to * finish. */ if (!get_ldev_if_state(device, D_UP_TO_DATE)) - r |= (1 << BDI_sync_congested); + r |= (1 << WB_sync_congested); else put_ldev(device); r &= bdi_bits; @@ -2399,9 +2399,9 @@ static int drbd_congested(void *congested_data, int bdi_bits) reason = 'b'; } - if (bdi_bits & (1 << BDI_async_congested) && + if (bdi_bits & (1 << WB_async_congested) && test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) { - r |= (1 << BDI_async_congested); + r |= (1 << WB_async_congested); reason = reason == 'b' ? 'a' : 'n'; } @@ -2774,7 +2774,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig This triggers a max_bio_size message upon first attach or connect */ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); - blk_queue_merge_bvec(q, drbd_merge_bvec); q->queue_lock = &resource->req_lock; device->md_io.page = alloc_page(GFP_KERNEL); diff --git a/kernel/drivers/block/drbd/drbd_nl.c b/kernel/drivers/block/drbd/drbd_nl.c index 74df8cfad..e80cbefbc 100644 --- a/kernel/drivers/block/drbd/drbd_nl.c +++ b/kernel/drivers/block/drbd/drbd_nl.c @@ -1156,14 +1156,14 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi /* For now, don't allow more than one activity log extent worth of data * to be discarded in one go. We may need to rework drbd_al_begin_io() * to allow for even larger discard ranges */ - q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS; + blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); /* REALLY? Is stacking secdiscard "legal"? */ if (blk_queue_secdiscard(b)) queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); } else { - q->limits.max_discard_sectors = 0; + blk_queue_max_discard_sectors(q, 0); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); } diff --git a/kernel/drivers/block/drbd/drbd_receiver.c b/kernel/drivers/block/drbd/drbd_receiver.c index cee20354a..b4b5680ac 100644 --- a/kernel/drivers/block/drbd/drbd_receiver.c +++ b/kernel/drivers/block/drbd/drbd_receiver.c @@ -357,7 +357,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto } if (has_payload && data_size) { - page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT)); + page = drbd_alloc_pages(peer_device, nr_pages, + gfpflags_allow_blocking(gfp_mask)); if (!page) goto fail; } @@ -598,7 +599,7 @@ static struct socket *drbd_try_connect(struct drbd_connection *connection) memcpy(&peer_in6, &connection->peer_addr, peer_addr_len); what = "sock_create_kern"; - err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family, + err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family, SOCK_STREAM, IPPROTO_TCP, &sock); if (err < 0) { sock = NULL; @@ -693,7 +694,7 @@ static int prepare_listen_socket(struct drbd_connection *connection, struct acce memcpy(&my_addr, &connection->my_addr, my_addr_len); what = "sock_create_kern"; - err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family, + err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family, SOCK_STREAM, IPPROTO_TCP, &s_listen); if (err) { s_listen = NULL; diff --git a/kernel/drivers/block/drbd/drbd_req.c b/kernel/drivers/block/drbd/drbd_req.c index 3907202fb..3ae2c0086 100644 --- a/kernel/drivers/block/drbd/drbd_req.c +++ b/kernel/drivers/block/drbd/drbd_req.c @@ -201,7 +201,8 @@ void start_new_tl_epoch(struct drbd_connection *connection) void complete_master_bio(struct drbd_device *device, struct bio_and_error *m) { - bio_endio(m->bio, m->error); + m->bio->bi_error = m->error; + bio_endio(m->bio); dec_ap_bio(device); } @@ -1153,12 +1154,12 @@ drbd_submit_req_private_bio(struct drbd_request *req) rw == WRITE ? DRBD_FAULT_DT_WR : rw == READ ? DRBD_FAULT_DT_RD : DRBD_FAULT_DT_RA)) - bio_endio(bio, -EIO); + bio_io_error(bio); else generic_make_request(bio); put_ldev(device); } else - bio_endio(bio, -EIO); + bio_io_error(bio); } static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req) @@ -1191,7 +1192,8 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long /* only pass the error to the upper layers. * if user cannot handle io errors, that's not our business. */ drbd_err(device, "could not kmalloc() req\n"); - bio_endio(bio, -ENOMEM); + bio->bi_error = -ENOMEM; + bio_endio(bio); return ERR_PTR(-ENOMEM); } req->start_jif = start_jif; @@ -1492,11 +1494,13 @@ void do_submit(struct work_struct *ws) } } -void drbd_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio) { struct drbd_device *device = (struct drbd_device *) q->queuedata; unsigned long start_jif; + blk_queue_split(q, &bio, q->bio_split); + start_jif = jiffies; /* @@ -1506,41 +1510,7 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) inc_ap_bio(device); __drbd_make_request(device, bio, start_jif); -} - -/* This is called by bio_add_page(). - * - * q->max_hw_sectors and other global limits are already enforced there. - * - * We need to call down to our lower level device, - * in case it has special restrictions. - * - * We also may need to enforce configured max-bio-bvecs limits. - * - * As long as the BIO is empty we have to allow at least one bvec, - * regardless of size and offset, so no need to ask lower levels. - */ -int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) -{ - struct drbd_device *device = (struct drbd_device *) q->queuedata; - unsigned int bio_size = bvm->bi_size; - int limit = DRBD_MAX_BIO_SIZE; - int backing_limit; - - if (bio_size && get_ldev(device)) { - unsigned int max_hw_sectors = queue_max_hw_sectors(q); - struct request_queue * const b = - device->ldev->backing_bdev->bd_disk->queue; - if (b->merge_bvec_fn) { - bvm->bi_bdev = device->ldev->backing_bdev; - backing_limit = b->merge_bvec_fn(b, bvm, bvec); - limit = min(limit, backing_limit); - } - put_ldev(device); - if ((limit >> 9) > max_hw_sectors) - limit = max_hw_sectors << 9; - } - return limit; + return BLK_QC_T_NONE; } void request_timer_fn(unsigned long data) diff --git a/kernel/drivers/block/drbd/drbd_worker.c b/kernel/drivers/block/drbd/drbd_worker.c index d0fae55d8..5578c1477 100644 --- a/kernel/drivers/block/drbd/drbd_worker.c +++ b/kernel/drivers/block/drbd/drbd_worker.c @@ -65,12 +65,12 @@ rwlock_t global_state_lock; /* used for synchronous meta data and bitmap IO * submitted by drbd_md_sync_page_io() */ -void drbd_md_endio(struct bio *bio, int error) +void drbd_md_endio(struct bio *bio) { struct drbd_device *device; device = bio->bi_private; - device->md_io.error = error; + device->md_io.error = bio->bi_error; /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able * to timeout on the lower level device, and eventually detach from it. @@ -170,31 +170,20 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l /* writes on behalf of the partner, or resync writes, * "submitted" by the receiver. */ -void drbd_peer_request_endio(struct bio *bio, int error) +void drbd_peer_request_endio(struct bio *bio) { struct drbd_peer_request *peer_req = bio->bi_private; struct drbd_device *device = peer_req->peer_device->device; - int uptodate = bio_flagged(bio, BIO_UPTODATE); int is_write = bio_data_dir(bio) == WRITE; int is_discard = !!(bio->bi_rw & REQ_DISCARD); - if (error && __ratelimit(&drbd_ratelimit_state)) + if (bio->bi_error && __ratelimit(&drbd_ratelimit_state)) drbd_warn(device, "%s: error=%d s=%llus\n", is_write ? (is_discard ? "discard" : "write") - : "read", error, + : "read", bio->bi_error, (unsigned long long)peer_req->i.sector); - if (!error && !uptodate) { - if (__ratelimit(&drbd_ratelimit_state)) - drbd_warn(device, "%s: setting error to -EIO s=%llus\n", - is_write ? "write" : "read", - (unsigned long long)peer_req->i.sector); - /* strange behavior of some lower level drivers... - * fail the request by clearing the uptodate flag, - * but do not return any error?! */ - error = -EIO; - } - if (error) + if (bio->bi_error) set_bit(__EE_WAS_ERROR, &peer_req->flags); bio_put(bio); /* no need for the bio anymore */ @@ -208,24 +197,13 @@ void drbd_peer_request_endio(struct bio *bio, int error) /* read, readA or write requests on R_PRIMARY coming from drbd_make_request */ -void drbd_request_endio(struct bio *bio, int error) +void drbd_request_endio(struct bio *bio) { unsigned long flags; struct drbd_request *req = bio->bi_private; struct drbd_device *device = req->device; struct bio_and_error m; enum drbd_req_event what; - int uptodate = bio_flagged(bio, BIO_UPTODATE); - - if (!error && !uptodate) { - drbd_warn(device, "p %s: setting error to -EIO\n", - bio_data_dir(bio) == WRITE ? "write" : "read"); - /* strange behavior of some lower level drivers... - * fail the request by clearing the uptodate flag, - * but do not return any error?! */ - error = -EIO; - } - /* If this request was aborted locally before, * but now was completed "successfully", @@ -259,14 +237,14 @@ void drbd_request_endio(struct bio *bio, int error) if (__ratelimit(&drbd_ratelimit_state)) drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); - if (!error) + if (!bio->bi_error) panic("possible random memory corruption caused by delayed completion of aborted local request\n"); } /* to avoid recursion in __req_mod */ - if (unlikely(error)) { + if (unlikely(bio->bi_error)) { if (bio->bi_rw & REQ_DISCARD) - what = (error == -EOPNOTSUPP) + what = (bio->bi_error == -EOPNOTSUPP) ? DISCARD_COMPLETED_NOTSUPP : DISCARD_COMPLETED_WITH_ERROR; else @@ -279,7 +257,7 @@ void drbd_request_endio(struct bio *bio, int error) what = COMPLETED_OK; bio_put(req->private_bio); - req->private_bio = ERR_PTR(error); + req->private_bio = ERR_PTR(bio->bi_error); /* not req_mod(), we need irqsave here! */ spin_lock_irqsave(&device->resource->req_lock, flags); diff --git a/kernel/drivers/block/floppy.c b/kernel/drivers/block/floppy.c index a08cda955..331363e7d 100644 --- a/kernel/drivers/block/floppy.c +++ b/kernel/drivers/block/floppy.c @@ -3771,13 +3771,14 @@ struct rb0_cbdata { struct completion complete; }; -static void floppy_rb0_cb(struct bio *bio, int err) +static void floppy_rb0_cb(struct bio *bio) { struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private; int drive = cbdata->drive; - if (err) { - pr_info("floppy: error %d while reading block 0\n", err); + if (bio->bi_error) { + pr_info("floppy: error %d while reading block 0\n", + bio->bi_error); set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags); } complete(&cbdata->complete); diff --git a/kernel/drivers/block/loop.c b/kernel/drivers/block/loop.c index cef6fa83a..423f4ca7d 100644 --- a/kernel/drivers/block/loop.c +++ b/kernel/drivers/block/loop.c @@ -164,6 +164,62 @@ static loff_t get_loop_size(struct loop_device *lo, struct file *file) return get_size(lo->lo_offset, lo->lo_sizelimit, file); } +static void __loop_update_dio(struct loop_device *lo, bool dio) +{ + struct file *file = lo->lo_backing_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned short sb_bsize = 0; + unsigned dio_align = 0; + bool use_dio; + + if (inode->i_sb->s_bdev) { + sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev); + dio_align = sb_bsize - 1; + } + + /* + * We support direct I/O only if lo_offset is aligned with the + * logical I/O size of backing device, and the logical block + * size of loop is bigger than the backing device's and the loop + * needn't transform transfer. + * + * TODO: the above condition may be loosed in the future, and + * direct I/O may be switched runtime at that time because most + * of requests in sane appplications should be PAGE_SIZE algined + */ + if (dio) { + if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && + !(lo->lo_offset & dio_align) && + mapping->a_ops->direct_IO && + !lo->transfer) + use_dio = true; + else + use_dio = false; + } else { + use_dio = false; + } + + if (lo->use_dio == use_dio) + return; + + /* flush dirty pages before changing direct IO */ + vfs_fsync(file, 0); + + /* + * The flag of LO_FLAGS_DIRECT_IO is handled similarly with + * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup + * will get updated by ioctl(LOOP_GET_STATUS) + */ + blk_mq_freeze_queue(lo->lo_queue); + lo->use_dio = use_dio; + if (use_dio) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + else + lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; + blk_mq_unfreeze_queue(lo->lo_queue); +} + static int figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) { @@ -389,6 +445,89 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq) return ret; } +static inline void handle_partial_read(struct loop_cmd *cmd, long bytes) +{ + if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE)) + return; + + if (unlikely(bytes < blk_rq_bytes(cmd->rq))) { + struct bio *bio = cmd->rq->bio; + + bio_advance(bio, bytes); + zero_fill_bio(bio); + } +} + +static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) +{ + struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); + struct request *rq = cmd->rq; + + handle_partial_read(cmd, ret); + + if (ret > 0) + ret = 0; + else if (ret < 0) + ret = -EIO; + + blk_mq_complete_request(rq, ret); +} + +static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, + loff_t pos, bool rw) +{ + struct iov_iter iter; + struct bio_vec *bvec; + struct bio *bio = cmd->rq->bio; + struct file *file = lo->lo_backing_file; + int ret; + + /* nomerge for loop request queue */ + WARN_ON(cmd->rq->bio != cmd->rq->biotail); + + bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, + bio_segments(bio), blk_rq_bytes(cmd->rq)); + + cmd->iocb.ki_pos = pos; + cmd->iocb.ki_filp = file; + cmd->iocb.ki_complete = lo_rw_aio_complete; + cmd->iocb.ki_flags = IOCB_DIRECT; + + if (rw == WRITE) + ret = file->f_op->write_iter(&cmd->iocb, &iter); + else + ret = file->f_op->read_iter(&cmd->iocb, &iter); + + if (ret != -EIOCBQUEUED) + cmd->iocb.ki_complete(&cmd->iocb, ret, 0); + return 0; +} + + +static inline int lo_rw_simple(struct loop_device *lo, + struct request *rq, loff_t pos, bool rw) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); + + if (cmd->use_aio) + return lo_rw_aio(lo, cmd, pos, rw); + + /* + * lo_write_simple and lo_read_simple should have been covered + * by io submit style function like lo_rw_aio(), one blocker + * is that lo_read_simple() need to call flush_dcache_page after + * the page is written from kernel, and it isn't easy to handle + * this in io submit style function which submits all segments + * of the req at one time. And direct read IO doesn't need to + * run flush_dcache_page(). + */ + if (rw == WRITE) + return lo_write_simple(lo, rq, pos); + else + return lo_read_simple(lo, rq, pos); +} + static int do_req_filebacked(struct loop_device *lo, struct request *rq) { loff_t pos; @@ -404,13 +543,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) else if (lo->transfer) ret = lo_write_transfer(lo, rq, pos); else - ret = lo_write_simple(lo, rq, pos); + ret = lo_rw_simple(lo, rq, pos, WRITE); } else { if (lo->transfer) ret = lo_read_transfer(lo, rq, pos); else - ret = lo_read_simple(lo, rq, pos); + ret = lo_rw_simple(lo, rq, pos, READ); } return ret; @@ -421,6 +560,12 @@ struct switch_request { struct completion wait; }; +static inline void loop_update_dio(struct loop_device *lo) +{ + __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) | + lo->use_dio); +} + /* * Do the actual switch; called from the BIO completion routine */ @@ -441,6 +586,7 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p) mapping->host->i_bdev->bd_block_size : PAGE_SIZE; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_update_dio(lo); } /* @@ -474,6 +620,28 @@ static int loop_flush(struct loop_device *lo) return loop_switch(lo, NULL); } +static void loop_reread_partitions(struct loop_device *lo, + struct block_device *bdev) +{ + int rc; + + /* + * bd_mutex has been held already in release path, so don't + * acquire it if this function is called in such case. + * + * If the reread partition isn't from release path, lo_refcnt + * must be at least one and it can only become zero when the + * current holder is released. + */ + if (!atomic_read(&lo->lo_refcnt)) + rc = __blkdev_reread_part(bdev); + else + rc = blkdev_reread_part(bdev); + if (rc) + pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", + __func__, lo->lo_number, lo->lo_file_name, rc); +} + /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up @@ -522,7 +690,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, fput(old_file); if (lo->lo_flags & LO_FLAGS_PARTSCAN) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); return 0; out_putf: @@ -566,7 +734,7 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) spin_lock_irq(&lo->lo_lock); if (lo->lo_backing_file) - p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1); + p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1); spin_unlock_irq(&lo->lo_lock); if (IS_ERR_OR_NULL(p)) @@ -605,11 +773,19 @@ static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) return sprintf(buf, "%s\n", partscan ? "1" : "0"); } +static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) +{ + int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); + + return sprintf(buf, "%s\n", dio ? "1" : "0"); +} + LOOP_ATTR_RO(backing_file); LOOP_ATTR_RO(offset); LOOP_ATTR_RO(sizelimit); LOOP_ATTR_RO(autoclear); LOOP_ATTR_RO(partscan); +LOOP_ATTR_RO(dio); static struct attribute *loop_attrs[] = { &loop_attr_backing_file.attr, @@ -617,6 +793,7 @@ static struct attribute *loop_attrs[] = { &loop_attr_sizelimit.attr, &loop_attr_autoclear.attr, &loop_attr_partscan.attr, + &loop_attr_dio.attr, NULL, }; @@ -653,7 +830,7 @@ static void loop_config_discard(struct loop_device *lo) lo->lo_encrypt_key_size) { q->limits.discard_granularity = 0; q->limits.discard_alignment = 0; - q->limits.max_discard_sectors = 0; + blk_queue_max_discard_sectors(q, 0); q->limits.discard_zeroes_data = 0; queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); return; @@ -661,11 +838,28 @@ static void loop_config_discard(struct loop_device *lo) q->limits.discard_granularity = inode->i_sb->s_blocksize; q->limits.discard_alignment = 0; - q->limits.max_discard_sectors = UINT_MAX >> 9; + blk_queue_max_discard_sectors(q, UINT_MAX >> 9); q->limits.discard_zeroes_data = 1; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); } +static void loop_unprepare_queue(struct loop_device *lo) +{ + flush_kthread_worker(&lo->worker); + kthread_stop(lo->worker_task); +} + +static int loop_prepare_queue(struct loop_device *lo) +{ + init_kthread_worker(&lo->worker); + lo->worker_task = kthread_run(kthread_worker_fn, + &lo->worker, "loop%d", lo->lo_number); + if (IS_ERR(lo->worker_task)) + return -ENOMEM; + set_user_nice(lo->worker_task, MIN_NICE); + return 0; +} + static int loop_set_fd(struct loop_device *lo, fmode_t mode, struct block_device *bdev, unsigned int arg) { @@ -723,17 +917,15 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, size = get_loop_size(lo, file); if ((loff_t)(sector_t)size != size) goto out_putf; - error = -ENOMEM; - lo->wq = alloc_workqueue("kloopd%d", - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16, - lo->lo_number); - if (!lo->wq) + error = loop_prepare_queue(lo); + if (error) goto out_putf; error = 0; set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); + lo->use_dio = false; lo->lo_blocksize = lo_blocksize; lo->lo_device = bdev; lo->lo_flags = lo_flags; @@ -747,6 +939,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_flush(lo->lo_queue, REQ_FLUSH); + loop_update_dio(lo); set_capacity(lo->lo_disk, size); bd_set_size(bdev, size << 9); loop_sysfs_init(lo); @@ -759,7 +952,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); /* Grab the block_device to prevent its destruction after we * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). @@ -831,7 +1024,7 @@ static int loop_clr_fd(struct loop_device *lo) * /do something like mkfs/losetup -d causing the losetup -d * command to fail with EBUSY. */ - if (lo->lo_refcnt > 1) { + if (atomic_read(&lo->lo_refcnt) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; mutex_unlock(&lo->lo_ctl_mutex); return 0; @@ -840,6 +1033,9 @@ static int loop_clr_fd(struct loop_device *lo) if (filp == NULL) return -EINVAL; + /* freeze request queue during the transition */ + blk_mq_freeze_queue(lo->lo_queue); + spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_rundown; lo->lo_backing_file = NULL; @@ -871,13 +1067,14 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); + blk_mq_unfreeze_queue(lo->lo_queue); + if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); lo->lo_flags = 0; if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; - destroy_workqueue(lo->wq); - lo->wq = NULL; + loop_unprepare_queue(lo); mutex_unlock(&lo->lo_ctl_mutex); /* * Need not hold lo_ctl_mutex to fput backing file. @@ -949,7 +1146,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { lo->lo_flags |= LO_FLAGS_PARTSCAN; lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - ioctl_by_bdev(lo->lo_device, BLKRRPART, 0); + loop_reread_partitions(lo, lo->lo_device); } lo->lo_encrypt_key_size = info->lo_encrypt_key_size; @@ -961,6 +1158,9 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) lo->lo_key_owner = uid; } + /* update dio if lo_offset or transfer is changed */ + __loop_update_dio(lo, lo->use_dio); + return 0; } @@ -1111,6 +1311,20 @@ static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev) return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); } +static int loop_set_dio(struct loop_device *lo, unsigned long arg) +{ + int error = -ENXIO; + if (lo->lo_state != Lo_bound) + goto out; + + __loop_update_dio(lo, !!arg); + if (lo->use_dio == !!arg) + return 0; + error = -EINVAL; + out: + return error; +} + static int lo_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1154,6 +1368,11 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_capacity(lo, bdev); break; + case LOOP_SET_DIRECT_IO: + err = -EPERM; + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_dio(lo, arg); + break; default: err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; } @@ -1330,9 +1549,7 @@ static int lo_open(struct block_device *bdev, fmode_t mode) goto out; } - mutex_lock(&lo->lo_ctl_mutex); - lo->lo_refcnt++; - mutex_unlock(&lo->lo_ctl_mutex); + atomic_inc(&lo->lo_refcnt); out: mutex_unlock(&loop_index_mutex); return err; @@ -1343,11 +1560,10 @@ static void lo_release(struct gendisk *disk, fmode_t mode) struct loop_device *lo = disk->private_data; int err; - mutex_lock(&lo->lo_ctl_mutex); - - if (--lo->lo_refcnt) - goto out; + if (atomic_dec_return(&lo->lo_refcnt)) + return; + mutex_lock(&lo->lo_ctl_mutex); if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { /* * In autoclear mode, stop the loop thread @@ -1364,7 +1580,6 @@ static void lo_release(struct gendisk *disk, fmode_t mode) loop_flush(lo); } -out: mutex_unlock(&lo->lo_ctl_mutex); } @@ -1438,23 +1653,13 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (lo->lo_state != Lo_bound) return -EIO; - if (cmd->rq->cmd_flags & REQ_WRITE) { - struct loop_device *lo = cmd->rq->q->queuedata; - bool need_sched = true; - - spin_lock_irq(&lo->lo_lock); - if (lo->write_started) - need_sched = false; - else - lo->write_started = true; - list_add_tail(&cmd->list, &lo->write_cmd_head); - spin_unlock_irq(&lo->lo_lock); + if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH | + REQ_DISCARD))) + cmd->use_aio = true; + else + cmd->use_aio = false; - if (need_sched) - queue_work(lo->wq, &lo->write_work); - } else { - queue_work(lo->wq, &cmd->read_work); - } + queue_kthread_work(&lo->worker, &cmd->work); return BLK_MQ_RQ_QUEUE_OK; } @@ -1463,48 +1668,24 @@ static void loop_handle_cmd(struct loop_cmd *cmd) { const bool write = cmd->rq->cmd_flags & REQ_WRITE; struct loop_device *lo = cmd->rq->q->queuedata; - int ret = -EIO; + int ret = 0; - if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) + if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { + ret = -EIO; goto failed; + } ret = do_req_filebacked(lo, cmd->rq); - failed: - if (ret) - cmd->rq->errors = -EIO; - blk_mq_complete_request(cmd->rq); -} - -static void loop_queue_write_work(struct work_struct *work) -{ - struct loop_device *lo = - container_of(work, struct loop_device, write_work); - LIST_HEAD(cmd_list); - - spin_lock_irq(&lo->lo_lock); - repeat: - list_splice_init(&lo->write_cmd_head, &cmd_list); - spin_unlock_irq(&lo->lo_lock); - - while (!list_empty(&cmd_list)) { - struct loop_cmd *cmd = list_first_entry(&cmd_list, - struct loop_cmd, list); - list_del_init(&cmd->list); - loop_handle_cmd(cmd); - } - - spin_lock_irq(&lo->lo_lock); - if (!list_empty(&lo->write_cmd_head)) - goto repeat; - lo->write_started = false; - spin_unlock_irq(&lo->lo_lock); + /* complete non-aio request */ + if (!cmd->use_aio || ret) + blk_mq_complete_request(cmd->rq, ret ? -EIO : 0); } -static void loop_queue_read_work(struct work_struct *work) +static void loop_queue_work(struct kthread_work *work) { struct loop_cmd *cmd = - container_of(work, struct loop_cmd, read_work); + container_of(work, struct loop_cmd, work); loop_handle_cmd(cmd); } @@ -1516,7 +1697,7 @@ static int loop_init_request(void *data, struct request *rq, struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); cmd->rq = rq; - INIT_WORK(&cmd->read_work, loop_queue_read_work); + init_kthread_work(&cmd->work, loop_queue_work); return 0; } @@ -1572,8 +1753,11 @@ static int loop_add(struct loop_device **l, int i) } lo->lo_queue->queuedata = lo; - INIT_LIST_HEAD(&lo->write_cmd_head); - INIT_WORK(&lo->write_work, loop_queue_write_work); + /* + * It doesn't make sense to enable merge because the I/O + * submitted to backing file is handled page by page. + */ + queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); disk = lo->lo_disk = alloc_disk(1 << part_shift); if (!disk) @@ -1601,6 +1785,7 @@ static int loop_add(struct loop_device **l, int i) disk->flags |= GENHD_FL_NO_PART_SCAN; disk->flags |= GENHD_FL_EXT_DEVT; mutex_init(&lo->lo_ctl_mutex); + atomic_set(&lo->lo_refcnt, 0); lo->lo_number = i; spin_lock_init(&lo->lo_lock); disk->major = LOOP_MAJOR; @@ -1718,7 +1903,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd, mutex_unlock(&lo->lo_ctl_mutex); break; } - if (lo->lo_refcnt > 0) { + if (atomic_read(&lo->lo_refcnt) > 0) { ret = -EBUSY; mutex_unlock(&lo->lo_ctl_mutex); break; diff --git a/kernel/drivers/block/loop.h b/kernel/drivers/block/loop.h index 49564edf5..fb2237c73 100644 --- a/kernel/drivers/block/loop.h +++ b/kernel/drivers/block/loop.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include /* Possible states of device */ @@ -28,7 +28,7 @@ struct loop_func_table; struct loop_device { int lo_number; - int lo_refcnt; + atomic_t lo_refcnt; loff_t lo_offset; loff_t lo_sizelimit; int lo_flags; @@ -54,12 +54,11 @@ struct loop_device { gfp_t old_gfp_mask; spinlock_t lo_lock; - struct workqueue_struct *wq; - struct list_head write_cmd_head; - struct work_struct write_work; - bool write_started; int lo_state; struct mutex lo_ctl_mutex; + struct kthread_worker worker; + struct task_struct *worker_task; + bool use_dio; struct request_queue *lo_queue; struct blk_mq_tag_set tag_set; @@ -67,9 +66,11 @@ struct loop_device { }; struct loop_cmd { - struct work_struct read_work; + struct kthread_work work; struct request *rq; struct list_head list; + bool use_aio; /* use AIO interface to handle I/O */ + struct kiocb iocb; }; /* Support for loadable transfer modules */ diff --git a/kernel/drivers/block/mtip32xx/mtip32xx.c b/kernel/drivers/block/mtip32xx/mtip32xx.c index 3bd7ca985..3457ac8c0 100644 --- a/kernel/drivers/block/mtip32xx/mtip32xx.c +++ b/kernel/drivers/block/mtip32xx/mtip32xx.c @@ -163,12 +163,6 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev) else dev_warn(&dd->pdev->dev, "%s: dd->queue is NULL\n", __func__); - if (dd->port) { - set_bit(MTIP_PF_SR_CLEANUP_BIT, &dd->port->flags); - wake_up_interruptible(&dd->port->svc_wait); - } else - dev_warn(&dd->pdev->dev, - "%s: dd->port is NULL\n", __func__); return true; /* device removed */ } @@ -179,7 +173,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd) { struct request *rq; - rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true); + rq = blk_mq_alloc_request(dd->queue, 0, __GFP_RECLAIM, true); return blk_mq_rq_to_pdu(rq); } @@ -269,8 +263,11 @@ static int mtip_hba_reset(struct driver_data *dd) /* Flush */ readl(dd->mmio + HOST_CTL); - /* Spin for up to 2 seconds, waiting for reset acknowledgement */ - timeout = jiffies + msecs_to_jiffies(2000); + /* + * Spin for up to 10 seconds waiting for reset acknowledgement. Spec + * is 1 sec but in LUN failure conditions, up to 10 secs are required + */ + timeout = jiffies + msecs_to_jiffies(10000); do { mdelay(10); if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) @@ -623,8 +620,7 @@ static void mtip_handle_tfe(struct driver_data *dd) set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && - test_bit(MTIP_TAG_INTERNAL, port->allocated)) { + if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL); dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); @@ -896,6 +892,10 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) /* Acknowledge the interrupt status on the port.*/ port_stat = readl(port->mmio + PORT_IRQ_STAT); + if (unlikely(port_stat == 0xFFFFFFFF)) { + mtip_check_surprise_removal(dd->pdev); + return IRQ_HANDLED; + } writel(port_stat, port->mmio + PORT_IRQ_STAT); /* Demux port status */ @@ -991,15 +991,10 @@ static bool mtip_pause_ncq(struct mtip_port *port, reply = port->rxfis + RX_FIS_D2H_REG; task_file_data = readl(port->mmio+PORT_TFDATA); - if (fis->command == ATA_CMD_SEC_ERASE_UNIT) - clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); - if ((task_file_data & 1)) return false; if (fis->command == ATA_CMD_SEC_ERASE_PREP) { - set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); - set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); port->ic_pause_timer = jiffies; return true; } else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) && @@ -1011,8 +1006,10 @@ static bool mtip_pause_ncq(struct mtip_port *port, ((fis->command == 0xFC) && (fis->features == 0x27 || fis->features == 0x72 || fis->features == 0x62 || fis->features == 0x26))) { + clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag); /* Com reset after secure erase or lowlevel format */ mtip_restart_port(port); + clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); return false; } @@ -1112,9 +1109,10 @@ static int mtip_exec_internal_command(struct mtip_port *port, int_cmd = mtip_get_int_command(dd); set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); - port->ic_pause_timer = 0; - clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + if (fis->command == ATA_CMD_SEC_ERASE_PREP) + set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); if (atomic == GFP_KERNEL) { @@ -1251,11 +1249,11 @@ static int mtip_exec_internal_command(struct mtip_port *port, exec_ic_exit: /* Clear the allocated and active bits for the internal command. */ mtip_put_int_command(dd, int_cmd); + clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); if (rv >= 0 && mtip_pause_ncq(port, fis)) { /* NCQ paused */ return rv; } - clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); wake_up_interruptible(&port->svc_wait); return rv; @@ -2625,18 +2623,6 @@ static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, readl(dd->mmio + HOST_IRQ_STAT)); size += sprintf(&buf[size], "\n"); - size += sprintf(&buf[size], "L/ Allocated : [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) { - if (sizeof(long) > sizeof(u32)) - group_allocated = - dd->port->allocated[n/2] >> (32*(n&1)); - else - group_allocated = dd->port->allocated[n]; - size += sprintf(&buf[size], "%08X ", group_allocated); - } - size += sprintf(&buf[size], "]\n"); - size += sprintf(&buf[size], "L/ Commands in Q : [ 0x"); for (n = dd->slot_groups-1; n >= 0; n--) { @@ -2780,48 +2766,6 @@ static void mtip_hw_debugfs_exit(struct driver_data *dd) debugfs_remove_recursive(dd->dfs_node); } -static int mtip_free_orphan(struct driver_data *dd) -{ - struct kobject *kobj; - - if (dd->bdev) { - if (dd->bdev->bd_holders >= 1) - return -2; - - bdput(dd->bdev); - dd->bdev = NULL; - } - - mtip_hw_debugfs_exit(dd); - - spin_lock(&rssd_index_lock); - ida_remove(&rssd_index_ida, dd->index); - spin_unlock(&rssd_index_lock); - - if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag) && - test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) { - put_disk(dd->disk); - } else { - if (dd->disk) { - kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); - if (kobj) { - mtip_hw_sysfs_exit(dd, kobj); - kobject_put(kobj); - } - del_gendisk(dd->disk); - dd->disk = NULL; - } - if (dd->queue) { - dd->queue->queuedata = NULL; - blk_cleanup_queue(dd->queue); - blk_mq_free_tag_set(&dd->tags); - dd->queue = NULL; - } - } - kfree(dd); - return 0; -} - /* * Perform any init/resume time hardware setup * @@ -2944,7 +2888,6 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd) mtip_block_initialize(dd); return 0; } - ssleep(10); } while (time_before(jiffies, timeout)); /* Check for timeout */ @@ -2969,7 +2912,6 @@ static int mtip_service_thread(void *data) unsigned long slot, slot_start, slot_wrap; unsigned int num_cmd_slots = dd->slot_groups * 32; struct mtip_port *port = dd->port; - int ret; while (1) { if (kthread_should_stop() || @@ -2990,10 +2932,6 @@ static int mtip_service_thread(void *data) test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) goto st_out; - /* If I am an orphan, start self cleanup */ - if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags)) - break; - if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) goto st_out; @@ -3047,26 +2985,6 @@ restart_eh: } } - /* wait for pci remove to exit */ - while (1) { - if (test_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag)) - break; - msleep_interruptible(1000); - if (kthread_should_stop()) - goto st_out; - } - - while (1) { - ret = mtip_free_orphan(dd); - if (!ret) { - /* NOTE: All data structures are invalid, do not - * access any here */ - return 0; - } - msleep_interruptible(1000); - if (kthread_should_stop()) - goto st_out; - } st_out: return 0; } @@ -3394,6 +3312,7 @@ static int mtip_hw_exit(struct driver_data *dd) /* Release the IRQ. */ irq_set_affinity_hint(dd->pdev->irq, NULL); devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd); + msleep(1000); /* Free dma regions */ mtip_dma_free(dd); @@ -3699,6 +3618,26 @@ static const struct block_device_operations mtip_block_ops = { .owner = THIS_MODULE }; +static inline bool is_se_active(struct driver_data *dd) +{ + if (unlikely(test_bit(MTIP_PF_SE_ACTIVE_BIT, &dd->port->flags))) { + if (dd->port->ic_pause_timer) { + unsigned long to = dd->port->ic_pause_timer + + msecs_to_jiffies(1000); + if (time_after(jiffies, to)) { + clear_bit(MTIP_PF_SE_ACTIVE_BIT, + &dd->port->flags); + clear_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag); + dd->port->ic_pause_timer = 0; + wake_up_interruptible(&dd->port->svc_wait); + return false; + } + } + return true; + } + return false; +} + /* * Block layer make request function. * @@ -3716,6 +3655,9 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); unsigned int nents; + if (is_se_active(dd)) + return -ENODATA; + if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { @@ -3814,6 +3756,14 @@ static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx, struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64; + /* + * For flush requests, request_idx starts at the end of the + * tag space. Since we don't support FLUSH/FUA, simply return + * 0 as there's nothing to be done. + */ + if (request_idx >= MTIP_MAX_COMMAND_SLOTS) + return 0; + cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, &cmd->command_dma, GFP_KERNEL); if (!cmd->command) @@ -3860,7 +3810,6 @@ static int mtip_block_initialize(struct driver_data *dd) sector_t capacity; unsigned int index = 0; struct kobject *kobj; - unsigned char thd_name[16]; if (dd->disk) goto skip_create_disk; /* hw init done, before rebuild */ @@ -3900,7 +3849,8 @@ static int mtip_block_initialize(struct driver_data *dd) dd->disk->driverfs_dev = &dd->pdev->dev; dd->disk->major = dd->major; - dd->disk->first_minor = dd->instance * MTIP_MAX_MINORS; + dd->disk->first_minor = index * MTIP_MAX_MINORS; + dd->disk->minors = MTIP_MAX_MINORS; dd->disk->fops = &mtip_block_ops; dd->disk->private_data = dd; dd->index = index; @@ -4007,10 +3957,9 @@ skip_create_disk: } start_service_thread: - sprintf(thd_name, "mtip_svc_thd_%02d", index); dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread, - dd, dd->numa_node, "%s", - thd_name); + dd, dd->numa_node, + "mtip_svc_thd_%02d", index); if (IS_ERR(dd->mtip_svc_handler)) { dev_err(&dd->pdev->dev, "service thread failed to start\n"); @@ -4066,52 +4015,51 @@ static int mtip_block_remove(struct driver_data *dd) { struct kobject *kobj; - if (!dd->sr) { - mtip_hw_debugfs_exit(dd); + mtip_hw_debugfs_exit(dd); - if (dd->mtip_svc_handler) { - set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); - wake_up_interruptible(&dd->port->svc_wait); - kthread_stop(dd->mtip_svc_handler); - } + if (dd->mtip_svc_handler) { + set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); + kthread_stop(dd->mtip_svc_handler); + } - /* Clean up the sysfs attributes, if created */ - if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { - kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); - if (kobj) { - mtip_hw_sysfs_exit(dd, kobj); - kobject_put(kobj); - } + /* Clean up the sysfs attributes, if created */ + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { + kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); + if (kobj) { + mtip_hw_sysfs_exit(dd, kobj); + kobject_put(kobj); } + } + if (!dd->sr) mtip_standby_drive(dd); - - /* - * Delete our gendisk structure. This also removes the device - * from /dev - */ - if (dd->bdev) { - bdput(dd->bdev); - dd->bdev = NULL; - } - if (dd->disk) { - if (dd->disk->queue) { - del_gendisk(dd->disk); - blk_cleanup_queue(dd->queue); - blk_mq_free_tag_set(&dd->tags); - dd->queue = NULL; - } else - put_disk(dd->disk); - } - dd->disk = NULL; - - spin_lock(&rssd_index_lock); - ida_remove(&rssd_index_ida, dd->index); - spin_unlock(&rssd_index_lock); - } else { + else dev_info(&dd->pdev->dev, "device %s surprise removal\n", dd->disk->disk_name); + + /* + * Delete our gendisk structure. This also removes the device + * from /dev + */ + if (dd->bdev) { + bdput(dd->bdev); + dd->bdev = NULL; + } + if (dd->disk) { + del_gendisk(dd->disk); + if (dd->disk->queue) { + blk_cleanup_queue(dd->queue); + blk_mq_free_tag_set(&dd->tags); + dd->queue = NULL; + } + put_disk(dd->disk); } + dd->disk = NULL; + + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, dd->index); + spin_unlock(&rssd_index_lock); /* De-initialize the protocol layer. */ mtip_hw_exit(dd); @@ -4140,12 +4088,12 @@ static int mtip_block_shutdown(struct driver_data *dd) dev_info(&dd->pdev->dev, "Shutting down %s ...\n", dd->disk->disk_name); + del_gendisk(dd->disk); if (dd->disk->queue) { - del_gendisk(dd->disk); blk_cleanup_queue(dd->queue); blk_mq_free_tag_set(&dd->tags); - } else - put_disk(dd->disk); + } + put_disk(dd->disk); dd->disk = NULL; dd->queue = NULL; } @@ -4507,6 +4455,7 @@ static void mtip_pci_remove(struct pci_dev *pdev) "Completion workers still active!\n"); } + blk_mq_stop_hw_queues(dd->queue); /* Clean up the block layer. */ mtip_block_remove(dd); @@ -4524,10 +4473,7 @@ static void mtip_pci_remove(struct pci_dev *pdev) list_del_init(&dd->remove_list); spin_unlock_irqrestore(&dev_lock, flags); - if (!dd->sr) - kfree(dd); - else - set_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag); + kfree(dd); pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); pci_set_drvdata(pdev, NULL); diff --git a/kernel/drivers/block/mtip32xx/mtip32xx.h b/kernel/drivers/block/mtip32xx/mtip32xx.h index ba1b31ee2..327478400 100644 --- a/kernel/drivers/block/mtip32xx/mtip32xx.h +++ b/kernel/drivers/block/mtip32xx/mtip32xx.h @@ -142,7 +142,6 @@ enum { MTIP_PF_SVC_THD_ACTIVE_BIT = 4, MTIP_PF_ISSUE_CMDS_BIT = 5, MTIP_PF_REBUILD_BIT = 6, - MTIP_PF_SR_CLEANUP_BIT = 7, MTIP_PF_SVC_THD_STOP_BIT = 8, /* below are bit numbers in 'dd_flag' defined in driver_data */ @@ -150,7 +149,6 @@ enum { MTIP_DDF_REMOVE_PENDING_BIT = 1, MTIP_DDF_OVER_TEMP_BIT = 2, MTIP_DDF_WRITE_PROTECT_BIT = 3, - MTIP_DDF_REMOVE_DONE_BIT = 4, MTIP_DDF_CLEANUP_BIT = 5, MTIP_DDF_RESUME_BIT = 6, MTIP_DDF_INIT_DONE_BIT = 7, @@ -412,19 +410,13 @@ struct mtip_port { * by the DMA when the driver issues internal commands. */ dma_addr_t sector_buffer_dma; - /* - * Bit significant, used to determine if a command slot has - * been allocated. i.e. the slot is in use. Bits are cleared - * when the command slot and all associated data structures - * are no longer needed. - */ + u16 *log_buf; dma_addr_t log_buf_dma; u8 *smart_buf; dma_addr_t smart_buf_dma; - unsigned long allocated[SLOTBITS_IN_LONGS]; /* * used to queue commands when an internal command is in progress * or error handling is active diff --git a/kernel/drivers/block/nbd.c b/kernel/drivers/block/nbd.c index 39e5f7fae..93b3f99b6 100644 --- a/kernel/drivers/block/nbd.c +++ b/kernel/drivers/block/nbd.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -40,8 +41,7 @@ #include struct nbd_device { - int flags; - int harderror; /* Code of hard error */ + u32 flags; struct socket * sock; /* If == NULL, device is not ready, yet */ int magic; @@ -56,11 +56,25 @@ struct nbd_device { struct gendisk *disk; int blksize; loff_t bytesize; - pid_t pid; /* pid of nbd-client, if attached */ int xmit_timeout; - int disconnect; /* a disconnect has been requested by user */ + bool disconnect; /* a disconnect has been requested by user */ + + struct timer_list timeout_timer; + spinlock_t tasks_lock; + struct task_struct *task_recv; + struct task_struct *task_send; + +#if IS_ENABLED(CONFIG_DEBUG_FS) + struct dentry *dbg_dir; +#endif }; +#if IS_ENABLED(CONFIG_DEBUG_FS) +static struct dentry *nbd_dbg_dir; +#endif + +#define nbd_name(nbd) ((nbd)->disk->disk_name) + #define NBD_MAGIC 0x68797548 static unsigned int nbds_max = 16; @@ -113,26 +127,38 @@ static void nbd_end_request(struct nbd_device *nbd, struct request *req) /* * Forcibly shutdown the socket causing all listeners to error */ -static void sock_shutdown(struct nbd_device *nbd, int lock) +static void sock_shutdown(struct nbd_device *nbd) { - if (lock) - mutex_lock(&nbd->tx_lock); - if (nbd->sock) { - dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); - kernel_sock_shutdown(nbd->sock, SHUT_RDWR); - nbd->sock = NULL; - } - if (lock) - mutex_unlock(&nbd->tx_lock); + if (!nbd->sock) + return; + + dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); + kernel_sock_shutdown(nbd->sock, SHUT_RDWR); + nbd->sock = NULL; + del_timer_sync(&nbd->timeout_timer); } static void nbd_xmit_timeout(unsigned long arg) { - struct task_struct *task = (struct task_struct *)arg; + struct nbd_device *nbd = (struct nbd_device *)arg; + unsigned long flags; + + if (list_empty(&nbd->queue_head)) + return; + + nbd->disconnect = true; + + spin_lock_irqsave(&nbd->tasks_lock, flags); - printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n", - task->comm, task->pid); - force_sig(SIGKILL, task); + if (nbd->task_recv) + force_sig(SIGKILL, nbd->task_recv); + + if (nbd->task_send) + force_sig(SIGKILL, nbd->task_send); + + spin_unlock_irqrestore(&nbd->tasks_lock, flags); + + dev_err(nbd_to_dev(nbd), "Connection timed out, killed receiver and sender, shutting down connection\n"); } /* @@ -171,33 +197,12 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, msg.msg_controllen = 0; msg.msg_flags = msg_flags | MSG_NOSIGNAL; - if (send) { - struct timer_list ti; - - if (nbd->xmit_timeout) { - init_timer(&ti); - ti.function = nbd_xmit_timeout; - ti.data = (unsigned long)current; - ti.expires = jiffies + nbd->xmit_timeout; - add_timer(&ti); - } + if (send) result = kernel_sendmsg(sock, &msg, &iov, 1, size); - if (nbd->xmit_timeout) - del_timer_sync(&ti); - } else + else result = kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags); - if (signal_pending(current)) { - siginfo_t info; - printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n", - task_pid_nr(current), current->comm, - dequeue_signal_lock(current, ¤t->blocked, &info)); - result = -EINTR; - sock_shutdown(nbd, !send); - break; - } - if (result <= 0) { if (result == 0) result = -EPIPE; /* short read */ @@ -210,6 +215,9 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, sigprocmask(SIG_SETMASK, &oldset, NULL); tsk_restore_flags(current, pflags, PF_MEMALLOC); + if (!send && nbd->xmit_timeout) + mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout); + return result; } @@ -230,29 +238,40 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) int result, flags; struct nbd_request request; unsigned long size = blk_rq_bytes(req); + u32 type; + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + type = NBD_CMD_DISC; + else if (req->cmd_flags & REQ_DISCARD) + type = NBD_CMD_TRIM; + else if (req->cmd_flags & REQ_FLUSH) + type = NBD_CMD_FLUSH; + else if (rq_data_dir(req) == WRITE) + type = NBD_CMD_WRITE; + else + type = NBD_CMD_READ; memset(&request, 0, sizeof(request)); request.magic = htonl(NBD_REQUEST_MAGIC); - request.type = htonl(nbd_cmd(req)); - - if (nbd_cmd(req) != NBD_CMD_FLUSH && nbd_cmd(req) != NBD_CMD_DISC) { + request.type = htonl(type); + if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) { request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(size); } memcpy(request.handle, &req, sizeof(req)); dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", - req, nbdcmd_to_ascii(nbd_cmd(req)), + req, nbdcmd_to_ascii(type), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); result = sock_xmit(nbd, 1, &request, sizeof(request), - (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); + (type == NBD_CMD_WRITE) ? MSG_MORE : 0); if (result <= 0) { dev_err(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); return -EIO; } - if (nbd_cmd(req) == NBD_CMD_WRITE) { + if (type == NBD_CMD_WRITE) { struct req_iterator iter; struct bio_vec bvec; /* @@ -322,26 +341,24 @@ static struct request *nbd_read_stat(struct nbd_device *nbd) if (result <= 0) { dev_err(disk_to_dev(nbd->disk), "Receive control failed (result %d)\n", result); - goto harderror; + return ERR_PTR(result); } if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", (unsigned long)ntohl(reply.magic)); - result = -EPROTO; - goto harderror; + return ERR_PTR(-EPROTO); } req = nbd_find_request(nbd, *(struct request **)reply.handle); if (IS_ERR(req)) { result = PTR_ERR(req); if (result != -ENOENT) - goto harderror; + return ERR_PTR(result); dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n", reply.handle); - result = -EBADR; - goto harderror; + return ERR_PTR(-EBADR); } if (ntohl(reply.error)) { @@ -352,7 +369,7 @@ static struct request *nbd_read_stat(struct nbd_device *nbd) } dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req); - if (nbd_cmd(req) == NBD_CMD_READ) { + if (rq_data_dir(req) != WRITE) { struct req_iterator iter; struct bio_vec bvec; @@ -369,18 +386,15 @@ static struct request *nbd_read_stat(struct nbd_device *nbd) } } return req; -harderror: - nbd->harderror = result; - return NULL; } static ssize_t pid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); + struct nbd_device *nbd = (struct nbd_device *)disk->private_data; - return sprintf(buf, "%ld\n", - (long) ((struct nbd_device *)disk->private_data)->pid); + return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv)); } static struct device_attribute pid_attr = { @@ -388,28 +402,58 @@ static struct device_attribute pid_attr = { .show = pid_show, }; -static int nbd_do_it(struct nbd_device *nbd) +static int nbd_thread_recv(struct nbd_device *nbd) { struct request *req; int ret; + unsigned long flags; BUG_ON(nbd->magic != NBD_MAGIC); sk_set_memalloc(nbd->sock->sk); - nbd->pid = task_pid_nr(current); + + spin_lock_irqsave(&nbd->tasks_lock, flags); + nbd->task_recv = current; + spin_unlock_irqrestore(&nbd->tasks_lock, flags); + ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (ret) { dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); - nbd->pid = 0; + + spin_lock_irqsave(&nbd->tasks_lock, flags); + nbd->task_recv = NULL; + spin_unlock_irqrestore(&nbd->tasks_lock, flags); + return ret; } - while ((req = nbd_read_stat(nbd)) != NULL) + while (1) { + req = nbd_read_stat(nbd); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + break; + } + nbd_end_request(nbd, req); + } device_remove_file(disk_to_dev(nbd->disk), &pid_attr); - nbd->pid = 0; - return 0; + + spin_lock_irqsave(&nbd->tasks_lock, flags); + nbd->task_recv = NULL; + spin_unlock_irqrestore(&nbd->tasks_lock, flags); + + if (signal_pending(current)) { + ret = kernel_dequeue_signal(NULL); + dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", + task_pid_nr(current), current->comm, ret); + mutex_lock(&nbd->tx_lock); + sock_shutdown(nbd); + mutex_unlock(&nbd->tx_lock); + ret = -ETIMEDOUT; + } + + return ret; } static void nbd_clear_que(struct nbd_device *nbd) @@ -444,6 +488,7 @@ static void nbd_clear_que(struct nbd_device *nbd) req->errors++; nbd_end_request(nbd, req); } + dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); } @@ -452,23 +497,11 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) if (req->cmd_type != REQ_TYPE_FS) goto error_out; - nbd_cmd(req) = NBD_CMD_READ; - if (rq_data_dir(req) == WRITE) { - if ((req->cmd_flags & REQ_DISCARD)) { - WARN_ON(!(nbd->flags & NBD_FLAG_SEND_TRIM)); - nbd_cmd(req) = NBD_CMD_TRIM; - } else - nbd_cmd(req) = NBD_CMD_WRITE; - if (nbd->flags & NBD_FLAG_READ_ONLY) { - dev_err(disk_to_dev(nbd->disk), - "Write on read-only\n"); - goto error_out; - } - } - - if (req->cmd_flags & REQ_FLUSH) { - BUG_ON(unlikely(blk_rq_sectors(req))); - nbd_cmd(req) = NBD_CMD_FLUSH; + if (rq_data_dir(req) == WRITE && + (nbd->flags & NBD_FLAG_READ_ONLY)) { + dev_err(disk_to_dev(nbd->disk), + "Write on read-only\n"); + goto error_out; } req->errors = 0; @@ -483,6 +516,9 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) nbd->active_req = req; + if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head)) + mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout); + if (nbd_send_req(nbd, req) != 0) { dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); req->errors++; @@ -504,10 +540,15 @@ error_out: nbd_end_request(nbd, req); } -static int nbd_thread(void *data) +static int nbd_thread_send(void *data) { struct nbd_device *nbd = data; struct request *req; + unsigned long flags; + + spin_lock_irqsave(&nbd->tasks_lock, flags); + nbd->task_send = current; + spin_unlock_irqrestore(&nbd->tasks_lock, flags); set_user_nice(current, MIN_NICE); while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { @@ -516,6 +557,17 @@ static int nbd_thread(void *data) kthread_should_stop() || !list_empty(&nbd->waiting_queue)); + if (signal_pending(current)) { + int ret = kernel_dequeue_signal(NULL); + + dev_warn(nbd_to_dev(nbd), "pid %d, %s, got signal %d\n", + task_pid_nr(current), current->comm, ret); + mutex_lock(&nbd->tx_lock); + sock_shutdown(nbd); + mutex_unlock(&nbd->tx_lock); + break; + } + /* extract request */ if (list_empty(&nbd->waiting_queue)) continue; @@ -529,6 +581,15 @@ static int nbd_thread(void *data) /* handle request */ nbd_handle_req(nbd, req); } + + spin_lock_irqsave(&nbd->tasks_lock, flags); + nbd->task_send = NULL; + spin_unlock_irqrestore(&nbd->tasks_lock, flags); + + /* Clear maybe pending signals */ + if (signal_pending(current)) + kernel_dequeue_signal(NULL); + return 0; } @@ -539,7 +600,7 @@ static int nbd_thread(void *data) * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } */ -static void do_nbd_request(struct request_queue *q) +static void nbd_request_handler(struct request_queue *q) __releases(q->queue_lock) __acquires(q->queue_lock) { struct request *req; @@ -575,6 +636,9 @@ static void do_nbd_request(struct request_queue *q) } } +static int nbd_dev_dbg_init(struct nbd_device *nbd); +static void nbd_dev_dbg_close(struct nbd_device *nbd); + /* Must be called with tx_lock held */ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, @@ -592,14 +656,13 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, fsync_bdev(bdev); mutex_lock(&nbd->tx_lock); blk_rq_init(NULL, &sreq); - sreq.cmd_type = REQ_TYPE_SPECIAL; - nbd_cmd(&sreq) = NBD_CMD_DISC; + sreq.cmd_type = REQ_TYPE_DRV_PRIV; /* Check again after getting mutex back. */ if (!nbd->sock) return -EINVAL; - nbd->disconnect = 1; + nbd->disconnect = true; nbd_send_req(nbd, &sreq); return 0; @@ -627,7 +690,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd->sock = sock; if (max_part > 0) bdev->bd_invalidated = 1; - nbd->disconnect = 0; /* we're connected now */ + nbd->disconnect = false; /* we're connected now */ return 0; } return -EINVAL; @@ -650,6 +713,12 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_SET_TIMEOUT: nbd->xmit_timeout = arg * HZ; + if (arg) + mod_timer(&nbd->timeout_timer, + jiffies + nbd->xmit_timeout); + else + del_timer_sync(&nbd->timeout_timer); + return 0; case NBD_SET_FLAGS: @@ -668,7 +737,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, struct socket *sock; int error; - if (nbd->pid) + if (nbd->task_recv) return -EBUSY; if (!nbd->sock) return -EINVAL; @@ -685,24 +754,24 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, else blk_queue_flush(nbd->disk->queue, 0); - thread = kthread_run(nbd_thread, nbd, "%s", - nbd->disk->disk_name); + thread = kthread_run(nbd_thread_send, nbd, "%s", + nbd_name(nbd)); if (IS_ERR(thread)) { mutex_lock(&nbd->tx_lock); return PTR_ERR(thread); } - error = nbd_do_it(nbd); + nbd_dev_dbg_init(nbd); + error = nbd_thread_recv(nbd); + nbd_dev_dbg_close(nbd); kthread_stop(thread); mutex_lock(&nbd->tx_lock); - if (error) - return error; - sock_shutdown(nbd, 0); + + sock_shutdown(nbd); sock = nbd->sock; nbd->sock = NULL; nbd_clear_que(nbd); - dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); kill_bdev(bdev); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); set_device_ro(bdev, false); @@ -713,10 +782,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, bdev->bd_inode->i_size = 0; set_capacity(nbd->disk, 0); if (max_part > 0) - ioctl_by_bdev(bdev, BLKRRPART, 0); + blkdev_reread_part(bdev); if (nbd->disconnect) /* user requested, ignore socket errors */ return 0; - return nbd->harderror; + return error; } case NBD_CLEAR_QUE: @@ -760,6 +829,161 @@ static const struct block_device_operations nbd_fops = .ioctl = nbd_ioctl, }; +#if IS_ENABLED(CONFIG_DEBUG_FS) + +static int nbd_dbg_tasks_show(struct seq_file *s, void *unused) +{ + struct nbd_device *nbd = s->private; + + if (nbd->task_recv) + seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv)); + if (nbd->task_send) + seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send)); + + return 0; +} + +static int nbd_dbg_tasks_open(struct inode *inode, struct file *file) +{ + return single_open(file, nbd_dbg_tasks_show, inode->i_private); +} + +static const struct file_operations nbd_dbg_tasks_ops = { + .open = nbd_dbg_tasks_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int nbd_dbg_flags_show(struct seq_file *s, void *unused) +{ + struct nbd_device *nbd = s->private; + u32 flags = nbd->flags; + + seq_printf(s, "Hex: 0x%08x\n\n", flags); + + seq_puts(s, "Known flags:\n"); + + if (flags & NBD_FLAG_HAS_FLAGS) + seq_puts(s, "NBD_FLAG_HAS_FLAGS\n"); + if (flags & NBD_FLAG_READ_ONLY) + seq_puts(s, "NBD_FLAG_READ_ONLY\n"); + if (flags & NBD_FLAG_SEND_FLUSH) + seq_puts(s, "NBD_FLAG_SEND_FLUSH\n"); + if (flags & NBD_FLAG_SEND_TRIM) + seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); + + return 0; +} + +static int nbd_dbg_flags_open(struct inode *inode, struct file *file) +{ + return single_open(file, nbd_dbg_flags_show, inode->i_private); +} + +static const struct file_operations nbd_dbg_flags_ops = { + .open = nbd_dbg_flags_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int nbd_dev_dbg_init(struct nbd_device *nbd) +{ + struct dentry *dir; + struct dentry *f; + + dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir); + if (IS_ERR_OR_NULL(dir)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s' (%ld)\n", + nbd_name(nbd), PTR_ERR(dir)); + return PTR_ERR(dir); + } + nbd->dbg_dir = dir; + + f = debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops); + if (IS_ERR_OR_NULL(f)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'tasks', %ld\n", + PTR_ERR(f)); + return PTR_ERR(f); + } + + f = debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); + if (IS_ERR_OR_NULL(f)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'size_bytes', %ld\n", + PTR_ERR(f)); + return PTR_ERR(f); + } + + f = debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout); + if (IS_ERR_OR_NULL(f)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'timeout', %ld\n", + PTR_ERR(f)); + return PTR_ERR(f); + } + + f = debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize); + if (IS_ERR_OR_NULL(f)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'blocksize', %ld\n", + PTR_ERR(f)); + return PTR_ERR(f); + } + + f = debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops); + if (IS_ERR_OR_NULL(f)) { + dev_err(nbd_to_dev(nbd), "Failed to create debugfs file 'flags', %ld\n", + PTR_ERR(f)); + return PTR_ERR(f); + } + + return 0; +} + +static void nbd_dev_dbg_close(struct nbd_device *nbd) +{ + debugfs_remove_recursive(nbd->dbg_dir); +} + +static int nbd_dbg_init(void) +{ + struct dentry *dbg_dir; + + dbg_dir = debugfs_create_dir("nbd", NULL); + if (IS_ERR(dbg_dir)) + return PTR_ERR(dbg_dir); + + nbd_dbg_dir = dbg_dir; + + return 0; +} + +static void nbd_dbg_close(void) +{ + debugfs_remove_recursive(nbd_dbg_dir); +} + +#else /* IS_ENABLED(CONFIG_DEBUG_FS) */ + +static int nbd_dev_dbg_init(struct nbd_device *nbd) +{ + return 0; +} + +static void nbd_dev_dbg_close(struct nbd_device *nbd) +{ +} + +static int nbd_dbg_init(void) +{ + return 0; +} + +static void nbd_dbg_close(void) +{ +} + +#endif + /* * And here should be modules and kernel interface * (Just smiley confuses emacs :-) @@ -813,7 +1037,7 @@ static int __init nbd_init(void) * every gendisk to have its very own request_queue struct. * These structs are big so we dynamically allocate them. */ - disk->queue = blk_init_queue(do_nbd_request, &nbd_lock); + disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock); if (!disk->queue) { put_disk(disk); goto out; @@ -824,7 +1048,7 @@ static int __init nbd_init(void) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue); disk->queue->limits.discard_granularity = 512; - disk->queue->limits.max_discard_sectors = UINT_MAX; + blk_queue_max_discard_sectors(disk->queue, UINT_MAX); disk->queue->limits.discard_zeroes_data = 0; blk_queue_max_hw_sectors(disk->queue, 65536); disk->queue->limits.max_sectors = 256; @@ -837,13 +1061,19 @@ static int __init nbd_init(void) printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR); + nbd_dbg_init(); + for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; nbd_dev[i].magic = NBD_MAGIC; INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); spin_lock_init(&nbd_dev[i].queue_lock); + spin_lock_init(&nbd_dev[i].tasks_lock); INIT_LIST_HEAD(&nbd_dev[i].queue_head); mutex_init(&nbd_dev[i].tx_lock); + init_timer(&nbd_dev[i].timeout_timer); + nbd_dev[i].timeout_timer.function = nbd_xmit_timeout; + nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i]; init_waitqueue_head(&nbd_dev[i].active_wq); init_waitqueue_head(&nbd_dev[i].waiting_wq); nbd_dev[i].blksize = 1024; @@ -870,6 +1100,9 @@ out: static void __exit nbd_cleanup(void) { int i; + + nbd_dbg_close(); + for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; nbd_dev[i].magic = 0; diff --git a/kernel/drivers/block/null_blk.c b/kernel/drivers/block/null_blk.c index 65cd61a41..09e3c0d87 100644 --- a/kernel/drivers/block/null_blk.c +++ b/kernel/drivers/block/null_blk.c @@ -8,6 +8,7 @@ #include #include #include +#include struct nullb_cmd { struct list_head list; @@ -17,6 +18,7 @@ struct nullb_cmd { struct bio *bio; unsigned int tag; struct nullb_queue *nq; + struct hrtimer timer; }; struct nullb_queue { @@ -39,23 +41,14 @@ struct nullb { struct nullb_queue *queues; unsigned int nr_queues; + char disk_name[DISK_NAME_LEN]; }; static LIST_HEAD(nullb_list); static struct mutex lock; static int null_major; static int nullb_indexes; - -struct completion_queue { - struct llist_head list; - struct hrtimer timer; -}; - -/* - * These are per-cpu for now, they will need to be configured by the - * complete_queues parameter and appropriately mapped. - */ -static DEFINE_PER_CPU(struct completion_queue, completion_queues); +static struct kmem_cache *ppa_cache; enum { NULL_IRQ_NONE = 0, @@ -99,7 +92,7 @@ static int null_set_queue_mode(const char *str, const struct kernel_param *kp) return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ); } -static struct kernel_param_ops null_queue_mode_param_ops = { +static const struct kernel_param_ops null_queue_mode_param_ops = { .set = null_set_queue_mode, .get = param_get_int, }; @@ -119,6 +112,10 @@ static int nr_devices = 2; module_param(nr_devices, int, S_IRUGO); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); +static bool use_lightnvm; +module_param(use_lightnvm, bool, S_IRUGO); +MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device"); + static int irqmode = NULL_IRQ_SOFTIRQ; static int null_set_irqmode(const char *str, const struct kernel_param *kp) @@ -127,7 +124,7 @@ static int null_set_irqmode(const char *str, const struct kernel_param *kp) NULL_IRQ_TIMER); } -static struct kernel_param_ops null_irqmode_param_ops = { +static const struct kernel_param_ops null_irqmode_param_ops = { .set = null_set_irqmode, .get = param_get_int, }; @@ -135,8 +132,8 @@ static struct kernel_param_ops null_irqmode_param_ops = { device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO); MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); -static int completion_nsec = 10000; -module_param(completion_nsec, int, S_IRUGO); +static unsigned long completion_nsec = 10000; +module_param(completion_nsec, ulong, S_IRUGO); MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns"); static int hw_queue_depth = 64; @@ -173,6 +170,8 @@ static void free_cmd(struct nullb_cmd *cmd) put_tag(cmd->nq, cmd->tag); } +static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); + static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) { struct nullb_cmd *cmd; @@ -183,6 +182,11 @@ static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) cmd = &nq->cmds[tag]; cmd->tag = tag; cmd->nq = nq; + if (irqmode == NULL_IRQ_TIMER) { + hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + cmd->timer.function = null_cmd_timer_expired; + } return cmd; } @@ -213,6 +217,11 @@ static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) static void end_cmd(struct nullb_cmd *cmd) { + struct request_queue *q = NULL; + + if (cmd->rq) + q = cmd->rq->q; + switch (queue_mode) { case NULL_Q_MQ: blk_mq_end_request(cmd->rq, 0); @@ -222,45 +231,34 @@ static void end_cmd(struct nullb_cmd *cmd) blk_end_request_all(cmd->rq, 0); break; case NULL_Q_BIO: - bio_endio(cmd->bio, 0); + bio_endio(cmd->bio); break; } free_cmd(cmd); + + /* Restart queue if needed, as we are freeing a tag */ + if (queue_mode == NULL_Q_RQ && blk_queue_stopped(q)) { + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + blk_start_queue_async(q); + spin_unlock_irqrestore(q->queue_lock, flags); + } } static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) { - struct completion_queue *cq; - struct llist_node *entry; - struct nullb_cmd *cmd; - - cq = &per_cpu(completion_queues, smp_processor_id()); - - while ((entry = llist_del_all(&cq->list)) != NULL) { - entry = llist_reverse_order(entry); - do { - cmd = container_of(entry, struct nullb_cmd, ll_list); - entry = entry->next; - end_cmd(cmd); - } while (entry); - } + end_cmd(container_of(timer, struct nullb_cmd, timer)); return HRTIMER_NORESTART; } static void null_cmd_end_timer(struct nullb_cmd *cmd) { - struct completion_queue *cq = &per_cpu(completion_queues, get_cpu()); - - cmd->ll_list.next = NULL; - if (llist_add(&cmd->ll_list, &cq->list)) { - ktime_t kt = ktime_set(0, completion_nsec); + ktime_t kt = ktime_set(0, completion_nsec); - hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL); - } - - put_cpu(); + hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL); } static void null_softirq_done_fn(struct request *rq) @@ -278,7 +276,7 @@ static inline void null_handle_cmd(struct nullb_cmd *cmd) case NULL_IRQ_SOFTIRQ: switch (queue_mode) { case NULL_Q_MQ: - blk_mq_complete_request(cmd->rq); + blk_mq_complete_request(cmd->rq, cmd->rq->errors); break; case NULL_Q_RQ: blk_complete_request(cmd->rq); @@ -310,7 +308,7 @@ static struct nullb_queue *nullb_to_queue(struct nullb *nullb) return &nullb->queues[index]; } -static void null_queue_bio(struct request_queue *q, struct bio *bio) +static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) { struct nullb *nullb = q->queuedata; struct nullb_queue *nq = nullb_to_queue(nullb); @@ -320,6 +318,7 @@ static void null_queue_bio(struct request_queue *q, struct bio *bio) cmd->bio = bio; null_handle_cmd(cmd); + return BLK_QC_T_NONE; } static int null_rq_prep_fn(struct request_queue *q, struct request *req) @@ -334,6 +333,7 @@ static int null_rq_prep_fn(struct request_queue *q, struct request *req) req->special = cmd; return BLKPREP_OK; } + blk_stop_queue(q); return BLKPREP_DEFER; } @@ -356,6 +356,10 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, { struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + if (irqmode == NULL_IRQ_TIMER) { + hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cmd->timer.function = null_cmd_timer_expired; + } cmd->rq = bd->rq; cmd->nq = hctx->driver_data; @@ -394,18 +398,177 @@ static struct blk_mq_ops null_mq_ops = { .complete = null_softirq_done_fn, }; +static void cleanup_queue(struct nullb_queue *nq) +{ + kfree(nq->tag_map); + kfree(nq->cmds); +} + +static void cleanup_queues(struct nullb *nullb) +{ + int i; + + for (i = 0; i < nullb->nr_queues; i++) + cleanup_queue(&nullb->queues[i]); + + kfree(nullb->queues); +} + static void null_del_dev(struct nullb *nullb) { list_del_init(&nullb->list); - del_gendisk(nullb->disk); + if (use_lightnvm) + nvm_unregister(nullb->disk_name); + else + del_gendisk(nullb->disk); blk_cleanup_queue(nullb->q); if (queue_mode == NULL_Q_MQ) blk_mq_free_tag_set(&nullb->tag_set); - put_disk(nullb->disk); + if (!use_lightnvm) + put_disk(nullb->disk); + cleanup_queues(nullb); kfree(nullb); } +#ifdef CONFIG_NVM + +static void null_lnvm_end_io(struct request *rq, int error) +{ + struct nvm_rq *rqd = rq->end_io_data; + struct nvm_dev *dev = rqd->dev; + + dev->mt->end_io(rqd, error); + + blk_put_request(rq); +} + +static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + struct request_queue *q = dev->q; + struct request *rq; + struct bio *bio = rqd->bio; + + rq = blk_mq_alloc_request(q, bio_rw(bio), GFP_KERNEL, 0); + if (IS_ERR(rq)) + return -ENOMEM; + + rq->cmd_type = REQ_TYPE_DRV_PRIV; + rq->__sector = bio->bi_iter.bi_sector; + rq->ioprio = bio_prio(bio); + + if (bio_has_data(bio)) + rq->nr_phys_segments = bio_phys_segments(q, bio); + + rq->__data_len = bio->bi_iter.bi_size; + rq->bio = rq->biotail = bio; + + rq->end_io_data = rqd; + + blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io); + + return 0; +} + +static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) +{ + sector_t size = gb * 1024 * 1024 * 1024ULL; + sector_t blksize; + struct nvm_id_group *grp; + + id->ver_id = 0x1; + id->vmnt = 0; + id->cgrps = 1; + id->cap = 0x3; + id->dom = 0x1; + + id->ppaf.blk_offset = 0; + id->ppaf.blk_len = 16; + id->ppaf.pg_offset = 16; + id->ppaf.pg_len = 16; + id->ppaf.sect_offset = 32; + id->ppaf.sect_len = 8; + id->ppaf.pln_offset = 40; + id->ppaf.pln_len = 8; + id->ppaf.lun_offset = 48; + id->ppaf.lun_len = 8; + id->ppaf.ch_offset = 56; + id->ppaf.ch_len = 8; + + do_div(size, bs); /* convert size to pages */ + do_div(size, 256); /* concert size to pgs pr blk */ + grp = &id->groups[0]; + grp->mtype = 0; + grp->fmtype = 0; + grp->num_ch = 1; + grp->num_pg = 256; + blksize = size; + do_div(size, (1 << 16)); + grp->num_lun = size + 1; + do_div(blksize, grp->num_lun); + grp->num_blk = blksize; + grp->num_pln = 1; + + grp->fpg_sz = bs; + grp->csecs = bs; + grp->trdt = 25000; + grp->trdm = 25000; + grp->tprt = 500000; + grp->tprm = 500000; + grp->tbet = 1500000; + grp->tbem = 1500000; + grp->mpos = 0x010101; /* single plane rwe */ + grp->cpar = hw_queue_depth; + + return 0; +} + +static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name) +{ + mempool_t *virtmem_pool; + + virtmem_pool = mempool_create_slab_pool(64, ppa_cache); + if (!virtmem_pool) { + pr_err("null_blk: Unable to create virtual memory pool\n"); + return NULL; + } + + return virtmem_pool; +} + +static void null_lnvm_destroy_dma_pool(void *pool) +{ + mempool_destroy(pool); +} + +static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, + gfp_t mem_flags, dma_addr_t *dma_handler) +{ + return mempool_alloc(pool, mem_flags); +} + +static void null_lnvm_dev_dma_free(void *pool, void *entry, + dma_addr_t dma_handler) +{ + mempool_free(entry, pool); +} + +static struct nvm_dev_ops null_lnvm_dev_ops = { + .identity = null_lnvm_id, + .submit_io = null_lnvm_submit_io, + + .create_dma_pool = null_lnvm_create_dma_pool, + .destroy_dma_pool = null_lnvm_destroy_dma_pool, + .dev_dma_alloc = null_lnvm_dev_dma_alloc, + .dev_dma_free = null_lnvm_dev_dma_free, + + /* Simulate nvme protocol restriction */ + .max_phys_sect = 64, +}; +#else +static struct nvm_dev_ops null_lnvm_dev_ops; +#endif /* CONFIG_NVM */ + static int null_open(struct block_device *bdev, fmode_t mode) { return 0; @@ -447,22 +610,6 @@ static int setup_commands(struct nullb_queue *nq) return 0; } -static void cleanup_queue(struct nullb_queue *nq) -{ - kfree(nq->tag_map); - kfree(nq->cmds); -} - -static void cleanup_queues(struct nullb *nullb) -{ - int i; - - for (i = 0; i < nullb->nr_queues; i++) - cleanup_queue(&nullb->queues[i]); - - kfree(nullb->queues); -} - static int setup_queues(struct nullb *nullb) { nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue), @@ -561,11 +708,6 @@ static int null_add_dev(void) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q); - disk = nullb->disk = alloc_disk_node(1, home_node); - if (!disk) { - rv = -ENOMEM; - goto out_cleanup_blk_queue; - } mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); @@ -575,9 +717,23 @@ static int null_add_dev(void) blk_queue_logical_block_size(nullb->q, bs); blk_queue_physical_block_size(nullb->q, bs); + sprintf(nullb->disk_name, "nullb%d", nullb->index); + + if (use_lightnvm) { + rv = nvm_register(nullb->q, nullb->disk_name, + &null_lnvm_dev_ops); + if (rv) + goto out_cleanup_blk_queue; + goto done; + } + + disk = nullb->disk = alloc_disk_node(1, home_node); + if (!disk) { + rv = -ENOMEM; + goto out_cleanup_lightnvm; + } size = gb * 1024 * 1024 * 1024ULL; - sector_div(size, bs); - set_capacity(disk, size); + set_capacity(disk, size >> 9); disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; @@ -585,10 +741,15 @@ static int null_add_dev(void) disk->fops = &null_fops; disk->private_data = nullb; disk->queue = nullb->q; - sprintf(disk->disk_name, "nullb%d", nullb->index); + strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + add_disk(disk); +done: return 0; +out_cleanup_lightnvm: + if (use_lightnvm) + nvm_unregister(nullb->disk_name); out_cleanup_blk_queue: blk_cleanup_queue(nullb->q); out_cleanup_tags: @@ -604,7 +765,9 @@ out: static int __init null_init(void) { + int ret = 0; unsigned int i; + struct nullb *nullb; if (bs > PAGE_SIZE) { pr_warn("null_blk: invalid block size\n"); @@ -612,6 +775,18 @@ static int __init null_init(void) bs = PAGE_SIZE; } + if (use_lightnvm && bs != 4096) { + pr_warn("null_blk: LightNVM only supports 4k block size\n"); + pr_warn("null_blk: defaults block size to 4k\n"); + bs = 4096; + } + + if (use_lightnvm && queue_mode != NULL_Q_MQ) { + pr_warn("null_blk: LightNVM only supported for blk-mq\n"); + pr_warn("null_blk: defaults queue mode to blk-mq\n"); + queue_mode = NULL_Q_MQ; + } + if (queue_mode == NULL_Q_MQ && use_per_node_hctx) { if (submit_queues < nr_online_nodes) { pr_warn("null_blk: submit_queues param is set to %u.", @@ -625,32 +800,38 @@ static int __init null_init(void) mutex_init(&lock); - /* Initialize a separate list for each CPU for issuing softirqs */ - for_each_possible_cpu(i) { - struct completion_queue *cq = &per_cpu(completion_queues, i); - - init_llist_head(&cq->list); - - if (irqmode != NULL_IRQ_TIMER) - continue; - - hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cq->timer.function = null_cmd_timer_expired; - } - null_major = register_blkdev(0, "nullb"); if (null_major < 0) return null_major; - for (i = 0; i < nr_devices; i++) { - if (null_add_dev()) { - unregister_blkdev(null_major, "nullb"); - return -EINVAL; + if (use_lightnvm) { + ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64), + 0, 0, NULL); + if (!ppa_cache) { + pr_err("null_blk: unable to create ppa cache\n"); + ret = -ENOMEM; + goto err_ppa; } } + for (i = 0; i < nr_devices; i++) { + ret = null_add_dev(); + if (ret) + goto err_dev; + } + pr_info("null: module loaded\n"); return 0; + +err_dev: + while (!list_empty(&nullb_list)) { + nullb = list_entry(nullb_list.next, struct nullb, list); + null_del_dev(nullb); + } + kmem_cache_destroy(ppa_cache); +err_ppa: + unregister_blkdev(null_major, "nullb"); + return ret; } static void __exit null_exit(void) @@ -665,6 +846,8 @@ static void __exit null_exit(void) null_del_dev(nullb); } mutex_unlock(&lock); + + kmem_cache_destroy(ppa_cache); } module_init(null_init); diff --git a/kernel/drivers/block/nvme-core.c b/kernel/drivers/block/nvme-core.c deleted file mode 100644 index 683dff272..000000000 --- a/kernel/drivers/block/nvme-core.c +++ /dev/null @@ -1,3178 +0,0 @@ -/* - * NVM Express device driver - * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define NVME_MINORS (1U << MINORBITS) -#define NVME_Q_DEPTH 1024 -#define NVME_AQ_DEPTH 256 -#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) -#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -#define ADMIN_TIMEOUT (admin_timeout * HZ) -#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) - -static unsigned char admin_timeout = 60; -module_param(admin_timeout, byte, 0644); -MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); - -unsigned char nvme_io_timeout = 30; -module_param_named(io_timeout, nvme_io_timeout, byte, 0644); -MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); - -static unsigned char shutdown_timeout = 5; -module_param(shutdown_timeout, byte, 0644); -MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); - -static int nvme_major; -module_param(nvme_major, int, 0); - -static int nvme_char_major; -module_param(nvme_char_major, int, 0); - -static int use_threaded_interrupts; -module_param(use_threaded_interrupts, int, 0); - -static DEFINE_SPINLOCK(dev_list_lock); -static LIST_HEAD(dev_list); -static struct task_struct *nvme_thread; -static struct workqueue_struct *nvme_workq; -static wait_queue_head_t nvme_kthread_wait; - -static struct class *nvme_class; - -static void nvme_reset_failed_dev(struct work_struct *ws); -static int nvme_process_cq(struct nvme_queue *nvmeq); - -struct async_cmd_info { - struct kthread_work work; - struct kthread_worker *worker; - struct request *req; - u32 result; - int status; - void *ctx; -}; - -/* - * An NVM Express queue. Each device has at least two (one for admin - * commands and one for I/O commands). - */ -struct nvme_queue { - struct device *q_dmadev; - struct nvme_dev *dev; - char irqname[24]; /* nvme4294967295-65535\0 */ - spinlock_t q_lock; - struct nvme_command *sq_cmds; - volatile struct nvme_completion *cqes; - dma_addr_t sq_dma_addr; - dma_addr_t cq_dma_addr; - u32 __iomem *q_db; - u16 q_depth; - s16 cq_vector; - u16 sq_head; - u16 sq_tail; - u16 cq_head; - u16 qid; - u8 cq_phase; - u8 cqe_seen; - struct async_cmd_info cmdinfo; - struct blk_mq_hw_ctx *hctx; -}; - -/* - * Check we didin't inadvertently grow the command struct - */ -static inline void _nvme_check_size(void) -{ - BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64); - BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); - BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); - BUILD_BUG_ON(sizeof(struct nvme_features) != 64); - BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64); - BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64); - BUILD_BUG_ON(sizeof(struct nvme_command) != 64); - BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096); - BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096); - BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64); - BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); -} - -typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, - struct nvme_completion *); - -struct nvme_cmd_info { - nvme_completion_fn fn; - void *ctx; - int aborted; - struct nvme_queue *nvmeq; - struct nvme_iod iod[0]; -}; - -/* - * Max size of iod being embedded in the request payload - */ -#define NVME_INT_PAGES 2 -#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) -#define NVME_INT_MASK 0x01 - -/* - * Will slightly overestimate the number of pages needed. This is OK - * as it only leads to a small amount of wasted memory for the lifetime of - * the I/O. - */ -static int nvme_npages(unsigned size, struct nvme_dev *dev) -{ - unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); -} - -static unsigned int nvme_cmd_size(struct nvme_dev *dev) -{ - unsigned int ret = sizeof(struct nvme_cmd_info); - - ret += sizeof(struct nvme_iod); - ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); - ret += sizeof(struct scatterlist) * NVME_INT_PAGES; - - return ret; -} - -static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) -{ - struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[0]; - - WARN_ON(nvmeq->hctx); - nvmeq->hctx = hctx; - hctx->driver_data = nvmeq; - return 0; -} - -static int nvme_admin_init_request(void *data, struct request *req, - unsigned int hctx_idx, unsigned int rq_idx, - unsigned int numa_node) -{ - struct nvme_dev *dev = data; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = dev->queues[0]; - - BUG_ON(!nvmeq); - cmd->nvmeq = nvmeq; - return 0; -} - -static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - struct nvme_queue *nvmeq = hctx->driver_data; - - nvmeq->hctx = NULL; -} - -static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int hctx_idx) -{ - struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[ - (hctx_idx % dev->queue_count) + 1]; - - if (!nvmeq->hctx) - nvmeq->hctx = hctx; - - /* nvmeq queues are shared between namespaces. We assume here that - * blk-mq map the tags so they match up with the nvme queue tags. */ - WARN_ON(nvmeq->hctx->tags != hctx->tags); - - hctx->driver_data = nvmeq; - return 0; -} - -static int nvme_init_request(void *data, struct request *req, - unsigned int hctx_idx, unsigned int rq_idx, - unsigned int numa_node) -{ - struct nvme_dev *dev = data; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; - - BUG_ON(!nvmeq); - cmd->nvmeq = nvmeq; - return 0; -} - -static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, - nvme_completion_fn handler) -{ - cmd->fn = handler; - cmd->ctx = ctx; - cmd->aborted = 0; - blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); -} - -static void *iod_get_private(struct nvme_iod *iod) -{ - return (void *) (iod->private & ~0x1UL); -} - -/* - * If bit 0 is set, the iod is embedded in the request payload. - */ -static bool iod_should_kfree(struct nvme_iod *iod) -{ - return (iod->private & NVME_INT_MASK) == 0; -} - -/* Special values must be less than 0x1000 */ -#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) -#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) -#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) -#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) - -static void special_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - if (ctx == CMD_CTX_CANCELLED) - return; - if (ctx == CMD_CTX_COMPLETED) { - dev_warn(nvmeq->q_dmadev, - "completed id %d twice on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - if (ctx == CMD_CTX_INVALID) { - dev_warn(nvmeq->q_dmadev, - "invalid id %d completed on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); -} - -static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) -{ - void *ctx; - - if (fn) - *fn = cmd->fn; - ctx = cmd->ctx; - cmd->fn = special_completion; - cmd->ctx = CMD_CTX_CANCELLED; - return ctx; -} - -static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - u32 result = le32_to_cpup(&cqe->result); - u16 status = le16_to_cpup(&cqe->status) >> 1; - - if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) - ++nvmeq->dev->event_limit; - if (status == NVME_SC_SUCCESS) - dev_warn(nvmeq->q_dmadev, - "async event result %08x\n", result); -} - -static void abort_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct request *req = ctx; - - u16 status = le16_to_cpup(&cqe->status) >> 1; - u32 result = le32_to_cpup(&cqe->result); - - blk_mq_free_hctx_request(nvmeq->hctx, req); - - dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); - ++nvmeq->dev->abort_limit; -} - -static void async_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct async_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - queue_kthread_work(cmdinfo->worker, &cmdinfo->work); - blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req); -} - -static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, - unsigned int tag) -{ - struct blk_mq_hw_ctx *hctx = nvmeq->hctx; - struct request *req = blk_mq_tag_to_rq(hctx->tags, tag); - - return blk_mq_rq_to_pdu(req); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, - nvme_completion_fn *fn) -{ - struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); - void *ctx; - if (tag >= nvmeq->q_depth) { - *fn = special_completion; - return CMD_CTX_INVALID; - } - if (fn) - *fn = cmd->fn; - ctx = cmd->ctx; - cmd->fn = special_completion; - cmd->ctx = CMD_CTX_COMPLETED; - return ctx; -} - -/** - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell - * @nvmeq: The queue to use - * @cmd: The command to send - * - * Safe to use from interrupt context - */ -static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) -{ - u16 tail = nvmeq->sq_tail; - - memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); - if (++tail == nvmeq->q_depth) - tail = 0; - writel(tail, nvmeq->q_db); - nvmeq->sq_tail = tail; - - return 0; -} - -static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) -{ - unsigned long flags; - int ret; - spin_lock_irqsave(&nvmeq->q_lock, flags); - ret = __nvme_submit_cmd(nvmeq, cmd); - spin_unlock_irqrestore(&nvmeq->q_lock, flags); - return ret; -} - -static __le64 **iod_list(struct nvme_iod *iod) -{ - return ((void *)iod) + iod->offset; -} - -static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, - unsigned nseg, unsigned long private) -{ - iod->private = private; - iod->offset = offsetof(struct nvme_iod, sg[nseg]); - iod->npages = -1; - iod->length = nbytes; - iod->nents = 0; -} - -static struct nvme_iod * -__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, - unsigned long priv, gfp_t gfp) -{ - struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(bytes, dev) + - sizeof(struct scatterlist) * nseg, gfp); - - if (iod) - iod_init(iod, bytes, nseg, priv); - - return iod; -} - -static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, - gfp_t gfp) -{ - unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : - sizeof(struct nvme_dsm_range); - struct nvme_iod *iod; - - if (rq->nr_phys_segments <= NVME_INT_PAGES && - size <= NVME_INT_BYTES(dev)) { - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); - - iod = cmd->iod; - iod_init(iod, size, rq->nr_phys_segments, - (unsigned long) rq | NVME_INT_MASK); - return iod; - } - - return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, - (unsigned long) rq, gfp); -} - -void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) -{ - const int last_prp = dev->page_size / 8 - 1; - int i; - __le64 **list = iod_list(iod); - dma_addr_t prp_dma = iod->first_dma; - - if (iod->npages == 0) - dma_pool_free(dev->prp_small_pool, list[0], prp_dma); - for (i = 0; i < iod->npages; i++) { - __le64 *prp_list = list[i]; - dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); - dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); - prp_dma = next_prp_dma; - } - - if (iod_should_kfree(iod)) - kfree(iod); -} - -static int nvme_error_status(u16 status) -{ - switch (status & 0x7ff) { - case NVME_SC_SUCCESS: - return 0; - case NVME_SC_CAP_EXCEEDED: - return -ENOSPC; - default: - return -EIO; - } -} - -#ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) -{ - if (be32_to_cpu(pi->ref_tag) == v) - pi->ref_tag = cpu_to_be32(p); -} - -static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) -{ - if (be32_to_cpu(pi->ref_tag) == p) - pi->ref_tag = cpu_to_be32(v); -} - -/** - * nvme_dif_remap - remaps ref tags to bip seed and physical lba - * - * The virtual start sector is the one that was originally submitted by the - * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical - * start sector may be different. Remap protection information to match the - * physical LBA on writes, and back to the original seed on reads. - * - * Type 0 and 3 do not have a ref tag, so no remapping required. - */ -static void nvme_dif_remap(struct request *req, - void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) -{ - struct nvme_ns *ns = req->rq_disk->private_data; - struct bio_integrity_payload *bip; - struct t10_pi_tuple *pi; - void *p, *pmap; - u32 i, nlb, ts, phys, virt; - - if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3) - return; - - bip = bio_integrity(req->bio); - if (!bip) - return; - - pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; - - p = pmap; - virt = bip_get_seed(bip); - phys = nvme_block_nr(ns, blk_rq_pos(req)); - nlb = (blk_rq_bytes(req) >> ns->lba_shift); - ts = ns->disk->integrity->tuple_size; - - for (i = 0; i < nlb; i++, virt++, phys++) { - pi = (struct t10_pi_tuple *)p; - dif_swap(phys, virt, pi); - p += ts; - } - kunmap_atomic(pmap); -} - -static int nvme_noop_verify(struct blk_integrity_iter *iter) -{ - return 0; -} - -static int nvme_noop_generate(struct blk_integrity_iter *iter) -{ - return 0; -} - -struct blk_integrity nvme_meta_noop = { - .name = "NVME_META_NOOP", - .generate_fn = nvme_noop_generate, - .verify_fn = nvme_noop_verify, -}; - -static void nvme_init_integrity(struct nvme_ns *ns) -{ - struct blk_integrity integrity; - - switch (ns->pi_type) { - case NVME_NS_DPS_PI_TYPE3: - integrity = t10_pi_type3_crc; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - integrity = t10_pi_type1_crc; - break; - default: - integrity = nvme_meta_noop; - break; - } - integrity.tuple_size = ns->ms; - blk_integrity_register(ns->disk, &integrity); - blk_queue_max_integrity_segments(ns->queue, 1); -} -#else /* CONFIG_BLK_DEV_INTEGRITY */ -static void nvme_dif_remap(struct request *req, - void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) -{ -} -static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) -{ -} -static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) -{ -} -static void nvme_init_integrity(struct nvme_ns *ns) -{ -} -#endif - -static void req_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct nvme_iod *iod = ctx; - struct request *req = iod_get_private(iod); - struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); - - u16 status = le16_to_cpup(&cqe->status) >> 1; - - if (unlikely(status)) { - if (!(status & NVME_SC_DNR || blk_noretry_request(req)) - && (jiffies - req->start_time) < req->timeout) { - unsigned long flags; - - blk_mq_requeue_request(req); - spin_lock_irqsave(req->q->queue_lock, flags); - if (!blk_queue_stopped(req->q)) - blk_mq_kick_requeue_list(req->q); - spin_unlock_irqrestore(req->q->queue_lock, flags); - return; - } - req->errors = nvme_error_status(status); - } else - req->errors = 0; - - if (cmd_rq->aborted) - dev_warn(&nvmeq->dev->pci_dev->dev, - "completing aborted command with status:%04x\n", - status); - - if (iod->nents) { - dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, - rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (blk_integrity_rq(req)) { - if (!rq_data_dir(req)) - nvme_dif_remap(req, nvme_dif_complete); - dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1, - rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - } - } - nvme_free_iod(nvmeq->dev, iod); - - blk_mq_complete_request(req); -} - -/* length is in bytes. gfp flags indicates whether we may sleep. */ -int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, - gfp_t gfp) -{ - struct dma_pool *pool; - int length = total_len; - struct scatterlist *sg = iod->sg; - int dma_len = sg_dma_len(sg); - u64 dma_addr = sg_dma_address(sg); - u32 page_size = dev->page_size; - int offset = dma_addr & (page_size - 1); - __le64 *prp_list; - __le64 **list = iod_list(iod); - dma_addr_t prp_dma; - int nprps, i; - - length -= (page_size - offset); - if (length <= 0) - return total_len; - - dma_len -= (page_size - offset); - if (dma_len) { - dma_addr += (page_size - offset); - } else { - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } - - if (length <= page_size) { - iod->first_dma = dma_addr; - return total_len; - } - - nprps = DIV_ROUND_UP(length, page_size); - if (nprps <= (256 / 8)) { - pool = dev->prp_small_pool; - iod->npages = 0; - } else { - pool = dev->prp_page_pool; - iod->npages = 1; - } - - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); - if (!prp_list) { - iod->first_dma = dma_addr; - iod->npages = -1; - return (total_len - length) + page_size; - } - list[0] = prp_list; - iod->first_dma = prp_dma; - i = 0; - for (;;) { - if (i == page_size >> 3) { - __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); - if (!prp_list) - return total_len - length; - list[iod->npages++] = prp_list; - prp_list[0] = old_prp_list[i - 1]; - old_prp_list[i - 1] = cpu_to_le64(prp_dma); - i = 1; - } - prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= page_size; - dma_addr += page_size; - length -= page_size; - if (length <= 0) - break; - if (dma_len > 0) - continue; - BUG_ON(dma_len < 0); - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } - - return total_len; -} - -/* - * We reuse the small pool to allocate the 16-byte range here as it is not - * worth having a special pool for these or additional cases to handle freeing - * the iod. - */ -static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct request *req, struct nvme_iod *iod) -{ - struct nvme_dsm_range *range = - (struct nvme_dsm_range *)iod_list(iod)[0]; - struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; - - range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); - range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.command_id = req->tag; - cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); - cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); - cmnd->dsm.nr = 0; - cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); -} - -static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, - int cmdid) -{ - struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; - - memset(cmnd, 0, sizeof(*cmnd)); - cmnd->common.opcode = nvme_cmd_flush; - cmnd->common.command_id = cmdid; - cmnd->common.nsid = cpu_to_le32(ns->ns_id); - - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); -} - -static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, - struct nvme_ns *ns) -{ - struct request *req = iod_get_private(iod); - struct nvme_command *cmnd; - u16 control = 0; - u32 dsmgmt = 0; - - if (req->cmd_flags & REQ_FUA) - control |= NVME_RW_FUA; - if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) - control |= NVME_RW_LR; - - if (req->cmd_flags & REQ_RAHEAD) - dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - - cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; - memset(cmnd, 0, sizeof(*cmnd)); - - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); - cmnd->rw.command_id = req->tag; - cmnd->rw.nsid = cpu_to_le32(ns->ns_id); - cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); - - if (blk_integrity_rq(req)) { - cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg)); - switch (ns->pi_type) { - case NVME_NS_DPS_PI_TYPE3: - control |= NVME_RW_PRINFO_PRCHK_GUARD; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - control |= NVME_RW_PRINFO_PRCHK_GUARD | - NVME_RW_PRINFO_PRCHK_REF; - cmnd->rw.reftag = cpu_to_le32( - nvme_block_nr(ns, blk_rq_pos(req))); - break; - } - } else if (ns->ms) - control |= NVME_RW_PRINFO_PRACT; - - cmnd->rw.control = cpu_to_le16(control); - cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); - - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; -} - -static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - struct nvme_ns *ns = hctx->queue->queuedata; - struct nvme_queue *nvmeq = hctx->driver_data; - struct request *req = bd->rq; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_iod *iod; - enum dma_data_direction dma_dir; - - /* - * If formated with metadata, require the block layer provide a buffer - * unless this namespace is formated such that the metadata can be - * stripped/generated by the controller with PRACT=1. - */ - if (ns->ms && !blk_integrity_rq(req)) { - if (!(ns->pi_type && ns->ms == 8)) { - req->errors = -EFAULT; - blk_mq_complete_request(req); - return BLK_MQ_RQ_QUEUE_OK; - } - } - - iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); - if (!iod) - return BLK_MQ_RQ_QUEUE_BUSY; - - if (req->cmd_flags & REQ_DISCARD) { - void *range; - /* - * We reuse the small pool to allocate the 16-byte range here - * as it is not worth having a special pool for these or - * additional cases to handle freeing the iod. - */ - range = dma_pool_alloc(nvmeq->dev->prp_small_pool, - GFP_ATOMIC, - &iod->first_dma); - if (!range) - goto retry_cmd; - iod_list(iod)[0] = (__le64 *)range; - iod->npages = 0; - } else if (req->nr_phys_segments) { - dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; - - sg_init_table(iod->sg, req->nr_phys_segments); - iod->nents = blk_rq_map_sg(req->q, req, iod->sg); - if (!iod->nents) - goto error_cmd; - - if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) - goto retry_cmd; - - if (blk_rq_bytes(req) != - nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { - dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, - iod->nents, dma_dir); - goto retry_cmd; - } - if (blk_integrity_rq(req)) { - if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) - goto error_cmd; - - sg_init_table(iod->meta_sg, 1); - if (blk_rq_map_integrity_sg( - req->q, req->bio, iod->meta_sg) != 1) - goto error_cmd; - - if (rq_data_dir(req)) - nvme_dif_remap(req, nvme_dif_prep); - - if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) - goto error_cmd; - } - } - - nvme_set_info(cmd, iod, req_completion); - spin_lock_irq(&nvmeq->q_lock); - if (req->cmd_flags & REQ_DISCARD) - nvme_submit_discard(nvmeq, ns, req, iod); - else if (req->cmd_flags & REQ_FLUSH) - nvme_submit_flush(nvmeq, ns, req->tag); - else - nvme_submit_iod(nvmeq, iod, ns); - - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); - return BLK_MQ_RQ_QUEUE_OK; - - error_cmd: - nvme_free_iod(nvmeq->dev, iod); - return BLK_MQ_RQ_QUEUE_ERROR; - retry_cmd: - nvme_free_iod(nvmeq->dev, iod); - return BLK_MQ_RQ_QUEUE_BUSY; -} - -static int nvme_process_cq(struct nvme_queue *nvmeq) -{ - u16 head, phase; - - head = nvmeq->cq_head; - phase = nvmeq->cq_phase; - - for (;;) { - void *ctx; - nvme_completion_fn fn; - struct nvme_completion cqe = nvmeq->cqes[head]; - if ((le16_to_cpu(cqe.status) & 1) != phase) - break; - nvmeq->sq_head = le16_to_cpu(cqe.sq_head); - if (++head == nvmeq->q_depth) { - head = 0; - phase = !phase; - } - ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); - fn(nvmeq, ctx, &cqe); - } - - /* If the controller ignores the cq head doorbell and continuously - * writes to the queue, it is theoretically possible to wrap around - * the queue twice and mistakenly return IRQ_NONE. Linux only - * requires that 0.1% of your interrupts are handled, so this isn't - * a big problem. - */ - if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) - return 0; - - writel(head, nvmeq->q_db + nvmeq->dev->db_stride); - nvmeq->cq_head = head; - nvmeq->cq_phase = phase; - - nvmeq->cqe_seen = 1; - return 1; -} - -/* Admin queue isn't initialized as a request queue. If at some point this - * happens anyway, make sure to notify the user */ -static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - WARN_ON_ONCE(1); - return BLK_MQ_RQ_QUEUE_ERROR; -} - -static irqreturn_t nvme_irq(int irq, void *data) -{ - irqreturn_t result; - struct nvme_queue *nvmeq = data; - spin_lock(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE; - nvmeq->cqe_seen = 0; - spin_unlock(&nvmeq->q_lock); - return result; -} - -static irqreturn_t nvme_irq_check(int irq, void *data) -{ - struct nvme_queue *nvmeq = data; - struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head]; - if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase) - return IRQ_NONE; - return IRQ_WAKE_THREAD; -} - -struct sync_cmd_info { - struct task_struct *task; - u32 result; - int status; -}; - -static void sync_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct sync_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - wake_up_process(cmdinfo->task); -} - -/* - * Returns 0 on success. If the result is negative, it's a Linux error code; - * if the result is positive, it's an NVM Express status code - */ -static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, - u32 *result, unsigned timeout) -{ - struct sync_cmd_info cmdinfo; - struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd_rq->nvmeq; - - cmdinfo.task = current; - cmdinfo.status = -EINTR; - - cmd->common.command_id = req->tag; - - nvme_set_info(cmd_rq, &cmdinfo, sync_completion); - - set_current_state(TASK_UNINTERRUPTIBLE); - nvme_submit_cmd(nvmeq, cmd); - schedule(); - - if (result) - *result = cmdinfo.result; - return cmdinfo.status; -} - -static int nvme_submit_async_admin_req(struct nvme_dev *dev) -{ - struct nvme_queue *nvmeq = dev->queues[0]; - struct nvme_command c; - struct nvme_cmd_info *cmd_info; - struct request *req; - - req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->cmd_flags |= REQ_NO_TIMEOUT; - cmd_info = blk_mq_rq_to_pdu(req); - nvme_set_info(cmd_info, NULL, async_req_completion); - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_async_event; - c.common.command_id = req->tag; - - blk_mq_free_hctx_request(nvmeq->hctx, req); - return __nvme_submit_cmd(nvmeq, &c); -} - -static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, - struct nvme_command *cmd, - struct async_cmd_info *cmdinfo, unsigned timeout) -{ - struct nvme_queue *nvmeq = dev->queues[0]; - struct request *req; - struct nvme_cmd_info *cmd_rq; - - req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->timeout = timeout; - cmd_rq = blk_mq_rq_to_pdu(req); - cmdinfo->req = req; - nvme_set_info(cmd_rq, cmdinfo, async_completion); - cmdinfo->status = -EINTR; - - cmd->common.command_id = req->tag; - - return nvme_submit_cmd(nvmeq, cmd); -} - -static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result, unsigned timeout) -{ - int res; - struct request *req; - - req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); - if (IS_ERR(req)) - return PTR_ERR(req); - res = nvme_submit_sync_cmd(req, cmd, result, timeout); - blk_mq_free_request(req); - return res; -} - -int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result) -{ - return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT); -} - -int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns, - struct nvme_command *cmd, u32 *result) -{ - int res; - struct request *req; - - req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT), - false); - if (IS_ERR(req)) - return PTR_ERR(req); - res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT); - blk_mq_free_request(req); - return res; -} - -static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.delete_queue.opcode = opcode; - c.delete_queue.qid = cpu_to_le16(id); - - return nvme_submit_admin_cmd(dev, &c, NULL); -} - -static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, - struct nvme_queue *nvmeq) -{ - struct nvme_command c; - int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; - - memset(&c, 0, sizeof(c)); - c.create_cq.opcode = nvme_admin_create_cq; - c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); - c.create_cq.cqid = cpu_to_le16(qid); - c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1); - c.create_cq.cq_flags = cpu_to_le16(flags); - c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - - return nvme_submit_admin_cmd(dev, &c, NULL); -} - -static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, - struct nvme_queue *nvmeq) -{ - struct nvme_command c; - int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; - - memset(&c, 0, sizeof(c)); - c.create_sq.opcode = nvme_admin_create_sq; - c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); - c.create_sq.sqid = cpu_to_le16(qid); - c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1); - c.create_sq.sq_flags = cpu_to_le16(flags); - c.create_sq.cqid = cpu_to_le16(qid); - - return nvme_submit_admin_cmd(dev, &c, NULL); -} - -static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) -{ - return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid); -} - -static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) -{ - return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); -} - -int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, - dma_addr_t dma_addr) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.identify.opcode = nvme_admin_identify; - c.identify.nsid = cpu_to_le32(nsid); - c.identify.prp1 = cpu_to_le64(dma_addr); - c.identify.cns = cpu_to_le32(cns); - - return nvme_submit_admin_cmd(dev, &c, NULL); -} - -int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, - dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_get_features; - c.features.nsid = cpu_to_le32(nsid); - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - - return nvme_submit_admin_cmd(dev, &c, result); -} - -int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, - dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_set_features; - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - c.features.dword11 = cpu_to_le32(dword11); - - return nvme_submit_admin_cmd(dev, &c, result); -} - -/** - * nvme_abort_req - Attempt aborting a request - * - * Schedule controller reset if the command was already aborted once before and - * still hasn't been returned to the driver, or if this is the admin queue. - */ -static void nvme_abort_req(struct request *req) -{ - struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd_rq->nvmeq; - struct nvme_dev *dev = nvmeq->dev; - struct request *abort_req; - struct nvme_cmd_info *abort_cmd; - struct nvme_command cmd; - - if (!nvmeq->qid || cmd_rq->aborted) { - unsigned long flags; - - spin_lock_irqsave(&dev_list_lock, flags); - if (work_busy(&dev->reset_work)) - goto out; - list_del_init(&dev->node); - dev_warn(&dev->pci_dev->dev, - "I/O %d QID %d timeout, reset controller\n", - req->tag, nvmeq->qid); - dev->reset_workfn = nvme_reset_failed_dev; - queue_work(nvme_workq, &dev->reset_work); - out: - spin_unlock_irqrestore(&dev_list_lock, flags); - return; - } - - if (!dev->abort_limit) - return; - - abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, - false); - if (IS_ERR(abort_req)) - return; - - abort_cmd = blk_mq_rq_to_pdu(abort_req); - nvme_set_info(abort_cmd, abort_req, abort_completion); - - memset(&cmd, 0, sizeof(cmd)); - cmd.abort.opcode = nvme_admin_abort_cmd; - cmd.abort.cid = req->tag; - cmd.abort.sqid = cpu_to_le16(nvmeq->qid); - cmd.abort.command_id = abort_req->tag; - - --dev->abort_limit; - cmd_rq->aborted = 1; - - dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, - nvmeq->qid); - if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) { - dev_warn(nvmeq->q_dmadev, - "Could not abort I/O %d QID %d", - req->tag, nvmeq->qid); - blk_mq_free_request(abort_req); - } -} - -static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx, - struct request *req, void *data, bool reserved) -{ - struct nvme_queue *nvmeq = data; - void *ctx; - nvme_completion_fn fn; - struct nvme_cmd_info *cmd; - struct nvme_completion cqe; - - if (!blk_mq_request_started(req)) - return; - - cmd = blk_mq_rq_to_pdu(req); - - if (cmd->ctx == CMD_CTX_CANCELLED) - return; - - if (blk_queue_dying(req->q)) - cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); - else - cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); - - - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", - req->tag, nvmeq->qid); - ctx = cancel_cmd_info(cmd, &fn); - fn(nvmeq, ctx, &cqe); -} - -static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) -{ - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd->nvmeq; - - dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, - nvmeq->qid); - spin_lock_irq(&nvmeq->q_lock); - nvme_abort_req(req); - spin_unlock_irq(&nvmeq->q_lock); - - /* - * The aborted req will be completed on receiving the abort req. - * We enable the timer again. If hit twice, it'll cause a device reset, - * as the device then is in a faulty state. - */ - return BLK_EH_RESET_TIMER; -} - -static void nvme_free_queue(struct nvme_queue *nvmeq) -{ - dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), - (void *)nvmeq->cqes, nvmeq->cq_dma_addr); - dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), - nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); -} - -static void nvme_free_queues(struct nvme_dev *dev, int lowest) -{ - int i; - - for (i = dev->queue_count - 1; i >= lowest; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; - dev->queue_count--; - dev->queues[i] = NULL; - nvme_free_queue(nvmeq); - } -} - -/** - * nvme_suspend_queue - put queue into suspended state - * @nvmeq - queue to suspend - */ -static int nvme_suspend_queue(struct nvme_queue *nvmeq) -{ - int vector; - - spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->cq_vector == -1) { - spin_unlock_irq(&nvmeq->q_lock); - return 1; - } - vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; - nvmeq->dev->online_queues--; - nvmeq->cq_vector = -1; - spin_unlock_irq(&nvmeq->q_lock); - - if (!nvmeq->qid && nvmeq->dev->admin_q) - blk_mq_freeze_queue_start(nvmeq->dev->admin_q); - - irq_set_affinity_hint(vector, NULL); - free_irq(vector, nvmeq); - - return 0; -} - -static void nvme_clear_queue(struct nvme_queue *nvmeq) -{ - struct blk_mq_hw_ctx *hctx = nvmeq->hctx; - - spin_lock_irq(&nvmeq->q_lock); - if (hctx && hctx->tags) - blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq); - spin_unlock_irq(&nvmeq->q_lock); -} - -static void nvme_disable_queue(struct nvme_dev *dev, int qid) -{ - struct nvme_queue *nvmeq = dev->queues[qid]; - - if (!nvmeq) - return; - if (nvme_suspend_queue(nvmeq)) - return; - - /* Don't tell the adapter to delete the admin queue. - * Don't tell a removed adapter to delete IO queues. */ - if (qid && readl(&dev->bar->csts) != -1) { - adapter_delete_sq(dev, qid); - adapter_delete_cq(dev, qid); - } - - spin_lock_irq(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); -} - -static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth) -{ - struct device *dmadev = &dev->pci_dev->dev; - struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); - if (!nvmeq) - return NULL; - - nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth), - &nvmeq->cq_dma_addr, GFP_KERNEL); - if (!nvmeq->cqes) - goto free_nvmeq; - - nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), - &nvmeq->sq_dma_addr, GFP_KERNEL); - if (!nvmeq->sq_cmds) - goto free_cqdma; - - nvmeq->q_dmadev = dmadev; - nvmeq->dev = dev; - snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", - dev->instance, qid); - spin_lock_init(&nvmeq->q_lock); - nvmeq->cq_head = 0; - nvmeq->cq_phase = 1; - nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - nvmeq->q_depth = depth; - nvmeq->qid = qid; - dev->queue_count++; - dev->queues[qid] = nvmeq; - - return nvmeq; - - free_cqdma: - dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, - nvmeq->cq_dma_addr); - free_nvmeq: - kfree(nvmeq); - return NULL; -} - -static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, - const char *name) -{ - if (use_threaded_interrupts) - return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector, - nvme_irq_check, nvme_irq, IRQF_SHARED, - name, nvmeq); - return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq, - IRQF_SHARED, name, nvmeq); -} - -static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) -{ - struct nvme_dev *dev = nvmeq->dev; - - spin_lock_irq(&nvmeq->q_lock); - nvmeq->sq_tail = 0; - nvmeq->cq_head = 0; - nvmeq->cq_phase = 1; - nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); - dev->online_queues++; - spin_unlock_irq(&nvmeq->q_lock); -} - -static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) -{ - struct nvme_dev *dev = nvmeq->dev; - int result; - - nvmeq->cq_vector = qid - 1; - result = adapter_alloc_cq(dev, qid, nvmeq); - if (result < 0) - return result; - - result = adapter_alloc_sq(dev, qid, nvmeq); - if (result < 0) - goto release_cq; - - result = queue_request_irq(dev, nvmeq, nvmeq->irqname); - if (result < 0) - goto release_sq; - - nvme_init_queue(nvmeq, qid); - return result; - - release_sq: - adapter_delete_sq(dev, qid); - release_cq: - adapter_delete_cq(dev, qid); - return result; -} - -static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) -{ - unsigned long timeout; - u32 bit = enabled ? NVME_CSTS_RDY : 0; - - timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; - - while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(&dev->pci_dev->dev, - "Device not ready; aborting %s\n", enabled ? - "initialisation" : "reset"); - return -ENODEV; - } - } - - return 0; -} - -/* - * If the device has been passed off to us in an enabled state, just clear - * the enabled bit. The spec says we should set the 'shutdown notification - * bits', but doing so may cause the device to complete commands to the - * admin queue ... and we don't know what memory that might be pointing at! - */ -static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) -{ - dev->ctrl_config &= ~NVME_CC_SHN_MASK; - dev->ctrl_config &= ~NVME_CC_ENABLE; - writel(dev->ctrl_config, &dev->bar->cc); - - return nvme_wait_ready(dev, cap, false); -} - -static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) -{ - dev->ctrl_config &= ~NVME_CC_SHN_MASK; - dev->ctrl_config |= NVME_CC_ENABLE; - writel(dev->ctrl_config, &dev->bar->cc); - - return nvme_wait_ready(dev, cap, true); -} - -static int nvme_shutdown_ctrl(struct nvme_dev *dev) -{ - unsigned long timeout; - - dev->ctrl_config &= ~NVME_CC_SHN_MASK; - dev->ctrl_config |= NVME_CC_SHN_NORMAL; - - writel(dev->ctrl_config, &dev->bar->cc); - - timeout = SHUTDOWN_TIMEOUT + jiffies; - while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != - NVME_CSTS_SHST_CMPLT) { - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(&dev->pci_dev->dev, - "Device shutdown incomplete; abort shutdown\n"); - return -ENODEV; - } - } - - return 0; -} - -static struct blk_mq_ops nvme_mq_admin_ops = { - .queue_rq = nvme_admin_queue_rq, - .map_queue = blk_mq_map_queue, - .init_hctx = nvme_admin_init_hctx, - .exit_hctx = nvme_exit_hctx, - .init_request = nvme_admin_init_request, - .timeout = nvme_timeout, -}; - -static struct blk_mq_ops nvme_mq_ops = { - .queue_rq = nvme_queue_rq, - .map_queue = blk_mq_map_queue, - .init_hctx = nvme_init_hctx, - .exit_hctx = nvme_exit_hctx, - .init_request = nvme_init_request, - .timeout = nvme_timeout, -}; - -static void nvme_dev_remove_admin(struct nvme_dev *dev) -{ - if (dev->admin_q && !blk_queue_dying(dev->admin_q)) { - blk_cleanup_queue(dev->admin_q); - blk_mq_free_tag_set(&dev->admin_tagset); - } -} - -static int nvme_alloc_admin_tags(struct nvme_dev *dev) -{ - if (!dev->admin_q) { - dev->admin_tagset.ops = &nvme_mq_admin_ops; - dev->admin_tagset.nr_hw_queues = 1; - dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; - dev->admin_tagset.reserved_tags = 1; - dev->admin_tagset.timeout = ADMIN_TIMEOUT; - dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); - dev->admin_tagset.cmd_size = nvme_cmd_size(dev); - dev->admin_tagset.driver_data = dev; - - if (blk_mq_alloc_tag_set(&dev->admin_tagset)) - return -ENOMEM; - - dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); - if (IS_ERR(dev->admin_q)) { - blk_mq_free_tag_set(&dev->admin_tagset); - return -ENOMEM; - } - if (!blk_get_queue(dev->admin_q)) { - nvme_dev_remove_admin(dev); - return -ENODEV; - } - } else - blk_mq_unfreeze_queue(dev->admin_q); - - return 0; -} - -static int nvme_configure_admin_queue(struct nvme_dev *dev) -{ - int result; - u32 aqa; - u64 cap = readq(&dev->bar->cap); - struct nvme_queue *nvmeq; - unsigned page_shift = PAGE_SHIFT; - unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; - unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; - - if (page_shift < dev_page_min) { - dev_err(&dev->pci_dev->dev, - "Minimum device page size (%u) too large for " - "host (%u)\n", 1 << dev_page_min, - 1 << page_shift); - return -ENODEV; - } - if (page_shift > dev_page_max) { - dev_info(&dev->pci_dev->dev, - "Device maximum page size (%u) smaller than " - "host (%u); enabling work-around\n", - 1 << dev_page_max, 1 << page_shift); - page_shift = dev_page_max; - } - - result = nvme_disable_ctrl(dev, cap); - if (result < 0) - return result; - - nvmeq = dev->queues[0]; - if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); - if (!nvmeq) - return -ENOMEM; - } - - aqa = nvmeq->q_depth - 1; - aqa |= aqa << 16; - - dev->page_size = 1 << page_shift; - - dev->ctrl_config = NVME_CC_CSS_NVM; - dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; - dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; - dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - - writel(aqa, &dev->bar->aqa); - writeq(nvmeq->sq_dma_addr, &dev->bar->asq); - writeq(nvmeq->cq_dma_addr, &dev->bar->acq); - - result = nvme_enable_ctrl(dev, cap); - if (result) - goto free_nvmeq; - - nvmeq->cq_vector = 0; - result = queue_request_irq(dev, nvmeq, nvmeq->irqname); - if (result) - goto free_nvmeq; - - return result; - - free_nvmeq: - nvme_free_queues(dev, 0); - return result; -} - -struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, - unsigned long addr, unsigned length) -{ - int i, err, count, nents, offset; - struct scatterlist *sg; - struct page **pages; - struct nvme_iod *iod; - - if (addr & 3) - return ERR_PTR(-EINVAL); - if (!length || length > INT_MAX - PAGE_SIZE) - return ERR_PTR(-EINVAL); - - offset = offset_in_page(addr); - count = DIV_ROUND_UP(offset + length, PAGE_SIZE); - pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); - - err = get_user_pages_fast(addr, count, 1, pages); - if (err < count) { - count = err; - err = -EFAULT; - goto put_pages; - } - - err = -ENOMEM; - iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL); - if (!iod) - goto put_pages; - - sg = iod->sg; - sg_init_table(sg, count); - for (i = 0; i < count; i++) { - sg_set_page(&sg[i], pages[i], - min_t(unsigned, length, PAGE_SIZE - offset), - offset); - length -= (PAGE_SIZE - offset); - offset = 0; - } - sg_mark_end(&sg[i - 1]); - iod->nents = count; - - nents = dma_map_sg(&dev->pci_dev->dev, sg, count, - write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (!nents) - goto free_iod; - - kfree(pages); - return iod; - - free_iod: - kfree(iod); - put_pages: - for (i = 0; i < count; i++) - put_page(pages[i]); - kfree(pages); - return ERR_PTR(err); -} - -void nvme_unmap_user_pages(struct nvme_dev *dev, int write, - struct nvme_iod *iod) -{ - int i; - - dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, - write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - - for (i = 0; i < iod->nents; i++) - put_page(sg_page(&iod->sg[i])); -} - -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) -{ - struct nvme_dev *dev = ns->dev; - struct nvme_user_io io; - struct nvme_command c; - unsigned length, meta_len, prp_len; - int status, write; - struct nvme_iod *iod; - dma_addr_t meta_dma = 0; - void *meta = NULL; - void __user *metadata; - - if (copy_from_user(&io, uio, sizeof(io))) - return -EFAULT; - length = (io.nblocks + 1) << ns->lba_shift; - meta_len = (io.nblocks + 1) * ns->ms; - - if (meta_len && ((io.metadata & 3) || !io.metadata) && !ns->ext) - return -EINVAL; - else if (meta_len && ns->ext) { - length += meta_len; - meta_len = 0; - } - - metadata = (void __user *)(unsigned long)io.metadata; - - write = io.opcode & 1; - - switch (io.opcode) { - case nvme_cmd_write: - case nvme_cmd_read: - case nvme_cmd_compare: - iod = nvme_map_user_pages(dev, write, io.addr, length); - break; - default: - return -EINVAL; - } - - if (IS_ERR(iod)) - return PTR_ERR(iod); - - prp_len = nvme_setup_prps(dev, iod, length, GFP_KERNEL); - if (length != prp_len) { - status = -ENOMEM; - goto unmap; - } - if (meta_len) { - meta = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, - &meta_dma, GFP_KERNEL); - - if (!meta) { - status = -ENOMEM; - goto unmap; - } - if (write) { - if (copy_from_user(meta, metadata, meta_len)) { - status = -EFAULT; - goto unmap; - } - } - } - - memset(&c, 0, sizeof(c)); - c.rw.opcode = io.opcode; - c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->ns_id); - c.rw.slba = cpu_to_le64(io.slba); - c.rw.length = cpu_to_le16(io.nblocks); - c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); - c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); - c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - c.rw.prp2 = cpu_to_le64(iod->first_dma); - c.rw.metadata = cpu_to_le64(meta_dma); - status = nvme_submit_io_cmd(dev, ns, &c, NULL); - unmap: - nvme_unmap_user_pages(dev, write, iod); - nvme_free_iod(dev, iod); - if (meta) { - if (status == NVME_SC_SUCCESS && !write) { - if (copy_to_user(metadata, meta, meta_len)) - status = -EFAULT; - } - dma_free_coherent(&dev->pci_dev->dev, meta_len, meta, meta_dma); - } - return status; -} - -static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) -{ - struct nvme_passthru_cmd cmd; - struct nvme_command c; - int status, length; - struct nvme_iod *uninitialized_var(iod); - unsigned timeout; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); - c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); - c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); - c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); - c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); - c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); - - length = cmd.data_len; - if (cmd.data_len) { - iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, - length); - if (IS_ERR(iod)) - return PTR_ERR(iod); - length = nvme_setup_prps(dev, iod, length, GFP_KERNEL); - c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - c.common.prp2 = cpu_to_le64(iod->first_dma); - } - - timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : - ADMIN_TIMEOUT; - - if (length != cmd.data_len) - status = -ENOMEM; - else if (ns) { - struct request *req; - - req = blk_mq_alloc_request(ns->queue, WRITE, - (GFP_KERNEL|__GFP_WAIT), false); - if (IS_ERR(req)) - status = PTR_ERR(req); - else { - status = nvme_submit_sync_cmd(req, &c, &cmd.result, - timeout); - blk_mq_free_request(req); - } - } else - status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout); - - if (cmd.data_len) { - nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); - nvme_free_iod(dev, iod); - } - - if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, - sizeof(cmd.result))) - status = -EFAULT; - - return status; -} - -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, - unsigned long arg) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - - switch (cmd) { - case NVME_IOCTL_ID: - force_successful_syscall_return(); - return ns->ns_id; - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); - case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->dev, ns, (void __user *)arg); - case NVME_IOCTL_SUBMIT_IO: - return nvme_submit_io(ns, (void __user *)arg); - case SG_GET_VERSION_NUM: - return nvme_sg_get_version_num((void __user *)arg); - case SG_IO: - return nvme_sg_io(ns, (void __user *)arg); - default: - return -ENOTTY; - } -} - -#ifdef CONFIG_COMPAT -static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - switch (cmd) { - case SG_IO: - return -ENOIOCTLCMD; - } - return nvme_ioctl(bdev, mode, cmd, arg); -} -#else -#define nvme_compat_ioctl NULL -#endif - -static int nvme_open(struct block_device *bdev, fmode_t mode) -{ - int ret = 0; - struct nvme_ns *ns; - - spin_lock(&dev_list_lock); - ns = bdev->bd_disk->private_data; - if (!ns) - ret = -ENXIO; - else if (!kref_get_unless_zero(&ns->dev->kref)) - ret = -ENXIO; - spin_unlock(&dev_list_lock); - - return ret; -} - -static void nvme_free_dev(struct kref *kref); - -static void nvme_release(struct gendisk *disk, fmode_t mode) -{ - struct nvme_ns *ns = disk->private_data; - struct nvme_dev *dev = ns->dev; - - kref_put(&dev->kref, nvme_free_dev); -} - -static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) -{ - /* some standard values */ - geo->heads = 1 << 6; - geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; - return 0; -} - -static void nvme_config_discard(struct nvme_ns *ns) -{ - u32 logical_block_size = queue_logical_block_size(ns->queue); - ns->queue->limits.discard_zeroes_data = 0; - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; - ns->queue->limits.max_discard_sectors = 0xffffffff; - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); -} - -static int nvme_revalidate_disk(struct gendisk *disk) -{ - struct nvme_ns *ns = disk->private_data; - struct nvme_dev *dev = ns->dev; - struct nvme_id_ns *id; - dma_addr_t dma_addr; - u8 lbaf, pi_type; - u16 old_ms; - unsigned short bs; - - id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr, - GFP_KERNEL); - if (!id) { - dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n", - __func__); - return 0; - } - if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) { - dev_warn(&dev->pci_dev->dev, - "identify failed ns:%d, setting capacity to 0\n", - ns->ns_id); - memset(id, 0, sizeof(*id)); - } - - old_ms = ns->ms; - lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; - ns->lba_shift = id->lbaf[lbaf].ds; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); - ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); - - /* - * If identify namespace failed, use default 512 byte block size so - * block layer can use before failing read/write for 0 capacity. - */ - if (ns->lba_shift == 0) - ns->lba_shift = 9; - bs = 1 << ns->lba_shift; - - /* XXX: PI implementation requires metadata equal t10 pi tuple size */ - pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? - id->dps & NVME_NS_DPS_PI_MASK : 0; - - if (blk_get_integrity(disk) && (ns->pi_type != pi_type || - ns->ms != old_ms || - bs != queue_logical_block_size(disk->queue) || - (ns->ms && ns->ext))) - blk_integrity_unregister(disk); - - ns->pi_type = pi_type; - blk_queue_logical_block_size(ns->queue, bs); - - if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) && - !ns->ext) - nvme_init_integrity(ns); - - if (id->ncap == 0 || (ns->ms && !blk_get_integrity(disk))) - set_capacity(disk, 0); - else - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - - if (dev->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ns); - - dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); - return 0; -} - -static const struct block_device_operations nvme_fops = { - .owner = THIS_MODULE, - .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, - .open = nvme_open, - .release = nvme_release, - .getgeo = nvme_getgeo, - .revalidate_disk= nvme_revalidate_disk, -}; - -static int nvme_kthread(void *data) -{ - struct nvme_dev *dev, *next; - - while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); - spin_lock(&dev_list_lock); - list_for_each_entry_safe(dev, next, &dev_list, node) { - int i; - if (readl(&dev->bar->csts) & NVME_CSTS_CFS) { - if (work_busy(&dev->reset_work)) - continue; - list_del_init(&dev->node); - dev_warn(&dev->pci_dev->dev, - "Failed status: %x, reset controller\n", - readl(&dev->bar->csts)); - dev->reset_workfn = nvme_reset_failed_dev; - queue_work(nvme_workq, &dev->reset_work); - continue; - } - for (i = 0; i < dev->queue_count; i++) { - struct nvme_queue *nvmeq = dev->queues[i]; - if (!nvmeq) - continue; - spin_lock_irq(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - - while ((i == 0) && (dev->event_limit > 0)) { - if (nvme_submit_async_admin_req(dev)) - break; - dev->event_limit--; - } - spin_unlock_irq(&nvmeq->q_lock); - } - } - spin_unlock(&dev_list_lock); - schedule_timeout(round_jiffies_relative(HZ)); - } - return 0; -} - -static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) -{ - struct nvme_ns *ns; - struct gendisk *disk; - int node = dev_to_node(&dev->pci_dev->dev); - - ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); - if (!ns) - return; - - ns->queue = blk_mq_init_queue(&dev->tagset); - if (IS_ERR(ns->queue)) - goto out_free_ns; - queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); - queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue); - ns->dev = dev; - ns->queue->queuedata = ns; - - disk = alloc_disk_node(0, node); - if (!disk) - goto out_free_queue; - - ns->ns_id = nsid; - ns->disk = disk; - ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ - list_add_tail(&ns->list, &dev->namespaces); - - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - if (dev->max_hw_sectors) - blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); - if (dev->stripe_size) - blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); - if (dev->vwc & NVME_CTRL_VWC_PRESENT) - blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); - - disk->major = nvme_major; - disk->first_minor = 0; - disk->fops = &nvme_fops; - disk->private_data = ns; - disk->queue = ns->queue; - disk->driverfs_dev = dev->device; - disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); - - /* - * Initialize capacity to 0 until we establish the namespace format and - * setup integrity extentions if necessary. The revalidate_disk after - * add_disk allows the driver to register with integrity if the format - * requires it. - */ - set_capacity(disk, 0); - nvme_revalidate_disk(ns->disk); - add_disk(ns->disk); - if (ns->ms) - revalidate_disk(ns->disk); - return; - out_free_queue: - blk_cleanup_queue(ns->queue); - out_free_ns: - kfree(ns); -} - -static void nvme_create_io_queues(struct nvme_dev *dev) -{ - unsigned i; - - for (i = dev->queue_count; i <= dev->max_qid; i++) - if (!nvme_alloc_queue(dev, i, dev->q_depth)) - break; - - for (i = dev->online_queues; i <= dev->queue_count - 1; i++) - if (nvme_create_queue(dev->queues[i], i)) - break; -} - -static int set_queue_count(struct nvme_dev *dev, int count) -{ - int status; - u32 result; - u32 q_count = (count - 1) | ((count - 1) << 16); - - status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, - &result); - if (status < 0) - return status; - if (status > 0) { - dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", - status); - return 0; - } - return min(result & 0xffff, result >> 16) + 1; -} - -static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) -{ - return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); -} - -static int nvme_setup_io_queues(struct nvme_dev *dev) -{ - struct nvme_queue *adminq = dev->queues[0]; - struct pci_dev *pdev = dev->pci_dev; - int result, i, vecs, nr_io_queues, size; - - nr_io_queues = num_possible_cpus(); - result = set_queue_count(dev, nr_io_queues); - if (result <= 0) - return result; - if (result < nr_io_queues) - nr_io_queues = result; - - size = db_bar_size(dev, nr_io_queues); - if (size > 8192) { - iounmap(dev->bar); - do { - dev->bar = ioremap(pci_resource_start(pdev, 0), size); - if (dev->bar) - break; - if (!--nr_io_queues) - return -ENOMEM; - size = db_bar_size(dev, nr_io_queues); - } while (1); - dev->dbs = ((void __iomem *)dev->bar) + 4096; - adminq->q_db = dev->dbs; - } - - /* Deregister the admin queue's interrupt */ - free_irq(dev->entry[0].vector, adminq); - - /* - * If we enable msix early due to not intx, disable it again before - * setting up the full range we need. - */ - if (!pdev->irq) - pci_disable_msix(pdev); - - for (i = 0; i < nr_io_queues; i++) - dev->entry[i].entry = i; - vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); - if (vecs < 0) { - vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32)); - if (vecs < 0) { - vecs = 1; - } else { - for (i = 0; i < vecs; i++) - dev->entry[i].vector = i + pdev->irq; - } - } - - /* - * Should investigate if there's a performance win from allocating - * more queues than interrupt vectors; it might allow the submission - * path to scale better, even if the receive path is limited by the - * number of interrupts. - */ - nr_io_queues = vecs; - dev->max_qid = nr_io_queues; - - result = queue_request_irq(dev, adminq, adminq->irqname); - if (result) - goto free_queues; - - /* Free previously allocated queues that are no longer usable */ - nvme_free_queues(dev, nr_io_queues + 1); - nvme_create_io_queues(dev); - - return 0; - - free_queues: - nvme_free_queues(dev, 1); - return result; -} - -/* - * Return: error value if an error occurred setting up the queues or calling - * Identify Device. 0 if these succeeded, even if adding some of the - * namespaces failed. At the moment, these failures are silent. TBD which - * failures should be reported. - */ -static int nvme_dev_add(struct nvme_dev *dev) -{ - struct pci_dev *pdev = dev->pci_dev; - int res; - unsigned nn, i; - struct nvme_id_ctrl *ctrl; - void *mem; - dma_addr_t dma_addr; - int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; - - mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL); - if (!mem) - return -ENOMEM; - - res = nvme_identify(dev, 0, 1, dma_addr); - if (res) { - dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); - dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); - return -EIO; - } - - ctrl = mem; - nn = le32_to_cpup(&ctrl->nn); - dev->oncs = le16_to_cpup(&ctrl->oncs); - dev->abort_limit = ctrl->acl + 1; - dev->vwc = ctrl->vwc; - memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); - memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); - memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); - if (ctrl->mdts) - dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); - if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && - (pdev->device == 0x0953) && ctrl->vs[3]) { - unsigned int max_hw_sectors; - - dev->stripe_size = 1 << (ctrl->vs[3] + shift); - max_hw_sectors = dev->stripe_size >> (shift - 9); - if (dev->max_hw_sectors) { - dev->max_hw_sectors = min(max_hw_sectors, - dev->max_hw_sectors); - } else - dev->max_hw_sectors = max_hw_sectors; - } - dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); - - dev->tagset.ops = &nvme_mq_ops; - dev->tagset.nr_hw_queues = dev->online_queues - 1; - dev->tagset.timeout = NVME_IO_TIMEOUT; - dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); - dev->tagset.queue_depth = - min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - dev->tagset.cmd_size = nvme_cmd_size(dev); - dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; - dev->tagset.driver_data = dev; - - if (blk_mq_alloc_tag_set(&dev->tagset)) - return 0; - - for (i = 1; i <= nn; i++) - nvme_alloc_ns(dev, i); - - return 0; -} - -static int nvme_dev_map(struct nvme_dev *dev) -{ - u64 cap; - int bars, result = -ENOMEM; - struct pci_dev *pdev = dev->pci_dev; - - if (pci_enable_device_mem(pdev)) - return result; - - dev->entry[0].vector = pdev->irq; - pci_set_master(pdev); - bars = pci_select_bars(pdev, IORESOURCE_MEM); - if (!bars) - goto disable_pci; - - if (pci_request_selected_regions(pdev, bars, "nvme")) - goto disable_pci; - - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) && - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) - goto disable; - - dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); - if (!dev->bar) - goto disable; - - if (readl(&dev->bar->csts) == -1) { - result = -ENODEV; - goto unmap; - } - - /* - * Some devices don't advertse INTx interrupts, pre-enable a single - * MSIX vec for setup. We'll adjust this later. - */ - if (!pdev->irq) { - result = pci_enable_msix(pdev, dev->entry, 1); - if (result < 0) - goto unmap; - } - - cap = readq(&dev->bar->cap); - dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); - dev->db_stride = 1 << NVME_CAP_STRIDE(cap); - dev->dbs = ((void __iomem *)dev->bar) + 4096; - - return 0; - - unmap: - iounmap(dev->bar); - dev->bar = NULL; - disable: - pci_release_regions(pdev); - disable_pci: - pci_disable_device(pdev); - return result; -} - -static void nvme_dev_unmap(struct nvme_dev *dev) -{ - if (dev->pci_dev->msi_enabled) - pci_disable_msi(dev->pci_dev); - else if (dev->pci_dev->msix_enabled) - pci_disable_msix(dev->pci_dev); - - if (dev->bar) { - iounmap(dev->bar); - dev->bar = NULL; - pci_release_regions(dev->pci_dev); - } - - if (pci_is_enabled(dev->pci_dev)) - pci_disable_device(dev->pci_dev); -} - -struct nvme_delq_ctx { - struct task_struct *waiter; - struct kthread_worker *worker; - atomic_t refcount; -}; - -static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) -{ - dq->waiter = current; - mb(); - - for (;;) { - set_current_state(TASK_KILLABLE); - if (!atomic_read(&dq->refcount)) - break; - if (!schedule_timeout(ADMIN_TIMEOUT) || - fatal_signal_pending(current)) { - /* - * Disable the controller first since we can't trust it - * at this point, but leave the admin queue enabled - * until all queue deletion requests are flushed. - * FIXME: This may take a while if there are more h/w - * queues than admin tags. - */ - set_current_state(TASK_RUNNING); - nvme_disable_ctrl(dev, readq(&dev->bar->cap)); - nvme_clear_queue(dev->queues[0]); - flush_kthread_worker(dq->worker); - nvme_disable_queue(dev, 0); - return; - } - } - set_current_state(TASK_RUNNING); -} - -static void nvme_put_dq(struct nvme_delq_ctx *dq) -{ - atomic_dec(&dq->refcount); - if (dq->waiter) - wake_up_process(dq->waiter); -} - -static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) -{ - atomic_inc(&dq->refcount); - return dq; -} - -static void nvme_del_queue_end(struct nvme_queue *nvmeq) -{ - struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; - nvme_put_dq(dq); -} - -static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, - kthread_work_func_t fn) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.delete_queue.opcode = opcode; - c.delete_queue.qid = cpu_to_le16(nvmeq->qid); - - init_kthread_work(&nvmeq->cmdinfo.work, fn); - return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, - ADMIN_TIMEOUT); -} - -static void nvme_del_cq_work_handler(struct kthread_work *work) -{ - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - nvme_del_queue_end(nvmeq); -} - -static int nvme_delete_cq(struct nvme_queue *nvmeq) -{ - return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, - nvme_del_cq_work_handler); -} - -static void nvme_del_sq_work_handler(struct kthread_work *work) -{ - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - int status = nvmeq->cmdinfo.status; - - if (!status) - status = nvme_delete_cq(nvmeq); - if (status) - nvme_del_queue_end(nvmeq); -} - -static int nvme_delete_sq(struct nvme_queue *nvmeq) -{ - return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, - nvme_del_sq_work_handler); -} - -static void nvme_del_queue_start(struct kthread_work *work) -{ - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - if (nvme_delete_sq(nvmeq)) - nvme_del_queue_end(nvmeq); -} - -static void nvme_disable_io_queues(struct nvme_dev *dev) -{ - int i; - DEFINE_KTHREAD_WORKER_ONSTACK(worker); - struct nvme_delq_ctx dq; - struct task_struct *kworker_task = kthread_run(kthread_worker_fn, - &worker, "nvme%d", dev->instance); - - if (IS_ERR(kworker_task)) { - dev_err(&dev->pci_dev->dev, - "Failed to create queue del task\n"); - for (i = dev->queue_count - 1; i > 0; i--) - nvme_disable_queue(dev, i); - return; - } - - dq.waiter = NULL; - atomic_set(&dq.refcount, 0); - dq.worker = &worker; - for (i = dev->queue_count - 1; i > 0; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; - - if (nvme_suspend_queue(nvmeq)) - continue; - nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); - nvmeq->cmdinfo.worker = dq.worker; - init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); - queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); - } - nvme_wait_dq(&dq, dev); - kthread_stop(kworker_task); -} - -/* -* Remove the node from the device list and check -* for whether or not we need to stop the nvme_thread. -*/ -static void nvme_dev_list_remove(struct nvme_dev *dev) -{ - struct task_struct *tmp = NULL; - - spin_lock(&dev_list_lock); - list_del_init(&dev->node); - if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) { - tmp = nvme_thread; - nvme_thread = NULL; - } - spin_unlock(&dev_list_lock); - - if (tmp) - kthread_stop(tmp); -} - -static void nvme_freeze_queues(struct nvme_dev *dev) -{ - struct nvme_ns *ns; - - list_for_each_entry(ns, &dev->namespaces, list) { - blk_mq_freeze_queue_start(ns->queue); - - spin_lock(ns->queue->queue_lock); - queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); - spin_unlock(ns->queue->queue_lock); - - blk_mq_cancel_requeue_work(ns->queue); - blk_mq_stop_hw_queues(ns->queue); - } -} - -static void nvme_unfreeze_queues(struct nvme_dev *dev) -{ - struct nvme_ns *ns; - - list_for_each_entry(ns, &dev->namespaces, list) { - queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); - blk_mq_unfreeze_queue(ns->queue); - blk_mq_start_stopped_hw_queues(ns->queue, true); - blk_mq_kick_requeue_list(ns->queue); - } -} - -static void nvme_dev_shutdown(struct nvme_dev *dev) -{ - int i; - u32 csts = -1; - - nvme_dev_list_remove(dev); - - if (dev->bar) { - nvme_freeze_queues(dev); - csts = readl(&dev->bar->csts); - } - if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { - for (i = dev->queue_count - 1; i >= 0; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; - nvme_suspend_queue(nvmeq); - } - } else { - nvme_disable_io_queues(dev); - nvme_shutdown_ctrl(dev); - nvme_disable_queue(dev, 0); - } - nvme_dev_unmap(dev); - - for (i = dev->queue_count - 1; i >= 0; i--) - nvme_clear_queue(dev->queues[i]); -} - -static void nvme_dev_remove(struct nvme_dev *dev) -{ - struct nvme_ns *ns; - - list_for_each_entry(ns, &dev->namespaces, list) { - if (ns->disk->flags & GENHD_FL_UP) { - if (blk_get_integrity(ns->disk)) - blk_integrity_unregister(ns->disk); - del_gendisk(ns->disk); - } - if (!blk_queue_dying(ns->queue)) { - blk_mq_abort_requeue_list(ns->queue); - blk_cleanup_queue(ns->queue); - } - } -} - -static int nvme_setup_prp_pools(struct nvme_dev *dev) -{ - struct device *dmadev = &dev->pci_dev->dev; - dev->prp_page_pool = dma_pool_create("prp list page", dmadev, - PAGE_SIZE, PAGE_SIZE, 0); - if (!dev->prp_page_pool) - return -ENOMEM; - - /* Optimisation for I/Os between 4k and 128k */ - dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, - 256, 256, 0); - if (!dev->prp_small_pool) { - dma_pool_destroy(dev->prp_page_pool); - return -ENOMEM; - } - return 0; -} - -static void nvme_release_prp_pools(struct nvme_dev *dev) -{ - dma_pool_destroy(dev->prp_page_pool); - dma_pool_destroy(dev->prp_small_pool); -} - -static DEFINE_IDA(nvme_instance_ida); - -static int nvme_set_instance(struct nvme_dev *dev) -{ - int instance, error; - - do { - if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) - return -ENODEV; - - spin_lock(&dev_list_lock); - error = ida_get_new(&nvme_instance_ida, &instance); - spin_unlock(&dev_list_lock); - } while (error == -EAGAIN); - - if (error) - return -ENODEV; - - dev->instance = instance; - return 0; -} - -static void nvme_release_instance(struct nvme_dev *dev) -{ - spin_lock(&dev_list_lock); - ida_remove(&nvme_instance_ida, dev->instance); - spin_unlock(&dev_list_lock); -} - -static void nvme_free_namespaces(struct nvme_dev *dev) -{ - struct nvme_ns *ns, *next; - - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - list_del(&ns->list); - - spin_lock(&dev_list_lock); - ns->disk->private_data = NULL; - spin_unlock(&dev_list_lock); - - put_disk(ns->disk); - kfree(ns); - } -} - -static void nvme_free_dev(struct kref *kref) -{ - struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); - - pci_dev_put(dev->pci_dev); - put_device(dev->device); - nvme_free_namespaces(dev); - nvme_release_instance(dev); - blk_mq_free_tag_set(&dev->tagset); - blk_put_queue(dev->admin_q); - kfree(dev->queues); - kfree(dev->entry); - kfree(dev); -} - -static int nvme_dev_open(struct inode *inode, struct file *f) -{ - struct nvme_dev *dev; - int instance = iminor(inode); - int ret = -ENODEV; - - spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) { - if (dev->instance == instance) { - if (!dev->admin_q) { - ret = -EWOULDBLOCK; - break; - } - if (!kref_get_unless_zero(&dev->kref)) - break; - f->private_data = dev; - ret = 0; - break; - } - } - spin_unlock(&dev_list_lock); - - return ret; -} - -static int nvme_dev_release(struct inode *inode, struct file *f) -{ - struct nvme_dev *dev = f->private_data; - kref_put(&dev->kref, nvme_free_dev); - return 0; -} - -static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ - struct nvme_dev *dev = f->private_data; - struct nvme_ns *ns; - - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(dev, NULL, (void __user *)arg); - case NVME_IOCTL_IO_CMD: - if (list_empty(&dev->namespaces)) - return -ENOTTY; - ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); - return nvme_user_cmd(dev, ns, (void __user *)arg); - default: - return -ENOTTY; - } -} - -static const struct file_operations nvme_dev_fops = { - .owner = THIS_MODULE, - .open = nvme_dev_open, - .release = nvme_dev_release, - .unlocked_ioctl = nvme_dev_ioctl, - .compat_ioctl = nvme_dev_ioctl, -}; - -static void nvme_set_irq_hints(struct nvme_dev *dev) -{ - struct nvme_queue *nvmeq; - int i; - - for (i = 0; i < dev->online_queues; i++) { - nvmeq = dev->queues[i]; - - if (!nvmeq->hctx) - continue; - - irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, - nvmeq->hctx->cpumask); - } -} - -static int nvme_dev_start(struct nvme_dev *dev) -{ - int result; - bool start_thread = false; - - result = nvme_dev_map(dev); - if (result) - return result; - - result = nvme_configure_admin_queue(dev); - if (result) - goto unmap; - - spin_lock(&dev_list_lock); - if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { - start_thread = true; - nvme_thread = NULL; - } - list_add(&dev->node, &dev_list); - spin_unlock(&dev_list_lock); - - if (start_thread) { - nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - wake_up_all(&nvme_kthread_wait); - } else - wait_event_killable(nvme_kthread_wait, nvme_thread); - - if (IS_ERR_OR_NULL(nvme_thread)) { - result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; - goto disable; - } - - nvme_init_queue(dev->queues[0], 0); - result = nvme_alloc_admin_tags(dev); - if (result) - goto disable; - - result = nvme_setup_io_queues(dev); - if (result) - goto free_tags; - - nvme_set_irq_hints(dev); - - dev->event_limit = 1; - return result; - - free_tags: - nvme_dev_remove_admin(dev); - disable: - nvme_disable_queue(dev, 0); - nvme_dev_list_remove(dev); - unmap: - nvme_dev_unmap(dev); - return result; -} - -static int nvme_remove_dead_ctrl(void *arg) -{ - struct nvme_dev *dev = (struct nvme_dev *)arg; - struct pci_dev *pdev = dev->pci_dev; - - if (pci_get_drvdata(pdev)) - pci_stop_and_remove_bus_device_locked(pdev); - kref_put(&dev->kref, nvme_free_dev); - return 0; -} - -static void nvme_remove_disks(struct work_struct *ws) -{ - struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); - - nvme_free_queues(dev, 1); - nvme_dev_remove(dev); -} - -static int nvme_dev_resume(struct nvme_dev *dev) -{ - int ret; - - ret = nvme_dev_start(dev); - if (ret) - return ret; - if (dev->online_queues < 2) { - spin_lock(&dev_list_lock); - dev->reset_workfn = nvme_remove_disks; - queue_work(nvme_workq, &dev->reset_work); - spin_unlock(&dev_list_lock); - } else { - nvme_unfreeze_queues(dev); - nvme_set_irq_hints(dev); - } - return 0; -} - -static void nvme_dev_reset(struct nvme_dev *dev) -{ - nvme_dev_shutdown(dev); - if (nvme_dev_resume(dev)) { - dev_warn(&dev->pci_dev->dev, "Device failed to resume\n"); - kref_get(&dev->kref); - if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", - dev->instance))) { - dev_err(&dev->pci_dev->dev, - "Failed to start controller remove task\n"); - kref_put(&dev->kref, nvme_free_dev); - } - } -} - -static void nvme_reset_failed_dev(struct work_struct *ws) -{ - struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); - nvme_dev_reset(dev); -} - -static void nvme_reset_workfn(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); - dev->reset_workfn(work); -} - -static void nvme_async_probe(struct work_struct *work); -static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) -{ - int node, result = -ENOMEM; - struct nvme_dev *dev; - - node = dev_to_node(&pdev->dev); - if (node == NUMA_NO_NODE) - set_dev_node(&pdev->dev, 0); - - dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); - if (!dev) - return -ENOMEM; - dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), - GFP_KERNEL, node); - if (!dev->entry) - goto free; - dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), - GFP_KERNEL, node); - if (!dev->queues) - goto free; - - INIT_LIST_HEAD(&dev->namespaces); - dev->reset_workfn = nvme_reset_failed_dev; - INIT_WORK(&dev->reset_work, nvme_reset_workfn); - dev->pci_dev = pci_dev_get(pdev); - pci_set_drvdata(pdev, dev); - result = nvme_set_instance(dev); - if (result) - goto put_pci; - - result = nvme_setup_prp_pools(dev); - if (result) - goto release; - - kref_init(&dev->kref); - dev->device = device_create(nvme_class, &pdev->dev, - MKDEV(nvme_char_major, dev->instance), - dev, "nvme%d", dev->instance); - if (IS_ERR(dev->device)) { - result = PTR_ERR(dev->device); - goto release_pools; - } - get_device(dev->device); - - INIT_LIST_HEAD(&dev->node); - INIT_WORK(&dev->probe_work, nvme_async_probe); - schedule_work(&dev->probe_work); - return 0; - - release_pools: - nvme_release_prp_pools(dev); - release: - nvme_release_instance(dev); - put_pci: - pci_dev_put(dev->pci_dev); - free: - kfree(dev->queues); - kfree(dev->entry); - kfree(dev); - return result; -} - -static void nvme_async_probe(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); - int result; - - result = nvme_dev_start(dev); - if (result) - goto reset; - - if (dev->online_queues > 1) - result = nvme_dev_add(dev); - if (result) - goto reset; - - nvme_set_irq_hints(dev); - return; - reset: - if (!work_busy(&dev->reset_work)) { - dev->reset_workfn = nvme_reset_failed_dev; - queue_work(nvme_workq, &dev->reset_work); - } -} - -static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - - if (prepare) - nvme_dev_shutdown(dev); - else - nvme_dev_resume(dev); -} - -static void nvme_shutdown(struct pci_dev *pdev) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_dev_shutdown(dev); -} - -static void nvme_remove(struct pci_dev *pdev) -{ - struct nvme_dev *dev = pci_get_drvdata(pdev); - - spin_lock(&dev_list_lock); - list_del_init(&dev->node); - spin_unlock(&dev_list_lock); - - pci_set_drvdata(pdev, NULL); - flush_work(&dev->probe_work); - flush_work(&dev->reset_work); - nvme_dev_shutdown(dev); - nvme_dev_remove(dev); - nvme_dev_remove_admin(dev); - device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); - nvme_free_queues(dev, 0); - nvme_release_prp_pools(dev); - kref_put(&dev->kref, nvme_free_dev); -} - -/* These functions are yet to be implemented */ -#define nvme_error_detected NULL -#define nvme_dump_registers NULL -#define nvme_link_reset NULL -#define nvme_slot_reset NULL -#define nvme_error_resume NULL - -#ifdef CONFIG_PM_SLEEP -static int nvme_suspend(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct nvme_dev *ndev = pci_get_drvdata(pdev); - - nvme_dev_shutdown(ndev); - return 0; -} - -static int nvme_resume(struct device *dev) -{ - struct pci_dev *pdev = to_pci_dev(dev); - struct nvme_dev *ndev = pci_get_drvdata(pdev); - - if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) { - ndev->reset_workfn = nvme_reset_failed_dev; - queue_work(nvme_workq, &ndev->reset_work); - } - return 0; -} -#endif - -static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); - -static const struct pci_error_handlers nvme_err_handler = { - .error_detected = nvme_error_detected, - .mmio_enabled = nvme_dump_registers, - .link_reset = nvme_link_reset, - .slot_reset = nvme_slot_reset, - .resume = nvme_error_resume, - .reset_notify = nvme_reset_notify, -}; - -/* Move to pci_ids.h later */ -#define PCI_CLASS_STORAGE_EXPRESS 0x010802 - -static const struct pci_device_id nvme_id_table[] = { - { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, - { 0, } -}; -MODULE_DEVICE_TABLE(pci, nvme_id_table); - -static struct pci_driver nvme_driver = { - .name = "nvme", - .id_table = nvme_id_table, - .probe = nvme_probe, - .remove = nvme_remove, - .shutdown = nvme_shutdown, - .driver = { - .pm = &nvme_dev_pm_ops, - }, - .err_handler = &nvme_err_handler, -}; - -static int __init nvme_init(void) -{ - int result; - - init_waitqueue_head(&nvme_kthread_wait); - - nvme_workq = create_singlethread_workqueue("nvme"); - if (!nvme_workq) - return -ENOMEM; - - result = register_blkdev(nvme_major, "nvme"); - if (result < 0) - goto kill_workq; - else if (result > 0) - nvme_major = result; - - result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", - &nvme_dev_fops); - if (result < 0) - goto unregister_blkdev; - else if (result > 0) - nvme_char_major = result; - - nvme_class = class_create(THIS_MODULE, "nvme"); - if (IS_ERR(nvme_class)) { - result = PTR_ERR(nvme_class); - goto unregister_chrdev; - } - - result = pci_register_driver(&nvme_driver); - if (result) - goto destroy_class; - return 0; - - destroy_class: - class_destroy(nvme_class); - unregister_chrdev: - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); - unregister_blkdev: - unregister_blkdev(nvme_major, "nvme"); - kill_workq: - destroy_workqueue(nvme_workq); - return result; -} - -static void __exit nvme_exit(void) -{ - pci_unregister_driver(&nvme_driver); - unregister_blkdev(nvme_major, "nvme"); - destroy_workqueue(nvme_workq); - class_destroy(nvme_class); - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); - BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); - _nvme_check_size(); -} - -MODULE_AUTHOR("Matthew Wilcox "); -MODULE_LICENSE("GPL"); -MODULE_VERSION("1.0"); -module_init(nvme_init); -module_exit(nvme_exit); diff --git a/kernel/drivers/block/nvme-scsi.c b/kernel/drivers/block/nvme-scsi.c deleted file mode 100644 index 44f2514fb..000000000 --- a/kernel/drivers/block/nvme-scsi.c +++ /dev/null @@ -1,3070 +0,0 @@ -/* - * NVM Express device driver - * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -/* - * Refer to the SCSI-NVMe Translation spec for details on how - * each command is translated. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -static int sg_version_num = 30534; /* 2 digits for each component */ - -#define SNTI_TRANSLATION_SUCCESS 0 -#define SNTI_INTERNAL_ERROR 1 - -/* VPD Page Codes */ -#define VPD_SUPPORTED_PAGES 0x00 -#define VPD_SERIAL_NUMBER 0x80 -#define VPD_DEVICE_IDENTIFIERS 0x83 -#define VPD_EXTENDED_INQUIRY 0x86 -#define VPD_BLOCK_LIMITS 0xB0 -#define VPD_BLOCK_DEV_CHARACTERISTICS 0xB1 - -/* CDB offsets */ -#define REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET 6 -#define REPORT_LUNS_SR_OFFSET 2 -#define READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET 10 -#define REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET 4 -#define REQUEST_SENSE_DESC_OFFSET 1 -#define REQUEST_SENSE_DESC_MASK 0x01 -#define DESCRIPTOR_FORMAT_SENSE_DATA_TYPE 1 -#define INQUIRY_EVPD_BYTE_OFFSET 1 -#define INQUIRY_PAGE_CODE_BYTE_OFFSET 2 -#define INQUIRY_EVPD_BIT_MASK 1 -#define INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET 3 -#define START_STOP_UNIT_CDB_IMMED_OFFSET 1 -#define START_STOP_UNIT_CDB_IMMED_MASK 0x1 -#define START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET 3 -#define START_STOP_UNIT_CDB_POWER_COND_MOD_MASK 0xF -#define START_STOP_UNIT_CDB_POWER_COND_OFFSET 4 -#define START_STOP_UNIT_CDB_POWER_COND_MASK 0xF0 -#define START_STOP_UNIT_CDB_NO_FLUSH_OFFSET 4 -#define START_STOP_UNIT_CDB_NO_FLUSH_MASK 0x4 -#define START_STOP_UNIT_CDB_START_OFFSET 4 -#define START_STOP_UNIT_CDB_START_MASK 0x1 -#define WRITE_BUFFER_CDB_MODE_OFFSET 1 -#define WRITE_BUFFER_CDB_MODE_MASK 0x1F -#define WRITE_BUFFER_CDB_BUFFER_ID_OFFSET 2 -#define WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET 3 -#define WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET 6 -#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET 1 -#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK 0xC0 -#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT 6 -#define FORMAT_UNIT_CDB_LONG_LIST_OFFSET 1 -#define FORMAT_UNIT_CDB_LONG_LIST_MASK 0x20 -#define FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET 1 -#define FORMAT_UNIT_CDB_FORMAT_DATA_MASK 0x10 -#define FORMAT_UNIT_SHORT_PARM_LIST_LEN 4 -#define FORMAT_UNIT_LONG_PARM_LIST_LEN 8 -#define FORMAT_UNIT_PROT_INT_OFFSET 3 -#define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET 0 -#define FORMAT_UNIT_PROT_FIELD_USAGE_MASK 0x07 -#define UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET 7 - -/* Misc. defines */ -#define NIBBLE_SHIFT 4 -#define FIXED_SENSE_DATA 0x70 -#define DESC_FORMAT_SENSE_DATA 0x72 -#define FIXED_SENSE_DATA_ADD_LENGTH 10 -#define LUN_ENTRY_SIZE 8 -#define LUN_DATA_HEADER_SIZE 8 -#define ALL_LUNS_RETURNED 0x02 -#define ALL_WELL_KNOWN_LUNS_RETURNED 0x01 -#define RESTRICTED_LUNS_RETURNED 0x00 -#define NVME_POWER_STATE_START_VALID 0x00 -#define NVME_POWER_STATE_ACTIVE 0x01 -#define NVME_POWER_STATE_IDLE 0x02 -#define NVME_POWER_STATE_STANDBY 0x03 -#define NVME_POWER_STATE_LU_CONTROL 0x07 -#define POWER_STATE_0 0 -#define POWER_STATE_1 1 -#define POWER_STATE_2 2 -#define POWER_STATE_3 3 -#define DOWNLOAD_SAVE_ACTIVATE 0x05 -#define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E -#define ACTIVATE_DEFERRED_MICROCODE 0x0F -#define FORMAT_UNIT_IMMED_MASK 0x2 -#define FORMAT_UNIT_IMMED_OFFSET 1 -#define KELVIN_TEMP_FACTOR 273 -#define FIXED_FMT_SENSE_DATA_SIZE 18 -#define DESC_FMT_SENSE_DATA_SIZE 8 - -/* SCSI/NVMe defines and bit masks */ -#define INQ_STANDARD_INQUIRY_PAGE 0x00 -#define INQ_SUPPORTED_VPD_PAGES_PAGE 0x00 -#define INQ_UNIT_SERIAL_NUMBER_PAGE 0x80 -#define INQ_DEVICE_IDENTIFICATION_PAGE 0x83 -#define INQ_EXTENDED_INQUIRY_DATA_PAGE 0x86 -#define INQ_BDEV_LIMITS_PAGE 0xB0 -#define INQ_BDEV_CHARACTERISTICS_PAGE 0xB1 -#define INQ_SERIAL_NUMBER_LENGTH 0x14 -#define INQ_NUM_SUPPORTED_VPD_PAGES 6 -#define VERSION_SPC_4 0x06 -#define ACA_UNSUPPORTED 0 -#define STANDARD_INQUIRY_LENGTH 36 -#define ADDITIONAL_STD_INQ_LENGTH 31 -#define EXTENDED_INQUIRY_DATA_PAGE_LENGTH 0x3C -#define RESERVED_FIELD 0 - -/* SCSI READ/WRITE Defines */ -#define IO_CDB_WP_MASK 0xE0 -#define IO_CDB_WP_SHIFT 5 -#define IO_CDB_FUA_MASK 0x8 -#define IO_6_CDB_LBA_OFFSET 0 -#define IO_6_CDB_LBA_MASK 0x001FFFFF -#define IO_6_CDB_TX_LEN_OFFSET 4 -#define IO_6_DEFAULT_TX_LEN 256 -#define IO_10_CDB_LBA_OFFSET 2 -#define IO_10_CDB_TX_LEN_OFFSET 7 -#define IO_10_CDB_WP_OFFSET 1 -#define IO_10_CDB_FUA_OFFSET 1 -#define IO_12_CDB_LBA_OFFSET 2 -#define IO_12_CDB_TX_LEN_OFFSET 6 -#define IO_12_CDB_WP_OFFSET 1 -#define IO_12_CDB_FUA_OFFSET 1 -#define IO_16_CDB_FUA_OFFSET 1 -#define IO_16_CDB_WP_OFFSET 1 -#define IO_16_CDB_LBA_OFFSET 2 -#define IO_16_CDB_TX_LEN_OFFSET 10 - -/* Mode Sense/Select defines */ -#define MODE_PAGE_INFO_EXCEP 0x1C -#define MODE_PAGE_CACHING 0x08 -#define MODE_PAGE_CONTROL 0x0A -#define MODE_PAGE_POWER_CONDITION 0x1A -#define MODE_PAGE_RETURN_ALL 0x3F -#define MODE_PAGE_BLK_DES_LEN 0x08 -#define MODE_PAGE_LLBAA_BLK_DES_LEN 0x10 -#define MODE_PAGE_CACHING_LEN 0x14 -#define MODE_PAGE_CONTROL_LEN 0x0C -#define MODE_PAGE_POW_CND_LEN 0x28 -#define MODE_PAGE_INF_EXC_LEN 0x0C -#define MODE_PAGE_ALL_LEN 0x54 -#define MODE_SENSE6_MPH_SIZE 4 -#define MODE_SENSE6_ALLOC_LEN_OFFSET 4 -#define MODE_SENSE_PAGE_CONTROL_OFFSET 2 -#define MODE_SENSE_PAGE_CONTROL_MASK 0xC0 -#define MODE_SENSE_PAGE_CODE_OFFSET 2 -#define MODE_SENSE_PAGE_CODE_MASK 0x3F -#define MODE_SENSE_LLBAA_OFFSET 1 -#define MODE_SENSE_LLBAA_MASK 0x10 -#define MODE_SENSE_LLBAA_SHIFT 4 -#define MODE_SENSE_DBD_OFFSET 1 -#define MODE_SENSE_DBD_MASK 8 -#define MODE_SENSE_DBD_SHIFT 3 -#define MODE_SENSE10_MPH_SIZE 8 -#define MODE_SENSE10_ALLOC_LEN_OFFSET 7 -#define MODE_SELECT_CDB_PAGE_FORMAT_OFFSET 1 -#define MODE_SELECT_CDB_SAVE_PAGES_OFFSET 1 -#define MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET 4 -#define MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET 7 -#define MODE_SELECT_CDB_PAGE_FORMAT_MASK 0x10 -#define MODE_SELECT_CDB_SAVE_PAGES_MASK 0x1 -#define MODE_SELECT_6_BD_OFFSET 3 -#define MODE_SELECT_10_BD_OFFSET 6 -#define MODE_SELECT_10_LLBAA_OFFSET 4 -#define MODE_SELECT_10_LLBAA_MASK 1 -#define MODE_SELECT_6_MPH_SIZE 4 -#define MODE_SELECT_10_MPH_SIZE 8 -#define CACHING_MODE_PAGE_WCE_MASK 0x04 -#define MODE_SENSE_BLK_DESC_ENABLED 0 -#define MODE_SENSE_BLK_DESC_COUNT 1 -#define MODE_SELECT_PAGE_CODE_MASK 0x3F -#define SHORT_DESC_BLOCK 8 -#define LONG_DESC_BLOCK 16 -#define MODE_PAGE_POW_CND_LEN_FIELD 0x26 -#define MODE_PAGE_INF_EXC_LEN_FIELD 0x0A -#define MODE_PAGE_CACHING_LEN_FIELD 0x12 -#define MODE_PAGE_CONTROL_LEN_FIELD 0x0A -#define MODE_SENSE_PC_CURRENT_VALUES 0 - -/* Log Sense defines */ -#define LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE 0x00 -#define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH 0x07 -#define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE 0x2F -#define LOG_PAGE_TEMPERATURE_PAGE 0x0D -#define LOG_SENSE_CDB_SP_OFFSET 1 -#define LOG_SENSE_CDB_SP_NOT_ENABLED 0 -#define LOG_SENSE_CDB_PC_OFFSET 2 -#define LOG_SENSE_CDB_PC_MASK 0xC0 -#define LOG_SENSE_CDB_PC_SHIFT 6 -#define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES 1 -#define LOG_SENSE_CDB_PAGE_CODE_MASK 0x3F -#define LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET 7 -#define REMAINING_INFO_EXCP_PAGE_LENGTH 0x8 -#define LOG_INFO_EXCP_PAGE_LENGTH 0xC -#define REMAINING_TEMP_PAGE_LENGTH 0xC -#define LOG_TEMP_PAGE_LENGTH 0x10 -#define LOG_TEMP_UNKNOWN 0xFF -#define SUPPORTED_LOG_PAGES_PAGE_LENGTH 0x3 - -/* Read Capacity defines */ -#define READ_CAP_10_RESP_SIZE 8 -#define READ_CAP_16_RESP_SIZE 32 - -/* NVMe Namespace and Command Defines */ -#define BYTES_TO_DWORDS 4 -#define NVME_MAX_FIRMWARE_SLOT 7 - -/* Report LUNs defines */ -#define REPORT_LUNS_FIRST_LUN_OFFSET 8 - -/* SCSI ADDITIONAL SENSE Codes */ - -#define SCSI_ASC_NO_SENSE 0x00 -#define SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT 0x03 -#define SCSI_ASC_LUN_NOT_READY 0x04 -#define SCSI_ASC_WARNING 0x0B -#define SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED 0x10 -#define SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED 0x10 -#define SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED 0x10 -#define SCSI_ASC_UNRECOVERED_READ_ERROR 0x11 -#define SCSI_ASC_MISCOMPARE_DURING_VERIFY 0x1D -#define SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID 0x20 -#define SCSI_ASC_ILLEGAL_COMMAND 0x20 -#define SCSI_ASC_ILLEGAL_BLOCK 0x21 -#define SCSI_ASC_INVALID_CDB 0x24 -#define SCSI_ASC_INVALID_LUN 0x25 -#define SCSI_ASC_INVALID_PARAMETER 0x26 -#define SCSI_ASC_FORMAT_COMMAND_FAILED 0x31 -#define SCSI_ASC_INTERNAL_TARGET_FAILURE 0x44 - -/* SCSI ADDITIONAL SENSE Code Qualifiers */ - -#define SCSI_ASCQ_CAUSE_NOT_REPORTABLE 0x00 -#define SCSI_ASCQ_FORMAT_COMMAND_FAILED 0x01 -#define SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED 0x01 -#define SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED 0x02 -#define SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED 0x03 -#define SCSI_ASCQ_FORMAT_IN_PROGRESS 0x04 -#define SCSI_ASCQ_POWER_LOSS_EXPECTED 0x08 -#define SCSI_ASCQ_INVALID_LUN_ID 0x09 - -/** - * DEVICE_SPECIFIC_PARAMETER in mode parameter header (see sbc2r16) to - * enable DPOFUA support type 0x10 value. - */ -#define DEVICE_SPECIFIC_PARAMETER 0 -#define VPD_ID_DESCRIPTOR_LENGTH sizeof(VPD_IDENTIFICATION_DESCRIPTOR) - -/* MACROs to extract information from CDBs */ - -#define GET_OPCODE(cdb) cdb[0] - -#define GET_U8_FROM_CDB(cdb, index) (cdb[index] << 0) - -#define GET_U16_FROM_CDB(cdb, index) ((cdb[index] << 8) | (cdb[index + 1] << 0)) - -#define GET_U24_FROM_CDB(cdb, index) ((cdb[index] << 16) | \ -(cdb[index + 1] << 8) | \ -(cdb[index + 2] << 0)) - -#define GET_U32_FROM_CDB(cdb, index) ((cdb[index] << 24) | \ -(cdb[index + 1] << 16) | \ -(cdb[index + 2] << 8) | \ -(cdb[index + 3] << 0)) - -#define GET_U64_FROM_CDB(cdb, index) ((((u64)cdb[index]) << 56) | \ -(((u64)cdb[index + 1]) << 48) | \ -(((u64)cdb[index + 2]) << 40) | \ -(((u64)cdb[index + 3]) << 32) | \ -(((u64)cdb[index + 4]) << 24) | \ -(((u64)cdb[index + 5]) << 16) | \ -(((u64)cdb[index + 6]) << 8) | \ -(((u64)cdb[index + 7]) << 0)) - -/* Inquiry Helper Macros */ -#define GET_INQ_EVPD_BIT(cdb) \ -((GET_U8_FROM_CDB(cdb, INQUIRY_EVPD_BYTE_OFFSET) & \ -INQUIRY_EVPD_BIT_MASK) ? 1 : 0) - -#define GET_INQ_PAGE_CODE(cdb) \ -(GET_U8_FROM_CDB(cdb, INQUIRY_PAGE_CODE_BYTE_OFFSET)) - -#define GET_INQ_ALLOC_LENGTH(cdb) \ -(GET_U16_FROM_CDB(cdb, INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET)) - -/* Report LUNs Helper Macros */ -#define GET_REPORT_LUNS_ALLOC_LENGTH(cdb) \ -(GET_U32_FROM_CDB(cdb, REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET)) - -/* Read Capacity Helper Macros */ -#define GET_READ_CAP_16_ALLOC_LENGTH(cdb) \ -(GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET)) - -#define IS_READ_CAP_16(cdb) \ -((cdb[0] == SERVICE_ACTION_IN_16 && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0) - -/* Request Sense Helper Macros */ -#define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb) \ -(GET_U8_FROM_CDB(cdb, REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET)) - -/* Mode Sense Helper Macros */ -#define GET_MODE_SENSE_DBD(cdb) \ -((GET_U8_FROM_CDB(cdb, MODE_SENSE_DBD_OFFSET) & MODE_SENSE_DBD_MASK) >> \ -MODE_SENSE_DBD_SHIFT) - -#define GET_MODE_SENSE_LLBAA(cdb) \ -((GET_U8_FROM_CDB(cdb, MODE_SENSE_LLBAA_OFFSET) & \ -MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT) - -#define GET_MODE_SENSE_MPH_SIZE(cdb10) \ -(cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE) - - -/* Struct to gather data that needs to be extracted from a SCSI CDB. - Not conforming to any particular CDB variant, but compatible with all. */ - -struct nvme_trans_io_cdb { - u8 fua; - u8 prot_info; - u64 lba; - u32 xfer_len; -}; - - -/* Internal Helper Functions */ - - -/* Copy data to userspace memory */ - -static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from, - unsigned long n) -{ - int res = SNTI_TRANSLATION_SUCCESS; - unsigned long not_copied; - int i; - void *index = from; - size_t remaining = n; - size_t xfer_len; - - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - for (i = 0; i < hdr->iovec_count; i++) { - not_copied = copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (not_copied) - return -EFAULT; - xfer_len = min(remaining, sgl.iov_len); - not_copied = copy_to_user(sgl.iov_base, index, - xfer_len); - if (not_copied) { - res = -EFAULT; - break; - } - index += xfer_len; - remaining -= xfer_len; - if (remaining == 0) - break; - } - return res; - } - not_copied = copy_to_user(hdr->dxferp, from, n); - if (not_copied) - res = -EFAULT; - return res; -} - -/* Copy data from userspace memory */ - -static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to, - unsigned long n) -{ - int res = SNTI_TRANSLATION_SUCCESS; - unsigned long not_copied; - int i; - void *index = to; - size_t remaining = n; - size_t xfer_len; - - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - for (i = 0; i < hdr->iovec_count; i++) { - not_copied = copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (not_copied) - return -EFAULT; - xfer_len = min(remaining, sgl.iov_len); - not_copied = copy_from_user(index, sgl.iov_base, - xfer_len); - if (not_copied) { - res = -EFAULT; - break; - } - index += xfer_len; - remaining -= xfer_len; - if (remaining == 0) - break; - } - return res; - } - - not_copied = copy_from_user(to, hdr->dxferp, n); - if (not_copied) - res = -EFAULT; - return res; -} - -/* Status/Sense Buffer Writeback */ - -static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key, - u8 asc, u8 ascq) -{ - int res = SNTI_TRANSLATION_SUCCESS; - u8 xfer_len; - u8 resp[DESC_FMT_SENSE_DATA_SIZE]; - - if (scsi_status_is_good(status)) { - hdr->status = SAM_STAT_GOOD; - hdr->masked_status = GOOD; - hdr->host_status = DID_OK; - hdr->driver_status = DRIVER_OK; - hdr->sb_len_wr = 0; - } else { - hdr->status = status; - hdr->masked_status = status >> 1; - hdr->host_status = DID_OK; - hdr->driver_status = DRIVER_OK; - - memset(resp, 0, DESC_FMT_SENSE_DATA_SIZE); - resp[0] = DESC_FORMAT_SENSE_DATA; - resp[1] = sense_key; - resp[2] = asc; - resp[3] = ascq; - - xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE); - hdr->sb_len_wr = xfer_len; - if (copy_to_user(hdr->sbp, resp, xfer_len) > 0) - res = -EFAULT; - } - - return res; -} - -static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc) -{ - u8 status, sense_key, asc, ascq; - int res = SNTI_TRANSLATION_SUCCESS; - - /* For non-nvme (Linux) errors, simply return the error code */ - if (nvme_sc < 0) - return nvme_sc; - - /* Mask DNR, More, and reserved fields */ - nvme_sc &= 0x7FF; - - switch (nvme_sc) { - /* Generic Command Status */ - case NVME_SC_SUCCESS: - status = SAM_STAT_GOOD; - sense_key = NO_SENSE; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_OPCODE: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ILLEGAL_COMMAND; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_FIELD: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_INVALID_CDB; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_DATA_XFER_ERROR: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_POWER_LOSS: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_WARNING; - ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED; - break; - case NVME_SC_INTERNAL: - status = SAM_STAT_CHECK_CONDITION; - sense_key = HARDWARE_ERROR; - asc = SCSI_ASC_INTERNAL_TARGET_FAILURE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ABORT_REQ: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ABORT_QUEUE: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_FUSED_FAIL: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_FUSED_MISSING: - status = SAM_STAT_TASK_ABORTED; - sense_key = ABORTED_COMMAND; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_INVALID_NS: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; - ascq = SCSI_ASCQ_INVALID_LUN_ID; - break; - case NVME_SC_LBA_RANGE: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ILLEGAL_BLOCK; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_CAP_EXCEEDED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_NS_NOT_READY: - status = SAM_STAT_CHECK_CONDITION; - sense_key = NOT_READY; - asc = SCSI_ASC_LUN_NOT_READY; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - - /* Command Specific Status */ - case NVME_SC_INVALID_FORMAT: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_FORMAT_COMMAND_FAILED; - ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED; - break; - case NVME_SC_BAD_ATTRIBUTES: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_INVALID_CDB; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - - /* Media Errors */ - case NVME_SC_WRITE_FAULT: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_READ_ERROR: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_UNRECOVERED_READ_ERROR; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_GUARD_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED; - break; - case NVME_SC_APPTAG_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED; - break; - case NVME_SC_REFTAG_CHECK: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MEDIUM_ERROR; - asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED; - ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED; - break; - case NVME_SC_COMPARE_FAILED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = MISCOMPARE; - asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - case NVME_SC_ACCESS_DENIED: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID; - ascq = SCSI_ASCQ_INVALID_LUN_ID; - break; - - /* Unspecified/Default */ - case NVME_SC_CMDID_CONFLICT: - case NVME_SC_CMD_SEQ_ERROR: - case NVME_SC_CQ_INVALID: - case NVME_SC_QID_INVALID: - case NVME_SC_QUEUE_SIZE: - case NVME_SC_ABORT_LIMIT: - case NVME_SC_ABORT_MISSING: - case NVME_SC_ASYNC_LIMIT: - case NVME_SC_FIRMWARE_SLOT: - case NVME_SC_FIRMWARE_IMAGE: - case NVME_SC_INVALID_VECTOR: - case NVME_SC_INVALID_LOG_PAGE: - default: - status = SAM_STAT_CHECK_CONDITION; - sense_key = ILLEGAL_REQUEST; - asc = SCSI_ASC_NO_SENSE; - ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - break; - } - - res = nvme_trans_completion(hdr, status, sense_key, asc, ascq); - - return res; -} - -/* INQUIRY Helper Functions */ - -static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; - struct nvme_id_ns *id_ns; - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - int xfer_len; - u8 resp_data_format = 0x02; - u8 protect; - u8 cmdque = 0x01 << 1; - u8 fw_offset = sizeof(dev->firmware_rev); - - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out_dma; - } - - /* nvme ns identify - use DPS value for PROTECT field */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - /* - * If nvme_sc was -ve, res will be -ve here. - * If nvme_sc was +ve, the status would bace been translated, and res - * can only be 0 or -ve. - * - If 0 && nvme_sc > 0, then go into next if where res gets nvme_sc - * - If -ve, return because its a Linux error. - */ - if (res) - goto out_free; - if (nvme_sc) { - res = nvme_sc; - goto out_free; - } - id_ns = mem; - (id_ns->dps) ? (protect = 0x01) : (protect = 0); - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[2] = VERSION_SPC_4; - inq_response[3] = resp_data_format; /*normaca=0 | hisup=0 */ - inq_response[4] = ADDITIONAL_STD_INQ_LENGTH; - inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ - inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ - strncpy(&inq_response[8], "NVMe ", 8); - strncpy(&inq_response[16], dev->model, 16); - - while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) - fw_offset--; - fw_offset -= 4; - strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4); - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - out_free: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out_dma: - return res; -} - -static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int xfer_len; - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = INQ_SUPPORTED_VPD_PAGES_PAGE; /* Page Code */ - inq_response[3] = INQ_NUM_SUPPORTED_VPD_PAGES; /* Page Length */ - inq_response[4] = INQ_SUPPORTED_VPD_PAGES_PAGE; - inq_response[5] = INQ_UNIT_SERIAL_NUMBER_PAGE; - inq_response[6] = INQ_DEVICE_IDENTIFICATION_PAGE; - inq_response[7] = INQ_EXTENDED_INQUIRY_DATA_PAGE; - inq_response[8] = INQ_BDEV_CHARACTERISTICS_PAGE; - inq_response[9] = INQ_BDEV_LIMITS_PAGE; - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - return res; -} - -static int nvme_trans_unit_serial_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *inq_response, - int alloc_len) -{ - struct nvme_dev *dev = ns->dev; - int res = SNTI_TRANSLATION_SUCCESS; - int xfer_len; - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ - inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ - strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH); - - xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - return res; -} - -static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) -{ - struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - int xfer_len; - __be32 tmp_id = cpu_to_be32(ns->ns_id); - - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out_dma; - } - - memset(inq_response, 0, alloc_len); - inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ - if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) { - struct nvme_id_ns *id_ns = mem; - void *eui = id_ns->eui64; - int len = sizeof(id_ns->eui64); - - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free; - if (nvme_sc) { - res = nvme_sc; - goto out_free; - } - - if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) { - if (bitmap_empty(eui, len * 8)) { - eui = id_ns->nguid; - len = sizeof(id_ns->nguid); - } - } - if (bitmap_empty(eui, len * 8)) - goto scsi_string; - - inq_response[3] = 4 + len; /* Page Length */ - /* Designation Descriptor start */ - inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ - inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = len; /* Designator Length */ - memcpy(&inq_response[8], eui, len); - } else { - scsi_string: - if (alloc_len < 72) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_free; - } - inq_response[3] = 0x48; /* Page Length */ - /* Designation Descriptor start */ - inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ - inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = 0x44; /* Designator Length */ - - sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor); - memcpy(&inq_response[12], dev->model, sizeof(dev->model)); - sprintf(&inq_response[52], "%04x", tmp_id); - memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); - } - xfer_len = alloc_len; - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - out_free: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out_dma: - return res; -} - -static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - u8 *inq_response; - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; - struct nvme_id_ctrl *id_ctrl; - struct nvme_id_ns *id_ns; - int xfer_len; - u8 microcode = 0x80; - u8 spt; - u8 spt_lut[8] = {0, 0, 2, 1, 4, 6, 5, 7}; - u8 grd_chk, app_chk, ref_chk, protect; - u8 uask_sup = 0x20; - u8 v_sup; - u8 luiclr = 0x01; - - inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); - if (inq_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out_dma; - } - - /* nvme ns identify */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free; - if (nvme_sc) { - res = nvme_sc; - goto out_free; - } - id_ns = mem; - spt = spt_lut[(id_ns->dpc) & 0x07] << 3; - (id_ns->dps) ? (protect = 0x01) : (protect = 0); - grd_chk = protect << 2; - app_chk = protect << 1; - ref_chk = protect; - - /* nvme controller identify */ - nvme_sc = nvme_identify(dev, 0, 1, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_free; - if (nvme_sc) { - res = nvme_sc; - goto out_free; - } - id_ctrl = mem; - v_sup = id_ctrl->vwc; - - memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE; /* Page Code */ - inq_response[2] = 0x00; /* Page Length MSB */ - inq_response[3] = 0x3C; /* Page Length LSB */ - inq_response[4] = microcode | spt | grd_chk | app_chk | ref_chk; - inq_response[5] = uask_sup; - inq_response[6] = v_sup; - inq_response[7] = luiclr; - inq_response[8] = 0; - inq_response[9] = 0; - - xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - out_free: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out_dma: - kfree(inq_response); - out_mem: - return res; -} - -static int nvme_trans_bdev_limits_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) -{ - __be32 max_sectors = cpu_to_be32( - nvme_block_nr(ns, queue_max_hw_sectors(ns->queue))); - __be32 max_discard = cpu_to_be32(ns->queue->limits.max_discard_sectors); - __be32 discard_desc_count = cpu_to_be32(0x100); - - memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); - inq_response[1] = VPD_BLOCK_LIMITS; - inq_response[3] = 0x3c; /* Page Length */ - memcpy(&inq_response[8], &max_sectors, sizeof(u32)); - memcpy(&inq_response[20], &max_discard, sizeof(u32)); - - if (max_discard) - memcpy(&inq_response[24], &discard_desc_count, sizeof(u32)); - - return nvme_trans_copy_to_user(hdr, inq_response, 0x3c); -} - -static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - u8 *inq_response; - int res = SNTI_TRANSLATION_SUCCESS; - int xfer_len; - - inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL); - if (inq_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE; /* Page Code */ - inq_response[2] = 0x00; /* Page Length MSB */ - inq_response[3] = 0x3C; /* Page Length LSB */ - inq_response[4] = 0x00; /* Medium Rotation Rate MSB */ - inq_response[5] = 0x01; /* Medium Rotation Rate LSB */ - inq_response[6] = 0x00; /* Form Factor */ - - xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len); - - kfree(inq_response); - out_mem: - return res; -} - -/* LOG SENSE Helper Functions */ - -static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int xfer_len; - u8 *log_response; - - log_response = kzalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL); - if (log_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = SUPPORTED_LOG_PAGES_PAGE_LENGTH; - log_response[4] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE; - log_response[5] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; - log_response[6] = LOG_PAGE_TEMPERATURE_PAGE; - - xfer_len = min(alloc_len, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - kfree(log_response); - out_mem: - return res; -} - -static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, - struct sg_io_hdr *hdr, int alloc_len) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int xfer_len; - u8 *log_response; - struct nvme_command c; - struct nvme_dev *dev = ns->dev; - struct nvme_smart_log *smart_log; - dma_addr_t dma_addr; - void *mem; - u8 temp_c; - u16 temp_k; - - log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL); - if (log_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - mem = dma_alloc_coherent(&dev->pci_dev->dev, - sizeof(struct nvme_smart_log), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out_dma; - } - - /* Get SMART Log Page */ - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_get_log_page; - c.common.nsid = cpu_to_le32(0xFFFFFFFF); - c.common.prp1 = cpu_to_le64(dma_addr); - c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / - BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART); - res = nvme_submit_admin_cmd(dev, &c, NULL); - if (res != NVME_SC_SUCCESS) { - temp_c = LOG_TEMP_UNKNOWN; - } else { - smart_log = mem; - temp_k = (smart_log->temperature[1] << 8) + - (smart_log->temperature[0]); - temp_c = temp_k - KELVIN_TEMP_FACTOR; - } - - log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = REMAINING_INFO_EXCP_PAGE_LENGTH; - /* Informational Exceptions Log Parameter 1 Start */ - /* Parameter Code=0x0000 bytes 4,5 */ - log_response[6] = 0x23; /* DU=0, TSD=1, ETC=0, TMC=0, FMT_AND_LNK=11b */ - log_response[7] = 0x04; /* PARAMETER LENGTH */ - /* Add sense Code and qualifier = 0x00 each */ - /* Use Temperature from NVMe Get Log Page, convert to C from K */ - log_response[10] = temp_c; - - xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), - mem, dma_addr); - out_dma: - kfree(log_response); - out_mem: - return res; -} - -static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, - int alloc_len) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int xfer_len; - u8 *log_response; - struct nvme_command c; - struct nvme_dev *dev = ns->dev; - struct nvme_smart_log *smart_log; - dma_addr_t dma_addr; - void *mem; - u32 feature_resp; - u8 temp_c_cur, temp_c_thresh; - u16 temp_k; - - log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL); - if (log_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - mem = dma_alloc_coherent(&dev->pci_dev->dev, - sizeof(struct nvme_smart_log), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out_dma; - } - - /* Get SMART Log Page */ - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_get_log_page; - c.common.nsid = cpu_to_le32(0xFFFFFFFF); - c.common.prp1 = cpu_to_le64(dma_addr); - c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) / - BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART); - res = nvme_submit_admin_cmd(dev, &c, NULL); - if (res != NVME_SC_SUCCESS) { - temp_c_cur = LOG_TEMP_UNKNOWN; - } else { - smart_log = mem; - temp_k = (smart_log->temperature[1] << 8) + - (smart_log->temperature[0]); - temp_c_cur = temp_k - KELVIN_TEMP_FACTOR; - } - - /* Get Features for Temp Threshold */ - res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0, - &feature_resp); - if (res != NVME_SC_SUCCESS) - temp_c_thresh = LOG_TEMP_UNKNOWN; - else - temp_c_thresh = (feature_resp & 0xFFFF) - KELVIN_TEMP_FACTOR; - - log_response[0] = LOG_PAGE_TEMPERATURE_PAGE; - /* Subpage=0x00, Page Length MSB=0 */ - log_response[3] = REMAINING_TEMP_PAGE_LENGTH; - /* Temperature Log Parameter 1 (Temperature) Start */ - /* Parameter Code = 0x0000 */ - log_response[6] = 0x01; /* Format and Linking = 01b */ - log_response[7] = 0x02; /* Parameter Length */ - /* Use Temperature from NVMe Get Log Page, convert to C from K */ - log_response[9] = temp_c_cur; - /* Temperature Log Parameter 2 (Reference Temperature) Start */ - log_response[11] = 0x01; /* Parameter Code = 0x0001 */ - log_response[12] = 0x01; /* Format and Linking = 01b */ - log_response[13] = 0x02; /* Parameter Length */ - /* Use Temperature Thresh from NVMe Get Log Page, convert to C from K */ - log_response[15] = temp_c_thresh; - - xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH); - res = nvme_trans_copy_to_user(hdr, log_response, xfer_len); - - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log), - mem, dma_addr); - out_dma: - kfree(log_response); - out_mem: - return res; -} - -/* MODE SENSE Helper Functions */ - -static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa, - u16 mode_data_length, u16 blk_desc_len) -{ - /* Quick check to make sure I don't stomp on my own memory... */ - if ((cdb10 && len < 8) || (!cdb10 && len < 4)) - return SNTI_INTERNAL_ERROR; - - if (cdb10) { - resp[0] = (mode_data_length & 0xFF00) >> 8; - resp[1] = (mode_data_length & 0x00FF); - /* resp[2] and [3] are zero */ - resp[4] = llbaa; - resp[5] = RESERVED_FIELD; - resp[6] = (blk_desc_len & 0xFF00) >> 8; - resp[7] = (blk_desc_len & 0x00FF); - } else { - resp[0] = (mode_data_length & 0x00FF); - /* resp[1] and [2] are zero */ - resp[3] = (blk_desc_len & 0x00FF); - } - - return SNTI_TRANSLATION_SUCCESS; -} - -static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *resp, int len, u8 llbaa) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; - struct nvme_id_ns *id_ns; - u8 flbas; - u32 lba_length; - - if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN) - return SNTI_INTERNAL_ERROR; - else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) - return SNTI_INTERNAL_ERROR; - - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } - - /* nvme ns identify */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ns = mem; - flbas = (id_ns->flbas) & 0x0F; - lba_length = (1 << (id_ns->lbaf[flbas].ds)); - - if (llbaa == 0) { - __be32 tmp_cap = cpu_to_be32(le64_to_cpu(id_ns->ncap)); - /* Byte 4 is reserved */ - __be32 tmp_len = cpu_to_be32(lba_length & 0x00FFFFFF); - - memcpy(resp, &tmp_cap, sizeof(u32)); - memcpy(&resp[4], &tmp_len, sizeof(u32)); - } else { - __be64 tmp_cap = cpu_to_be64(le64_to_cpu(id_ns->ncap)); - __be32 tmp_len = cpu_to_be32(lba_length); - - memcpy(resp, &tmp_cap, sizeof(u64)); - /* Bytes 8, 9, 10, 11 are reserved */ - memcpy(&resp[12], &tmp_len, sizeof(u32)); - } - - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out: - return res; -} - -static int nvme_trans_fill_control_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - if (len < MODE_PAGE_CONTROL_LEN) - return SNTI_INTERNAL_ERROR; - - resp[0] = MODE_PAGE_CONTROL; - resp[1] = MODE_PAGE_CONTROL_LEN_FIELD; - resp[2] = 0x0E; /* TST=000b, TMF_ONLY=0, DPICZ=1, - * D_SENSE=1, GLTSD=1, RLEC=0 */ - resp[3] = 0x12; /* Q_ALGO_MODIFIER=1h, NUAR=0, QERR=01b */ - /* Byte 4: VS=0, RAC=0, UA_INT=0, SWP=0 */ - resp[5] = 0x40; /* ATO=0, TAS=1, ATMPE=0, RWWP=0, AUTOLOAD=0 */ - /* resp[6] and [7] are obsolete, thus zero */ - resp[8] = 0xFF; /* Busy timeout period = 0xffff */ - resp[9] = 0xFF; - /* Bytes 10,11: Extended selftest completion time = 0x0000 */ - - return SNTI_TRANSLATION_SUCCESS; -} - -static int nvme_trans_fill_caching_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, - u8 *resp, int len) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_dev *dev = ns->dev; - u32 feature_resp; - u8 vwc; - - if (len < MODE_PAGE_CACHING_LEN) - return SNTI_INTERNAL_ERROR; - - nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0, - &feature_resp); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out; - if (nvme_sc) { - res = nvme_sc; - goto out; - } - vwc = feature_resp & 0x00000001; - - resp[0] = MODE_PAGE_CACHING; - resp[1] = MODE_PAGE_CACHING_LEN_FIELD; - resp[2] = vwc << 2; - - out: - return res; -} - -static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - int res = SNTI_TRANSLATION_SUCCESS; - - if (len < MODE_PAGE_POW_CND_LEN) - return SNTI_INTERNAL_ERROR; - - resp[0] = MODE_PAGE_POWER_CONDITION; - resp[1] = MODE_PAGE_POW_CND_LEN_FIELD; - /* All other bytes are zero */ - - return res; -} - -static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *resp, - int len) -{ - int res = SNTI_TRANSLATION_SUCCESS; - - if (len < MODE_PAGE_INF_EXC_LEN) - return SNTI_INTERNAL_ERROR; - - resp[0] = MODE_PAGE_INFO_EXCEP; - resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD; - resp[2] = 0x88; - /* All other bytes are zero */ - - return res; -} - -static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *resp, int len) -{ - int res = SNTI_TRANSLATION_SUCCESS; - u16 mode_pages_offset_1 = 0; - u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4; - - mode_pages_offset_2 = mode_pages_offset_1 + MODE_PAGE_CACHING_LEN; - mode_pages_offset_3 = mode_pages_offset_2 + MODE_PAGE_CONTROL_LEN; - mode_pages_offset_4 = mode_pages_offset_3 + MODE_PAGE_POW_CND_LEN; - - res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1], - MODE_PAGE_CACHING_LEN); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; - res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2], - MODE_PAGE_CONTROL_LEN); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; - res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3], - MODE_PAGE_POW_CND_LEN); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; - res = nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4], - MODE_PAGE_INF_EXC_LEN); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; - - out: - return res; -} - -static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa) -{ - if (dbd == MODE_SENSE_BLK_DESC_ENABLED) { - /* SPC-4: len = 8 x Num_of_descriptors if llbaa = 0, 16x if 1 */ - return 8 * (llbaa + 1) * MODE_SENSE_BLK_DESC_COUNT; - } else { - return 0; - } -} - -static int nvme_trans_mode_page_create(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *cmd, - u16 alloc_len, u8 cdb10, - int (*mode_page_fill_func) - (struct nvme_ns *, - struct sg_io_hdr *hdr, u8 *, int), - u16 mode_pages_tot_len) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int xfer_len; - u8 *response; - u8 dbd, llbaa; - u16 resp_size; - int mph_size; - u16 mode_pages_offset_1; - u16 blk_desc_len, blk_desc_offset, mode_data_length; - - dbd = GET_MODE_SENSE_DBD(cmd); - llbaa = GET_MODE_SENSE_LLBAA(cmd); - mph_size = GET_MODE_SENSE_MPH_SIZE(cdb10); - blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa); - - resp_size = mph_size + blk_desc_len + mode_pages_tot_len; - /* Refer spc4r34 Table 440 for calculation of Mode data Length field */ - mode_data_length = 3 + (3 * cdb10) + blk_desc_len + mode_pages_tot_len; - - blk_desc_offset = mph_size; - mode_pages_offset_1 = blk_desc_offset + blk_desc_len; - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10, - llbaa, mode_data_length, blk_desc_len); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out_free; - if (blk_desc_len > 0) { - res = nvme_trans_fill_blk_desc(ns, hdr, - &response[blk_desc_offset], - blk_desc_len, llbaa); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out_free; - } - res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1], - mode_pages_tot_len); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out_free; - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - out_free: - kfree(response); - out_mem: - return res; -} - -/* Read Capacity Helper Functions */ - -static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, - u8 cdb16) -{ - u8 flbas; - u32 lba_length; - u64 rlba; - u8 prot_en; - u8 p_type_lut[4] = {0, 0, 1, 2}; - __be64 tmp_rlba; - __be32 tmp_rlba_32; - __be32 tmp_len; - - flbas = (id_ns->flbas) & 0x0F; - lba_length = (1 << (id_ns->lbaf[flbas].ds)); - rlba = le64_to_cpup(&id_ns->nsze) - 1; - (id_ns->dps) ? (prot_en = 0x01) : (prot_en = 0); - - if (!cdb16) { - if (rlba > 0xFFFFFFFF) - rlba = 0xFFFFFFFF; - tmp_rlba_32 = cpu_to_be32(rlba); - tmp_len = cpu_to_be32(lba_length); - memcpy(response, &tmp_rlba_32, sizeof(u32)); - memcpy(&response[4], &tmp_len, sizeof(u32)); - } else { - tmp_rlba = cpu_to_be64(rlba); - tmp_len = cpu_to_be32(lba_length); - memcpy(response, &tmp_rlba, sizeof(u64)); - memcpy(&response[8], &tmp_len, sizeof(u32)); - response[12] = (p_type_lut[id_ns->dps & 0x3] << 1) | prot_en; - /* P_I_Exponent = 0x0 | LBPPBE = 0x0 */ - /* LBPME = 0 | LBPRZ = 0 | LALBA = 0x00 */ - /* Bytes 16-31 - Reserved */ - } -} - -/* Start Stop Unit Helper Functions */ - -static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 pc, u8 pcmod, u8 start) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; - struct nvme_id_ctrl *id_ctrl; - int lowest_pow_st; /* max npss = lowest power consumption */ - unsigned ps_desired = 0; - - /* NVMe Controller Identify */ - mem = dma_alloc_coherent(&dev->pci_dev->dev, - sizeof(struct nvme_id_ctrl), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } - nvme_sc = nvme_identify(dev, 0, 1, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ctrl = mem; - lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1)); - - switch (pc) { - case NVME_POWER_STATE_START_VALID: - /* Action unspecified if POWER CONDITION MODIFIER != 0 */ - if (pcmod == 0 && start == 0x1) - ps_desired = POWER_STATE_0; - if (pcmod == 0 && start == 0x0) - ps_desired = lowest_pow_st; - break; - case NVME_POWER_STATE_ACTIVE: - /* Action unspecified if POWER CONDITION MODIFIER != 0 */ - if (pcmod == 0) - ps_desired = POWER_STATE_0; - break; - case NVME_POWER_STATE_IDLE: - /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */ - if (pcmod == 0x0) - ps_desired = POWER_STATE_1; - else if (pcmod == 0x1) - ps_desired = POWER_STATE_2; - else if (pcmod == 0x2) - ps_desired = POWER_STATE_3; - break; - case NVME_POWER_STATE_STANDBY: - /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */ - if (pcmod == 0x0) - ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2)); - else if (pcmod == 0x1) - ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1)); - break; - case NVME_POWER_STATE_LU_CONTROL: - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0, - NULL); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_dma; - if (nvme_sc) - res = nvme_sc; - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem, - dma_addr); - out: - return res; -} - -/* Write Buffer Helper Functions */ -/* Also using this for Format Unit with hdr passed as NULL, and buffer_id, 0 */ - -static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 opcode, u32 tot_len, u32 offset, - u8 buffer_id) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_dev *dev = ns->dev; - struct nvme_command c; - struct nvme_iod *iod = NULL; - unsigned length; - - memset(&c, 0, sizeof(c)); - c.common.opcode = opcode; - if (opcode == nvme_admin_download_fw) { - if (hdr->iovec_count > 0) { - /* Assuming SGL is not allowed for this command */ - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - iod = nvme_map_user_pages(dev, DMA_TO_DEVICE, - (unsigned long)hdr->dxferp, tot_len); - if (IS_ERR(iod)) { - res = PTR_ERR(iod); - goto out; - } - length = nvme_setup_prps(dev, iod, tot_len, GFP_KERNEL); - if (length != tot_len) { - res = -ENOMEM; - goto out_unmap; - } - - c.dlfw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - c.dlfw.prp2 = cpu_to_le64(iod->first_dma); - c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); - c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); - } else if (opcode == nvme_admin_activate_fw) { - u32 cdw10 = buffer_id | NVME_FWACT_REPL_ACTV; - c.common.cdw10[0] = cpu_to_le32(cdw10); - } - - nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_unmap; - if (nvme_sc) - res = nvme_sc; - - out_unmap: - if (opcode == nvme_admin_download_fw) { - nvme_unmap_user_pages(dev, DMA_TO_DEVICE, iod); - nvme_free_iod(dev, iod); - } - out: - return res; -} - -/* Mode Select Helper Functions */ - -static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10, - u16 *bd_len, u8 *llbaa) -{ - if (cdb10) { - /* 10 Byte CDB */ - *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) + - parm_list[MODE_SELECT_10_BD_OFFSET + 1]; - *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] & - MODE_SELECT_10_LLBAA_MASK; - } else { - /* 6 Byte CDB */ - *bd_len = parm_list[MODE_SELECT_6_BD_OFFSET]; - } -} - -static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list, - u16 idx, u16 bd_len, u8 llbaa) -{ - u16 bd_num; - - bd_num = bd_len / ((llbaa == 0) ? - SHORT_DESC_BLOCK : LONG_DESC_BLOCK); - /* Store block descriptor info if a FORMAT UNIT comes later */ - /* TODO Saving 1st BD info; what to do if multiple BD received? */ - if (llbaa == 0) { - /* Standard Block Descriptor - spc4r34 7.5.5.1 */ - ns->mode_select_num_blocks = - (parm_list[idx + 1] << 16) + - (parm_list[idx + 2] << 8) + - (parm_list[idx + 3]); - - ns->mode_select_block_len = - (parm_list[idx + 5] << 16) + - (parm_list[idx + 6] << 8) + - (parm_list[idx + 7]); - } else { - /* Long LBA Block Descriptor - sbc3r27 6.4.2.3 */ - ns->mode_select_num_blocks = - (((u64)parm_list[idx + 0]) << 56) + - (((u64)parm_list[idx + 1]) << 48) + - (((u64)parm_list[idx + 2]) << 40) + - (((u64)parm_list[idx + 3]) << 32) + - (((u64)parm_list[idx + 4]) << 24) + - (((u64)parm_list[idx + 5]) << 16) + - (((u64)parm_list[idx + 6]) << 8) + - ((u64)parm_list[idx + 7]); - - ns->mode_select_block_len = - (parm_list[idx + 12] << 24) + - (parm_list[idx + 13] << 16) + - (parm_list[idx + 14] << 8) + - (parm_list[idx + 15]); - } -} - -static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *mode_page, u8 page_code) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_dev *dev = ns->dev; - unsigned dword11; - - switch (page_code) { - case MODE_PAGE_CACHING: - dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); - nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11, - 0, NULL); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - break; - if (nvme_sc) { - res = nvme_sc; - break; - } - break; - case MODE_PAGE_CONTROL: - break; - case MODE_PAGE_POWER_CONDITION: - /* Verify the OS is not trying to set timers */ - if ((mode_page[2] & 0x01) != 0 || (mode_page[3] & 0x0F) != 0) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - if (!res) - res = SNTI_INTERNAL_ERROR; - break; - } - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - if (!res) - res = SNTI_INTERNAL_ERROR; - break; - } - - return res; -} - -static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd, u16 parm_list_len, u8 pf, - u8 sp, u8 cdb10) -{ - int res = SNTI_TRANSLATION_SUCCESS; - u8 *parm_list; - u16 bd_len; - u8 llbaa = 0; - u16 index, saved_index; - u8 page_code; - u16 mp_size; - - /* Get parm list from data-in/out buffer */ - parm_list = kmalloc(parm_list_len, GFP_KERNEL); - if (parm_list == NULL) { - res = -ENOMEM; - goto out; - } - - res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out_mem; - - nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa); - index = (cdb10) ? (MODE_SELECT_10_MPH_SIZE) : (MODE_SELECT_6_MPH_SIZE); - - if (bd_len != 0) { - /* Block Descriptors present, parse */ - nvme_trans_modesel_save_bd(ns, parm_list, index, bd_len, llbaa); - index += bd_len; - } - saved_index = index; - - /* Multiple mode pages may be present; iterate through all */ - /* In 1st Iteration, don't do NVME Command, only check for CDB errors */ - do { - page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; - mp_size = parm_list[index + 1] + 2; - if ((page_code != MODE_PAGE_CACHING) && - (page_code != MODE_PAGE_CONTROL) && - (page_code != MODE_PAGE_POWER_CONDITION)) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - index += mp_size; - } while (index < parm_list_len); - - /* In 2nd Iteration, do the NVME Commands */ - index = saved_index; - do { - page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK; - mp_size = parm_list[index + 1] + 2; - res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index], - page_code); - if (res != SNTI_TRANSLATION_SUCCESS) - break; - index += mp_size; - } while (index < parm_list_len); - - out_mem: - kfree(parm_list); - out: - return res; -} - -/* Format Unit Helper Functions */ - -static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, - struct sg_io_hdr *hdr) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; - struct nvme_id_ns *id_ns; - u8 flbas; - - /* - * SCSI Expects a MODE SELECT would have been issued prior to - * a FORMAT UNIT, and the block size and number would be used - * from the block descriptor in it. If a MODE SELECT had not - * been issued, FORMAT shall use the current values for both. - */ - - if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { - mem = dma_alloc_coherent(&dev->pci_dev->dev, - sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } - /* nvme ns identify */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ns = mem; - - if (ns->mode_select_num_blocks == 0) - ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap); - if (ns->mode_select_block_len == 0) { - flbas = (id_ns->flbas) & 0x0F; - ns->mode_select_block_len = - (1 << (id_ns->lbaf[flbas].ds)); - } - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - mem, dma_addr); - } - out: - return res; -} - -static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len, - u8 format_prot_info, u8 *nvme_pf_code) -{ - int res = SNTI_TRANSLATION_SUCCESS; - u8 *parm_list; - u8 pf_usage, pf_code; - - parm_list = kmalloc(len, GFP_KERNEL); - if (parm_list == NULL) { - res = -ENOMEM; - goto out; - } - res = nvme_trans_copy_from_user(hdr, parm_list, len); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out_mem; - - if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] & - FORMAT_UNIT_IMMED_MASK) != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - - if (len == FORMAT_UNIT_LONG_PARM_LIST_LEN && - (parm_list[FORMAT_UNIT_PROT_INT_OFFSET] & 0x0F) != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_mem; - } - pf_usage = parm_list[FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET] & - FORMAT_UNIT_PROT_FIELD_USAGE_MASK; - pf_code = (pf_usage << 2) | format_prot_info; - switch (pf_code) { - case 0: - *nvme_pf_code = 0; - break; - case 2: - *nvme_pf_code = 1; - break; - case 3: - *nvme_pf_code = 2; - break; - case 7: - *nvme_pf_code = 3; - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out_mem: - kfree(parm_list); - out: - return res; -} - -static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 prot_info) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; - struct nvme_id_ns *id_ns; - u8 i; - u8 flbas, nlbaf; - u8 selected_lbaf = 0xFF; - u32 cdw10 = 0; - struct nvme_command c; - - /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } - /* nvme ns identify */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ns = mem; - flbas = (id_ns->flbas) & 0x0F; - nlbaf = id_ns->nlbaf; - - for (i = 0; i < nlbaf; i++) { - if (ns->mode_select_block_len == (1 << (id_ns->lbaf[i].ds))) { - selected_lbaf = i; - break; - } - } - if (selected_lbaf > 0x0F) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - if (ns->mode_select_num_blocks != le64_to_cpu(id_ns->ncap)) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - - cdw10 |= prot_info << 5; - cdw10 |= selected_lbaf & 0x0F; - memset(&c, 0, sizeof(c)); - c.format.opcode = nvme_admin_format_nvm; - c.format.nsid = cpu_to_le32(ns->ns_id); - c.format.cdw10 = cpu_to_le32(cdw10); - - nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_dma; - if (nvme_sc) - res = nvme_sc; - - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out: - return res; -} - -/* Read/Write Helper Functions */ - -static inline void nvme_trans_get_io_cdb6(u8 *cmd, - struct nvme_trans_io_cdb *cdb_info) -{ - cdb_info->fua = 0; - cdb_info->prot_info = 0; - cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_6_CDB_LBA_OFFSET) & - IO_6_CDB_LBA_MASK; - cdb_info->xfer_len = GET_U8_FROM_CDB(cmd, IO_6_CDB_TX_LEN_OFFSET); - - /* sbc3r27 sec 5.32 - TRANSFER LEN of 0 implies a 256 Block transfer */ - if (cdb_info->xfer_len == 0) - cdb_info->xfer_len = IO_6_DEFAULT_TX_LEN; -} - -static inline void nvme_trans_get_io_cdb10(u8 *cmd, - struct nvme_trans_io_cdb *cdb_info) -{ - cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_10_CDB_FUA_OFFSET) & - IO_CDB_FUA_MASK; - cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_10_CDB_WP_OFFSET) & - IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; - cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_10_CDB_LBA_OFFSET); - cdb_info->xfer_len = GET_U16_FROM_CDB(cmd, IO_10_CDB_TX_LEN_OFFSET); -} - -static inline void nvme_trans_get_io_cdb12(u8 *cmd, - struct nvme_trans_io_cdb *cdb_info) -{ - cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_12_CDB_FUA_OFFSET) & - IO_CDB_FUA_MASK; - cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_12_CDB_WP_OFFSET) & - IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; - cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_12_CDB_LBA_OFFSET); - cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_12_CDB_TX_LEN_OFFSET); -} - -static inline void nvme_trans_get_io_cdb16(u8 *cmd, - struct nvme_trans_io_cdb *cdb_info) -{ - cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_16_CDB_FUA_OFFSET) & - IO_CDB_FUA_MASK; - cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_16_CDB_WP_OFFSET) & - IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT; - cdb_info->lba = GET_U64_FROM_CDB(cmd, IO_16_CDB_LBA_OFFSET); - cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_16_CDB_TX_LEN_OFFSET); -} - -static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr, - struct nvme_trans_io_cdb *cdb_info, - u32 max_blocks) -{ - /* If using iovecs, send one nvme command per vector */ - if (hdr->iovec_count > 0) - return hdr->iovec_count; - else if (cdb_info->xfer_len > max_blocks) - return ((cdb_info->xfer_len - 1) / max_blocks) + 1; - else - return 1; -} - -static u16 nvme_trans_io_get_control(struct nvme_ns *ns, - struct nvme_trans_io_cdb *cdb_info) -{ - u16 control = 0; - - /* When Protection information support is added, implement here */ - - if (cdb_info->fua > 0) - control |= NVME_RW_FUA; - - return control; -} - -static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, - struct nvme_trans_io_cdb *cdb_info, u8 is_write) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_dev *dev = ns->dev; - u32 num_cmds; - struct nvme_iod *iod; - u64 unit_len; - u64 unit_num_blocks; /* Number of blocks to xfer in each nvme cmd */ - u32 retcode; - u32 i = 0; - u64 nvme_offset = 0; - void __user *next_mapping_addr; - struct nvme_command c; - u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read); - u16 control; - u32 max_blocks = queue_max_hw_sectors(ns->queue); - - num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks); - - /* - * This loop handles two cases. - * First, when an SGL is used in the form of an iovec list: - * - Use iov_base as the next mapping address for the nvme command_id - * - Use iov_len as the data transfer length for the command. - * Second, when we have a single buffer - * - If larger than max_blocks, split into chunks, offset - * each nvme command accordingly. - */ - for (i = 0; i < num_cmds; i++) { - memset(&c, 0, sizeof(c)); - if (hdr->iovec_count > 0) { - struct sg_iovec sgl; - - retcode = copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (retcode) - return -EFAULT; - unit_len = sgl.iov_len; - unit_num_blocks = unit_len >> ns->lba_shift; - next_mapping_addr = sgl.iov_base; - } else { - unit_num_blocks = min((u64)max_blocks, - (cdb_info->xfer_len - nvme_offset)); - unit_len = unit_num_blocks << ns->lba_shift; - next_mapping_addr = hdr->dxferp + - ((1 << ns->lba_shift) * nvme_offset); - } - - c.rw.opcode = opcode; - c.rw.nsid = cpu_to_le32(ns->ns_id); - c.rw.slba = cpu_to_le64(cdb_info->lba + nvme_offset); - c.rw.length = cpu_to_le16(unit_num_blocks - 1); - control = nvme_trans_io_get_control(ns, cdb_info); - c.rw.control = cpu_to_le16(control); - - iod = nvme_map_user_pages(dev, - (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - (unsigned long)next_mapping_addr, unit_len); - if (IS_ERR(iod)) { - res = PTR_ERR(iod); - goto out; - } - retcode = nvme_setup_prps(dev, iod, unit_len, GFP_KERNEL); - if (retcode != unit_len) { - nvme_unmap_user_pages(dev, - (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - iod); - nvme_free_iod(dev, iod); - res = -ENOMEM; - goto out; - } - c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - c.rw.prp2 = cpu_to_le64(iod->first_dma); - - nvme_offset += unit_num_blocks; - - nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); - if (nvme_sc != NVME_SC_SUCCESS) { - nvme_unmap_user_pages(dev, - (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - iod); - nvme_free_iod(dev, iod); - res = nvme_trans_status_code(hdr, nvme_sc); - goto out; - } - nvme_unmap_user_pages(dev, - (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - iod); - nvme_free_iod(dev, iod); - } - res = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); - - out: - return res; -} - - -/* SCSI Command Translation Functions */ - -static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - struct nvme_trans_io_cdb cdb_info; - u8 opcode = cmd[0]; - u64 xfer_bytes; - u64 sum_iov_len = 0; - struct sg_iovec sgl; - int i; - size_t not_copied; - - /* Extract Fields from CDB */ - switch (opcode) { - case WRITE_6: - case READ_6: - nvme_trans_get_io_cdb6(cmd, &cdb_info); - break; - case WRITE_10: - case READ_10: - nvme_trans_get_io_cdb10(cmd, &cdb_info); - break; - case WRITE_12: - case READ_12: - nvme_trans_get_io_cdb12(cmd, &cdb_info); - break; - case WRITE_16: - case READ_16: - nvme_trans_get_io_cdb16(cmd, &cdb_info); - break; - default: - /* Will never really reach here */ - res = SNTI_INTERNAL_ERROR; - goto out; - } - - /* Calculate total length of transfer (in bytes) */ - if (hdr->iovec_count > 0) { - for (i = 0; i < hdr->iovec_count; i++) { - not_copied = copy_from_user(&sgl, hdr->dxferp + - i * sizeof(struct sg_iovec), - sizeof(struct sg_iovec)); - if (not_copied) - return -EFAULT; - sum_iov_len += sgl.iov_len; - /* IO vector sizes should be multiples of block size */ - if (sgl.iov_len % (1 << ns->lba_shift) != 0) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_PARAMETER, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - } - } else { - sum_iov_len = hdr->dxfer_len; - } - - /* As Per sg ioctl howto, if the lengths differ, use the lower one */ - xfer_bytes = min(((u64)hdr->dxfer_len), sum_iov_len); - - /* If block count and actual data buffer size dont match, error out */ - if (xfer_bytes != (cdb_info.xfer_len << ns->lba_shift)) { - res = -EINVAL; - goto out; - } - - /* Check for 0 length transfer - it is not illegal */ - if (cdb_info.xfer_len == 0) - goto out; - - /* Send NVMe IO Command(s) */ - res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; - - out: - return res; -} - -static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - u8 evpd; - u8 page_code; - int alloc_len; - u8 *inq_response; - - evpd = GET_INQ_EVPD_BIT(cmd); - page_code = GET_INQ_PAGE_CODE(cmd); - alloc_len = GET_INQ_ALLOC_LENGTH(cmd); - - inq_response = kmalloc(max(alloc_len, STANDARD_INQUIRY_LENGTH), - GFP_KERNEL); - if (inq_response == NULL) { - res = -ENOMEM; - goto out_mem; - } - - if (evpd == 0) { - if (page_code == INQ_STANDARD_INQUIRY_PAGE) { - res = nvme_trans_standard_inquiry_page(ns, hdr, - inq_response, alloc_len); - } else { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - } else { - switch (page_code) { - case VPD_SUPPORTED_PAGES: - res = nvme_trans_supported_vpd_pages(ns, hdr, - inq_response, alloc_len); - break; - case VPD_SERIAL_NUMBER: - res = nvme_trans_unit_serial_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_DEVICE_IDENTIFIERS: - res = nvme_trans_device_id_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_EXTENDED_INQUIRY: - res = nvme_trans_ext_inq_page(ns, hdr, alloc_len); - break; - case VPD_BLOCK_LIMITS: - res = nvme_trans_bdev_limits_page(ns, hdr, inq_response, - alloc_len); - break; - case VPD_BLOCK_DEV_CHARACTERISTICS: - res = nvme_trans_bdev_char_page(ns, hdr, alloc_len); - break; - default: - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, - SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - } - kfree(inq_response); - out_mem: - return res; -} - -static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - u16 alloc_len; - u8 sp; - u8 pc; - u8 page_code; - - sp = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_SP_OFFSET); - if (sp != LOG_SENSE_CDB_SP_NOT_ENABLED) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - pc = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_PC_OFFSET); - page_code = pc & LOG_SENSE_CDB_PAGE_CODE_MASK; - pc = (pc & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT; - if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - alloc_len = GET_U16_FROM_CDB(cmd, LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET); - switch (page_code) { - case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE: - res = nvme_trans_log_supp_pages(ns, hdr, alloc_len); - break; - case LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE: - res = nvme_trans_log_info_exceptions(ns, hdr, alloc_len); - break; - case LOG_PAGE_TEMPERATURE_PAGE: - res = nvme_trans_log_temperature(ns, hdr, alloc_len); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - u8 cdb10 = 0; - u16 parm_list_len; - u8 page_format; - u8 save_pages; - - page_format = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_PAGE_FORMAT_OFFSET); - page_format &= MODE_SELECT_CDB_PAGE_FORMAT_MASK; - - save_pages = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_SAVE_PAGES_OFFSET); - save_pages &= MODE_SELECT_CDB_SAVE_PAGES_MASK; - - if (GET_OPCODE(cmd) == MODE_SELECT) { - parm_list_len = GET_U8_FROM_CDB(cmd, - MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET); - } else { - parm_list_len = GET_U16_FROM_CDB(cmd, - MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET); - cdb10 = 1; - } - - if (parm_list_len != 0) { - /* - * According to SPC-4 r24, a paramter list length field of 0 - * shall not be considered an error - */ - res = nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len, - page_format, save_pages, cdb10); - } - - return res; -} - -static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - u16 alloc_len; - u8 cdb10 = 0; - u8 page_code; - u8 pc; - - if (GET_OPCODE(cmd) == MODE_SENSE) { - alloc_len = GET_U8_FROM_CDB(cmd, MODE_SENSE6_ALLOC_LEN_OFFSET); - } else { - alloc_len = GET_U16_FROM_CDB(cmd, - MODE_SENSE10_ALLOC_LEN_OFFSET); - cdb10 = 1; - } - - pc = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CONTROL_OFFSET) & - MODE_SENSE_PAGE_CONTROL_MASK; - if (pc != MODE_SENSE_PC_CURRENT_VALUES) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - - page_code = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CODE_OFFSET) & - MODE_SENSE_PAGE_CODE_MASK; - switch (page_code) { - case MODE_PAGE_CACHING: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_caching_page, - MODE_PAGE_CACHING_LEN); - break; - case MODE_PAGE_CONTROL: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_control_page, - MODE_PAGE_CONTROL_LEN); - break; - case MODE_PAGE_POWER_CONDITION: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_pow_cnd_page, - MODE_PAGE_POW_CND_LEN); - break; - case MODE_PAGE_INFO_EXCEP: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_inf_exc_page, - MODE_PAGE_INF_EXC_LEN); - break; - case MODE_PAGE_RETURN_ALL: - res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len, - cdb10, - &nvme_trans_fill_all_pages, - MODE_PAGE_ALL_LEN); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - u32 alloc_len = READ_CAP_10_RESP_SIZE; - u32 resp_size = READ_CAP_10_RESP_SIZE; - u32 xfer_len; - u8 cdb16; - struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; - struct nvme_id_ns *id_ns; - u8 *response; - - cdb16 = IS_READ_CAP_16(cmd); - if (cdb16) { - alloc_len = GET_READ_CAP_16_ALLOC_LENGTH(cmd); - resp_size = READ_CAP_16_RESP_SIZE; - } - - mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } - /* nvme ns identify */ - nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ns = mem; - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_dma; - } - nvme_trans_fill_read_cap(response, id_ns, cdb16); - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem, - dma_addr); - out: - return res; -} - -static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - u32 alloc_len, xfer_len, resp_size; - u8 select_report; - u8 *response; - struct nvme_dev *dev = ns->dev; - dma_addr_t dma_addr; - void *mem; - struct nvme_id_ctrl *id_ctrl; - u32 ll_length, lun_id; - u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; - __be32 tmp_len; - - alloc_len = GET_REPORT_LUNS_ALLOC_LENGTH(cmd); - select_report = GET_U8_FROM_CDB(cmd, REPORT_LUNS_SR_OFFSET); - - if ((select_report != ALL_LUNS_RETURNED) && - (select_report != ALL_WELL_KNOWN_LUNS_RETURNED) && - (select_report != RESTRICTED_LUNS_RETURNED)) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } else { - /* NVMe Controller Identify */ - mem = dma_alloc_coherent(&dev->pci_dev->dev, - sizeof(struct nvme_id_ctrl), - &dma_addr, GFP_KERNEL); - if (mem == NULL) { - res = -ENOMEM; - goto out; - } - nvme_sc = nvme_identify(dev, 0, 1, dma_addr); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out_dma; - if (nvme_sc) { - res = nvme_sc; - goto out_dma; - } - id_ctrl = mem; - ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE; - resp_size = ll_length + LUN_DATA_HEADER_SIZE; - - if (alloc_len < resp_size) { - res = nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out_dma; - } - - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out_dma; - } - - /* The first LUN ID will always be 0 per the SAM spec */ - for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) { - /* - * Set the LUN Id and then increment to the next LUN - * location in the parameter data. - */ - __be64 tmp_id = cpu_to_be64(lun_id); - memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64)); - lun_id_offset += LUN_ENTRY_SIZE; - } - tmp_len = cpu_to_be32(ll_length); - memcpy(response, &tmp_len, sizeof(u32)); - } - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out_dma: - dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem, - dma_addr); - out: - return res; -} - -static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - u8 alloc_len, xfer_len, resp_size; - u8 desc_format; - u8 *response; - - alloc_len = GET_REQUEST_SENSE_ALLOC_LENGTH(cmd); - desc_format = GET_U8_FROM_CDB(cmd, REQUEST_SENSE_DESC_OFFSET); - desc_format &= REQUEST_SENSE_DESC_MASK; - - resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) : - (FIXED_FMT_SENSE_DATA_SIZE)); - response = kzalloc(resp_size, GFP_KERNEL); - if (response == NULL) { - res = -ENOMEM; - goto out; - } - - if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) { - /* Descriptor Format Sense Data */ - response[0] = DESC_FORMAT_SENSE_DATA; - response[1] = NO_SENSE; - /* TODO How is LOW POWER CONDITION ON handled? (byte 2) */ - response[2] = SCSI_ASC_NO_SENSE; - response[3] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - /* SDAT_OVFL = 0 | Additional Sense Length = 0 */ - } else { - /* Fixed Format Sense Data */ - response[0] = FIXED_SENSE_DATA; - /* Byte 1 = Obsolete */ - response[2] = NO_SENSE; /* FM, EOM, ILI, SDAT_OVFL = 0 */ - /* Bytes 3-6 - Information - set to zero */ - response[7] = FIXED_SENSE_DATA_ADD_LENGTH; - /* Bytes 8-11 - Cmd Specific Information - set to zero */ - response[12] = SCSI_ASC_NO_SENSE; - response[13] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE; - /* Byte 14 = Field Replaceable Unit Code = 0 */ - /* Bytes 15-17 - SKSV=0; Sense Key Specific = 0 */ - } - - xfer_len = min(alloc_len, resp_size); - res = nvme_trans_copy_to_user(hdr, response, xfer_len); - - kfree(response); - out: - return res; -} - -static int nvme_trans_security_protocol(struct nvme_ns *ns, - struct sg_io_hdr *hdr, - u8 *cmd) -{ - return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); -} - -static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_command c; - u8 immed, pcmod, pc, no_flush, start; - - immed = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_IMMED_OFFSET); - pcmod = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET); - pc = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_OFFSET); - no_flush = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_NO_FLUSH_OFFSET); - start = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_START_OFFSET); - - immed &= START_STOP_UNIT_CDB_IMMED_MASK; - pcmod &= START_STOP_UNIT_CDB_POWER_COND_MOD_MASK; - pc = (pc & START_STOP_UNIT_CDB_POWER_COND_MASK) >> NIBBLE_SHIFT; - no_flush &= START_STOP_UNIT_CDB_NO_FLUSH_MASK; - start &= START_STOP_UNIT_CDB_START_MASK; - - if (immed != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } else { - if (no_flush == 0) { - /* Issue NVME FLUSH command prior to START STOP UNIT */ - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_cmd_flush; - c.common.nsid = cpu_to_le32(ns->ns_id); - - nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out; - if (nvme_sc) { - res = nvme_sc; - goto out; - } - } - /* Setup the expected power state transition */ - res = nvme_trans_power_state(ns, hdr, pc, pcmod, start); - } - - out: - return res; -} - -static int nvme_trans_synchronize_cache(struct nvme_ns *ns, - struct sg_io_hdr *hdr, u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - int nvme_sc; - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_cmd_flush; - c.common.nsid = cpu_to_le32(ns->ns_id); - - nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); - - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - goto out; - if (nvme_sc) - res = nvme_sc; - - out: - return res; -} - -static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - u8 parm_hdr_len = 0; - u8 nvme_pf_code = 0; - u8 format_prot_info, long_list, format_data; - - format_prot_info = GET_U8_FROM_CDB(cmd, - FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET); - long_list = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_LONG_LIST_OFFSET); - format_data = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET); - - format_prot_info = (format_prot_info & - FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK) >> - FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT; - long_list &= FORMAT_UNIT_CDB_LONG_LIST_MASK; - format_data &= FORMAT_UNIT_CDB_FORMAT_DATA_MASK; - - if (format_data != 0) { - if (format_prot_info != 0) { - if (long_list == 0) - parm_hdr_len = FORMAT_UNIT_SHORT_PARM_LIST_LEN; - else - parm_hdr_len = FORMAT_UNIT_LONG_PARM_LIST_LEN; - } - } else if (format_data == 0 && format_prot_info != 0) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - - /* Get parm header from data-in/out buffer */ - /* - * According to the translation spec, the only fields in the parameter - * list we are concerned with are in the header. So allocate only that. - */ - if (parm_hdr_len > 0) { - res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len, - format_prot_info, &nvme_pf_code); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; - } - - /* Attempt to activate any previously downloaded firmware image */ - res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, 0, 0, 0); - - /* Determine Block size and count and send format command */ - res = nvme_trans_fmt_set_blk_size_count(ns, hdr); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; - - res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code); - - out: - return res; -} - -static int nvme_trans_test_unit_ready(struct nvme_ns *ns, - struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - struct nvme_dev *dev = ns->dev; - - if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - NOT_READY, SCSI_ASC_LUN_NOT_READY, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - else - res = nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0); - - return res; -} - -static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - int res = SNTI_TRANSLATION_SUCCESS; - u32 buffer_offset, parm_list_length; - u8 buffer_id, mode; - - parm_list_length = - GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET); - if (parm_list_length % BYTES_TO_DWORDS != 0) { - /* NVMe expects Firmware file to be a whole number of DWORDS */ - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - buffer_id = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_ID_OFFSET); - if (buffer_id > NVME_MAX_FIRMWARE_SLOT) { - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - goto out; - } - mode = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_MODE_OFFSET) & - WRITE_BUFFER_CDB_MODE_MASK; - buffer_offset = - GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET); - - switch (mode) { - case DOWNLOAD_SAVE_ACTIVATE: - res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw, - parm_list_length, buffer_offset, - buffer_id); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; - res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, - parm_list_length, buffer_offset, - buffer_id); - break; - case DOWNLOAD_SAVE_DEFER_ACTIVATE: - res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw, - parm_list_length, buffer_offset, - buffer_id); - break; - case ACTIVATE_DEFERRED_MICROCODE: - res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, - parm_list_length, buffer_offset, - buffer_id); - break; - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - - out: - return res; -} - -struct scsi_unmap_blk_desc { - __be64 slba; - __be32 nlb; - u32 resv; -}; - -struct scsi_unmap_parm_list { - __be16 unmap_data_len; - __be16 unmap_blk_desc_data_len; - u32 resv; - struct scsi_unmap_blk_desc desc[0]; -}; - -static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *cmd) -{ - struct nvme_dev *dev = ns->dev; - struct scsi_unmap_parm_list *plist; - struct nvme_dsm_range *range; - struct nvme_command c; - int i, nvme_sc, res = -ENOMEM; - u16 ndesc, list_len; - dma_addr_t dma_addr; - - list_len = GET_U16_FROM_CDB(cmd, UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET); - if (!list_len) - return -EINVAL; - - plist = kmalloc(list_len, GFP_KERNEL); - if (!plist) - return -ENOMEM; - - res = nvme_trans_copy_from_user(hdr, plist, list_len); - if (res != SNTI_TRANSLATION_SUCCESS) - goto out; - - ndesc = be16_to_cpu(plist->unmap_blk_desc_data_len) >> 4; - if (!ndesc || ndesc > 256) { - res = -EINVAL; - goto out; - } - - range = dma_alloc_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), - &dma_addr, GFP_KERNEL); - if (!range) - goto out; - - for (i = 0; i < ndesc; i++) { - range[i].nlb = cpu_to_le32(be32_to_cpu(plist->desc[i].nlb)); - range[i].slba = cpu_to_le64(be64_to_cpu(plist->desc[i].slba)); - range[i].cattr = 0; - } - - memset(&c, 0, sizeof(c)); - c.dsm.opcode = nvme_cmd_dsm; - c.dsm.nsid = cpu_to_le32(ns->ns_id); - c.dsm.prp1 = cpu_to_le64(dma_addr); - c.dsm.nr = cpu_to_le32(ndesc - 1); - c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - - nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); - res = nvme_trans_status_code(hdr, nvme_sc); - - dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), - range, dma_addr); - out: - kfree(plist); - return res; -} - -static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) -{ - u8 cmd[BLK_MAX_CDB]; - int retcode; - unsigned int opcode; - - if (hdr->cmdp == NULL) - return -EMSGSIZE; - if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) - return -EFAULT; - - /* - * Prime the hdr with good status for scsi commands that don't require - * an nvme command for translation. - */ - retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); - if (retcode) - return retcode; - - opcode = cmd[0]; - - switch (opcode) { - case READ_6: - case READ_10: - case READ_12: - case READ_16: - retcode = nvme_trans_io(ns, hdr, 0, cmd); - break; - case WRITE_6: - case WRITE_10: - case WRITE_12: - case WRITE_16: - retcode = nvme_trans_io(ns, hdr, 1, cmd); - break; - case INQUIRY: - retcode = nvme_trans_inquiry(ns, hdr, cmd); - break; - case LOG_SENSE: - retcode = nvme_trans_log_sense(ns, hdr, cmd); - break; - case MODE_SELECT: - case MODE_SELECT_10: - retcode = nvme_trans_mode_select(ns, hdr, cmd); - break; - case MODE_SENSE: - case MODE_SENSE_10: - retcode = nvme_trans_mode_sense(ns, hdr, cmd); - break; - case READ_CAPACITY: - retcode = nvme_trans_read_capacity(ns, hdr, cmd); - break; - case SERVICE_ACTION_IN_16: - if (IS_READ_CAP_16(cmd)) - retcode = nvme_trans_read_capacity(ns, hdr, cmd); - else - goto out; - break; - case REPORT_LUNS: - retcode = nvme_trans_report_luns(ns, hdr, cmd); - break; - case REQUEST_SENSE: - retcode = nvme_trans_request_sense(ns, hdr, cmd); - break; - case SECURITY_PROTOCOL_IN: - case SECURITY_PROTOCOL_OUT: - retcode = nvme_trans_security_protocol(ns, hdr, cmd); - break; - case START_STOP: - retcode = nvme_trans_start_stop(ns, hdr, cmd); - break; - case SYNCHRONIZE_CACHE: - retcode = nvme_trans_synchronize_cache(ns, hdr, cmd); - break; - case FORMAT_UNIT: - retcode = nvme_trans_format_unit(ns, hdr, cmd); - break; - case TEST_UNIT_READY: - retcode = nvme_trans_test_unit_ready(ns, hdr, cmd); - break; - case WRITE_BUFFER: - retcode = nvme_trans_write_buffer(ns, hdr, cmd); - break; - case UNMAP: - retcode = nvme_trans_unmap(ns, hdr, cmd); - break; - default: - out: - retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - return retcode; -} - -int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) -{ - struct sg_io_hdr hdr; - int retcode; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&hdr, u_hdr, sizeof(hdr))) - return -EFAULT; - if (hdr.interface_id != 'S') - return -EINVAL; - if (hdr.cmd_len > BLK_MAX_CDB) - return -EINVAL; - - retcode = nvme_scsi_translate(ns, &hdr); - if (retcode < 0) - return retcode; - if (retcode > 0) - retcode = SNTI_TRANSLATION_SUCCESS; - if (copy_to_user(u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0) - return -EFAULT; - - return retcode; -} - -int nvme_sg_get_version_num(int __user *ip) -{ - return put_user(sg_version_num, ip); -} diff --git a/kernel/drivers/block/osdblk.c b/kernel/drivers/block/osdblk.c index e22942596..1b709a4e3 100644 --- a/kernel/drivers/block/osdblk.c +++ b/kernel/drivers/block/osdblk.c @@ -271,7 +271,7 @@ static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask) goto err_out; tmp->bi_bdev = NULL; - gfpmask &= ~__GFP_WAIT; + gfpmask &= ~__GFP_DIRECT_RECLAIM; tmp->bi_next = NULL; if (!new_chain) diff --git a/kernel/drivers/block/paride/paride.c b/kernel/drivers/block/paride/paride.c index 48c50f11f..0e287993b 100644 --- a/kernel/drivers/block/paride/paride.c +++ b/kernel/drivers/block/paride/paride.c @@ -30,6 +30,7 @@ #include #include /* TASK_* */ #include +#include #include "paride.h" @@ -244,17 +245,19 @@ void paride_unregister(PIP * pr) EXPORT_SYMBOL(paride_unregister); -static int pi_register_parport(PIA * pi, int verbose) +static int pi_register_parport(PIA *pi, int verbose, int unit) { struct parport *port; + struct pardev_cb par_cb; port = parport_find_base(pi->port); if (!port) return 0; - - pi->pardev = parport_register_device(port, - pi->device, NULL, - pi_wake_up, NULL, 0, (void *) pi); + memset(&par_cb, 0, sizeof(par_cb)); + par_cb.wakeup = pi_wake_up; + par_cb.private = (void *)pi; + pi->pardev = parport_register_dev_model(port, pi->device, &par_cb, + unit); parport_put_port(port); if (!pi->pardev) return 0; @@ -311,7 +314,7 @@ static int pi_probe_unit(PIA * pi, int unit, char *scratch, int verbose) e = pi->proto->max_units; } - if (!pi_register_parport(pi, verbose)) + if (!pi_register_parport(pi, verbose, s)) return 0; if (pi->proto->test_port) { @@ -432,3 +435,45 @@ int pi_init(PIA * pi, int autoprobe, int port, int mode, } EXPORT_SYMBOL(pi_init); + +static int pi_probe(struct pardevice *par_dev) +{ + struct device_driver *drv = par_dev->dev.driver; + int len = strlen(drv->name); + + if (strncmp(par_dev->name, drv->name, len)) + return -ENODEV; + + return 0; +} + +void *pi_register_driver(char *name) +{ + struct parport_driver *parp_drv; + int ret; + + parp_drv = kzalloc(sizeof(*parp_drv), GFP_KERNEL); + if (!parp_drv) + return NULL; + + parp_drv->name = name; + parp_drv->probe = pi_probe; + parp_drv->devmodel = true; + + ret = parport_register_driver(parp_drv); + if (ret) { + kfree(parp_drv); + return NULL; + } + return (void *)parp_drv; +} +EXPORT_SYMBOL(pi_register_driver); + +void pi_unregister_driver(void *_drv) +{ + struct parport_driver *drv = _drv; + + parport_unregister_driver(drv); + kfree(drv); +} +EXPORT_SYMBOL(pi_unregister_driver); diff --git a/kernel/drivers/block/paride/paride.h b/kernel/drivers/block/paride/paride.h index 2bddbf455..ddb9e589d 100644 --- a/kernel/drivers/block/paride/paride.h +++ b/kernel/drivers/block/paride/paride.h @@ -165,6 +165,8 @@ typedef struct pi_protocol PIP; extern int paride_register( PIP * ); extern void paride_unregister ( PIP * ); +void *pi_register_driver(char *); +void pi_unregister_driver(void *); #endif /* __DRIVERS_PARIDE_H__ */ /* end of paride.h */ diff --git a/kernel/drivers/block/paride/pcd.c b/kernel/drivers/block/paride/pcd.c index 3b7c9f1be..93362362a 100644 --- a/kernel/drivers/block/paride/pcd.c +++ b/kernel/drivers/block/paride/pcd.c @@ -221,6 +221,7 @@ static int pcd_busy; /* request being processed ? */ static int pcd_sector; /* address of next requested sector */ static int pcd_count; /* number of blocks still to do */ static char *pcd_buf; /* buffer for request in progress */ +static void *par_drv; /* reference of parport driver */ /* kernel glue structures */ @@ -690,6 +691,12 @@ static int pcd_detect(void) printk("%s: %s version %s, major %d, nice %d\n", name, name, PCD_VERSION, major, nice); + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + return -1; + } + k = 0; if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ cd = pcd; @@ -723,6 +730,7 @@ static int pcd_detect(void) printk("%s: No CD-ROM drive found\n", name); for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) put_disk(cd->disk); + pi_unregister_driver(par_drv); return -1; } @@ -984,6 +992,7 @@ static void __exit pcd_exit(void) } blk_cleanup_queue(pcd_queue); unregister_blkdev(major, name); + pi_unregister_driver(par_drv); } MODULE_LICENSE("GPL"); diff --git a/kernel/drivers/block/paride/pd.c b/kernel/drivers/block/paride/pd.c index d48715b28..562b5a4ca 100644 --- a/kernel/drivers/block/paride/pd.c +++ b/kernel/drivers/block/paride/pd.c @@ -247,6 +247,8 @@ static char *pd_errs[17] = { "ERR", "INDEX", "ECC", "DRQ", "SEEK", "WRERR", "IDNF", "MC", "UNC", "???", "TMO" }; +static void *par_drv; /* reference of parport driver */ + static inline int status_reg(struct pd_unit *disk) { return pi_read_regr(disk->pi, 1, 6); @@ -442,7 +444,7 @@ static char *pd_buf; /* buffer for request in progress */ static enum action do_pd_io_start(void) { - if (pd_req->cmd_type == REQ_TYPE_SPECIAL) { + if (pd_req->cmd_type == REQ_TYPE_DRV_PRIV) { phase = pd_special; return pd_special(); } @@ -721,11 +723,11 @@ static int pd_special_command(struct pd_unit *disk, struct request *rq; int err = 0; - rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT); + rq = blk_get_request(disk->gd->queue, READ, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); - rq->cmd_type = REQ_TYPE_SPECIAL; + rq->cmd_type = REQ_TYPE_DRV_PRIV; rq->special = func; err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0); @@ -872,6 +874,12 @@ static int pd_detect(void) pd_drive_count++; } + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + return -1; + } + if (pd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */ disk = pd; if (pi_init(disk->pi, 1, -1, -1, -1, -1, -1, pd_scratch, @@ -902,8 +910,10 @@ static int pd_detect(void) found = 1; } } - if (!found) + if (!found) { printk("%s: no valid drive found\n", name); + pi_unregister_driver(par_drv); + } return found; } diff --git a/kernel/drivers/block/paride/pf.c b/kernel/drivers/block/paride/pf.c index 9a15fd3c9..7a7d977a7 100644 --- a/kernel/drivers/block/paride/pf.c +++ b/kernel/drivers/block/paride/pf.c @@ -264,6 +264,7 @@ static int pf_cmd; /* current command READ/WRITE */ static struct pf_unit *pf_current;/* unit of current request */ static int pf_mask; /* stopper for pseudo-int */ static char *pf_buf; /* buffer for request in progress */ +static void *par_drv; /* reference of parport driver */ /* kernel glue structures */ @@ -703,6 +704,11 @@ static int pf_detect(void) printk("%s: %s version %s, major %d, cluster %d, nice %d\n", name, name, PF_VERSION, major, cluster, nice); + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + return -1; + } k = 0; if (pf_drive_count == 0) { if (pi_init(pf->pi, 1, -1, -1, -1, -1, -1, pf_scratch, PI_PF, @@ -735,6 +741,7 @@ static int pf_detect(void) printk("%s: No ATAPI disk detected\n", name); for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) put_disk(pf->disk); + pi_unregister_driver(par_drv); return -1; } diff --git a/kernel/drivers/block/paride/pg.c b/kernel/drivers/block/paride/pg.c index 876d0c3ea..bfbd4c852 100644 --- a/kernel/drivers/block/paride/pg.c +++ b/kernel/drivers/block/paride/pg.c @@ -227,6 +227,7 @@ static int pg_identify(struct pg *dev, int log); static char pg_scratch[512]; /* scratch block buffer */ static struct class *pg_class; +static void *par_drv; /* reference of parport driver */ /* kernel glue structures */ @@ -481,6 +482,12 @@ static int pg_detect(void) printk("%s: %s version %s, major %d\n", name, name, PG_VERSION, major); + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + return -1; + } + k = 0; if (pg_drive_count == 0) { if (pi_init(dev->pi, 1, -1, -1, -1, -1, -1, pg_scratch, @@ -511,6 +518,7 @@ static int pg_detect(void) if (k) return 0; + pi_unregister_driver(par_drv); printk("%s: No ATAPI device detected\n", name); return -1; } diff --git a/kernel/drivers/block/paride/pt.c b/kernel/drivers/block/paride/pt.c index 2596042eb..1740d75e8 100644 --- a/kernel/drivers/block/paride/pt.c +++ b/kernel/drivers/block/paride/pt.c @@ -232,6 +232,7 @@ static int pt_identify(struct pt_unit *tape); static struct pt_unit pt[PT_UNITS]; static char pt_scratch[512]; /* scratch block buffer */ +static void *par_drv; /* reference of parport driver */ /* kernel glue structures */ @@ -605,6 +606,12 @@ static int pt_detect(void) printk("%s: %s version %s, major %d\n", name, name, PT_VERSION, major); + par_drv = pi_register_driver(name); + if (!par_drv) { + pr_err("failed to register %s driver\n", name); + return -1; + } + specified = 0; for (unit = 0; unit < PT_UNITS; unit++) { struct pt_unit *tape = &pt[unit]; @@ -644,6 +651,7 @@ static int pt_detect(void) if (found) return 0; + pi_unregister_driver(par_drv); printk("%s: No ATAPI tape drive detected\n", name); return -1; } diff --git a/kernel/drivers/block/pktcdvd.c b/kernel/drivers/block/pktcdvd.c index 09e628daf..d06c62ecc 100644 --- a/kernel/drivers/block/pktcdvd.c +++ b/kernel/drivers/block/pktcdvd.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include @@ -703,14 +704,14 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command * int ret = 0; rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ? - WRITE : READ, __GFP_WAIT); + WRITE : READ, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); blk_rq_set_block_pc(rq); if (cgc->buflen) { ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen, - __GFP_WAIT); + __GFP_RECLAIM); if (ret) goto out; } @@ -976,7 +977,7 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec) } } -static void pkt_end_io_read(struct bio *bio, int err) +static void pkt_end_io_read(struct bio *bio) { struct packet_data *pkt = bio->bi_private; struct pktcdvd_device *pd = pkt->pd; @@ -984,9 +985,9 @@ static void pkt_end_io_read(struct bio *bio, int err) pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n", bio, (unsigned long long)pkt->sector, - (unsigned long long)bio->bi_iter.bi_sector, err); + (unsigned long long)bio->bi_iter.bi_sector, bio->bi_error); - if (err) + if (bio->bi_error) atomic_inc(&pkt->io_errors); if (atomic_dec_and_test(&pkt->io_wait)) { atomic_inc(&pkt->run_sm); @@ -995,13 +996,13 @@ static void pkt_end_io_read(struct bio *bio, int err) pkt_bio_finished(pd); } -static void pkt_end_io_packet_write(struct bio *bio, int err) +static void pkt_end_io_packet_write(struct bio *bio) { struct packet_data *pkt = bio->bi_private; struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, err); + pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_error); pd->stats.pkt_ended++; @@ -1339,22 +1340,22 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) pkt_queue_bio(pd, pkt->w_bio); } -static void pkt_finish_packet(struct packet_data *pkt, int uptodate) +static void pkt_finish_packet(struct packet_data *pkt, int error) { struct bio *bio; - if (!uptodate) + if (error) pkt->cache_valid = 0; /* Finish all bios corresponding to this packet */ - while ((bio = bio_list_pop(&pkt->orig_bios))) - bio_endio(bio, uptodate ? 0 : -EIO); + while ((bio = bio_list_pop(&pkt->orig_bios))) { + bio->bi_error = error; + bio_endio(bio); + } } static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt) { - int uptodate; - pkt_dbg(2, pd, "pkt %d\n", pkt->id); for (;;) { @@ -1383,7 +1384,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data if (atomic_read(&pkt->io_wait) > 0) return; - if (test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags)) { + if (!pkt->w_bio->bi_error) { pkt_set_state(pkt, PACKET_FINISHED_STATE); } else { pkt_set_state(pkt, PACKET_RECOVERY_STATE); @@ -1400,8 +1401,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data break; case PACKET_FINISHED_STATE: - uptodate = test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags); - pkt_finish_packet(pkt, uptodate); + pkt_finish_packet(pkt, pkt->w_bio->bi_error); return; default: @@ -2331,13 +2331,14 @@ static void pkt_close(struct gendisk *disk, fmode_t mode) } -static void pkt_end_io_read_cloned(struct bio *bio, int err) +static void pkt_end_io_read_cloned(struct bio *bio) { struct packet_stacked_data *psd = bio->bi_private; struct pktcdvd_device *pd = psd->pd; + psd->bio->bi_error = bio->bi_error; bio_put(bio); - bio_endio(psd->bio, err); + bio_endio(psd->bio); mempool_free(psd, psd_pool); pkt_bio_finished(pd); } @@ -2440,12 +2441,16 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) } } -static void pkt_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) { struct pktcdvd_device *pd; char b[BDEVNAME_SIZE]; struct bio *split; + blk_queue_bounce(q, &bio); + + blk_queue_split(q, &bio, q->bio_split); + pd = q->queuedata; if (!pd) { pr_err("%s incorrect request queue\n", @@ -2462,7 +2467,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) */ if (bio_data_dir(bio) == READ) { pkt_make_request_read(pd, bio); - return; + return BLK_QC_T_NONE; } if (!test_bit(PACKET_WRITABLE, &pd->flags)) { @@ -2476,8 +2481,6 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) goto end_io; } - blk_queue_bounce(q, &bio); - do { sector_t zone = get_zone(bio->bi_iter.bi_sector, pd); sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd); @@ -2496,31 +2499,10 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) pkt_make_request_write(q, split); } while (split != bio); - return; + return BLK_QC_T_NONE; end_io: bio_io_error(bio); -} - - - -static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, - struct bio_vec *bvec) -{ - struct pktcdvd_device *pd = q->queuedata; - sector_t zone = get_zone(bmd->bi_sector, pd); - int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size; - int remaining = (pd->settings.size << 9) - used; - int remaining2; - - /* - * A bio <= PAGE_SIZE must be allowed. If it crosses a packet - * boundary, pkt_make_request() will split the bio. - */ - remaining2 = PAGE_SIZE - bmd->bi_size; - remaining = max(remaining, remaining2); - - BUG_ON(remaining < 0); - return remaining; + return BLK_QC_T_NONE; } static void pkt_init_queue(struct pktcdvd_device *pd) @@ -2530,7 +2512,6 @@ static void pkt_init_queue(struct pktcdvd_device *pd) blk_queue_make_request(q, pkt_make_request); blk_queue_logical_block_size(q, CD_FRAMESIZE); blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS); - blk_queue_merge_bvec(q, pkt_merge_bvec); q->queuedata = pd; } @@ -2821,8 +2802,7 @@ out_new_dev: out_mem2: put_disk(disk); out_mem: - if (pd->rb_pool) - mempool_destroy(pd->rb_pool); + mempool_destroy(pd->rb_pool); kfree(pd); out_mutex: mutex_unlock(&ctl_mutex); diff --git a/kernel/drivers/block/pmem.c b/kernel/drivers/block/pmem.c deleted file mode 100644 index eabf4a8d0..000000000 --- a/kernel/drivers/block/pmem.c +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Persistent Memory Driver - * - * Copyright (c) 2014, Intel Corporation. - * Copyright (c) 2015, Christoph Hellwig . - * Copyright (c) 2015, Boaz Harrosh . - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#define PMEM_MINORS 16 - -struct pmem_device { - struct request_queue *pmem_queue; - struct gendisk *pmem_disk; - - /* One contiguous memory region per device */ - phys_addr_t phys_addr; - void *virt_addr; - size_t size; -}; - -static int pmem_major; -static atomic_t pmem_index; - -static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, - unsigned int len, unsigned int off, int rw, - sector_t sector) -{ - void *mem = kmap_atomic(page); - size_t pmem_off = sector << 9; - - if (rw == READ) { - memcpy(mem + off, pmem->virt_addr + pmem_off, len); - flush_dcache_page(page); - } else { - flush_dcache_page(page); - memcpy(pmem->virt_addr + pmem_off, mem + off, len); - } - - kunmap_atomic(mem); -} - -static void pmem_make_request(struct request_queue *q, struct bio *bio) -{ - struct block_device *bdev = bio->bi_bdev; - struct pmem_device *pmem = bdev->bd_disk->private_data; - int rw; - struct bio_vec bvec; - sector_t sector; - struct bvec_iter iter; - int err = 0; - - if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) { - err = -EIO; - goto out; - } - - BUG_ON(bio->bi_rw & REQ_DISCARD); - - rw = bio_data_dir(bio); - sector = bio->bi_iter.bi_sector; - bio_for_each_segment(bvec, bio, iter) { - pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, - rw, sector); - sector += bvec.bv_len >> 9; - } - -out: - bio_endio(bio, err); -} - -static int pmem_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, int rw) -{ - struct pmem_device *pmem = bdev->bd_disk->private_data; - - pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); - page_endio(page, rw & WRITE, 0); - - return 0; -} - -static long pmem_direct_access(struct block_device *bdev, sector_t sector, - void **kaddr, unsigned long *pfn, long size) -{ - struct pmem_device *pmem = bdev->bd_disk->private_data; - size_t offset = sector << 9; - - if (!pmem) - return -ENODEV; - - *kaddr = pmem->virt_addr + offset; - *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; - - return pmem->size - offset; -} - -static const struct block_device_operations pmem_fops = { - .owner = THIS_MODULE, - .rw_page = pmem_rw_page, - .direct_access = pmem_direct_access, -}; - -static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res) -{ - struct pmem_device *pmem; - struct gendisk *disk; - int idx, err; - - err = -ENOMEM; - pmem = kzalloc(sizeof(*pmem), GFP_KERNEL); - if (!pmem) - goto out; - - pmem->phys_addr = res->start; - pmem->size = resource_size(res); - - err = -EINVAL; - if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) { - dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", &pmem->phys_addr, pmem->size); - goto out_free_dev; - } - - /* - * Map the memory as non-cachable, as we can't write back the contents - * of the CPU caches in case of a crash. - */ - err = -ENOMEM; - pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size); - if (!pmem->virt_addr) - goto out_release_region; - - pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL); - if (!pmem->pmem_queue) - goto out_unmap; - - blk_queue_make_request(pmem->pmem_queue, pmem_make_request); - blk_queue_max_hw_sectors(pmem->pmem_queue, 1024); - blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); - - disk = alloc_disk(PMEM_MINORS); - if (!disk) - goto out_free_queue; - - idx = atomic_inc_return(&pmem_index) - 1; - - disk->major = pmem_major; - disk->first_minor = PMEM_MINORS * idx; - disk->fops = &pmem_fops; - disk->private_data = pmem; - disk->queue = pmem->pmem_queue; - disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "pmem%d", idx); - disk->driverfs_dev = dev; - set_capacity(disk, pmem->size >> 9); - pmem->pmem_disk = disk; - - add_disk(disk); - - return pmem; - -out_free_queue: - blk_cleanup_queue(pmem->pmem_queue); -out_unmap: - iounmap(pmem->virt_addr); -out_release_region: - release_mem_region(pmem->phys_addr, pmem->size); -out_free_dev: - kfree(pmem); -out: - return ERR_PTR(err); -} - -static void pmem_free(struct pmem_device *pmem) -{ - del_gendisk(pmem->pmem_disk); - put_disk(pmem->pmem_disk); - blk_cleanup_queue(pmem->pmem_queue); - iounmap(pmem->virt_addr); - release_mem_region(pmem->phys_addr, pmem->size); - kfree(pmem); -} - -static int pmem_probe(struct platform_device *pdev) -{ - struct pmem_device *pmem; - struct resource *res; - - if (WARN_ON(pdev->num_resources > 1)) - return -ENXIO; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENXIO; - - pmem = pmem_alloc(&pdev->dev, res); - if (IS_ERR(pmem)) - return PTR_ERR(pmem); - - platform_set_drvdata(pdev, pmem); - - return 0; -} - -static int pmem_remove(struct platform_device *pdev) -{ - struct pmem_device *pmem = platform_get_drvdata(pdev); - - pmem_free(pmem); - return 0; -} - -static struct platform_driver pmem_driver = { - .probe = pmem_probe, - .remove = pmem_remove, - .driver = { - .owner = THIS_MODULE, - .name = "pmem", - }, -}; - -static int __init pmem_init(void) -{ - int error; - - pmem_major = register_blkdev(0, "pmem"); - if (pmem_major < 0) - return pmem_major; - - error = platform_driver_register(&pmem_driver); - if (error) - unregister_blkdev(pmem_major, "pmem"); - return error; -} -module_init(pmem_init); - -static void pmem_exit(void) -{ - platform_driver_unregister(&pmem_driver); - unregister_blkdev(pmem_major, "pmem"); -} -module_exit(pmem_exit); - -MODULE_AUTHOR("Ross Zwisler "); -MODULE_LICENSE("GPL v2"); diff --git a/kernel/drivers/block/ps3vram.c b/kernel/drivers/block/ps3vram.c index ef45cfb98..56847fcda 100644 --- a/kernel/drivers/block/ps3vram.c +++ b/kernel/drivers/block/ps3vram.c @@ -1,5 +1,5 @@ /* - * ps3vram - Use extra PS3 video ram as MTD block device. + * ps3vram - Use extra PS3 video ram as block device. * * Copyright 2009 Sony Corporation * @@ -73,8 +73,8 @@ struct ps3vram_priv { u64 memory_handle; u64 context_handle; - u32 *ctrl; - void *reports; + u32 __iomem *ctrl; + void __iomem *reports; u8 *xdr_buf; u32 *fifo_base; @@ -104,7 +104,7 @@ static char *size = "256M"; module_param(size, charp, 0); MODULE_PARM_DESC(size, "memory size"); -static u32 *ps3vram_get_notifier(void *reports, int notifier) +static u32 __iomem *ps3vram_get_notifier(void __iomem *reports, int notifier) { return reports + DMA_NOTIFIER_OFFSET_BASE + DMA_NOTIFIER_SIZE * notifier; @@ -113,22 +113,22 @@ static u32 *ps3vram_get_notifier(void *reports, int notifier) static void ps3vram_notifier_reset(struct ps3_system_bus_device *dev) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); - u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); + u32 __iomem *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); int i; for (i = 0; i < 4; i++) - notify[i] = 0xffffffff; + iowrite32be(0xffffffff, notify + i); } static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev, unsigned int timeout_ms) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); - u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); + u32 __iomem *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); unsigned long timeout; for (timeout = 20; timeout; timeout--) { - if (!notify[3]) + if (!ioread32be(notify + 3)) return 0; udelay(10); } @@ -136,7 +136,7 @@ static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev, timeout = jiffies + msecs_to_jiffies(timeout_ms); do { - if (!notify[3]) + if (!ioread32be(notify + 3)) return 0; msleep(1); } while (time_before(jiffies, timeout)); @@ -148,8 +148,8 @@ static void ps3vram_init_ring(struct ps3_system_bus_device *dev) { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); - priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET; - priv->ctrl[CTRL_GET] = FIFO_BASE + FIFO_OFFSET; + iowrite32be(FIFO_BASE + FIFO_OFFSET, priv->ctrl + CTRL_PUT); + iowrite32be(FIFO_BASE + FIFO_OFFSET, priv->ctrl + CTRL_GET); } static int ps3vram_wait_ring(struct ps3_system_bus_device *dev, @@ -159,14 +159,14 @@ static int ps3vram_wait_ring(struct ps3_system_bus_device *dev, unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms); do { - if (priv->ctrl[CTRL_PUT] == priv->ctrl[CTRL_GET]) + if (ioread32be(priv->ctrl + CTRL_PUT) == ioread32be(priv->ctrl + CTRL_GET)) return 0; msleep(1); } while (time_before(jiffies, timeout)); dev_warn(&dev->core, "FIFO timeout (%08x/%08x/%08x)\n", - priv->ctrl[CTRL_PUT], priv->ctrl[CTRL_GET], - priv->ctrl[CTRL_TOP]); + ioread32be(priv->ctrl + CTRL_PUT), ioread32be(priv->ctrl + CTRL_GET), + ioread32be(priv->ctrl + CTRL_TOP)); return -ETIMEDOUT; } @@ -189,7 +189,7 @@ static void ps3vram_rewind_ring(struct ps3_system_bus_device *dev) ps3vram_out_ring(priv, 0x20000000 | (FIFO_BASE + FIFO_OFFSET)); - priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET; + iowrite32be(FIFO_BASE + FIFO_OFFSET, priv->ctrl + CTRL_PUT); /* asking the HV for a blit will kick the FIFO */ status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0); @@ -207,8 +207,8 @@ static void ps3vram_fire_ring(struct ps3_system_bus_device *dev) mutex_lock(&ps3_gpu_mutex); - priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET + - (priv->fifo_ptr - priv->fifo_base) * sizeof(u32); + iowrite32be(FIFO_BASE + FIFO_OFFSET + (priv->fifo_ptr - priv->fifo_base) + * sizeof(u32), priv->ctrl + CTRL_PUT); /* asking the HV for a blit will kick the FIFO */ status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0); @@ -593,11 +593,12 @@ out: next = bio_list_peek(&priv->list); spin_unlock_irq(&priv->lock); - bio_endio(bio, error); + bio->bi_error = error; + bio_endio(bio); return next; } -static void ps3vram_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t ps3vram_make_request(struct request_queue *q, struct bio *bio) { struct ps3_system_bus_device *dev = q->queuedata; struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); @@ -605,17 +606,21 @@ static void ps3vram_make_request(struct request_queue *q, struct bio *bio) dev_dbg(&dev->core, "%s\n", __func__); + blk_queue_split(q, &bio, q->bio_split); + spin_lock_irq(&priv->lock); busy = !bio_list_empty(&priv->list); bio_list_add(&priv->list, bio); spin_unlock_irq(&priv->lock); if (busy) - return; + return BLK_QC_T_NONE; do { bio = ps3vram_do_bio(dev, bio); } while (bio); + + return BLK_QC_T_NONE; } static int ps3vram_probe(struct ps3_system_bus_device *dev) diff --git a/kernel/drivers/block/rbd.c b/kernel/drivers/block/rbd.c index 010ce0b1f..81ea69fee 100644 --- a/kernel/drivers/block/rbd.c +++ b/kernel/drivers/block/rbd.c @@ -96,6 +96,8 @@ static int atomic_dec_return_safe(atomic_t *v) #define RBD_MINORS_PER_MAJOR 256 #define RBD_SINGLE_MAJOR_PART_SHIFT 4 +#define RBD_MAX_PARENT_CHAIN_LEN 16 + #define RBD_SNAP_DEV_NAME_PREFIX "snap_" #define RBD_MAX_SNAP_NAME_LEN \ (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1)) @@ -346,6 +348,7 @@ struct rbd_device { struct rbd_image_header header; unsigned long flags; /* possibly lock protected */ struct rbd_spec *spec; + struct rbd_options *opts; char *header_name; @@ -415,8 +418,6 @@ MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (d static int rbd_img_request_submit(struct rbd_img_request *img_request); -static void rbd_dev_device_release(struct device *dev); - static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove(struct bus_type *bus, const char *buf, @@ -425,7 +426,7 @@ static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf, size_t count); static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, size_t count); -static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping); +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); static void rbd_spec_put(struct rbd_spec *spec); static int rbd_dev_id_to_minor(int dev_id) @@ -725,34 +726,36 @@ static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts) } /* - * mount options + * (Per device) rbd map options */ enum { + Opt_queue_depth, Opt_last_int, /* int args above */ Opt_last_string, /* string args above */ Opt_read_only, Opt_read_write, - /* Boolean args above */ - Opt_last_bool, + Opt_err }; static match_table_t rbd_opts_tokens = { + {Opt_queue_depth, "queue_depth=%d"}, /* int args above */ /* string args above */ {Opt_read_only, "read_only"}, {Opt_read_only, "ro"}, /* Alternate spelling */ {Opt_read_write, "read_write"}, {Opt_read_write, "rw"}, /* Alternate spelling */ - /* Boolean args above */ - {-1, NULL} + {Opt_err, NULL} }; struct rbd_options { + int queue_depth; bool read_only; }; +#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ #define RBD_READ_ONLY_DEFAULT false static int parse_rbd_opts_token(char *c, void *private) @@ -762,27 +765,27 @@ static int parse_rbd_opts_token(char *c, void *private) int token, intval, ret; token = match_token(c, rbd_opts_tokens, argstr); - if (token < 0) - return -EINVAL; - if (token < Opt_last_int) { ret = match_int(&argstr[0], &intval); if (ret < 0) { - pr_err("bad mount option arg (not int) " - "at '%s'\n", c); + pr_err("bad mount option arg (not int) at '%s'\n", c); return ret; } dout("got int token %d val %d\n", token, intval); } else if (token > Opt_last_int && token < Opt_last_string) { - dout("got string token %d val %s\n", token, - argstr[0].from); - } else if (token > Opt_last_string && token < Opt_last_bool) { - dout("got Boolean token %d\n", token); + dout("got string token %d val %s\n", token, argstr[0].from); } else { dout("got token %d\n", token); } switch (token) { + case Opt_queue_depth: + if (intval < 1) { + pr_err("queue_depth out of range\n"); + return -EINVAL; + } + rbd_opts->queue_depth = intval; + break; case Opt_read_only: rbd_opts->read_only = true; break; @@ -790,9 +793,10 @@ static int parse_rbd_opts_token(char *c, void *private) rbd_opts->read_only = false; break; default: - rbd_assert(false); - break; + /* libceph prints "bad option" msg */ + return -EINVAL; } + return 0; } @@ -1564,22 +1568,39 @@ static void rbd_obj_request_end(struct rbd_obj_request *obj_request) /* * Wait for an object request to complete. If interrupted, cancel the * underlying osd request. + * + * @timeout: in jiffies, 0 means "wait forever" */ -static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) +static int __rbd_obj_request_wait(struct rbd_obj_request *obj_request, + unsigned long timeout) { - int ret; + long ret; dout("%s %p\n", __func__, obj_request); - - ret = wait_for_completion_interruptible(&obj_request->completion); - if (ret < 0) { - dout("%s %p interrupted\n", __func__, obj_request); + ret = wait_for_completion_interruptible_timeout( + &obj_request->completion, + ceph_timeout_jiffies(timeout)); + if (ret <= 0) { + if (ret == 0) + ret = -ETIMEDOUT; rbd_obj_request_end(obj_request); - return ret; + } else { + ret = 0; } - dout("%s %p done\n", __func__, obj_request); - return 0; + dout("%s %p ret %d\n", __func__, obj_request, (int)ret); + return ret; +} + +static int rbd_obj_request_wait(struct rbd_obj_request *obj_request) +{ + return __rbd_obj_request_wait(obj_request, 0); +} + +static int rbd_obj_request_wait_timeout(struct rbd_obj_request *obj_request, + unsigned long timeout) +{ + return __rbd_obj_request_wait(obj_request, timeout); } static void rbd_img_request_complete(struct rbd_img_request *img_request) @@ -1842,9 +1863,11 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, rbd_osd_read_callback(obj_request); break; case CEPH_OSD_OP_SETALLOCHINT: - rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE); + rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || + osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); /* fall through */ case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: rbd_osd_write_callback(obj_request); break; case CEPH_OSD_OP_STAT: @@ -2380,7 +2403,10 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, opcode = CEPH_OSD_OP_ZERO; } } else if (op_type == OBJ_OP_WRITE) { - opcode = CEPH_OSD_OP_WRITE; + if (!offset && length == object_size) + opcode = CEPH_OSD_OP_WRITEFULL; + else + opcode = CEPH_OSD_OP_WRITE; osd_req_op_alloc_hint_init(osd_request, num_ops, object_size, object_size); num_ops++; @@ -2389,7 +2415,7 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, } if (opcode == CEPH_OSD_OP_DELETE) - osd_req_op_init(osd_request, num_ops, opcode); + osd_req_op_init(osd_request, num_ops, opcode, 0); else osd_req_op_extent_init(osd_request, num_ops, opcode, offset, length, 0, 0); @@ -2860,7 +2886,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) goto out; stat_request->callback = rbd_img_obj_exists_callback; - osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT); + osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, false, false); rbd_osd_req_format_read(stat_request); @@ -3134,6 +3160,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( bool watch) { struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct ceph_options *opts = osdc->client->options; struct rbd_obj_request *obj_request; int ret; @@ -3160,7 +3187,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( if (ret) goto out; - ret = rbd_obj_request_wait(obj_request); + ret = rbd_obj_request_wait_timeout(obj_request, opts->mount_timeout); if (ret) goto out; @@ -3415,6 +3442,7 @@ static void rbd_queue_workfn(struct work_struct *work) goto err_rq; } img_request->rq = rq; + snapc = NULL; /* img_request consumes a ref */ if (op_type == OBJ_OP_DISCARD) result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, @@ -3452,52 +3480,6 @@ static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_MQ_RQ_QUEUE_OK; } -/* - * a queue callback. Makes sure that we don't create a bio that spans across - * multiple osd objects. One exception would be with a single page bios, - * which we handle later at bio_chain_clone_range() - */ -static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, - struct bio_vec *bvec) -{ - struct rbd_device *rbd_dev = q->queuedata; - sector_t sector_offset; - sector_t sectors_per_obj; - sector_t obj_sector_offset; - int ret; - - /* - * Find how far into its rbd object the partition-relative - * bio start sector is to offset relative to the enclosing - * device. - */ - sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector; - sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT); - obj_sector_offset = sector_offset & (sectors_per_obj - 1); - - /* - * Compute the number of bytes from that offset to the end - * of the object. Account for what's already used by the bio. - */ - ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT; - if (ret > bmd->bi_size) - ret -= bmd->bi_size; - else - ret = 0; - - /* - * Don't send back more than was asked for. And if the bio - * was empty, let the whole thing through because: "Note - * that a block device *must* allow a single page to be - * added to an empty bio." - */ - rbd_assert(bvec->bv_len <= PAGE_SIZE); - if (ret > (int) bvec->bv_len || !bmd->bi_size) - ret = (int) bvec->bv_len; - - return ret; -} - static void rbd_free_disk(struct rbd_device *rbd_dev) { struct gendisk *disk = rbd_dev->disk; @@ -3762,10 +3744,9 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); rbd_dev->tag_set.ops = &rbd_mq_ops; - rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ; + rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; rbd_dev->tag_set.numa_node = NUMA_NO_NODE; - rbd_dev->tag_set.flags = - BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; rbd_dev->tag_set.nr_hw_queues = 1; rbd_dev->tag_set.cmd_size = sizeof(struct work_struct); @@ -3785,6 +3766,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) /* set io sizes to object size */ segment_size = rbd_obj_bytes(&rbd_dev->header); blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); + q->limits.max_sectors = queue_max_hw_sectors(q); + blk_queue_max_segments(q, segment_size / SECTOR_SIZE); blk_queue_max_segment_size(q, segment_size); blk_queue_io_min(q, segment_size); blk_queue_io_opt(q, segment_size); @@ -3793,10 +3776,12 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); q->limits.discard_granularity = segment_size; q->limits.discard_alignment = segment_size; - q->limits.max_discard_sectors = segment_size / SECTOR_SIZE; + blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE); q->limits.discard_zeroes_data = 1; - blk_queue_merge_bvec(q, rbd_merge_bvec); + if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) + q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; + disk->queue = q; q->queuedata = rbd_dev; @@ -4005,14 +3990,12 @@ static const struct attribute_group *rbd_attr_groups[] = { NULL }; -static void rbd_sysfs_dev_release(struct device *dev) -{ -} +static void rbd_dev_release(struct device *dev); static struct device_type rbd_device_type = { .name = "rbd", .groups = rbd_attr_groups, - .release = rbd_sysfs_dev_release, + .release = rbd_dev_release, }; static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec) @@ -4055,8 +4038,28 @@ static void rbd_spec_free(struct kref *kref) kfree(spec); } +static void rbd_dev_release(struct device *dev) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + bool need_put = !!rbd_dev->opts; + + rbd_put_client(rbd_dev->rbd_client); + rbd_spec_put(rbd_dev->spec); + kfree(rbd_dev->opts); + kfree(rbd_dev); + + /* + * This is racy, but way better than putting module outside of + * the release callback. The race window is pretty small, so + * doing something similar to dm (dm-builtin.c) is overkill. + */ + if (need_put) + module_put(THIS_MODULE); +} + static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, - struct rbd_spec *spec) + struct rbd_spec *spec, + struct rbd_options *opts) { struct rbd_device *rbd_dev; @@ -4070,8 +4073,14 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, INIT_LIST_HEAD(&rbd_dev->node); init_rwsem(&rbd_dev->header_rwsem); - rbd_dev->spec = spec; + rbd_dev->dev.bus = &rbd_bus_type; + rbd_dev->dev.type = &rbd_device_type; + rbd_dev->dev.parent = &rbd_root_dev; + device_initialize(&rbd_dev->dev); + rbd_dev->rbd_client = rbdc; + rbd_dev->spec = spec; + rbd_dev->opts = opts; /* Initialize the layout used for all rbd requests */ @@ -4080,14 +4089,21 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); + /* + * If this is a mapping rbd_dev (as opposed to a parent one), + * pin our module. We have a ref from do_rbd_add(), so use + * __module_get(). + */ + if (rbd_dev->opts) + __module_get(THIS_MODULE); + return rbd_dev; } static void rbd_dev_destroy(struct rbd_device *rbd_dev) { - rbd_put_client(rbd_dev->rbd_client); - rbd_spec_put(rbd_dev->spec); - kfree(rbd_dev); + if (rbd_dev) + put_device(&rbd_dev->dev); } /* @@ -4695,7 +4711,10 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev) } ret = rbd_dev_v2_snap_context(rbd_dev); - dout("rbd_dev_v2_snap_context returned %d\n", ret); + if (ret && first_time) { + kfree(rbd_dev->header.object_prefix); + rbd_dev->header.object_prefix = NULL; + } return ret; } @@ -4710,27 +4729,6 @@ static int rbd_dev_header_info(struct rbd_device *rbd_dev) return rbd_dev_v2_header_info(rbd_dev); } -static int rbd_bus_add_dev(struct rbd_device *rbd_dev) -{ - struct device *dev; - int ret; - - dev = &rbd_dev->dev; - dev->bus = &rbd_bus_type; - dev->type = &rbd_device_type; - dev->parent = &rbd_root_dev; - dev->release = rbd_dev_device_release; - dev_set_name(dev, "%d", rbd_dev->dev_id); - ret = device_register(dev); - - return ret; -} - -static void rbd_bus_del_dev(struct rbd_device *rbd_dev) -{ - device_unregister(&rbd_dev->dev); -} - /* * Get a unique rbd identifier for the given new rbd_dev, and add * the rbd_dev to the global list. @@ -4945,6 +4943,7 @@ static int rbd_add_parse_args(const char *buf, goto out_mem; rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; + rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; copts = ceph_parse_options(options, mon_addrs, mon_addrs + mon_addrs_size - 1, @@ -4975,8 +4974,8 @@ out_err: */ static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) { + struct ceph_options *opts = rbdc->client->options; u64 newest_epoch; - unsigned long timeout = rbdc->client->options->mount_timeout * HZ; int tries = 0; int ret; @@ -4991,7 +4990,8 @@ again: if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { ceph_monc_request_next_osdmap(&rbdc->client->monc); (void) ceph_monc_wait_osdmap(&rbdc->client->monc, - newest_epoch, timeout); + newest_epoch, + opts->mount_timeout); goto again; } else { /* the osdmap we have is new enough */ @@ -5142,45 +5142,51 @@ out_err: return ret; } -static int rbd_dev_probe_parent(struct rbd_device *rbd_dev) +/* + * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() -> + * rbd_dev_image_probe() recursion depth, which means it's also the + * length of the already discovered part of the parent chain. + */ +static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) { struct rbd_device *parent = NULL; - struct rbd_spec *parent_spec; - struct rbd_client *rbdc; int ret; if (!rbd_dev->parent_spec) return 0; - /* - * We need to pass a reference to the client and the parent - * spec when creating the parent rbd_dev. Images related by - * parent/child relationships always share both. - */ - parent_spec = rbd_spec_get(rbd_dev->parent_spec); - rbdc = __rbd_get_client(rbd_dev->rbd_client); - ret = -ENOMEM; - parent = rbd_dev_create(rbdc, parent_spec); - if (!parent) + if (++depth > RBD_MAX_PARENT_CHAIN_LEN) { + pr_info("parent chain is too long (%d)\n", depth); + ret = -EINVAL; + goto out_err; + } + + parent = rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec, + NULL); + if (!parent) { + ret = -ENOMEM; goto out_err; + } + + /* + * Images related by parent/child relationships always share + * rbd_client and spec/parent_spec, so bump their refcounts. + */ + __rbd_get_client(rbd_dev->rbd_client); + rbd_spec_get(rbd_dev->parent_spec); - ret = rbd_dev_image_probe(parent, false); + ret = rbd_dev_image_probe(parent, depth); if (ret < 0) goto out_err; + rbd_dev->parent = parent; atomic_set(&rbd_dev->parent_ref, 1); - return 0; + out_err: - if (parent) { - rbd_dev_unparent(rbd_dev); - kfree(rbd_dev->header_name); + rbd_dev_unparent(rbd_dev); + if (parent) rbd_dev_destroy(parent); - } else { - rbd_put_client(rbdc); - rbd_spec_put(parent_spec); - } - return ret; } @@ -5225,7 +5231,8 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); - ret = rbd_bus_add_dev(rbd_dev); + dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); + ret = device_add(&rbd_dev->dev); if (ret) goto err_out_mapping; @@ -5248,8 +5255,6 @@ err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); err_out_id: rbd_dev_id_put(rbd_dev); - rbd_dev_mapping_clear(rbd_dev); - return ret; } @@ -5298,7 +5303,7 @@ static void rbd_dev_image_release(struct rbd_device *rbd_dev) * parent), initiate a watch on its header object before using that * object to get detailed information about the rbd image. */ -static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) +static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) { int ret; @@ -5316,7 +5321,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) if (ret) goto err_out_format; - if (mapping) { + if (!depth) { ret = rbd_dev_header_watch_sync(rbd_dev); if (ret) { if (ret == -ENOENT) @@ -5337,7 +5342,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) * Otherwise this is a parent image, identified by pool, image * and snap ids - need to fill in names for those ids. */ - if (mapping) + if (!depth) ret = rbd_spec_fill_snap_id(rbd_dev); else ret = rbd_spec_fill_names(rbd_dev); @@ -5359,12 +5364,12 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) * Need to warn users if this image is the one being * mapped and has a parent. */ - if (mapping && rbd_dev->parent_spec) + if (!depth && rbd_dev->parent_spec) rbd_warn(rbd_dev, "WARNING: kernel layering is EXPERIMENTAL!"); } - ret = rbd_dev_probe_parent(rbd_dev); + ret = rbd_dev_probe_parent(rbd_dev, depth); if (ret) goto err_out_probe; @@ -5375,7 +5380,7 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping) err_out_probe: rbd_dev_unprobe(rbd_dev); err_out_watch: - if (mapping) + if (!depth) rbd_dev_header_unwatch_sync(rbd_dev); out_header_name: kfree(rbd_dev->header_name); @@ -5397,7 +5402,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, struct rbd_spec *spec = NULL; struct rbd_client *rbdc; bool read_only; - int rc = -ENOMEM; + int rc; if (!try_module_get(THIS_MODULE)) return -ENODEV; @@ -5405,10 +5410,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, /* parse add command */ rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec); if (rc < 0) - goto err_out_module; - read_only = rbd_opts->read_only; - kfree(rbd_opts); - rbd_opts = NULL; /* done with this */ + goto out; rbdc = rbd_get_client(ceph_opts); if (IS_ERR(rbdc)) { @@ -5434,18 +5436,22 @@ static ssize_t do_rbd_add(struct bus_type *bus, goto err_out_client; } - rbd_dev = rbd_dev_create(rbdc, spec); - if (!rbd_dev) + rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts); + if (!rbd_dev) { + rc = -ENOMEM; goto err_out_client; + } rbdc = NULL; /* rbd_dev now owns this */ spec = NULL; /* rbd_dev now owns this */ + rbd_opts = NULL; /* rbd_dev now owns this */ - rc = rbd_dev_image_probe(rbd_dev, true); + rc = rbd_dev_image_probe(rbd_dev, 0); if (rc < 0) goto err_out_rbd_dev; /* If we are mapping a snapshot it must be marked read-only */ + read_only = rbd_dev->opts->read_only; if (rbd_dev->spec->snap_id != CEPH_NOSNAP) read_only = true; rbd_dev->mapping.read_only = read_only; @@ -5459,10 +5465,13 @@ static ssize_t do_rbd_add(struct bus_type *bus, */ rbd_dev_header_unwatch_sync(rbd_dev); rbd_dev_image_release(rbd_dev); - goto err_out_module; + goto out; } - return count; + rc = count; +out: + module_put(THIS_MODULE); + return rc; err_out_rbd_dev: rbd_dev_destroy(rbd_dev); @@ -5470,12 +5479,8 @@ err_out_client: rbd_put_client(rbdc); err_out_args: rbd_spec_put(spec); -err_out_module: - module_put(THIS_MODULE); - - dout("Error adding device %s\n", buf); - - return (ssize_t)rc; + kfree(rbd_opts); + goto out; } static ssize_t rbd_add(struct bus_type *bus, @@ -5495,17 +5500,15 @@ static ssize_t rbd_add_single_major(struct bus_type *bus, return do_rbd_add(bus, buf, count); } -static void rbd_dev_device_release(struct device *dev) +static void rbd_dev_device_release(struct rbd_device *rbd_dev) { - struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - rbd_free_disk(rbd_dev); clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags); + device_del(&rbd_dev->dev); rbd_dev_mapping_clear(rbd_dev); if (!single_major) unregister_blkdev(rbd_dev->major, rbd_dev->name); rbd_dev_id_put(rbd_dev); - rbd_dev_mapping_clear(rbd_dev); } static void rbd_dev_remove_parent(struct rbd_device *rbd_dev) @@ -5590,9 +5593,8 @@ static ssize_t do_rbd_remove(struct bus_type *bus, * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting * in a potential use after free of rbd_dev->disk or rbd_dev. */ - rbd_bus_del_dev(rbd_dev); + rbd_dev_device_release(rbd_dev); rbd_dev_image_release(rbd_dev); - module_put(THIS_MODULE); return count; } @@ -5663,10 +5665,8 @@ static int rbd_slab_init(void) if (rbd_segment_name_cache) return 0; out_err: - if (rbd_obj_request_cache) { - kmem_cache_destroy(rbd_obj_request_cache); - rbd_obj_request_cache = NULL; - } + kmem_cache_destroy(rbd_obj_request_cache); + rbd_obj_request_cache = NULL; kmem_cache_destroy(rbd_img_request_cache); rbd_img_request_cache = NULL; diff --git a/kernel/drivers/block/rsxx/dev.c b/kernel/drivers/block/rsxx/dev.c index ac8c62cb4..e1b8b7061 100644 --- a/kernel/drivers/block/rsxx/dev.c +++ b/kernel/drivers/block/rsxx/dev.c @@ -137,17 +137,22 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card, if (!card->eeh_state && card->gendisk) disk_stats_complete(card, meta->bio, meta->start_time); - bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0); + if (atomic_read(&meta->error)) + bio_io_error(meta->bio); + else + bio_endio(meta->bio); kmem_cache_free(bio_meta_pool, meta); } } -static void rsxx_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) { struct rsxx_cardinfo *card = q->queuedata; struct rsxx_bio_meta *bio_meta; int st = -EINVAL; + blk_queue_split(q, &bio, q->bio_split); + might_sleep(); if (!card) @@ -194,12 +199,15 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio) if (st) goto queue_err; - return; + return BLK_QC_T_NONE; queue_err: kmem_cache_free(bio_meta_pool, bio_meta); req_err: - bio_endio(bio, st); + if (st) + bio->bi_error = st; + bio_endio(bio); + return BLK_QC_T_NONE; } /*----------------- Device Setup -------------------*/ diff --git a/kernel/drivers/block/skd_main.c b/kernel/drivers/block/skd_main.c index 1e46eb230..586f9168f 100644 --- a/kernel/drivers/block/skd_main.c +++ b/kernel/drivers/block/skd_main.c @@ -4422,7 +4422,7 @@ static int skd_cons_disk(struct skd_device *skdev) /* DISCARD Flag initialization. */ q->limits.discard_granularity = 8192; q->limits.discard_alignment = 0; - q->limits.max_discard_sectors = UINT_MAX >> 9; + blk_queue_max_discard_sectors(q, UINT_MAX >> 9); q->limits.discard_zeroes_data = 1; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); diff --git a/kernel/drivers/block/sx8.c b/kernel/drivers/block/sx8.c index 5d552857d..59c91d49b 100644 --- a/kernel/drivers/block/sx8.c +++ b/kernel/drivers/block/sx8.c @@ -620,7 +620,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx) spin_unlock_irq(&host->lock); DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); - crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->cmd_type = REQ_TYPE_DRV_PRIV; crq->rq->special = crq; blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); @@ -661,7 +661,7 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) crq->msg_bucket = (u32) rc; DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx); - crq->rq->cmd_type = REQ_TYPE_SPECIAL; + crq->rq->cmd_type = REQ_TYPE_DRV_PRIV; crq->rq->special = crq; blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL); diff --git a/kernel/drivers/block/umem.c b/kernel/drivers/block/umem.c index 4cf81b5bf..7939b9f87 100644 --- a/kernel/drivers/block/umem.c +++ b/kernel/drivers/block/umem.c @@ -456,7 +456,7 @@ static void process_page(unsigned long data) PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE); if (control & DMASCR_HARD_ERROR) { /* error */ - clear_bit(BIO_UPTODATE, &bio->bi_flags); + bio->bi_error = -EIO; dev_printk(KERN_WARNING, &card->dev->dev, "I/O error on sector %d/%d\n", le32_to_cpu(desc->local_addr)>>9, @@ -505,7 +505,7 @@ static void process_page(unsigned long data) return_bio = bio->bi_next; bio->bi_next = NULL; - bio_endio(bio, 0); + bio_endio(bio); } } @@ -524,13 +524,15 @@ static int mm_check_plugged(struct cardinfo *card) return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb)); } -static void mm_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t mm_make_request(struct request_queue *q, struct bio *bio) { struct cardinfo *card = q->queuedata; pr_debug("mm_make_request %llu %u\n", (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size); + blk_queue_split(q, &bio, q->bio_split); + spin_lock_irq(&card->lock); *card->biotail = bio; bio->bi_next = NULL; @@ -539,7 +541,7 @@ static void mm_make_request(struct request_queue *q, struct bio *bio) activate(card); spin_unlock_irq(&card->lock); - return; + return BLK_QC_T_NONE; } static irqreturn_t mm_interrupt(int irq, void *__card) diff --git a/kernel/drivers/block/virtio_blk.c b/kernel/drivers/block/virtio_blk.c index 5ea2f0bbb..6ca35495a 100644 --- a/kernel/drivers/block/virtio_blk.c +++ b/kernel/drivers/block/virtio_blk.c @@ -124,7 +124,7 @@ static inline void virtblk_request_done(struct request *req) req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual); req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len); req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors); - } else if (req->cmd_type == REQ_TYPE_SPECIAL) { + } else if (req->cmd_type == REQ_TYPE_DRV_PRIV) { req->errors = (error != 0); } @@ -144,7 +144,7 @@ static void virtblk_done(struct virtqueue *vq) do { virtqueue_disable_cb(vq); while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { - blk_mq_complete_request(vbr->req); + blk_mq_complete_request(vbr->req, vbr->req->errors); req_done = true; } if (unlikely(virtqueue_is_broken(vq))) @@ -188,7 +188,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; - case REQ_TYPE_SPECIAL: + case REQ_TYPE_DRV_PRIV: vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); @@ -251,7 +251,7 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str) return PTR_ERR(req); } - req->cmd_type = REQ_TYPE_SPECIAL; + req->cmd_type = REQ_TYPE_DRV_PRIV; err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); blk_put_request(req); @@ -478,8 +478,7 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev) struct virtio_blk_config, wce, &writeback); if (err) - writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE) || - virtio_has_feature(vdev, VIRTIO_F_VERSION_1); + writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE); return writeback; } @@ -657,6 +656,7 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; vblk->disk->driverfs_dev = &vdev->dev; + vblk->disk->flags |= GENHD_FL_EXT_DEVT; vblk->index = index; /* configure queue flush support */ @@ -840,7 +840,7 @@ static unsigned int features_legacy[] = { static unsigned int features[] = { VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, - VIRTIO_BLK_F_TOPOLOGY, + VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, VIRTIO_BLK_F_MQ, }; diff --git a/kernel/drivers/block/xen-blkback/blkback.c b/kernel/drivers/block/xen-blkback/blkback.c index 3e9ec9523..41fb1a917 100644 --- a/kernel/drivers/block/xen-blkback/blkback.c +++ b/kernel/drivers/block/xen-blkback/blkback.c @@ -83,6 +83,13 @@ module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644); MODULE_PARM_DESC(max_persistent_grants, "Maximum number of grants to map persistently"); +/* + * Maximum order of pages to be used for the shared ring between front and + * backend, 4KB page granularity is used. + */ +unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; +module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); +MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); /* * The LRU mechanism to clean the lists of persistent grants needs to * be executed periodically. The time interval between consecutive executions @@ -729,7 +736,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req) struct grant_page **pages = req->segments; unsigned int invcount; - invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_pages, + invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs, req->unmap, req->unmap_pages); work->data = req; @@ -915,7 +922,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req) int rc; rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, - pending_req->nr_pages, + pending_req->nr_segs, (pending_req->operation != BLKIF_OP_READ)); return rc; @@ -931,7 +938,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, int indirect_grefs, rc, n, nseg, i; struct blkif_request_segment *segments = NULL; - nseg = pending_req->nr_pages; + nseg = pending_req->nr_segs; indirect_grefs = INDIRECT_PAGES(nseg); BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); @@ -943,6 +950,8 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, goto unmap; for (n = 0, i = 0; n < nseg; n++) { + uint8_t first_sect, last_sect; + if ((n % SEGS_PER_INDIRECT_FRAME) == 0) { /* Map indirect segments */ if (segments) @@ -950,15 +959,18 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page); } i = n % SEGS_PER_INDIRECT_FRAME; + pending_req->segments[n]->gref = segments[i].gref; - seg[n].nsec = segments[i].last_sect - - segments[i].first_sect + 1; - seg[n].offset = (segments[i].first_sect << 9); - if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) || - (segments[i].last_sect < segments[i].first_sect)) { + + first_sect = READ_ONCE(segments[i].first_sect); + last_sect = READ_ONCE(segments[i].last_sect); + if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) { rc = -EINVAL; goto unmap; } + + seg[n].nsec = last_sect - first_sect + 1; + seg[n].offset = first_sect << 9; preq->nr_sects += seg[n].nsec; } @@ -1071,9 +1083,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) /* * bio callback. */ -static void end_block_io_op(struct bio *bio, int error) +static void end_block_io_op(struct bio *bio) { - __end_block_io_op(bio->bi_private, error); + __end_block_io_op(bio->bi_private, bio->bi_error); bio_put(bio); } @@ -1203,6 +1215,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, req_operation = req->operation == BLKIF_OP_INDIRECT ? req->u.indirect.indirect_op : req->operation; + if ((req->operation == BLKIF_OP_INDIRECT) && (req_operation != BLKIF_OP_READ) && (req_operation != BLKIF_OP_WRITE)) { @@ -1251,7 +1264,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, pending_req->id = req->u.rw.id; pending_req->operation = req_operation; pending_req->status = BLKIF_RSP_OKAY; - pending_req->nr_pages = nseg; + pending_req->nr_segs = nseg; if (req->operation != BLKIF_OP_INDIRECT) { preq.dev = req->u.rw.handle; @@ -1261,7 +1274,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, seg[i].nsec = req->u.rw.seg[i].last_sect - req->u.rw.seg[i].first_sect + 1; seg[i].offset = (req->u.rw.seg[i].first_sect << 9); - if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || + if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) || (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) goto fail_response; @@ -1372,7 +1385,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, fail_flush: xen_blkbk_unmap(blkif, pending_req->segments, - pending_req->nr_pages); + pending_req->nr_segs); fail_response: /* Haven't submitted any bio's yet. */ make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); @@ -1438,6 +1451,12 @@ static int __init xen_blkif_init(void) if (!xen_domain()) return -ENODEV; + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { + pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", + xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); + xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; + } + rc = xen_blkif_interface_init(); if (rc) goto failed_init; diff --git a/kernel/drivers/block/xen-blkback/common.h b/kernel/drivers/block/xen-blkback/common.h index f620b5d3f..c929ae227 100644 --- a/kernel/drivers/block/xen-blkback/common.h +++ b/kernel/drivers/block/xen-blkback/common.h @@ -39,23 +39,33 @@ #include #include #include +#include #include #include #include #include +extern unsigned int xen_blkif_max_ring_order; /* * This is the maximum number of segments that would be allowed in indirect * requests. This value will also be passed to the frontend. */ #define MAX_INDIRECT_SEGMENTS 256 -#define SEGS_PER_INDIRECT_FRAME \ - (PAGE_SIZE/sizeof(struct blkif_request_segment)) +/* + * Xen use 4K pages. The guest may use different page size (4K or 64K) + * Number of Xen pages per segment + */ +#define XEN_PAGES_PER_SEGMENT (PAGE_SIZE / XEN_PAGE_SIZE) + +#define XEN_PAGES_PER_INDIRECT_FRAME \ + (XEN_PAGE_SIZE/sizeof(struct blkif_request_segment)) +#define SEGS_PER_INDIRECT_FRAME \ + (XEN_PAGES_PER_INDIRECT_FRAME / XEN_PAGES_PER_SEGMENT) + #define MAX_INDIRECT_PAGES \ ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) -#define INDIRECT_PAGES(_segs) \ - ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) +#define INDIRECT_PAGES(_segs) DIV_ROUND_UP(_segs, XEN_PAGES_PER_INDIRECT_FRAME) /* Not a real protocol. Used to generate ring structs which contain * the elements common to all protocols only. This way we get a @@ -248,7 +258,7 @@ struct backend_info; #define PERSISTENT_GNT_WAS_ACTIVE 1 /* Number of requests that we can fit in a ring */ -#define XEN_BLKIF_REQS 32 +#define XEN_BLKIF_REQS_PER_PAGE 32 struct persistent_gnt { struct page *page; @@ -320,6 +330,7 @@ struct xen_blkif { struct work_struct free_work; /* Thread shutdown wait queue. */ wait_queue_head_t shutdown_wq; + unsigned int nr_ring_pages; }; struct seg_buf { @@ -343,7 +354,7 @@ struct grant_page { struct pending_req { struct xen_blkif *blkif; u64 id; - int nr_pages; + int nr_segs; atomic_t pendcnt; unsigned short operation; int status; @@ -397,8 +408,8 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, struct blkif_x86_32_request *src) { int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; - dst->operation = src->operation; - switch (src->operation) { + dst->operation = READ_ONCE(src->operation); + switch (dst->operation) { case BLKIF_OP_READ: case BLKIF_OP_WRITE: case BLKIF_OP_WRITE_BARRIER: @@ -445,8 +456,8 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, struct blkif_x86_64_request *src) { int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j; - dst->operation = src->operation; - switch (src->operation) { + dst->operation = READ_ONCE(src->operation); + switch (dst->operation) { case BLKIF_OP_READ: case BLKIF_OP_WRITE: case BLKIF_OP_WRITE_BARRIER: diff --git a/kernel/drivers/block/xen-blkback/xenbus.c b/kernel/drivers/block/xen-blkback/xenbus.c index 6ab69ad61..f53cff42f 100644 --- a/kernel/drivers/block/xen-blkback/xenbus.c +++ b/kernel/drivers/block/xen-blkback/xenbus.c @@ -25,6 +25,7 @@ /* Enlarge the array size in order to fully show blkback name. */ #define BLKBACK_NAME_LEN (20) +#define RINGREF_NAME_LEN (20) struct backend_info { struct xenbus_device *dev; @@ -124,8 +125,6 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) static struct xen_blkif *xen_blkif_alloc(domid_t domid) { struct xen_blkif *blkif; - struct pending_req *req, *n; - int i, j; BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST); @@ -151,55 +150,15 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) INIT_LIST_HEAD(&blkif->pending_free); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); - - for (i = 0; i < XEN_BLKIF_REQS; i++) { - req = kzalloc(sizeof(*req), GFP_KERNEL); - if (!req) - goto fail; - list_add_tail(&req->free_list, - &blkif->pending_free); - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { - req->segments[j] = kzalloc(sizeof(*req->segments[0]), - GFP_KERNEL); - if (!req->segments[j]) - goto fail; - } - for (j = 0; j < MAX_INDIRECT_PAGES; j++) { - req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]), - GFP_KERNEL); - if (!req->indirect_pages[j]) - goto fail; - } - } spin_lock_init(&blkif->pending_free_lock); init_waitqueue_head(&blkif->pending_free_wq); init_waitqueue_head(&blkif->shutdown_wq); return blkif; - -fail: - list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { - list_del(&req->free_list); - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { - if (!req->segments[j]) - break; - kfree(req->segments[j]); - } - for (j = 0; j < MAX_INDIRECT_PAGES; j++) { - if (!req->indirect_pages[j]) - break; - kfree(req->indirect_pages[j]); - } - kfree(req); - } - - kmem_cache_free(xen_blkif_cachep, blkif); - - return ERR_PTR(-ENOMEM); } -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, - unsigned int evtchn) +static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, + unsigned int nr_grefs, unsigned int evtchn) { int err; @@ -207,7 +166,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, if (blkif->irq) return 0; - err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1, + err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, &blkif->blk_ring); if (err < 0) return err; @@ -217,21 +176,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, { struct blkif_sring *sring; sring = (struct blkif_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); + BACK_RING_INIT(&blkif->blk_rings.native, sring, + XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_sring *sring_x86_32; sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, + XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_sring *sring_x86_64; sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, + XEN_PAGE_SIZE * nr_grefs); break; } default: @@ -253,6 +215,9 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, static int xen_blkif_disconnect(struct xen_blkif *blkif) { + struct pending_req *req, *n; + int i = 0, j; + if (blkif->xenblkd) { kthread_stop(blkif->xenblkd); wake_up(&blkif->shutdown_wq); @@ -279,13 +244,28 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif) /* Remove all persistent grants and the cache of ballooned pages. */ xen_blkbk_free_caches(blkif); + /* Check that there is no request in use */ + list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { + list_del(&req->free_list); + + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) + kfree(req->segments[j]); + + for (j = 0; j < MAX_INDIRECT_PAGES; j++) + kfree(req->indirect_pages[j]); + + kfree(req); + i++; + } + + WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); + blkif->nr_ring_pages = 0; + return 0; } static void xen_blkif_free(struct xen_blkif *blkif) { - struct pending_req *req, *n; - int i = 0, j; xen_blkif_disconnect(blkif); xen_vbd_free(&blkif->vbd); @@ -298,22 +278,6 @@ static void xen_blkif_free(struct xen_blkif *blkif) BUG_ON(!list_empty(&blkif->free_pages)); BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - /* Check that there is no request in use */ - list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { - list_del(&req->free_list); - - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) - kfree(req->segments[j]); - - for (j = 0; j < MAX_INDIRECT_PAGES; j++) - kfree(req->indirect_pages[j]); - - kfree(req); - i++; - } - - WARN_ON(i != XEN_BLKIF_REQS); - kmem_cache_free(xen_blkif_cachep, blkif); } @@ -597,6 +561,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev, if (err) goto fail; + err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-page-order", "%u", + xen_blkif_max_ring_order); + if (err) + pr_warn("%s write out 'max-ring-page-order' failed\n", __func__); + err = xenbus_switch_state(dev, XenbusStateInitWait); if (err) goto fail; @@ -860,22 +829,66 @@ again: static int connect_ring(struct backend_info *be) { struct xenbus_device *dev = be->dev; - unsigned long ring_ref; - unsigned int evtchn; + unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; + unsigned int evtchn, nr_grefs, ring_page_order; unsigned int pers_grants; char protocol[64] = ""; - int err; + struct pending_req *req, *n; + int err, i, j; pr_debug("%s %s\n", __func__, dev->otherend); - err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", - &ring_ref, "event-channel", "%u", &evtchn, NULL); - if (err) { - xenbus_dev_fatal(dev, err, - "reading %s/ring-ref and event-channel", + err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u", + &evtchn); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/event-channel", dev->otherend); return err; } + pr_info("event-channel %u\n", evtchn); + + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", + &ring_page_order); + if (err != 1) { + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", + "%u", &ring_ref[0]); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", + dev->otherend); + return err; + } + nr_grefs = 1; + pr_info("%s:using single page: ring-ref %d\n", dev->otherend, + ring_ref[0]); + } else { + unsigned int i; + + if (ring_page_order > xen_blkif_max_ring_order) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", + dev->otherend, ring_page_order, + xen_blkif_max_ring_order); + return err; + } + + nr_grefs = 1 << ring_page_order; + for (i = 0; i < nr_grefs; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name, + "%u", &ring_ref[i]); + if (err != 1) { + err = -EINVAL; + xenbus_dev_fatal(dev, err, "reading %s/%s", + dev->otherend, ring_ref_name); + return err; + } + pr_info("ring-ref%u: %u\n", i, ring_ref[i]); + } + } be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", @@ -900,20 +913,55 @@ static int connect_ring(struct backend_info *be) be->blkif->vbd.feature_gnt_persistent = pers_grants; be->blkif->vbd.overflow_max_grants = 0; + be->blkif->nr_ring_pages = nr_grefs; - pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n", - ring_ref, evtchn, be->blkif->blk_protocol, protocol, + pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n", + nr_grefs, evtchn, be->blkif->blk_protocol, protocol, pers_grants ? "persistent grants" : ""); + for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + goto fail; + list_add_tail(&req->free_list, &be->blkif->pending_free); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { + req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL); + if (!req->segments[j]) + goto fail; + } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) { + req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]), + GFP_KERNEL); + if (!req->indirect_pages[j]) + goto fail; + } + } + /* Map the shared frame, irq etc. */ - err = xen_blkif_map(be->blkif, ring_ref, evtchn); + err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); if (err) { - xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", - ring_ref, evtchn); + xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); return err; } return 0; + +fail: + list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) { + list_del(&req->free_list); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { + if (!req->segments[j]) + break; + kfree(req->segments[j]); + } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) { + if (!req->indirect_pages[j]) + break; + kfree(req->indirect_pages[j]); + } + kfree(req); + } + return -ENOMEM; } static const struct xenbus_device_id xen_blkbk_ids[] = { diff --git a/kernel/drivers/block/xen-blkfront.c b/kernel/drivers/block/xen-blkfront.c index 89c7371ab..2fee2eef9 100644 --- a/kernel/drivers/block/xen-blkfront.c +++ b/kernel/drivers/block/xen-blkfront.c @@ -37,6 +37,7 @@ #include #include +#include #include #include #include @@ -67,7 +68,7 @@ enum blkif_state { struct grant { grant_ref_t gref; - unsigned long pfn; + struct page *page; struct list_head node; }; @@ -77,12 +78,12 @@ struct blk_shadow { struct grant **grants_used; struct grant **indirect_grants; struct scatterlist *sg; + unsigned int num_sg; }; struct split_bio { struct bio *bio; atomic_t pending; - int err; }; static DEFINE_MUTEX(blkfront_mutex); @@ -98,7 +99,25 @@ static unsigned int xen_blkif_max_segments = 32; module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) +/* + * Maximum order of pages to be used for the shared ring between front and + * backend, 4KB page granularity is used. + */ +static unsigned int xen_blkif_max_ring_order; +module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO); +MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring"); + +#define BLK_RING_SIZE(info) \ + __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages) + +#define BLK_MAX_RING_SIZE \ + __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) + +/* + * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 + * characters are enough. Define to 20 to keep consist with backend. + */ +#define RINGREF_NAME_LEN (20) /* * We have one of these per vbd, whether ide, scsi or 'other'. They @@ -114,13 +133,14 @@ struct blkfront_info int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref; + int ring_ref[XENBUS_MAX_RING_GRANTS]; + unsigned int nr_ring_pages; struct blkif_front_ring ring; unsigned int evtchn, irq; struct request_queue *rq; struct work_struct work; struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_RING_SIZE]; + struct blk_shadow shadow[BLK_MAX_RING_SIZE]; struct list_head grants; struct list_head indirect_pages; unsigned int persistent_gnts_c; @@ -131,16 +151,16 @@ struct blkfront_info unsigned int discard_granularity; unsigned int discard_alignment; unsigned int feature_persistent:1; + /* Number of 4KB segments handled */ unsigned int max_indirect_segments; int is_ready; + struct blk_mq_tag_set tag_set; }; static unsigned int nr_minors; static unsigned long *minors; static DEFINE_SPINLOCK(minor_lock); -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ - (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) #define GRANT_INVALID_REF 0 #define PARTS_PER_DISK 16 @@ -160,17 +180,31 @@ static DEFINE_SPINLOCK(minor_lock); #define DEV_NAME "xvd" /* name in /dev */ -#define SEGS_PER_INDIRECT_FRAME \ - (PAGE_SIZE/sizeof(struct blkif_request_segment)) -#define INDIRECT_GREFS(_segs) \ - ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME) +/* + * Grants are always the same size as a Xen page (i.e 4KB). + * A physical segment is always the same size as a Linux page. + * Number of grants per physical segment + */ +#define GRANTS_PER_PSEG (PAGE_SIZE / XEN_PAGE_SIZE) + +#define GRANTS_PER_INDIRECT_FRAME \ + (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment)) + +#define PSEGS_PER_INDIRECT_FRAME \ + (GRANTS_INDIRECT_FRAME / GRANTS_PSEGS) + +#define INDIRECT_GREFS(_grants) \ + DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME) + +#define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) static int blkfront_setup_indirect(struct blkfront_info *info); +static int blkfront_gather_backend_features(struct blkfront_info *info); static int get_id_from_freelist(struct blkfront_info *info) { unsigned long free = info->shadow_free; - BUG_ON(free >= BLK_RING_SIZE); + BUG_ON(free >= BLK_RING_SIZE(info)); info->shadow_free = info->shadow[free].req.u.rw.id; info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ return free; @@ -206,7 +240,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) kfree(gnt_list_entry); goto out_of_memory; } - gnt_list_entry->pfn = page_to_pfn(granted_page); + gnt_list_entry->page = granted_page; } gnt_list_entry->gref = GRANT_INVALID_REF; @@ -221,7 +255,7 @@ out_of_memory: &info->grants, node) { list_del(&gnt_list_entry->node); if (info->feature_persistent) - __free_page(pfn_to_page(gnt_list_entry->pfn)); + __free_page(gnt_list_entry->page); kfree(gnt_list_entry); i--; } @@ -229,34 +263,77 @@ out_of_memory: return -ENOMEM; } -static struct grant *get_grant(grant_ref_t *gref_head, - unsigned long pfn, - struct blkfront_info *info) +static struct grant *get_free_grant(struct blkfront_info *info) { struct grant *gnt_list_entry; - unsigned long buffer_mfn; BUG_ON(list_empty(&info->grants)); gnt_list_entry = list_first_entry(&info->grants, struct grant, - node); + node); list_del(&gnt_list_entry->node); - if (gnt_list_entry->gref != GRANT_INVALID_REF) { + if (gnt_list_entry->gref != GRANT_INVALID_REF) info->persistent_gnts_c--; + + return gnt_list_entry; +} + +static inline void grant_foreign_access(const struct grant *gnt_list_entry, + const struct blkfront_info *info) +{ + gnttab_page_grant_foreign_access_ref_one(gnt_list_entry->gref, + info->xbdev->otherend_id, + gnt_list_entry->page, + 0); +} + +static struct grant *get_grant(grant_ref_t *gref_head, + unsigned long gfn, + struct blkfront_info *info) +{ + struct grant *gnt_list_entry = get_free_grant(info); + + if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; + + /* Assign a gref to this page */ + gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); + BUG_ON(gnt_list_entry->gref == -ENOSPC); + if (info->feature_persistent) + grant_foreign_access(gnt_list_entry, info); + else { + /* Grant access to the GFN passed by the caller */ + gnttab_grant_foreign_access_ref(gnt_list_entry->gref, + info->xbdev->otherend_id, + gfn, 0); } + return gnt_list_entry; +} + +static struct grant *get_indirect_grant(grant_ref_t *gref_head, + struct blkfront_info *info) +{ + struct grant *gnt_list_entry = get_free_grant(info); + + if (gnt_list_entry->gref != GRANT_INVALID_REF) + return gnt_list_entry; + /* Assign a gref to this page */ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head); BUG_ON(gnt_list_entry->gref == -ENOSPC); if (!info->feature_persistent) { - BUG_ON(!pfn); - gnt_list_entry->pfn = pfn; + struct page *indirect_page; + + /* Fetch a pre-allocated page to use for indirect grefs */ + BUG_ON(list_empty(&info->indirect_pages)); + indirect_page = list_first_entry(&info->indirect_pages, + struct page, lru); + list_del(&indirect_page->lru); + gnt_list_entry->page = indirect_page; } - buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn); - gnttab_grant_foreign_access_ref(gnt_list_entry->gref, - info->xbdev->otherend_id, - buffer_mfn, 0); + grant_foreign_access(gnt_list_entry, info); + return gnt_list_entry; } @@ -379,20 +456,128 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return 0; } -/* - * Generate a Xen blkfront IO request from a blk layer request. Reads - * and writes are handled as expected. - * - * @req: a request struct - */ -static int blkif_queue_request(struct request *req) +static int blkif_queue_discard_req(struct request *req) { struct blkfront_info *info = req->rq_disk->private_data; struct blkif_request *ring_req; unsigned long id; + + /* Fill out a communications ring structure. */ + ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); + id = get_id_from_freelist(info); + info->shadow[id].request = req; + + ring_req->operation = BLKIF_OP_DISCARD; + ring_req->u.discard.nr_sectors = blk_rq_sectors(req); + ring_req->u.discard.id = id; + ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); + if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) + ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; + else + ring_req->u.discard.flag = 0; + + info->ring.req_prod_pvt++; + + /* Keep a private copy so we can reissue requests when recovering. */ + info->shadow[id].req = *ring_req; + + return 0; +} + +struct setup_rw_req { + unsigned int grant_idx; + struct blkif_request_segment *segments; + struct blkfront_info *info; + struct blkif_request *ring_req; + grant_ref_t gref_head; + unsigned int id; + /* Only used when persistent grant is used and it's a read request */ + bool need_copy; + unsigned int bvec_off; + char *bvec_data; +}; + +static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, + unsigned int len, void *data) +{ + struct setup_rw_req *setup = data; + int n, ref; + struct grant *gnt_list_entry; unsigned int fsect, lsect; - int i, ref, n; - struct blkif_request_segment *segments = NULL; + /* Convenient aliases */ + unsigned int grant_idx = setup->grant_idx; + struct blkif_request *ring_req = setup->ring_req; + struct blkfront_info *info = setup->info; + struct blk_shadow *shadow = &info->shadow[setup->id]; + + if ((ring_req->operation == BLKIF_OP_INDIRECT) && + (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { + if (setup->segments) + kunmap_atomic(setup->segments); + + n = grant_idx / GRANTS_PER_INDIRECT_FRAME; + gnt_list_entry = get_indirect_grant(&setup->gref_head, info); + shadow->indirect_grants[n] = gnt_list_entry; + setup->segments = kmap_atomic(gnt_list_entry->page); + ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; + } + + gnt_list_entry = get_grant(&setup->gref_head, gfn, info); + ref = gnt_list_entry->gref; + shadow->grants_used[grant_idx] = gnt_list_entry; + + if (setup->need_copy) { + void *shared_data; + + shared_data = kmap_atomic(gnt_list_entry->page); + /* + * this does not wipe data stored outside the + * range sg->offset..sg->offset+sg->length. + * Therefore, blkback *could* see data from + * previous requests. This is OK as long as + * persistent grants are shared with just one + * domain. It may need refactoring if this + * changes + */ + memcpy(shared_data + offset, + setup->bvec_data + setup->bvec_off, + len); + + kunmap_atomic(shared_data); + setup->bvec_off += len; + } + + fsect = offset >> 9; + lsect = fsect + (len >> 9) - 1; + if (ring_req->operation != BLKIF_OP_INDIRECT) { + ring_req->u.rw.seg[grant_idx] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } else { + setup->segments[grant_idx % GRANTS_PER_INDIRECT_FRAME] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } + + (setup->grant_idx)++; +} + +static int blkif_queue_rw_req(struct request *req) +{ + struct blkfront_info *info = req->rq_disk->private_data; + struct blkif_request *ring_req; + unsigned long id; + int i; + struct setup_rw_req setup = { + .grant_idx = 0, + .segments = NULL, + .info = info, + .need_copy = rq_data_dir(req) && info->feature_persistent, + }; /* * Used to store if we are able to queue the request by just using @@ -400,28 +585,23 @@ static int blkif_queue_request(struct request *req) * as there are not sufficiently many free. */ bool new_persistent_gnts; - grant_ref_t gref_head; - struct grant *gnt_list_entry = NULL; struct scatterlist *sg; - int nseg, max_grefs; - - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) - return 1; + int num_sg, max_grefs, num_grant; - max_grefs = req->nr_phys_segments; + max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG; if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST) /* * If we are using indirect segments we need to account * for the indirect grefs used in the request. */ - max_grefs += INDIRECT_GREFS(req->nr_phys_segments); + max_grefs += INDIRECT_GREFS(max_grefs); /* Check if we have enough grants to allocate a requests */ if (info->persistent_gnts_c < max_grefs) { new_persistent_gnts = 1; if (gnttab_alloc_grant_references( max_grefs - info->persistent_gnts_c, - &gref_head) < 0) { + &setup.gref_head) < 0) { gnttab_request_free_callback( &info->callback, blkif_restart_queue_callback, @@ -437,139 +617,82 @@ static int blkif_queue_request(struct request *req) id = get_id_from_freelist(info); info->shadow[id].request = req; - if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) { - ring_req->operation = BLKIF_OP_DISCARD; - ring_req->u.discard.nr_sectors = blk_rq_sectors(req); - ring_req->u.discard.id = id; - ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); - if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) - ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; - else - ring_req->u.discard.flag = 0; + BUG_ON(info->max_indirect_segments == 0 && + GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST); + BUG_ON(info->max_indirect_segments && + GREFS(req->nr_phys_segments) > info->max_indirect_segments); + + num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + num_grant = 0; + /* Calculate the number of grant used */ + for_each_sg(info->shadow[id].sg, sg, num_sg, i) + num_grant += gnttab_count_grant(sg->offset, sg->length); + + ring_req->u.rw.id = id; + info->shadow[id].num_sg = num_sg; + if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + /* + * The indirect operation can only be a BLKIF_OP_READ or + * BLKIF_OP_WRITE + */ + BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); + ring_req->operation = BLKIF_OP_INDIRECT; + ring_req->u.indirect.indirect_op = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.indirect.handle = info->handle; + ring_req->u.indirect.nr_segments = num_grant; } else { - BUG_ON(info->max_indirect_segments == 0 && - req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); - BUG_ON(info->max_indirect_segments && - req->nr_phys_segments > info->max_indirect_segments); - nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); - ring_req->u.rw.id = id; - if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); + ring_req->u.rw.handle = info->handle; + ring_req->operation = rq_data_dir(req) ? + BLKIF_OP_WRITE : BLKIF_OP_READ; + if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { /* - * The indirect operation can only be a BLKIF_OP_READ or - * BLKIF_OP_WRITE + * Ideally we can do an unordered flush-to-disk. + * In case the backend onlysupports barriers, use that. + * A barrier request a superset of FUA, so we can + * implement it the same way. (It's also a FLUSH+FUA, + * since it is guaranteed ordered WRT previous writes.) */ - BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); - ring_req->operation = BLKIF_OP_INDIRECT; - ring_req->u.indirect.indirect_op = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; - ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req); - ring_req->u.indirect.handle = info->handle; - ring_req->u.indirect.nr_segments = nseg; - } else { - ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req); - ring_req->u.rw.handle = info->handle; - ring_req->operation = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { - /* - * Ideally we can do an unordered flush-to-disk. In case the - * backend onlysupports barriers, use that. A barrier request - * a superset of FUA, so we can implement it the same - * way. (It's also a FLUSH+FUA, since it is - * guaranteed ordered WRT previous writes.) - */ - switch (info->feature_flush & - ((REQ_FLUSH|REQ_FUA))) { - case REQ_FLUSH|REQ_FUA: - ring_req->operation = - BLKIF_OP_WRITE_BARRIER; - break; - case REQ_FLUSH: - ring_req->operation = - BLKIF_OP_FLUSH_DISKCACHE; - break; - default: - ring_req->operation = 0; - } + switch (info->feature_flush & + ((REQ_FLUSH|REQ_FUA))) { + case REQ_FLUSH|REQ_FUA: + ring_req->operation = + BLKIF_OP_WRITE_BARRIER; + break; + case REQ_FLUSH: + ring_req->operation = + BLKIF_OP_FLUSH_DISKCACHE; + break; + default: + ring_req->operation = 0; } - ring_req->u.rw.nr_segments = nseg; } - for_each_sg(info->shadow[id].sg, sg, nseg, i) { - fsect = sg->offset >> 9; - lsect = fsect + (sg->length >> 9) - 1; - - if ((ring_req->operation == BLKIF_OP_INDIRECT) && - (i % SEGS_PER_INDIRECT_FRAME == 0)) { - unsigned long uninitialized_var(pfn); - - if (segments) - kunmap_atomic(segments); - - n = i / SEGS_PER_INDIRECT_FRAME; - if (!info->feature_persistent) { - struct page *indirect_page; - - /* Fetch a pre-allocated page to use for indirect grefs */ - BUG_ON(list_empty(&info->indirect_pages)); - indirect_page = list_first_entry(&info->indirect_pages, - struct page, lru); - list_del(&indirect_page->lru); - pfn = page_to_pfn(indirect_page); - } - gnt_list_entry = get_grant(&gref_head, pfn, info); - info->shadow[id].indirect_grants[n] = gnt_list_entry; - segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); - ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; - } - - gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info); - ref = gnt_list_entry->gref; - - info->shadow[id].grants_used[i] = gnt_list_entry; - - if (rq_data_dir(req) && info->feature_persistent) { - char *bvec_data; - void *shared_data; + ring_req->u.rw.nr_segments = num_grant; + } - BUG_ON(sg->offset + sg->length > PAGE_SIZE); + setup.ring_req = ring_req; + setup.id = id; + for_each_sg(info->shadow[id].sg, sg, num_sg, i) { + BUG_ON(sg->offset + sg->length > PAGE_SIZE); - shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn)); - bvec_data = kmap_atomic(sg_page(sg)); + if (setup.need_copy) { + setup.bvec_off = sg->offset; + setup.bvec_data = kmap_atomic(sg_page(sg)); + } - /* - * this does not wipe data stored outside the - * range sg->offset..sg->offset+sg->length. - * Therefore, blkback *could* see data from - * previous requests. This is OK as long as - * persistent grants are shared with just one - * domain. It may need refactoring if this - * changes - */ - memcpy(shared_data + sg->offset, - bvec_data + sg->offset, - sg->length); + gnttab_foreach_grant_in_range(sg_page(sg), + sg->offset, + sg->length, + blkif_setup_rw_req_grant, + &setup); - kunmap_atomic(bvec_data); - kunmap_atomic(shared_data); - } - if (ring_req->operation != BLKIF_OP_INDIRECT) { - ring_req->u.rw.seg[i] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; - } else { - n = i % SEGS_PER_INDIRECT_FRAME; - segments[n] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; - } - } - if (segments) - kunmap_atomic(segments); + if (setup.need_copy) + kunmap_atomic(setup.bvec_data); } + if (setup.segments) + kunmap_atomic(setup.segments); info->ring.req_prod_pvt++; @@ -577,11 +700,29 @@ static int blkif_queue_request(struct request *req) info->shadow[id].req = *ring_req; if (new_persistent_gnts) - gnttab_free_grant_references(gref_head); + gnttab_free_grant_references(setup.gref_head); return 0; } +/* + * Generate a Xen blkfront IO request from a blk layer request. Reads + * and writes are handled as expected. + * + * @req: a request struct + */ +static int blkif_queue_request(struct request *req) +{ + struct blkfront_info *info = req->rq_disk->private_data; + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + return 1; + + if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) + return blkif_queue_discard_req(req); + else + return blkif_queue_rw_req(req); +} static inline void flush_requests(struct blkfront_info *info) { @@ -603,54 +744,41 @@ static inline bool blkif_request_flush_invalid(struct request *req, !(info->feature_flush & REQ_FUA))); } -/* - * do_blkif_request - * read a block; request is in a request queue - */ -static void do_blkif_request(struct request_queue *rq) +static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *qd) { - struct blkfront_info *info = NULL; - struct request *req; - int queued; - - pr_debug("Entered do_blkif_request\n"); + struct blkfront_info *info = qd->rq->rq_disk->private_data; - queued = 0; - - while ((req = blk_peek_request(rq)) != NULL) { - info = req->rq_disk->private_data; - - if (RING_FULL(&info->ring)) - goto wait; + blk_mq_start_request(qd->rq); + spin_lock_irq(&info->io_lock); + if (RING_FULL(&info->ring)) + goto out_busy; - blk_start_request(req); + if (blkif_request_flush_invalid(qd->rq, info)) + goto out_err; - if (blkif_request_flush_invalid(req, info)) { - __blk_end_request_all(req, -EOPNOTSUPP); - continue; - } + if (blkif_queue_request(qd->rq)) + goto out_busy; - pr_debug("do_blk_req %p: cmd %p, sec %lx, " - "(%u/%u) [%s]\n", - req, req->cmd, (unsigned long)blk_rq_pos(req), - blk_rq_cur_sectors(req), blk_rq_sectors(req), - rq_data_dir(req) ? "write" : "read"); - - if (blkif_queue_request(req)) { - blk_requeue_request(rq, req); -wait: - /* Avoid pointless unplugs. */ - blk_stop_queue(rq); - break; - } + flush_requests(info); + spin_unlock_irq(&info->io_lock); + return BLK_MQ_RQ_QUEUE_OK; - queued++; - } +out_err: + spin_unlock_irq(&info->io_lock); + return BLK_MQ_RQ_QUEUE_ERROR; - if (queued != 0) - flush_requests(info); +out_busy: + spin_unlock_irq(&info->io_lock); + blk_mq_stop_hw_queue(hctx); + return BLK_MQ_RQ_QUEUE_BUSY; } +static struct blk_mq_ops blkfront_mq_ops = { + .queue_rq = blkif_queue_rq, + .map_queue = blk_mq_map_queue, +}; + static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, unsigned int physical_sector_size, unsigned int segments) @@ -658,9 +786,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, struct request_queue *rq; struct blkfront_info *info = gd->private_data; - rq = blk_init_queue(do_blkif_request, &info->io_lock); - if (rq == NULL) + memset(&info->tag_set, 0, sizeof(info->tag_set)); + info->tag_set.ops = &blkfront_mq_ops; + info->tag_set.nr_hw_queues = 1; + info->tag_set.queue_depth = BLK_RING_SIZE(info); + info->tag_set.numa_node = NUMA_NO_NODE; + info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + info->tag_set.cmd_size = 0; + info->tag_set.driver_data = info; + + if (blk_mq_alloc_tag_set(&info->tag_set)) + return -1; + rq = blk_mq_init_queue(&info->tag_set); + if (IS_ERR(rq)) { + blk_mq_free_tag_set(&info->tag_set); return -1; + } queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); @@ -676,14 +817,14 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_logical_block_size(rq, sector_size); blk_queue_physical_block_size(rq, physical_sector_size); - blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512); + blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512); /* Each segment in a request is up to an aligned page in size. */ blk_queue_segment_boundary(rq, PAGE_SIZE - 1); blk_queue_max_segment_size(rq, PAGE_SIZE); /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_segments(rq, segments); + blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG); /* Make sure buffer addresses are sector-aligned. */ blk_queue_dma_alignment(rq, 511); @@ -888,19 +1029,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, static void xlvbd_release_gendisk(struct blkfront_info *info) { unsigned int minor, nr_minors; - unsigned long flags; if (info->rq == NULL) return; - spin_lock_irqsave(&info->io_lock, flags); - /* No more blkif_request(). */ - blk_stop_queue(info->rq); + blk_mq_stop_hw_queues(info->rq); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); - spin_unlock_irqrestore(&info->io_lock, flags); /* Flush gnttab callback work. Must be done with no locks held. */ flush_work(&info->work); @@ -912,20 +1049,18 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) xlbd_release_minors(minor, nr_minors); blk_cleanup_queue(info->rq); + blk_mq_free_tag_set(&info->tag_set); info->rq = NULL; put_disk(info->gd); info->gd = NULL; } +/* Must be called with io_lock holded */ static void kick_pending_request_queues(struct blkfront_info *info) { - if (!RING_FULL(&info->ring)) { - /* Re-enable calldowns. */ - blk_start_queue(info->rq); - /* Kick things off immediately. */ - do_blkif_request(info->rq); - } + if (!RING_FULL(&info->ring)) + blk_mq_start_stopped_hw_queues(info->rq, true); } static void blkif_restart_queue(struct work_struct *work) @@ -950,7 +1085,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; /* No more blkif_request(). */ if (info->rq) - blk_stop_queue(info->rq); + blk_mq_stop_hw_queues(info->rq); /* Remove all persistent grants */ if (!list_empty(&info->grants)) { @@ -963,7 +1098,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) info->persistent_gnts_c--; } if (info->feature_persistent) - __free_page(pfn_to_page(persistent_gnt->pfn)); + __free_page(persistent_gnt->page); kfree(persistent_gnt); } } @@ -983,7 +1118,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) } } - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { /* * Clear persistent grants present in requests already * on the shared ring @@ -998,7 +1133,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) persistent_gnt = info->shadow[i].grants_used[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); if (info->feature_persistent) - __free_page(pfn_to_page(persistent_gnt->pfn)); + __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1012,7 +1147,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) for (j = 0; j < INDIRECT_GREFS(segs); j++) { persistent_gnt = info->shadow[i].indirect_grants[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); - __free_page(pfn_to_page(persistent_gnt->pfn)); + __free_page(persistent_gnt->page); kfree(persistent_gnt); } @@ -1033,51 +1168,80 @@ free_shadow: flush_work(&info->work); /* Free resources associated with old device channel. */ - if (info->ring_ref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(info->ring_ref, 0, - (unsigned long)info->ring.sring); - info->ring_ref = GRANT_INVALID_REF; - info->ring.sring = NULL; + for (i = 0; i < info->nr_ring_pages; i++) { + if (info->ring_ref[i] != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref[i], 0, 0); + info->ring_ref[i] = GRANT_INVALID_REF; + } } + free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); + info->ring.sring = NULL; + if (info->irq) unbind_from_irqhandler(info->irq, info); info->evtchn = info->irq = 0; } +struct copy_from_grant { + const struct blk_shadow *s; + unsigned int grant_idx; + unsigned int bvec_offset; + char *bvec_data; +}; + +static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, + unsigned int len, void *data) +{ + struct copy_from_grant *info = data; + char *shared_data; + /* Convenient aliases */ + const struct blk_shadow *s = info->s; + + shared_data = kmap_atomic(s->grants_used[info->grant_idx]->page); + + memcpy(info->bvec_data + info->bvec_offset, + shared_data + offset, len); + + info->bvec_offset += len; + info->grant_idx++; + + kunmap_atomic(shared_data); +} + static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, struct blkif_response *bret) { int i = 0; struct scatterlist *sg; - char *bvec_data; - void *shared_data; - int nseg; + int num_sg, num_grant; + struct copy_from_grant data = { + .s = s, + .grant_idx = 0, + }; - nseg = s->req.operation == BLKIF_OP_INDIRECT ? + num_grant = s->req.operation == BLKIF_OP_INDIRECT ? s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; + num_sg = s->num_sg; if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { - /* - * Copy the data received from the backend into the bvec. - * Since bv_offset can be different than 0, and bv_len different - * than PAGE_SIZE, we have to keep track of the current offset, - * to be sure we are copying the data from the right shared page. - */ - for_each_sg(s->sg, sg, nseg, i) { + for_each_sg(s->sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); - shared_data = kmap_atomic( - pfn_to_page(s->grants_used[i]->pfn)); - bvec_data = kmap_atomic(sg_page(sg)); - memcpy(bvec_data + sg->offset, - shared_data + sg->offset, - sg->length); - kunmap_atomic(bvec_data); - kunmap_atomic(shared_data); + + data.bvec_offset = sg->offset; + data.bvec_data = kmap_atomic(sg_page(sg)); + + gnttab_foreach_grant_in_range(sg_page(sg), + sg->offset, + sg->length, + blkif_copy_from_grant, + &data); + + kunmap_atomic(data.bvec_data); } } /* Add the persistent grant into the list of free grants */ - for (i = 0; i < nseg; i++) { + for (i = 0; i < num_grant; i++) { if (gnttab_query_foreign_access(s->grants_used[i]->gref)) { /* * If the grant is still mapped by the backend (the @@ -1103,7 +1267,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, } } if (s->req.operation == BLKIF_OP_INDIRECT) { - for (i = 0; i < INDIRECT_GREFS(nseg); i++) { + for (i = 0; i < INDIRECT_GREFS(num_grant); i++) { if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) { if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", @@ -1119,7 +1283,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, * available pages for indirect grefs. */ if (!info->feature_persistent) { - indirect_page = pfn_to_page(s->indirect_grants[i]->pfn); + indirect_page = s->indirect_grants[i]->page; list_add(&indirect_page->lru, &info->indirect_pages); } s->indirect_grants[i]->gref = GRANT_INVALID_REF; @@ -1159,7 +1323,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) * never have given to it (we stamp it up to BLK_RING_SIZE - * look in get_id_from_freelist. */ - if (id >= BLK_RING_SIZE) { + if (id >= BLK_RING_SIZE(info)) { WARN(1, "%s: response to %s has incorrect id (%ld)\n", info->gd->disk_name, op_name(bret->operation), id); /* We can't safely get the 'struct request' as @@ -1190,7 +1354,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) queue_flag_clear(QUEUE_FLAG_DISCARD, rq); queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); } - __blk_end_request_all(req, error); + blk_mq_complete_request(req, error); break; case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: @@ -1218,7 +1382,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " "request: %x\n", bret->status); - __blk_end_request_all(req, error); + blk_mq_complete_request(req, error); break; default: BUG(); @@ -1247,26 +1411,30 @@ static int setup_blkring(struct xenbus_device *dev, struct blkfront_info *info) { struct blkif_sring *sring; - grant_ref_t gref; - int err; + int err, i; + unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; + grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; - info->ring_ref = GRANT_INVALID_REF; + for (i = 0; i < info->nr_ring_pages; i++) + info->ring_ref[i] = GRANT_INVALID_REF; - sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH); + sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, + get_order(ring_size)); if (!sring) { xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); return -ENOMEM; } SHARED_RING_INIT(sring); - FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + FRONT_RING_INIT(&info->ring, sring, ring_size); - err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref); + err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); if (err < 0) { - free_page((unsigned long)sring); + free_pages((unsigned long)sring, get_order(ring_size)); info->ring.sring = NULL; goto fail; } - info->ring_ref = gref; + for (i = 0; i < info->nr_ring_pages; i++) + info->ring_ref[i] = gref[i]; err = xenbus_alloc_evtchn(dev, &info->evtchn); if (err) @@ -1294,7 +1462,18 @@ static int talk_to_blkback(struct xenbus_device *dev, { const char *message = NULL; struct xenbus_transaction xbt; - int err; + int err, i; + unsigned int max_page_order = 0; + unsigned int ring_page_order = 0; + + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "max-ring-page-order", "%u", &max_page_order); + if (err != 1) + info->nr_ring_pages = 1; + else { + ring_page_order = min(xen_blkif_max_ring_order, max_page_order); + info->nr_ring_pages = 1 << ring_page_order; + } /* Create shared ring, alloc event channel. */ err = setup_blkring(dev, info); @@ -1308,11 +1487,32 @@ again: goto destroy_blkring; } - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", info->ring_ref); - if (err) { - message = "writing ring-ref"; - goto abort_transaction; + if (info->nr_ring_pages == 1) { + err = xenbus_printf(xbt, dev->nodename, + "ring-ref", "%u", info->ring_ref[0]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } else { + err = xenbus_printf(xbt, dev->nodename, + "ring-page-order", "%u", ring_page_order); + if (err) { + message = "writing ring-page-order"; + goto abort_transaction; + } + + for (i = 0; i < info->nr_ring_pages; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_printf(xbt, dev->nodename, ring_ref_name, + "%u", info->ring_ref[i]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } } err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", info->evtchn); @@ -1340,6 +1540,9 @@ again: goto destroy_blkring; } + for (i = 0; i < BLK_RING_SIZE(info); i++) + info->shadow[i].req.u.rw.id = i+1; + info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; xenbus_switch_state(dev, XenbusStateInitialised); return 0; @@ -1363,7 +1566,7 @@ again: static int blkfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { - int err, vdevice, i; + int err, vdevice; struct blkfront_info *info; /* FIXME: Use dynamic device id if this is not set. */ @@ -1424,34 +1627,21 @@ static int blkfront_probe(struct xenbus_device *dev, info->connected = BLKIF_STATE_DISCONNECTED; INIT_WORK(&info->work, blkif_restart_queue); - for (i = 0; i < BLK_RING_SIZE; i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; - /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); dev_set_drvdata(&dev->dev, info); - err = talk_to_blkback(dev, info); - if (err) { - kfree(info); - dev_set_drvdata(&dev->dev, NULL); - return err; - } - return 0; } -static void split_bio_end(struct bio *bio, int error) +static void split_bio_end(struct bio *bio) { struct split_bio *split_bio = bio->bi_private; - if (error) - split_bio->err = error; - if (atomic_dec_and_test(&split_bio->pending)) { split_bio->bio->bi_phys_segments = 0; - bio_endio(split_bio->bio, split_bio->err); + split_bio->bio->bi_error = bio->bi_error; + bio_endio(split_bio->bio); kfree(split_bio); } bio_put(bio); @@ -1478,12 +1668,12 @@ static int blkif_recover(struct blkfront_info *info) /* Stage 2: Set up free list. */ memset(&info->shadow, 0, sizeof(info->shadow)); - for (i = 0; i < BLK_RING_SIZE; i++) + for (i = 0; i < BLK_RING_SIZE(info); i++) info->shadow[i].req.u.rw.id = i+1; info->shadow_free = info->ring.req_prod_pvt; - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; + info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; - rc = blkfront_setup_indirect(info); + rc = blkfront_gather_backend_features(info); if (rc) { kfree(copy); return rc; @@ -1493,7 +1683,7 @@ static int blkif_recover(struct blkfront_info *info) blk_queue_max_segments(info->rq, segs); bio_list_init(&bio_list); INIT_LIST_HEAD(&requests); - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { /* Not in use? */ if (!copy[i].request) continue; @@ -1519,28 +1709,6 @@ static int blkif_recover(struct blkfront_info *info) kfree(copy); - /* - * Empty the queue, this is important because we might have - * requests in the queue with more segments than what we - * can handle now. - */ - spin_lock_irq(&info->io_lock); - while ((req = blk_fetch_request(info->rq)) != NULL) { - if (req->cmd_flags & - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { - list_add(&req->queuelist, &requests); - continue; - } - merge_bio.head = req->bio; - merge_bio.tail = req->biotail; - bio_list_merge(&bio_list, &merge_bio); - req->bio = NULL; - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) - pr_alert("diskcache flush request found!\n"); - __blk_end_request_all(req, 0); - } - spin_unlock_irq(&info->io_lock); - xenbus_switch_state(info->xbdev, XenbusStateConnected); spin_lock_irq(&info->io_lock); @@ -1555,9 +1723,10 @@ static int blkif_recover(struct blkfront_info *info) /* Requeue pending requests (flush or discard) */ list_del_init(&req->queuelist); BUG_ON(req->nr_phys_segments > segs); - blk_requeue_request(info->rq, req); + blk_mq_requeue_request(req); } spin_unlock_irq(&info->io_lock); + blk_mq_kick_requeue_list(info->rq); while ((bio = bio_list_pop(&bio_list)) != NULL) { /* Traverse the list of pending bios and re-queue them */ @@ -1572,8 +1741,8 @@ static int blkif_recover(struct blkfront_info *info) atomic_set(&split_bio->pending, pending); split_bio->bio = bio; for (i = 0; i < pending; i++) { - offset = (i * segs * PAGE_SIZE) >> 9; - size = min((unsigned int)(segs * PAGE_SIZE) >> 9, + offset = (i * segs * XEN_PAGE_SIZE) >> 9; + size = min((unsigned int)(segs * XEN_PAGE_SIZE) >> 9, (unsigned int)bio_sectors(bio) - offset); cloned_bio = bio_clone(bio, GFP_NOIO); BUG_ON(cloned_bio == NULL); @@ -1684,22 +1853,17 @@ static void blkfront_setup_discard(struct blkfront_info *info) static int blkfront_setup_indirect(struct blkfront_info *info) { - unsigned int indirect_segments, segs; + unsigned int psegs, grants; int err, i; - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-max-indirect-segments", "%u", &indirect_segments, - NULL); - if (err) { - info->max_indirect_segments = 0; - segs = BLKIF_MAX_SEGMENTS_PER_REQUEST; - } else { - info->max_indirect_segments = min(indirect_segments, - xen_blkif_max_segments); - segs = info->max_indirect_segments; - } + if (info->max_indirect_segments == 0) + grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; + else + grants = info->max_indirect_segments; + psegs = grants / GRANTS_PER_PSEG; - err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE); + err = fill_grant_buffer(info, + (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); if (err) goto out_of_memory; @@ -1709,7 +1873,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info) * grants, we need to allocate a set of pages that can be * used for mapping indirect grefs */ - int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE; + int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); BUG_ON(!list_empty(&info->indirect_pages)); for (i = 0; i < num; i++) { @@ -1720,29 +1884,29 @@ static int blkfront_setup_indirect(struct blkfront_info *info) } } - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { info->shadow[i].grants_used = kzalloc( - sizeof(info->shadow[i].grants_used[0]) * segs, + sizeof(info->shadow[i].grants_used[0]) * grants, GFP_NOIO); - info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO); + info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); if (info->max_indirect_segments) info->shadow[i].indirect_grants = kzalloc( sizeof(info->shadow[i].indirect_grants[0]) * - INDIRECT_GREFS(segs), + INDIRECT_GREFS(grants), GFP_NOIO); if ((info->shadow[i].grants_used == NULL) || (info->shadow[i].sg == NULL) || (info->max_indirect_segments && (info->shadow[i].indirect_grants == NULL))) goto out_of_memory; - sg_init_table(info->shadow[i].sg, segs); + sg_init_table(info->shadow[i].sg, psegs); } return 0; out_of_memory: - for (i = 0; i < BLK_RING_SIZE; i++) { + for (i = 0; i < BLK_RING_SIZE(info); i++) { kfree(info->shadow[i].grants_used); info->shadow[i].grants_used = NULL; kfree(info->shadow[i].sg); @@ -1760,6 +1924,68 @@ out_of_memory: return -ENOMEM; } +/* + * Gather all backend feature-* + */ +static int blkfront_gather_backend_features(struct blkfront_info *info) +{ + int err; + int barrier, flush, discard, persistent; + unsigned int indirect_segments; + + info->feature_flush = 0; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-barrier", "%d", &barrier, + NULL); + + /* + * If there's no "feature-barrier" defined, then it means + * we're dealing with a very old backend which writes + * synchronously; nothing to do. + * + * If there are barriers, then we use flush. + */ + if (!err && barrier) + info->feature_flush = REQ_FLUSH | REQ_FUA; + /* + * And if there is "feature-flush-cache" use that above + * barriers. + */ + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-flush-cache", "%d", &flush, + NULL); + + if (!err && flush) + info->feature_flush = REQ_FLUSH; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-discard", "%d", &discard, + NULL); + + if (!err && discard) + blkfront_setup_discard(info); + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-persistent", "%u", &persistent, + NULL); + if (err) + info->feature_persistent = 0; + else + info->feature_persistent = persistent; + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-max-indirect-segments", "%u", &indirect_segments, + NULL); + if (err) + info->max_indirect_segments = 0; + else + info->max_indirect_segments = min(indirect_segments, + xen_blkif_max_segments); + + return blkfront_setup_indirect(info); +} + /* * Invoked when the backend is finally 'ready' (and has told produced * the details about the physical device - #sectors, size, etc). @@ -1771,7 +1997,6 @@ static void blkfront_connect(struct blkfront_info *info) unsigned int physical_sector_size; unsigned int binfo; int err; - int barrier, flush, discard, persistent; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -1828,48 +2053,7 @@ static void blkfront_connect(struct blkfront_info *info) if (err != 1) physical_sector_size = sector_size; - info->feature_flush = 0; - - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-barrier", "%d", &barrier, - NULL); - - /* - * If there's no "feature-barrier" defined, then it means - * we're dealing with a very old backend which writes - * synchronously; nothing to do. - * - * If there are barriers, then we use flush. - */ - if (!err && barrier) - info->feature_flush = REQ_FLUSH | REQ_FUA; - /* - * And if there is "feature-flush-cache" use that above - * barriers. - */ - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-flush-cache", "%d", &flush, - NULL); - - if (!err && flush) - info->feature_flush = REQ_FLUSH; - - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-discard", "%d", &discard, - NULL); - - if (!err && discard) - blkfront_setup_discard(info); - - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-persistent", "%u", &persistent, - NULL); - if (err) - info->feature_persistent = 0; - else - info->feature_persistent = persistent; - - err = blkfront_setup_indirect(info); + err = blkfront_gather_backend_features(info); if (err) { xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", info->xbdev->otherend); @@ -1908,8 +2092,15 @@ static void blkback_changed(struct xenbus_device *dev, dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state); switch (backend_state) { - case XenbusStateInitialising: case XenbusStateInitWait: + if (dev->state != XenbusStateInitialising) + break; + if (talk_to_blkback(dev, info)) { + kfree(info); + dev_set_drvdata(&dev->dev, NULL); + break; + } + case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateReconfiguring: case XenbusStateReconfigured: @@ -1925,7 +2116,8 @@ static void blkback_changed(struct xenbus_device *dev, break; /* Missed the backend's Closing state -- fallthrough */ case XenbusStateClosing: - blkfront_closing(info); + if (info) + blkfront_closing(info); break; } } @@ -2093,6 +2285,12 @@ static int __init xlblk_init(void) if (!xen_domain()) return -ENODEV; + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { + pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", + xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); + xen_blkif_max_ring_order = 0; + } + if (!xen_has_pv_disk_devices()) return -ENODEV; diff --git a/kernel/drivers/block/zram/Kconfig b/kernel/drivers/block/zram/Kconfig index 6489c0fd0..386ba3d1a 100644 --- a/kernel/drivers/block/zram/Kconfig +++ b/kernel/drivers/block/zram/Kconfig @@ -23,12 +23,4 @@ config ZRAM_LZ4_COMPRESS default n help This option enables LZ4 compression algorithm support. Compression - algorithm can be changed using `comp_algorithm' device attribute. - -config ZRAM_DEBUG - bool "Compressed RAM block device debug support" - depends on ZRAM - default n - help - This option adds additional debugging code to the compressed - RAM block device driver. + algorithm can be changed using `comp_algorithm' device attribute. \ No newline at end of file diff --git a/kernel/drivers/block/zram/zcomp.c b/kernel/drivers/block/zram/zcomp.c index 54d946a9e..c53617752 100644 --- a/kernel/drivers/block/zram/zcomp.c +++ b/kernel/drivers/block/zram/zcomp.c @@ -76,7 +76,7 @@ static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) */ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) { - struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_NOIO); if (!zstrm) return NULL; @@ -85,7 +85,7 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ - zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + zstrm->buffer = (void *)__get_free_pages(GFP_NOIO | __GFP_ZERO, 1); if (!zstrm->private || !zstrm->buffer) { zcomp_strm_free(comp, zstrm); zstrm = NULL; @@ -274,7 +274,7 @@ ssize_t zcomp_available_show(const char *comp, char *buf) int i = 0; while (backends[i]) { - if (sysfs_streq(comp, backends[i]->name)) + if (!strcmp(comp, backends[i]->name)) sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, "[%s] ", backends[i]->name); else @@ -286,6 +286,11 @@ ssize_t zcomp_available_show(const char *comp, char *buf) return sz; } +bool zcomp_available_algorithm(const char *comp) +{ + return find_backend(comp) != NULL; +} + bool zcomp_set_max_streams(struct zcomp *comp, int num_strm) { return comp->set_max_streams(comp, num_strm); diff --git a/kernel/drivers/block/zram/zcomp.h b/kernel/drivers/block/zram/zcomp.h index c59d1fca7..46e2b9f8f 100644 --- a/kernel/drivers/block/zram/zcomp.h +++ b/kernel/drivers/block/zram/zcomp.h @@ -51,6 +51,7 @@ struct zcomp { }; ssize_t zcomp_available_show(const char *comp, char *buf); +bool zcomp_available_algorithm(const char *comp); struct zcomp *zcomp_create(const char *comp, int max_strm); void zcomp_destroy(struct zcomp *comp); diff --git a/kernel/drivers/block/zram/zcomp_lz4.c b/kernel/drivers/block/zram/zcomp_lz4.c index f2afb7e98..dd6083124 100644 --- a/kernel/drivers/block/zram/zcomp_lz4.c +++ b/kernel/drivers/block/zram/zcomp_lz4.c @@ -10,17 +10,36 @@ #include #include #include +#include +#include #include "zcomp_lz4.h" static void *zcomp_lz4_create(void) { - return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL); + void *ret; + + /* + * This function can be called in swapout/fs write path + * so we can't use GFP_FS|IO. And it assumes we already + * have at least one stream in zram initialization so we + * don't do best effort to allocate more stream in here. + * A default stream will work well without further multiple + * streams. That's why we use NORETRY | NOWARN. + */ + ret = kzalloc(LZ4_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY | + __GFP_NOWARN); + if (!ret) + ret = __vmalloc(LZ4_MEM_COMPRESS, + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | + __GFP_ZERO | __GFP_HIGHMEM, + PAGE_KERNEL); + return ret; } static void zcomp_lz4_destroy(void *private) { - kfree(private); + kvfree(private); } static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, diff --git a/kernel/drivers/block/zram/zcomp_lzo.c b/kernel/drivers/block/zram/zcomp_lzo.c index da1bc47d5..edc549920 100644 --- a/kernel/drivers/block/zram/zcomp_lzo.c +++ b/kernel/drivers/block/zram/zcomp_lzo.c @@ -10,17 +10,36 @@ #include #include #include +#include +#include #include "zcomp_lzo.h" static void *lzo_create(void) { - return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL); + void *ret; + + /* + * This function can be called in swapout/fs write path + * so we can't use GFP_FS|IO. And it assumes we already + * have at least one stream in zram initialization so we + * don't do best effort to allocate more stream in here. + * A default stream will work well without further multiple + * streams. That's why we use NORETRY | NOWARN. + */ + ret = kzalloc(LZO1X_MEM_COMPRESS, GFP_NOIO | __GFP_NORETRY | + __GFP_NOWARN); + if (!ret) + ret = __vmalloc(LZO1X_MEM_COMPRESS, + GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN | + __GFP_ZERO | __GFP_HIGHMEM, + PAGE_KERNEL); + return ret; } static void lzo_destroy(void *private) { - kfree(private); + kvfree(private); } static int lzo_compress(const unsigned char *src, unsigned char *dst, diff --git a/kernel/drivers/block/zram/zram_drv.c b/kernel/drivers/block/zram/zram_drv.c index 6e134f475..65e0b375a 100644 --- a/kernel/drivers/block/zram/zram_drv.c +++ b/kernel/drivers/block/zram/zram_drv.c @@ -15,10 +15,6 @@ #define KMSG_COMPONENT "zram" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#ifdef CONFIG_ZRAM_DEBUG -#define DEBUG -#endif - #include #include #include @@ -32,12 +28,16 @@ #include #include #include +#include +#include #include "zram_drv.h" -/* Globals */ +static DEFINE_IDR(zram_index_idr); +/* idr index must be protected */ +static DEFINE_MUTEX(zram_index_mutex); + static int zram_major; -static struct zram *zram_devices; static const char *default_compressor = "lzo"; /* Module params (documentation at end) */ @@ -53,7 +53,7 @@ static inline void deprecated_attr_warn(const char *name) } #define ZRAM_ATTR_RO(name) \ -static ssize_t name##_show(struct device *d, \ +static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ @@ -74,33 +74,117 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } -static ssize_t compact_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +/* flag operations require table entry bit_spin_lock() being held */ +static int zram_test_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) { - unsigned long nr_migrated; - struct zram *zram = dev_to_zram(dev); - struct zram_meta *meta; + return meta->table[index].value & BIT(flag); +} - down_read(&zram->init_lock); - if (!init_done(zram)) { - up_read(&zram->init_lock); - return -EINVAL; - } +static void zram_set_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value |= BIT(flag); +} - meta = zram->meta; - nr_migrated = zs_compact(meta->mem_pool); - atomic64_add(nr_migrated, &zram->stats.num_migrated); - up_read(&zram->init_lock); +static void zram_clear_flag(struct zram_meta *meta, u32 index, + enum zram_pageflags flag) +{ + meta->table[index].value &= ~BIT(flag); +} - return len; +static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +{ + return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); } -static ssize_t disksize_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void zram_set_obj_size(struct zram_meta *meta, + u32 index, size_t size) { - struct zram *zram = dev_to_zram(dev); + unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; - return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); + meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; +} + +static inline bool is_partial_io(struct bio_vec *bvec) +{ + return bvec->bv_len != PAGE_SIZE; +} + +/* + * Check if request is within bounds and aligned on zram logical blocks. + */ +static inline bool valid_io_request(struct zram *zram, + sector_t start, unsigned int size) +{ + u64 end, bound; + + /* unaligned request */ + if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + return false; + if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + return false; + + end = start + (size >> SECTOR_SHIFT); + bound = zram->disksize >> SECTOR_SHIFT; + /* out of range range */ + if (unlikely(start >= bound || end > bound || start > end)) + return false; + + /* I/O request is valid */ + return true; +} + +static void update_position(u32 *index, int *offset, struct bio_vec *bvec) +{ + if (*offset + bvec->bv_len >= PAGE_SIZE) + (*index)++; + *offset = (*offset + bvec->bv_len) % PAGE_SIZE; +} + +static inline void update_used_max(struct zram *zram, + const unsigned long pages) +{ + unsigned long old_max, cur_max; + + old_max = atomic_long_read(&zram->stats.max_used_pages); + + do { + cur_max = old_max; + if (pages > cur_max) + old_max = atomic_long_cmpxchg( + &zram->stats.max_used_pages, cur_max, pages); + } while (old_max != cur_max); +} + +static bool page_zero_filled(void *ptr) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + + for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos]) + return false; + } + + return true; +} + +static void handle_zero_page(struct bio_vec *bvec) +{ + struct page *page = bvec->bv_page; + void *user_mem; + + user_mem = kmap_atomic(page); + if (is_partial_io(bvec)) + memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); + else + clear_page(user_mem); + kunmap_atomic(user_mem); + + flush_dcache_page(page); } static ssize_t initstate_show(struct device *dev, @@ -116,6 +200,14 @@ static ssize_t initstate_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%u\n", val); } +static ssize_t disksize_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize); +} + static ssize_t orig_data_size_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -143,19 +235,6 @@ static ssize_t mem_used_total_show(struct device *dev, return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); } -static ssize_t max_comp_streams_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - int val; - struct zram *zram = dev_to_zram(dev); - - down_read(&zram->init_lock); - val = zram->max_comp_streams; - up_read(&zram->init_lock); - - return scnprintf(buf, PAGE_SIZE, "%d\n", val); -} - static ssize_t mem_limit_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -225,6 +304,19 @@ static ssize_t mem_used_max_store(struct device *dev, return len; } +static ssize_t max_comp_streams_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int val; + struct zram *zram = dev_to_zram(dev); + + down_read(&zram->init_lock); + val = zram->max_comp_streams; + up_read(&zram->init_lock); + + return scnprintf(buf, PAGE_SIZE, "%d\n", val); +} + static ssize_t max_comp_streams_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -271,6 +363,11 @@ static ssize_t comp_algorithm_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); + size_t sz; + + if (!zcomp_available_algorithm(buf)) + return -EINVAL; + down_write(&zram->init_lock); if (init_done(zram)) { up_write(&zram->init_lock); @@ -278,95 +375,133 @@ static ssize_t comp_algorithm_store(struct device *dev, return -EBUSY; } strlcpy(zram->compressor, buf, sizeof(zram->compressor)); + + /* ignore trailing newline */ + sz = strlen(zram->compressor); + if (sz > 0 && zram->compressor[sz - 1] == '\n') + zram->compressor[sz - 1] = 0x00; + up_write(&zram->init_lock); return len; } -/* flag operations needs meta->tb_lock */ -static int zram_test_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) +static ssize_t compact_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { - return meta->table[index].value & BIT(flag); -} + struct zram *zram = dev_to_zram(dev); + struct zram_meta *meta; -static void zram_set_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].value |= BIT(flag); -} + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } -static void zram_clear_flag(struct zram_meta *meta, u32 index, - enum zram_pageflags flag) -{ - meta->table[index].value &= ~BIT(flag); -} + meta = zram->meta; + zs_compact(meta->mem_pool); + up_read(&zram->init_lock); -static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) -{ - return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); + return len; } -static void zram_set_obj_size(struct zram_meta *meta, - u32 index, size_t size) +static ssize_t io_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { - unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + struct zram *zram = dev_to_zram(dev); + ssize_t ret; - meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; -} + down_read(&zram->init_lock); + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8llu\n", + (u64)atomic64_read(&zram->stats.failed_reads), + (u64)atomic64_read(&zram->stats.failed_writes), + (u64)atomic64_read(&zram->stats.invalid_io), + (u64)atomic64_read(&zram->stats.notify_free)); + up_read(&zram->init_lock); -static inline int is_partial_io(struct bio_vec *bvec) -{ - return bvec->bv_len != PAGE_SIZE; + return ret; } -/* - * Check if request is within bounds and aligned on zram logical blocks. - */ -static inline int valid_io_request(struct zram *zram, - sector_t start, unsigned int size) +static ssize_t mm_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { - u64 end, bound; - - /* unaligned request */ - if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) - return 0; - if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) - return 0; - - end = start + (size >> SECTOR_SHIFT); - bound = zram->disksize >> SECTOR_SHIFT; - /* out of range range */ - if (unlikely(start >= bound || end > bound || start > end)) - return 0; - - /* I/O request is valid */ - return 1; -} + struct zram *zram = dev_to_zram(dev); + struct zs_pool_stats pool_stats; + u64 orig_size, mem_used = 0; + long max_used; + ssize_t ret; -static void zram_meta_free(struct zram_meta *meta, u64 disksize) -{ - size_t num_pages = disksize >> PAGE_SHIFT; - size_t index; + memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats)); - /* Free all pages that are still in this zram device */ - for (index = 0; index < num_pages; index++) { - unsigned long handle = meta->table[index].handle; + down_read(&zram->init_lock); + if (init_done(zram)) { + mem_used = zs_get_total_pages(zram->meta->mem_pool); + zs_pool_stats(zram->meta->mem_pool, &pool_stats); + } - if (!handle) - continue; + orig_size = atomic64_read(&zram->stats.pages_stored); + max_used = atomic_long_read(&zram->stats.max_used_pages); - zs_free(meta->mem_pool, handle); - } + ret = scnprintf(buf, PAGE_SIZE, + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n", + orig_size << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.compr_data_size), + mem_used << PAGE_SHIFT, + zram->limit_pages << PAGE_SHIFT, + max_used << PAGE_SHIFT, + (u64)atomic64_read(&zram->stats.zero_pages), + pool_stats.pages_compacted); + up_read(&zram->init_lock); - zs_destroy_pool(meta->mem_pool); - vfree(meta->table); - kfree(meta); + return ret; } -static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize) -{ +static DEVICE_ATTR_RO(io_stat); +static DEVICE_ATTR_RO(mm_stat); +ZRAM_ATTR_RO(num_reads); +ZRAM_ATTR_RO(num_writes); +ZRAM_ATTR_RO(failed_reads); +ZRAM_ATTR_RO(failed_writes); +ZRAM_ATTR_RO(invalid_io); +ZRAM_ATTR_RO(notify_free); +ZRAM_ATTR_RO(zero_pages); +ZRAM_ATTR_RO(compr_data_size); + +static inline bool zram_meta_get(struct zram *zram) +{ + if (atomic_inc_not_zero(&zram->refcount)) + return true; + return false; +} + +static inline void zram_meta_put(struct zram *zram) +{ + atomic_dec(&zram->refcount); +} + +static void zram_meta_free(struct zram_meta *meta, u64 disksize) +{ + size_t num_pages = disksize >> PAGE_SHIFT; + size_t index; + + /* Free all pages that are still in this zram device */ + for (index = 0; index < num_pages; index++) { + unsigned long handle = meta->table[index].handle; + + if (!handle) + continue; + + zs_free(meta->mem_pool, handle); + } + + zs_destroy_pool(meta->mem_pool); + vfree(meta->table); + kfree(meta); +} + +static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) +{ size_t num_pages; - char pool_name[8]; struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); if (!meta) @@ -379,13 +514,14 @@ static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize) goto out_error; } - snprintf(pool_name, sizeof(pool_name), "zram%d", device_id); meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM); if (!meta->mem_pool) { pr_err("Error creating memory pool\n"); goto out_error; } + zram_meta_init_table_locks(meta, disksize); + return meta; out_error: @@ -394,56 +530,6 @@ out_error: return NULL; } -static inline bool zram_meta_get(struct zram *zram) -{ - if (atomic_inc_not_zero(&zram->refcount)) - return true; - return false; -} - -static inline void zram_meta_put(struct zram *zram) -{ - atomic_dec(&zram->refcount); -} - -static void update_position(u32 *index, int *offset, struct bio_vec *bvec) -{ - if (*offset + bvec->bv_len >= PAGE_SIZE) - (*index)++; - *offset = (*offset + bvec->bv_len) % PAGE_SIZE; -} - -static int page_zero_filled(void *ptr) -{ - unsigned int pos; - unsigned long *page; - - page = (unsigned long *)ptr; - - for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) { - if (page[pos]) - return 0; - } - - return 1; -} - -static void handle_zero_page(struct bio_vec *bvec) -{ - struct page *page = bvec->bv_page; - void *user_mem; - - user_mem = kmap_atomic(page); - if (is_partial_io(bvec)) - memset(user_mem + bvec->bv_offset, 0, bvec->bv_len); - else - clear_page(user_mem); - kunmap_atomic(user_mem); - - flush_dcache_page(page); -} - - /* * To protect concurrent access to the same index entry, * caller should hold this table index entry's bit_spinlock to @@ -484,12 +570,12 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) unsigned long handle; size_t size; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_lock_table(&meta->table[index]); handle = meta->table[index].handle; size = zram_get_obj_size(meta, index); if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_unlock_table(&meta->table[index]); clear_page(mem); return 0; } @@ -500,7 +586,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) else ret = zcomp_decompress(zram->comp, cmem, size, mem); zs_unmap_object(meta->mem_pool, handle); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_unlock_table(&meta->table[index]); /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret)) { @@ -520,14 +606,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, struct zram_meta *meta = zram->meta; page = bvec->bv_page; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_lock_table(&meta->table[index]); if (unlikely(!meta->table[index].handle) || zram_test_flag(meta, index, ZRAM_ZERO)) { - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_unlock_table(&meta->table[index]); handle_zero_page(bvec); return 0; } - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_unlock_table(&meta->table[index]); if (is_partial_io(bvec)) /* Use a temporary buffer to decompress the page */ @@ -538,7 +624,7 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, uncmem = user_mem; if (!uncmem) { - pr_info("Unable to allocate temp memory\n"); + pr_err("Unable to allocate temp memory\n"); ret = -ENOMEM; goto out_cleanup; } @@ -561,21 +647,6 @@ out_cleanup: return ret; } -static inline void update_used_max(struct zram *zram, - const unsigned long pages) -{ - unsigned long old_max, cur_max; - - old_max = atomic_long_read(&zram->stats.max_used_pages); - - do { - cur_max = old_max; - if (pages > cur_max) - old_max = atomic_long_cmpxchg( - &zram->stats.max_used_pages, cur_max, pages); - } while (old_max != cur_max); -} - static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, int offset) { @@ -585,8 +656,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; struct zram_meta *meta = zram->meta; - struct zcomp_strm *zstrm; - bool locked = false; + struct zcomp_strm *zstrm = NULL; unsigned long alloced_pages; page = bvec->bv_page; @@ -606,7 +676,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } zstrm = zcomp_strm_find(zram->comp); - locked = true; user_mem = kmap_atomic(page); if (is_partial_io(bvec)) { @@ -622,10 +691,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, if (user_mem) kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_lock_table(&meta->table[index]); zram_free_page(zram, index); zram_set_flag(meta, index, ZRAM_ZERO); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_unlock_table(&meta->table[index]); atomic64_inc(&zram->stats.zero_pages); ret = 0; @@ -652,21 +721,21 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, handle = zs_malloc(meta->mem_pool, clen); if (!handle) { - pr_info("Error allocating memory for compressed page: %u, size=%zu\n", + pr_err("Error allocating memory for compressed page: %u, size=%zu\n", index, clen); ret = -ENOMEM; goto out; } alloced_pages = zs_get_total_pages(meta->mem_pool); + update_used_max(zram, alloced_pages); + if (zram->limit_pages && alloced_pages > zram->limit_pages) { zs_free(meta->mem_pool, handle); ret = -ENOMEM; goto out; } - update_used_max(zram, alloced_pages); - cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { @@ -678,60 +747,31 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, } zcomp_strm_release(zram->comp, zstrm); - locked = false; + zstrm = NULL; zs_unmap_object(meta->mem_pool, handle); /* * Free memory associated with this sector * before overwriting unused sectors. */ - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_lock_table(&meta->table[index]); zram_free_page(zram, index); meta->table[index].handle = handle; zram_set_obj_size(meta, index, clen); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_unlock_table(&meta->table[index]); /* Update stats */ atomic64_add(clen, &zram->stats.compr_data_size); atomic64_inc(&zram->stats.pages_stored); out: - if (locked) + if (zstrm) zcomp_strm_release(zram->comp, zstrm); if (is_partial_io(bvec)) kfree(uncmem); return ret; } -static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, int rw) -{ - unsigned long start_time = jiffies; - int ret; - - generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, - &zram->disk->part0); - - if (rw == READ) { - atomic64_inc(&zram->stats.num_reads); - ret = zram_bvec_read(zram, bvec, index, offset); - } else { - atomic64_inc(&zram->stats.num_writes); - ret = zram_bvec_write(zram, bvec, index, offset); - } - - generic_end_io_acct(rw, &zram->disk->part0, start_time); - - if (unlikely(ret)) { - if (rw == READ) - atomic64_inc(&zram->stats.failed_reads); - else - atomic64_inc(&zram->stats.failed_writes); - } - - return ret; -} - /* * zram_bio_discard - handler on discard request * @index: physical block index in PAGE_SIZE units @@ -762,194 +802,75 @@ static void zram_bio_discard(struct zram *zram, u32 index, } while (n >= PAGE_SIZE) { - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_lock_table(&meta->table[index]); zram_free_page(zram, index); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_unlock_table(&meta->table[index]); atomic64_inc(&zram->stats.notify_free); index++; n -= PAGE_SIZE; } } -static void zram_reset_device(struct zram *zram) +static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, + int offset, int rw) { - struct zram_meta *meta; - struct zcomp *comp; - u64 disksize; - - down_write(&zram->init_lock); + unsigned long start_time = jiffies; + int ret; - zram->limit_pages = 0; + generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT, + &zram->disk->part0); - if (!init_done(zram)) { - up_write(&zram->init_lock); - return; + if (rw == READ) { + atomic64_inc(&zram->stats.num_reads); + ret = zram_bvec_read(zram, bvec, index, offset); + } else { + atomic64_inc(&zram->stats.num_writes); + ret = zram_bvec_write(zram, bvec, index, offset); } - meta = zram->meta; - comp = zram->comp; - disksize = zram->disksize; - /* - * Refcount will go down to 0 eventually and r/w handler - * cannot handle further I/O so it will bail out by - * check zram_meta_get. - */ - zram_meta_put(zram); - /* - * We want to free zram_meta in process context to avoid - * deadlock between reclaim path and any other locks. - */ - wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); - - /* Reset stats */ - memset(&zram->stats, 0, sizeof(zram->stats)); - zram->disksize = 0; - zram->max_comp_streams = 1; + generic_end_io_acct(rw, &zram->disk->part0, start_time); - set_capacity(zram->disk, 0); - part_stat_set_all(&zram->disk->part0, 0); + if (unlikely(ret)) { + if (rw == READ) + atomic64_inc(&zram->stats.failed_reads); + else + atomic64_inc(&zram->stats.failed_writes); + } - up_write(&zram->init_lock); - /* I/O operation under all of CPU are done so let's free */ - zram_meta_free(meta, disksize); - zcomp_destroy(comp); + return ret; } -static ssize_t disksize_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) +static void __zram_make_request(struct zram *zram, struct bio *bio) { - u64 disksize; - struct zcomp *comp; - struct zram_meta *meta; - struct zram *zram = dev_to_zram(dev); - int err; - - disksize = memparse(buf, NULL); - if (!disksize) - return -EINVAL; + int offset, rw; + u32 index; + struct bio_vec bvec; + struct bvec_iter iter; - disksize = PAGE_ALIGN(disksize); - meta = zram_meta_alloc(zram->disk->first_minor, disksize); - if (!meta) - return -ENOMEM; + index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT; + offset = (bio->bi_iter.bi_sector & + (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; - comp = zcomp_create(zram->compressor, zram->max_comp_streams); - if (IS_ERR(comp)) { - pr_info("Cannot initialise %s compressing backend\n", - zram->compressor); - err = PTR_ERR(comp); - goto out_free_meta; + if (unlikely(bio->bi_rw & REQ_DISCARD)) { + zram_bio_discard(zram, index, offset, bio); + bio_endio(bio); + return; } - down_write(&zram->init_lock); - if (init_done(zram)) { - pr_info("Cannot change disksize for initialized device\n"); - err = -EBUSY; - goto out_destroy_comp; - } + rw = bio_data_dir(bio); + bio_for_each_segment(bvec, bio, iter) { + int max_transfer_size = PAGE_SIZE - offset; - init_waitqueue_head(&zram->io_done); - atomic_set(&zram->refcount, 1); - zram->meta = meta; - zram->comp = comp; - zram->disksize = disksize; - set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - up_write(&zram->init_lock); + if (bvec.bv_len > max_transfer_size) { + /* + * zram_bvec_rw() can only make operation on a single + * zram page. Split the bio vector. + */ + struct bio_vec bv; - /* - * Revalidate disk out of the init_lock to avoid lockdep splat. - * It's okay because disk's capacity is protected by init_lock - * so that revalidate_disk always sees up-to-date capacity. - */ - revalidate_disk(zram->disk); - - return len; - -out_destroy_comp: - up_write(&zram->init_lock); - zcomp_destroy(comp); -out_free_meta: - zram_meta_free(meta, disksize); - return err; -} - -static ssize_t reset_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - int ret; - unsigned short do_reset; - struct zram *zram; - struct block_device *bdev; - - zram = dev_to_zram(dev); - bdev = bdget_disk(zram->disk, 0); - - if (!bdev) - return -ENOMEM; - - mutex_lock(&bdev->bd_mutex); - /* Do not reset an active device! */ - if (bdev->bd_openers) { - ret = -EBUSY; - goto out; - } - - ret = kstrtou16(buf, 10, &do_reset); - if (ret) - goto out; - - if (!do_reset) { - ret = -EINVAL; - goto out; - } - - /* Make sure all pending I/O is finished */ - fsync_bdev(bdev); - zram_reset_device(zram); - - mutex_unlock(&bdev->bd_mutex); - revalidate_disk(zram->disk); - bdput(bdev); - - return len; - -out: - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); - return ret; -} - -static void __zram_make_request(struct zram *zram, struct bio *bio) -{ - int offset, rw; - u32 index; - struct bio_vec bvec; - struct bvec_iter iter; - - index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT; - offset = (bio->bi_iter.bi_sector & - (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; - - if (unlikely(bio->bi_rw & REQ_DISCARD)) { - zram_bio_discard(zram, index, offset, bio); - bio_endio(bio, 0); - return; - } - - rw = bio_data_dir(bio); - bio_for_each_segment(bvec, bio, iter) { - int max_transfer_size = PAGE_SIZE - offset; - - if (bvec.bv_len > max_transfer_size) { - /* - * zram_bvec_rw() can only make operation on a single - * zram page. Split the bio vector. - */ - struct bio_vec bv; - - bv.bv_page = bvec.bv_page; - bv.bv_len = max_transfer_size; - bv.bv_offset = bvec.bv_offset; + bv.bv_page = bvec.bv_page; + bv.bv_len = max_transfer_size; + bv.bv_offset = bvec.bv_offset; if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) goto out; @@ -965,8 +886,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) update_position(&index, &offset, &bvec); } - set_bit(BIO_UPTODATE, &bio->bi_flags); - bio_endio(bio, 0); + bio_endio(bio); return; out: @@ -976,13 +896,15 @@ out: /* * Handler function for all zram I/O requests. */ -static void zram_make_request(struct request_queue *queue, struct bio *bio) +static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio) { struct zram *zram = queue->queuedata; if (unlikely(!zram_meta_get(zram))) goto error; + blk_queue_split(queue, &bio, queue->bio_split); + if (!valid_io_request(zram, bio->bi_iter.bi_sector, bio->bi_iter.bi_size)) { atomic64_inc(&zram->stats.invalid_io); @@ -991,11 +913,12 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) __zram_make_request(zram, bio); zram_meta_put(zram); - return; + return BLK_QC_T_NONE; put_zram: zram_meta_put(zram); error: bio_io_error(bio); + return BLK_QC_T_NONE; } static void zram_slot_free_notify(struct block_device *bdev, @@ -1007,9 +930,9 @@ static void zram_slot_free_notify(struct block_device *bdev, zram = bdev->bd_disk->private_data; meta = zram->meta; - bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); + zram_lock_table(&meta->table[index]); zram_free_page(zram, index); - bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); + zram_unlock_table(&meta->table[index]); atomic64_inc(&zram->stats.notify_free); } @@ -1055,80 +978,185 @@ out: return err; } -static const struct block_device_operations zram_devops = { - .swap_slot_free_notify = zram_slot_free_notify, - .rw_page = zram_rw_page, - .owner = THIS_MODULE -}; +static void zram_reset_device(struct zram *zram) +{ + struct zram_meta *meta; + struct zcomp *comp; + u64 disksize; -static DEVICE_ATTR_WO(compact); -static DEVICE_ATTR_RW(disksize); -static DEVICE_ATTR_RO(initstate); -static DEVICE_ATTR_WO(reset); -static DEVICE_ATTR_RO(orig_data_size); -static DEVICE_ATTR_RO(mem_used_total); -static DEVICE_ATTR_RW(mem_limit); -static DEVICE_ATTR_RW(mem_used_max); -static DEVICE_ATTR_RW(max_comp_streams); -static DEVICE_ATTR_RW(comp_algorithm); + down_write(&zram->init_lock); -static ssize_t io_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) + zram->limit_pages = 0; + + if (!init_done(zram)) { + up_write(&zram->init_lock); + return; + } + + meta = zram->meta; + comp = zram->comp; + disksize = zram->disksize; + /* + * Refcount will go down to 0 eventually and r/w handler + * cannot handle further I/O so it will bail out by + * check zram_meta_get. + */ + zram_meta_put(zram); + /* + * We want to free zram_meta in process context to avoid + * deadlock between reclaim path and any other locks. + */ + wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); + + /* Reset stats */ + memset(&zram->stats, 0, sizeof(zram->stats)); + zram->disksize = 0; + zram->max_comp_streams = 1; + + set_capacity(zram->disk, 0); + part_stat_set_all(&zram->disk->part0, 0); + + up_write(&zram->init_lock); + /* I/O operation under all of CPU are done so let's free */ + zram_meta_free(meta, disksize); + zcomp_destroy(comp); +} + +static ssize_t disksize_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { + u64 disksize; + struct zcomp *comp; + struct zram_meta *meta; struct zram *zram = dev_to_zram(dev); - ssize_t ret; + int err; - down_read(&zram->init_lock); - ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8llu\n", - (u64)atomic64_read(&zram->stats.failed_reads), - (u64)atomic64_read(&zram->stats.failed_writes), - (u64)atomic64_read(&zram->stats.invalid_io), - (u64)atomic64_read(&zram->stats.notify_free)); - up_read(&zram->init_lock); + disksize = memparse(buf, NULL); + if (!disksize) + return -EINVAL; - return ret; + disksize = PAGE_ALIGN(disksize); + meta = zram_meta_alloc(zram->disk->disk_name, disksize); + if (!meta) + return -ENOMEM; + + comp = zcomp_create(zram->compressor, zram->max_comp_streams); + if (IS_ERR(comp)) { + pr_err("Cannot initialise %s compressing backend\n", + zram->compressor); + err = PTR_ERR(comp); + goto out_free_meta; + } + + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Cannot change disksize for initialized device\n"); + err = -EBUSY; + goto out_destroy_comp; + } + + init_waitqueue_head(&zram->io_done); + atomic_set(&zram->refcount, 1); + zram->meta = meta; + zram->comp = comp; + zram->disksize = disksize; + set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + up_write(&zram->init_lock); + + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock + * so that revalidate_disk always sees up-to-date capacity. + */ + revalidate_disk(zram->disk); + + return len; + +out_destroy_comp: + up_write(&zram->init_lock); + zcomp_destroy(comp); +out_free_meta: + zram_meta_free(meta, disksize); + return err; } -static ssize_t mm_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t reset_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) { - struct zram *zram = dev_to_zram(dev); - u64 orig_size, mem_used = 0; - long max_used; - ssize_t ret; + int ret; + unsigned short do_reset; + struct zram *zram; + struct block_device *bdev; - down_read(&zram->init_lock); - if (init_done(zram)) - mem_used = zs_get_total_pages(zram->meta->mem_pool); + ret = kstrtou16(buf, 10, &do_reset); + if (ret) + return ret; - orig_size = atomic64_read(&zram->stats.pages_stored); - max_used = atomic_long_read(&zram->stats.max_used_pages); + if (!do_reset) + return -EINVAL; - ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n", - orig_size << PAGE_SHIFT, - (u64)atomic64_read(&zram->stats.compr_data_size), - mem_used << PAGE_SHIFT, - zram->limit_pages << PAGE_SHIFT, - max_used << PAGE_SHIFT, - (u64)atomic64_read(&zram->stats.zero_pages), - (u64)atomic64_read(&zram->stats.num_migrated)); - up_read(&zram->init_lock); + zram = dev_to_zram(dev); + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + /* Do not reset an active device or claimed device */ + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + /* From now on, anyone can't open /dev/zram[0-9] */ + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + revalidate_disk(zram->disk); + bdput(bdev); + + mutex_lock(&bdev->bd_mutex); + zram->claim = false; + mutex_unlock(&bdev->bd_mutex); + + return len; +} + +static int zram_open(struct block_device *bdev, fmode_t mode) +{ + int ret = 0; + struct zram *zram; + + WARN_ON(!mutex_is_locked(&bdev->bd_mutex)); + + zram = bdev->bd_disk->private_data; + /* zram was claimed to reset so open request fails */ + if (zram->claim) + ret = -EBUSY; return ret; } -static DEVICE_ATTR_RO(io_stat); -static DEVICE_ATTR_RO(mm_stat); -ZRAM_ATTR_RO(num_reads); -ZRAM_ATTR_RO(num_writes); -ZRAM_ATTR_RO(failed_reads); -ZRAM_ATTR_RO(failed_writes); -ZRAM_ATTR_RO(invalid_io); -ZRAM_ATTR_RO(notify_free); -ZRAM_ATTR_RO(zero_pages); -ZRAM_ATTR_RO(compr_data_size); +static const struct block_device_operations zram_devops = { + .open = zram_open, + .swap_slot_free_notify = zram_slot_free_notify, + .rw_page = zram_rw_page, + .owner = THIS_MODULE +}; + +static DEVICE_ATTR_WO(compact); +static DEVICE_ATTR_RW(disksize); +static DEVICE_ATTR_RO(initstate); +static DEVICE_ATTR_WO(reset); +static DEVICE_ATTR_RO(orig_data_size); +static DEVICE_ATTR_RO(mem_used_total); +static DEVICE_ATTR_RW(mem_limit); +static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_RW(max_comp_streams); +static DEVICE_ATTR_RW(comp_algorithm); static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -1158,10 +1186,24 @@ static struct attribute_group zram_disk_attr_group = { .attrs = zram_disk_attrs, }; -static int create_device(struct zram *zram, int device_id) +/* + * Allocate and initialize new zram device. the function returns + * '>= 0' device_id upon success, and negative value otherwise. + */ +static int zram_add(void) { + struct zram *zram; struct request_queue *queue; - int ret = -ENOMEM; + int ret, device_id; + + zram = kzalloc(sizeof(struct zram), GFP_KERNEL); + if (!zram) + return -ENOMEM; + + ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL); + if (ret < 0) + goto out_free_dev; + device_id = ret; init_rwsem(&zram->init_lock); @@ -1169,15 +1211,16 @@ static int create_device(struct zram *zram, int device_id) if (!queue) { pr_err("Error allocating disk queue for device %d\n", device_id); - goto out; + ret = -ENOMEM; + goto out_free_idr; } blk_queue_make_request(queue, zram_make_request); - /* gendisk structure */ + /* gendisk structure */ zram->disk = alloc_disk(1); if (!zram->disk) { - pr_warn("Error allocating disk structure for device %d\n", + pr_err("Error allocating disk structure for device %d\n", device_id); ret = -ENOMEM; goto out_free_queue; @@ -1206,7 +1249,7 @@ static int create_device(struct zram *zram, int device_id) blk_queue_io_min(zram->disk->queue, PAGE_SIZE); blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); zram->disk->queue->limits.discard_granularity = PAGE_SIZE; - zram->disk->queue->limits.max_discard_sectors = UINT_MAX; + blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); /* * zram_bio_discard() will clear all logical blocks if logical block * size is identical with physical block size(PAGE_SIZE). But if it is @@ -1226,96 +1269,185 @@ static int create_device(struct zram *zram, int device_id) ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, &zram_disk_attr_group); if (ret < 0) { - pr_warn("Error creating sysfs group"); + pr_err("Error creating sysfs group for device %d\n", + device_id); goto out_free_disk; } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); zram->meta = NULL; zram->max_comp_streams = 1; - return 0; + + pr_info("Added device: %s\n", zram->disk->disk_name); + return device_id; out_free_disk: del_gendisk(zram->disk); put_disk(zram->disk); out_free_queue: blk_cleanup_queue(queue); -out: +out_free_idr: + idr_remove(&zram_index_idr, device_id); +out_free_dev: + kfree(zram); return ret; } -static void destroy_devices(unsigned int nr) +static int zram_remove(struct zram *zram) +{ + struct block_device *bdev; + + bdev = bdget_disk(zram->disk, 0); + if (!bdev) + return -ENOMEM; + + mutex_lock(&bdev->bd_mutex); + if (bdev->bd_openers || zram->claim) { + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return -EBUSY; + } + + zram->claim = true; + mutex_unlock(&bdev->bd_mutex); + + /* + * Remove sysfs first, so no one will perform a disksize + * store while we destroy the devices. This also helps during + * hot_remove -- zram_reset_device() is the last holder of + * ->init_lock, no later/concurrent disksize_store() or any + * other sysfs handlers are possible. + */ + sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, + &zram_disk_attr_group); + + /* Make sure all the pending I/O are finished */ + fsync_bdev(bdev); + zram_reset_device(zram); + bdput(bdev); + + pr_info("Removed device: %s\n", zram->disk->disk_name); + + blk_cleanup_queue(zram->disk->queue); + del_gendisk(zram->disk); + put_disk(zram->disk); + kfree(zram); + return 0; +} + +/* zram-control sysfs attributes */ +static ssize_t hot_add_show(struct class *class, + struct class_attribute *attr, + char *buf) +{ + int ret; + + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + + if (ret < 0) + return ret; + return scnprintf(buf, PAGE_SIZE, "%d\n", ret); +} + +static ssize_t hot_remove_store(struct class *class, + struct class_attribute *attr, + const char *buf, + size_t count) { struct zram *zram; - unsigned int i; + int ret, dev_id; - for (i = 0; i < nr; i++) { - zram = &zram_devices[i]; - /* - * Remove sysfs first, so no one will perform a disksize - * store while we destroy the devices - */ - sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, - &zram_disk_attr_group); + /* dev_id is gendisk->first_minor, which is `int' */ + ret = kstrtoint(buf, 10, &dev_id); + if (ret) + return ret; + if (dev_id < 0) + return -EINVAL; - zram_reset_device(zram); + mutex_lock(&zram_index_mutex); - blk_cleanup_queue(zram->disk->queue); - del_gendisk(zram->disk); - put_disk(zram->disk); + zram = idr_find(&zram_index_idr, dev_id); + if (zram) { + ret = zram_remove(zram); + idr_remove(&zram_index_idr, dev_id); + } else { + ret = -ENODEV; } - kfree(zram_devices); + mutex_unlock(&zram_index_mutex); + return ret ? ret : count; +} + +static struct class_attribute zram_control_class_attrs[] = { + __ATTR_RO(hot_add), + __ATTR_WO(hot_remove), + __ATTR_NULL, +}; + +static struct class zram_control_class = { + .name = "zram-control", + .owner = THIS_MODULE, + .class_attrs = zram_control_class_attrs, +}; + +static int zram_remove_cb(int id, void *ptr, void *data) +{ + zram_remove(ptr); + return 0; +} + +static void destroy_devices(void) +{ + class_unregister(&zram_control_class); + idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + idr_destroy(&zram_index_idr); unregister_blkdev(zram_major, "zram"); - pr_info("Destroyed %u device(s)\n", nr); } static int __init zram_init(void) { - int ret, dev_id; + int ret; - if (num_devices > max_num_devices) { - pr_warn("Invalid value for num_devices: %u\n", - num_devices); - return -EINVAL; + ret = class_register(&zram_control_class); + if (ret) { + pr_err("Unable to register zram-control class\n"); + return ret; } zram_major = register_blkdev(0, "zram"); if (zram_major <= 0) { - pr_warn("Unable to get major number\n"); + pr_err("Unable to get major number\n"); + class_unregister(&zram_control_class); return -EBUSY; } - /* Allocate the device array and initialize each one */ - zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL); - if (!zram_devices) { - unregister_blkdev(zram_major, "zram"); - return -ENOMEM; - } - - for (dev_id = 0; dev_id < num_devices; dev_id++) { - ret = create_device(&zram_devices[dev_id], dev_id); - if (ret) + while (num_devices != 0) { + mutex_lock(&zram_index_mutex); + ret = zram_add(); + mutex_unlock(&zram_index_mutex); + if (ret < 0) goto out_error; + num_devices--; } - pr_info("Created %u device(s)\n", num_devices); return 0; out_error: - destroy_devices(dev_id); + destroy_devices(); return ret; } static void __exit zram_exit(void) { - destroy_devices(num_devices); + destroy_devices(); } module_init(zram_init); module_exit(zram_exit); module_param(num_devices, uint, 0); -MODULE_PARM_DESC(num_devices, "Number of zram devices"); +MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Nitin Gupta "); diff --git a/kernel/drivers/block/zram/zram_drv.h b/kernel/drivers/block/zram/zram_drv.h index 570c598f4..9e3e953d6 100644 --- a/kernel/drivers/block/zram/zram_drv.h +++ b/kernel/drivers/block/zram/zram_drv.h @@ -20,12 +20,6 @@ #include "zcomp.h" -/* - * Some arbitrary value. This is just to catch - * invalid value for num_devices module parameter. - */ -static const unsigned max_num_devices = 32; - /*-- Configurable parameters */ /* @@ -78,13 +72,15 @@ enum zram_pageflags { struct zram_table_entry { unsigned long handle; unsigned long value; +#ifdef CONFIG_PREEMPT_RT_BASE + spinlock_t lock; +#endif }; struct zram_stats { atomic64_t compr_data_size; /* compressed size of pages stored */ atomic64_t num_reads; /* failed + successful */ atomic64_t num_writes; /* --do-- */ - atomic64_t num_migrated; /* no. of migrated object */ atomic64_t failed_reads; /* can happen when memory is too low */ atomic64_t failed_writes; /* can happen when memory is too low */ atomic64_t invalid_io; /* non-page-aligned I/O requests */ @@ -121,5 +117,47 @@ struct zram { */ u64 disksize; /* bytes */ char compressor[10]; + /* + * zram is claimed so open request will be failed + */ + bool claim; /* Protected by bdev->bd_mutex */ }; + +#ifndef CONFIG_PREEMPT_RT_BASE +static inline void zram_lock_table(struct zram_table_entry *table) +{ + bit_spin_lock(ZRAM_ACCESS, &table->value); +} + +static inline void zram_unlock_table(struct zram_table_entry *table) +{ + bit_spin_unlock(ZRAM_ACCESS, &table->value); +} + +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) { } +#else /* CONFIG_PREEMPT_RT_BASE */ +static inline void zram_lock_table(struct zram_table_entry *table) +{ + spin_lock(&table->lock); + __set_bit(ZRAM_ACCESS, &table->value); +} + +static inline void zram_unlock_table(struct zram_table_entry *table) +{ + __clear_bit(ZRAM_ACCESS, &table->value); + spin_unlock(&table->lock); +} + +static inline void zram_meta_init_table_locks(struct zram_meta *meta, u64 disksize) +{ + size_t num_pages = disksize >> PAGE_SHIFT; + size_t index; + + for (index = 0; index < num_pages; index++) { + spinlock_t *lock = &meta->table[index].lock; + spin_lock_init(lock); + } +} +#endif /* CONFIG_PREEMPT_RT_BASE */ + #endif -- cgit 1.2.3-korg