diff options
author | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-11 10:41:07 +0300 |
---|---|---|
committer | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-13 08:17:18 +0300 |
commit | e09b41010ba33a20a87472ee821fa407a5b8da36 (patch) | |
tree | d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/drivers/nvdimm | |
parent | f93b97fd65072de626c074dbe099a1fff05ce060 (diff) |
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page.
During the rebasing, the following patch collided:
Force tick interrupt and get rid of softirq magic(I70131fb85).
Collisions have been removed because its logic was found on the
source already.
Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769
Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/drivers/nvdimm')
-rw-r--r-- | kernel/drivers/nvdimm/Kconfig | 91 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/Makefile | 25 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/blk.c | 386 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/btt.c | 1456 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/btt.h | 188 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/btt_devs.c | 306 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/bus.c | 725 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/claim.c | 201 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/core.c | 454 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/dimm.c | 102 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/dimm_devs.c | 548 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/e820.c | 100 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/label.c | 927 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/label.h | 141 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/namespace_devs.c | 1986 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/nd-core.h | 92 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/nd.h | 281 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/pfn.h | 35 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/pfn_devs.c | 337 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/pmem.c | 464 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/region.c | 116 | ||||
-rw-r--r-- | kernel/drivers/nvdimm/region_devs.c | 756 |
22 files changed, 9717 insertions, 0 deletions
diff --git a/kernel/drivers/nvdimm/Kconfig b/kernel/drivers/nvdimm/Kconfig new file mode 100644 index 000000000..53c11621d --- /dev/null +++ b/kernel/drivers/nvdimm/Kconfig @@ -0,0 +1,91 @@ +menuconfig LIBNVDIMM + tristate "NVDIMM (Non-Volatile Memory Device) Support" + depends on PHYS_ADDR_T_64BIT + depends on BLK_DEV + help + Generic support for non-volatile memory devices including + ACPI-6-NFIT defined resources. On platforms that define an + NFIT, or otherwise can discover NVDIMM resources, a libnvdimm + bus is registered to advertise PMEM (persistent memory) + namespaces (/dev/pmemX) and BLK (sliding mmio window(s)) + namespaces (/dev/ndblkX.Y). A PMEM namespace refers to a + memory resource that may span multiple DIMMs and support DAX + (see CONFIG_DAX). A BLK namespace refers to an NVDIMM control + region which exposes an mmio register set for windowed access + mode to non-volatile memory. + +if LIBNVDIMM + +config BLK_DEV_PMEM + tristate "PMEM: Persistent memory block device support" + default LIBNVDIMM + depends on HAS_IOMEM + select ND_BTT if BTT + select ND_PFN if NVDIMM_PFN + help + Memory ranges for PMEM are described by either an NFIT + (NVDIMM Firmware Interface Table, see CONFIG_NFIT_ACPI), a + non-standard OEM-specific E820 memory type (type-12, see + CONFIG_X86_PMEM_LEGACY), or it is manually specified by the + 'memmap=nn[KMG]!ss[KMG]' kernel command line (see + Documentation/kernel-parameters.txt). This driver converts + these persistent memory ranges into block devices that are + capable of DAX (direct-access) file system mappings. See + Documentation/nvdimm/nvdimm.txt for more details. + + Say Y if you want to use an NVDIMM + +config ND_BLK + tristate "BLK: Block data window (aperture) device support" + default LIBNVDIMM + select ND_BTT if BTT + help + Support NVDIMMs, or other devices, that implement a BLK-mode + access capability. BLK-mode access uses memory-mapped-i/o + apertures to access persistent media. + + Say Y if your platform firmware emits an ACPI.NFIT table + (CONFIG_ACPI_NFIT), or otherwise exposes BLK-mode + capabilities. + +config ND_CLAIM + bool + +config ND_BTT + tristate + +config BTT + bool "BTT: Block Translation Table (atomic sector updates)" + default y if LIBNVDIMM + select ND_CLAIM + help + The Block Translation Table (BTT) provides atomic sector + update semantics for persistent memory devices, so that + applications that rely on sector writes not being torn (a + guarantee that typical disks provide) can continue to do so. + The BTT manifests itself as an alternate personality for an + NVDIMM namespace, i.e. a namespace can be in raw mode (pmemX, + ndblkX.Y, etc...), or 'sectored' mode, (pmemXs, ndblkX.Ys, + etc...). + + Select Y if unsure + +config ND_PFN + tristate + +config NVDIMM_PFN + bool "PFN: Map persistent (device) memory" + default LIBNVDIMM + depends on ZONE_DEVICE + select ND_CLAIM + help + Map persistent memory, i.e. advertise it to the memory + management sub-system. By default persistent memory does + not support direct I/O, RDMA, or any other usage that + requires a 'struct page' to mediate an I/O request. This + driver allocates and initializes the infrastructure needed + to support those use cases. + + Select Y if unsure + +endif diff --git a/kernel/drivers/nvdimm/Makefile b/kernel/drivers/nvdimm/Makefile new file mode 100644 index 000000000..ea84d3c4e --- /dev/null +++ b/kernel/drivers/nvdimm/Makefile @@ -0,0 +1,25 @@ +obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o +obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o +obj-$(CONFIG_ND_BTT) += nd_btt.o +obj-$(CONFIG_ND_BLK) += nd_blk.o +obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o + +nd_pmem-y := pmem.o + +nd_btt-y := btt.o + +nd_blk-y := blk.o + +nd_e820-y := e820.o + +libnvdimm-y := core.o +libnvdimm-y += bus.o +libnvdimm-y += dimm_devs.o +libnvdimm-y += dimm.o +libnvdimm-y += region_devs.o +libnvdimm-y += region.o +libnvdimm-y += namespace_devs.o +libnvdimm-y += label.o +libnvdimm-$(CONFIG_ND_CLAIM) += claim.o +libnvdimm-$(CONFIG_BTT) += btt_devs.o +libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o diff --git a/kernel/drivers/nvdimm/blk.c b/kernel/drivers/nvdimm/blk.c new file mode 100644 index 000000000..91a336ea8 --- /dev/null +++ b/kernel/drivers/nvdimm/blk.c @@ -0,0 +1,386 @@ +/* + * NVDIMM Block Window Driver + * Copyright (c) 2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/blkdev.h> +#include <linux/fs.h> +#include <linux/genhd.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/nd.h> +#include <linux/sizes.h> +#include "nd.h" + +struct nd_blk_device { + struct request_queue *queue; + struct gendisk *disk; + struct nd_namespace_blk *nsblk; + struct nd_blk_region *ndbr; + size_t disk_size; + u32 sector_size; + u32 internal_lbasize; +}; + +static int nd_blk_major; + +static u32 nd_blk_meta_size(struct nd_blk_device *blk_dev) +{ + return blk_dev->nsblk->lbasize - blk_dev->sector_size; +} + +static resource_size_t to_dev_offset(struct nd_namespace_blk *nsblk, + resource_size_t ns_offset, unsigned int len) +{ + int i; + + for (i = 0; i < nsblk->num_resources; i++) { + if (ns_offset < resource_size(nsblk->res[i])) { + if (ns_offset + len > resource_size(nsblk->res[i])) { + dev_WARN_ONCE(&nsblk->common.dev, 1, + "illegal request\n"); + return SIZE_MAX; + } + return nsblk->res[i]->start + ns_offset; + } + ns_offset -= resource_size(nsblk->res[i]); + } + + dev_WARN_ONCE(&nsblk->common.dev, 1, "request out of range\n"); + return SIZE_MAX; +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev, + struct bio_integrity_payload *bip, u64 lba, + int rw) +{ + unsigned int len = nd_blk_meta_size(blk_dev); + resource_size_t dev_offset, ns_offset; + struct nd_namespace_blk *nsblk; + struct nd_blk_region *ndbr; + int err = 0; + + nsblk = blk_dev->nsblk; + ndbr = blk_dev->ndbr; + ns_offset = lba * blk_dev->internal_lbasize + blk_dev->sector_size; + dev_offset = to_dev_offset(nsblk, ns_offset, len); + if (dev_offset == SIZE_MAX) + return -EIO; + + while (len) { + unsigned int cur_len; + struct bio_vec bv; + void *iobuf; + + bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); + /* + * The 'bv' obtained from bvec_iter_bvec has its .bv_len and + * .bv_offset already adjusted for iter->bi_bvec_done, and we + * can use those directly + */ + + cur_len = min(len, bv.bv_len); + iobuf = kmap_atomic(bv.bv_page); + err = ndbr->do_io(ndbr, dev_offset, iobuf + bv.bv_offset, + cur_len, rw); + kunmap_atomic(iobuf); + if (err) + return err; + + len -= cur_len; + dev_offset += cur_len; + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len); + } + + return err; +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static int nd_blk_rw_integrity(struct nd_blk_device *blk_dev, + struct bio_integrity_payload *bip, u64 lba, + int rw) +{ + return 0; +} +#endif + +static int nd_blk_do_bvec(struct nd_blk_device *blk_dev, + struct bio_integrity_payload *bip, struct page *page, + unsigned int len, unsigned int off, int rw, + sector_t sector) +{ + struct nd_blk_region *ndbr = blk_dev->ndbr; + resource_size_t dev_offset, ns_offset; + int err = 0; + void *iobuf; + u64 lba; + + while (len) { + unsigned int cur_len; + + /* + * If we don't have an integrity payload, we don't have to + * split the bvec into sectors, as this would cause unnecessary + * Block Window setup/move steps. the do_io routine is capable + * of handling len <= PAGE_SIZE. + */ + cur_len = bip ? min(len, blk_dev->sector_size) : len; + + lba = div_u64(sector << SECTOR_SHIFT, blk_dev->sector_size); + ns_offset = lba * blk_dev->internal_lbasize; + dev_offset = to_dev_offset(blk_dev->nsblk, ns_offset, cur_len); + if (dev_offset == SIZE_MAX) + return -EIO; + + iobuf = kmap_atomic(page); + err = ndbr->do_io(ndbr, dev_offset, iobuf + off, cur_len, rw); + kunmap_atomic(iobuf); + if (err) + return err; + + if (bip) { + err = nd_blk_rw_integrity(blk_dev, bip, lba, rw); + if (err) + return err; + } + len -= cur_len; + off += cur_len; + sector += blk_dev->sector_size >> SECTOR_SHIFT; + } + + return err; +} + +static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio) +{ + struct block_device *bdev = bio->bi_bdev; + struct gendisk *disk = bdev->bd_disk; + struct bio_integrity_payload *bip; + struct nd_blk_device *blk_dev; + struct bvec_iter iter; + unsigned long start; + struct bio_vec bvec; + int err = 0, rw; + bool do_acct; + + /* + * bio_integrity_enabled also checks if the bio already has an + * integrity payload attached. If it does, we *don't* do a + * bio_integrity_prep here - the payload has been generated by + * another kernel subsystem, and we just pass it through. + */ + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio->bi_error = -EIO; + goto out; + } + + bip = bio_integrity(bio); + blk_dev = disk->private_data; + rw = bio_data_dir(bio); + do_acct = nd_iostat_start(bio, &start); + bio_for_each_segment(bvec, bio, iter) { + unsigned int len = bvec.bv_len; + + BUG_ON(len > PAGE_SIZE); + err = nd_blk_do_bvec(blk_dev, bip, bvec.bv_page, len, + bvec.bv_offset, rw, iter.bi_sector); + if (err) { + dev_info(&blk_dev->nsblk->common.dev, + "io error in %s sector %lld, len %d,\n", + (rw == READ) ? "READ" : "WRITE", + (unsigned long long) iter.bi_sector, len); + bio->bi_error = err; + break; + } + } + if (do_acct) + nd_iostat_end(bio, start); + + out: + bio_endio(bio); + return BLK_QC_T_NONE; +} + +static int nd_blk_rw_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *iobuf, size_t n, int rw) +{ + struct nd_blk_device *blk_dev = dev_get_drvdata(ndns->claim); + struct nd_namespace_blk *nsblk = blk_dev->nsblk; + struct nd_blk_region *ndbr = blk_dev->ndbr; + resource_size_t dev_offset; + + dev_offset = to_dev_offset(nsblk, offset, n); + + if (unlikely(offset + n > blk_dev->disk_size)) { + dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n"); + return -EFAULT; + } + + if (dev_offset == SIZE_MAX) + return -EIO; + + return ndbr->do_io(ndbr, dev_offset, iobuf, n, rw); +} + +static const struct block_device_operations nd_blk_fops = { + .owner = THIS_MODULE, + .revalidate_disk = nvdimm_revalidate_disk, +}; + +static int nd_blk_attach_disk(struct nd_namespace_common *ndns, + struct nd_blk_device *blk_dev) +{ + resource_size_t available_disk_size; + struct gendisk *disk; + u64 internal_nlba; + + internal_nlba = div_u64(blk_dev->disk_size, blk_dev->internal_lbasize); + available_disk_size = internal_nlba * blk_dev->sector_size; + + blk_dev->queue = blk_alloc_queue(GFP_KERNEL); + if (!blk_dev->queue) + return -ENOMEM; + + blk_queue_make_request(blk_dev->queue, nd_blk_make_request); + blk_queue_max_hw_sectors(blk_dev->queue, UINT_MAX); + blk_queue_bounce_limit(blk_dev->queue, BLK_BOUNCE_ANY); + blk_queue_logical_block_size(blk_dev->queue, blk_dev->sector_size); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, blk_dev->queue); + + disk = blk_dev->disk = alloc_disk(0); + if (!disk) { + blk_cleanup_queue(blk_dev->queue); + return -ENOMEM; + } + + disk->driverfs_dev = &ndns->dev; + disk->major = nd_blk_major; + disk->first_minor = 0; + disk->fops = &nd_blk_fops; + disk->private_data = blk_dev; + disk->queue = blk_dev->queue; + disk->flags = GENHD_FL_EXT_DEVT; + nvdimm_namespace_disk_name(ndns, disk->disk_name); + set_capacity(disk, 0); + add_disk(disk); + + if (nd_blk_meta_size(blk_dev)) { + int rc = nd_integrity_init(disk, nd_blk_meta_size(blk_dev)); + + if (rc) { + del_gendisk(disk); + put_disk(disk); + blk_cleanup_queue(blk_dev->queue); + return rc; + } + } + + set_capacity(disk, available_disk_size >> SECTOR_SHIFT); + revalidate_disk(disk); + return 0; +} + +static int nd_blk_probe(struct device *dev) +{ + struct nd_namespace_common *ndns; + struct nd_namespace_blk *nsblk; + struct nd_blk_device *blk_dev; + int rc; + + ndns = nvdimm_namespace_common_probe(dev); + if (IS_ERR(ndns)) + return PTR_ERR(ndns); + + blk_dev = kzalloc(sizeof(*blk_dev), GFP_KERNEL); + if (!blk_dev) + return -ENOMEM; + + nsblk = to_nd_namespace_blk(&ndns->dev); + blk_dev->disk_size = nvdimm_namespace_capacity(ndns); + blk_dev->ndbr = to_nd_blk_region(dev->parent); + blk_dev->nsblk = to_nd_namespace_blk(&ndns->dev); + blk_dev->internal_lbasize = roundup(nsblk->lbasize, + INT_LBASIZE_ALIGNMENT); + blk_dev->sector_size = ((nsblk->lbasize >= 4096) ? 4096 : 512); + dev_set_drvdata(dev, blk_dev); + + ndns->rw_bytes = nd_blk_rw_bytes; + if (is_nd_btt(dev)) + rc = nvdimm_namespace_attach_btt(ndns); + else if (nd_btt_probe(ndns, blk_dev) == 0) { + /* we'll come back as btt-blk */ + rc = -ENXIO; + } else + rc = nd_blk_attach_disk(ndns, blk_dev); + if (rc) + kfree(blk_dev); + return rc; +} + +static void nd_blk_detach_disk(struct nd_blk_device *blk_dev) +{ + del_gendisk(blk_dev->disk); + put_disk(blk_dev->disk); + blk_cleanup_queue(blk_dev->queue); +} + +static int nd_blk_remove(struct device *dev) +{ + struct nd_blk_device *blk_dev = dev_get_drvdata(dev); + + if (is_nd_btt(dev)) + nvdimm_namespace_detach_btt(to_nd_btt(dev)->ndns); + else + nd_blk_detach_disk(blk_dev); + kfree(blk_dev); + + return 0; +} + +static struct nd_device_driver nd_blk_driver = { + .probe = nd_blk_probe, + .remove = nd_blk_remove, + .drv = { + .name = "nd_blk", + }, + .type = ND_DRIVER_NAMESPACE_BLK, +}; + +static int __init nd_blk_init(void) +{ + int rc; + + rc = register_blkdev(0, "nd_blk"); + if (rc < 0) + return rc; + + nd_blk_major = rc; + rc = nd_driver_register(&nd_blk_driver); + + if (rc < 0) + unregister_blkdev(nd_blk_major, "nd_blk"); + + return rc; +} + +static void __exit nd_blk_exit(void) +{ + driver_unregister(&nd_blk_driver.drv); + unregister_blkdev(nd_blk_major, "nd_blk"); +} + +MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_BLK); +module_init(nd_blk_init); +module_exit(nd_blk_exit); diff --git a/kernel/drivers/nvdimm/btt.c b/kernel/drivers/nvdimm/btt.c new file mode 100644 index 000000000..efb2c1cee --- /dev/null +++ b/kernel/drivers/nvdimm/btt.c @@ -0,0 +1,1456 @@ +/* + * Block Translation Table + * Copyright (c) 2014-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/highmem.h> +#include <linux/debugfs.h> +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/hdreg.h> +#include <linux/genhd.h> +#include <linux/sizes.h> +#include <linux/ndctl.h> +#include <linux/fs.h> +#include <linux/nd.h> +#include "btt.h" +#include "nd.h" + +enum log_ent_request { + LOG_NEW_ENT = 0, + LOG_OLD_ENT +}; + +static int btt_major; + +static int arena_read_bytes(struct arena_info *arena, resource_size_t offset, + void *buf, size_t n) +{ + struct nd_btt *nd_btt = arena->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* arena offsets are 4K from the base of the device */ + offset += SZ_4K; + return nvdimm_read_bytes(ndns, offset, buf, n); +} + +static int arena_write_bytes(struct arena_info *arena, resource_size_t offset, + void *buf, size_t n) +{ + struct nd_btt *nd_btt = arena->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* arena offsets are 4K from the base of the device */ + offset += SZ_4K; + return nvdimm_write_bytes(ndns, offset, buf, n); +} + +static int btt_info_write(struct arena_info *arena, struct btt_sb *super) +{ + int ret; + + ret = arena_write_bytes(arena, arena->info2off, super, + sizeof(struct btt_sb)); + if (ret) + return ret; + + return arena_write_bytes(arena, arena->infooff, super, + sizeof(struct btt_sb)); +} + +static int btt_info_read(struct arena_info *arena, struct btt_sb *super) +{ + WARN_ON(!super); + return arena_read_bytes(arena, arena->infooff, super, + sizeof(struct btt_sb)); +} + +/* + * 'raw' version of btt_map write + * Assumptions: + * mapping is in little-endian + * mapping contains 'E' and 'Z' flags as desired + */ +static int __btt_map_write(struct arena_info *arena, u32 lba, __le32 mapping) +{ + u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); + + WARN_ON(lba >= arena->external_nlba); + return arena_write_bytes(arena, ns_off, &mapping, MAP_ENT_SIZE); +} + +static int btt_map_write(struct arena_info *arena, u32 lba, u32 mapping, + u32 z_flag, u32 e_flag) +{ + u32 ze; + __le32 mapping_le; + + /* + * This 'mapping' is supposed to be just the LBA mapping, without + * any flags set, so strip the flag bits. + */ + mapping &= MAP_LBA_MASK; + + ze = (z_flag << 1) + e_flag; + switch (ze) { + case 0: + /* + * We want to set neither of the Z or E flags, and + * in the actual layout, this means setting the bit + * positions of both to '1' to indicate a 'normal' + * map entry + */ + mapping |= MAP_ENT_NORMAL; + break; + case 1: + mapping |= (1 << MAP_ERR_SHIFT); + break; + case 2: + mapping |= (1 << MAP_TRIM_SHIFT); + break; + default: + /* + * The case where Z and E are both sent in as '1' could be + * construed as a valid 'normal' case, but we decide not to, + * to avoid confusion + */ + WARN_ONCE(1, "Invalid use of Z and E flags\n"); + return -EIO; + } + + mapping_le = cpu_to_le32(mapping); + return __btt_map_write(arena, lba, mapping_le); +} + +static int btt_map_read(struct arena_info *arena, u32 lba, u32 *mapping, + int *trim, int *error) +{ + int ret; + __le32 in; + u32 raw_mapping, postmap, ze, z_flag, e_flag; + u64 ns_off = arena->mapoff + (lba * MAP_ENT_SIZE); + + WARN_ON(lba >= arena->external_nlba); + + ret = arena_read_bytes(arena, ns_off, &in, MAP_ENT_SIZE); + if (ret) + return ret; + + raw_mapping = le32_to_cpu(in); + + z_flag = (raw_mapping & MAP_TRIM_MASK) >> MAP_TRIM_SHIFT; + e_flag = (raw_mapping & MAP_ERR_MASK) >> MAP_ERR_SHIFT; + ze = (z_flag << 1) + e_flag; + postmap = raw_mapping & MAP_LBA_MASK; + + /* Reuse the {z,e}_flag variables for *trim and *error */ + z_flag = 0; + e_flag = 0; + + switch (ze) { + case 0: + /* Initial state. Return postmap = premap */ + *mapping = lba; + break; + case 1: + *mapping = postmap; + e_flag = 1; + break; + case 2: + *mapping = postmap; + z_flag = 1; + break; + case 3: + *mapping = postmap; + break; + default: + return -EIO; + } + + if (trim) + *trim = z_flag; + if (error) + *error = e_flag; + + return ret; +} + +static int btt_log_read_pair(struct arena_info *arena, u32 lane, + struct log_entry *ent) +{ + WARN_ON(!ent); + return arena_read_bytes(arena, + arena->logoff + (2 * lane * LOG_ENT_SIZE), ent, + 2 * LOG_ENT_SIZE); +} + +static struct dentry *debugfs_root; + +static void arena_debugfs_init(struct arena_info *a, struct dentry *parent, + int idx) +{ + char dirname[32]; + struct dentry *d; + + /* If for some reason, parent bttN was not created, exit */ + if (!parent) + return; + + snprintf(dirname, 32, "arena%d", idx); + d = debugfs_create_dir(dirname, parent); + if (IS_ERR_OR_NULL(d)) + return; + a->debugfs_dir = d; + + debugfs_create_x64("size", S_IRUGO, d, &a->size); + debugfs_create_x64("external_lba_start", S_IRUGO, d, + &a->external_lba_start); + debugfs_create_x32("internal_nlba", S_IRUGO, d, &a->internal_nlba); + debugfs_create_u32("internal_lbasize", S_IRUGO, d, + &a->internal_lbasize); + debugfs_create_x32("external_nlba", S_IRUGO, d, &a->external_nlba); + debugfs_create_u32("external_lbasize", S_IRUGO, d, + &a->external_lbasize); + debugfs_create_u32("nfree", S_IRUGO, d, &a->nfree); + debugfs_create_u16("version_major", S_IRUGO, d, &a->version_major); + debugfs_create_u16("version_minor", S_IRUGO, d, &a->version_minor); + debugfs_create_x64("nextoff", S_IRUGO, d, &a->nextoff); + debugfs_create_x64("infooff", S_IRUGO, d, &a->infooff); + debugfs_create_x64("dataoff", S_IRUGO, d, &a->dataoff); + debugfs_create_x64("mapoff", S_IRUGO, d, &a->mapoff); + debugfs_create_x64("logoff", S_IRUGO, d, &a->logoff); + debugfs_create_x64("info2off", S_IRUGO, d, &a->info2off); + debugfs_create_x32("flags", S_IRUGO, d, &a->flags); +} + +static void btt_debugfs_init(struct btt *btt) +{ + int i = 0; + struct arena_info *arena; + + btt->debugfs_dir = debugfs_create_dir(dev_name(&btt->nd_btt->dev), + debugfs_root); + if (IS_ERR_OR_NULL(btt->debugfs_dir)) + return; + + list_for_each_entry(arena, &btt->arena_list, list) { + arena_debugfs_init(arena, btt->debugfs_dir, i); + i++; + } +} + +/* + * This function accepts two log entries, and uses the + * sequence number to find the 'older' entry. + * It also updates the sequence number in this old entry to + * make it the 'new' one if the mark_flag is set. + * Finally, it returns which of the entries was the older one. + * + * TODO The logic feels a bit kludge-y. make it better.. + */ +static int btt_log_get_old(struct log_entry *ent) +{ + int old; + + /* + * the first ever time this is seen, the entry goes into [0] + * the next time, the following logic works out to put this + * (next) entry into [1] + */ + if (ent[0].seq == 0) { + ent[0].seq = cpu_to_le32(1); + return 0; + } + + if (ent[0].seq == ent[1].seq) + return -EINVAL; + if (le32_to_cpu(ent[0].seq) + le32_to_cpu(ent[1].seq) > 5) + return -EINVAL; + + if (le32_to_cpu(ent[0].seq) < le32_to_cpu(ent[1].seq)) { + if (le32_to_cpu(ent[1].seq) - le32_to_cpu(ent[0].seq) == 1) + old = 0; + else + old = 1; + } else { + if (le32_to_cpu(ent[0].seq) - le32_to_cpu(ent[1].seq) == 1) + old = 1; + else + old = 0; + } + + return old; +} + +static struct device *to_dev(struct arena_info *arena) +{ + return &arena->nd_btt->dev; +} + +/* + * This function copies the desired (old/new) log entry into ent if + * it is not NULL. It returns the sub-slot number (0 or 1) + * where the desired log entry was found. Negative return values + * indicate errors. + */ +static int btt_log_read(struct arena_info *arena, u32 lane, + struct log_entry *ent, int old_flag) +{ + int ret; + int old_ent, ret_ent; + struct log_entry log[2]; + + ret = btt_log_read_pair(arena, lane, log); + if (ret) + return -EIO; + + old_ent = btt_log_get_old(log); + if (old_ent < 0 || old_ent > 1) { + dev_info(to_dev(arena), + "log corruption (%d): lane %d seq [%d, %d]\n", + old_ent, lane, log[0].seq, log[1].seq); + /* TODO set error state? */ + return -EIO; + } + + ret_ent = (old_flag ? old_ent : (1 - old_ent)); + + if (ent != NULL) + memcpy(ent, &log[ret_ent], LOG_ENT_SIZE); + + return ret_ent; +} + +/* + * This function commits a log entry to media + * It does _not_ prepare the freelist entry for the next write + * btt_flog_write is the wrapper for updating the freelist elements + */ +static int __btt_log_write(struct arena_info *arena, u32 lane, + u32 sub, struct log_entry *ent) +{ + int ret; + /* + * Ignore the padding in log_entry for calculating log_half. + * The entry is 'committed' when we write the sequence number, + * and we want to ensure that that is the last thing written. + * We don't bother writing the padding as that would be extra + * media wear and write amplification + */ + unsigned int log_half = (LOG_ENT_SIZE - 2 * sizeof(u64)) / 2; + u64 ns_off = arena->logoff + (((2 * lane) + sub) * LOG_ENT_SIZE); + void *src = ent; + + /* split the 16B write into atomic, durable halves */ + ret = arena_write_bytes(arena, ns_off, src, log_half); + if (ret) + return ret; + + ns_off += log_half; + src += log_half; + return arena_write_bytes(arena, ns_off, src, log_half); +} + +static int btt_flog_write(struct arena_info *arena, u32 lane, u32 sub, + struct log_entry *ent) +{ + int ret; + + ret = __btt_log_write(arena, lane, sub, ent); + if (ret) + return ret; + + /* prepare the next free entry */ + arena->freelist[lane].sub = 1 - arena->freelist[lane].sub; + if (++(arena->freelist[lane].seq) == 4) + arena->freelist[lane].seq = 1; + arena->freelist[lane].block = le32_to_cpu(ent->old_map); + + return ret; +} + +/* + * This function initializes the BTT map to the initial state, which is + * all-zeroes, and indicates an identity mapping + */ +static int btt_map_init(struct arena_info *arena) +{ + int ret = -EINVAL; + void *zerobuf; + size_t offset = 0; + size_t chunk_size = SZ_2M; + size_t mapsize = arena->logoff - arena->mapoff; + + zerobuf = kzalloc(chunk_size, GFP_KERNEL); + if (!zerobuf) + return -ENOMEM; + + while (mapsize) { + size_t size = min(mapsize, chunk_size); + + ret = arena_write_bytes(arena, arena->mapoff + offset, zerobuf, + size); + if (ret) + goto free; + + offset += size; + mapsize -= size; + cond_resched(); + } + + free: + kfree(zerobuf); + return ret; +} + +/* + * This function initializes the BTT log with 'fake' entries pointing + * to the initial reserved set of blocks as being free + */ +static int btt_log_init(struct arena_info *arena) +{ + int ret; + u32 i; + struct log_entry log, zerolog; + + memset(&zerolog, 0, sizeof(zerolog)); + + for (i = 0; i < arena->nfree; i++) { + log.lba = cpu_to_le32(i); + log.old_map = cpu_to_le32(arena->external_nlba + i); + log.new_map = cpu_to_le32(arena->external_nlba + i); + log.seq = cpu_to_le32(LOG_SEQ_INIT); + ret = __btt_log_write(arena, i, 0, &log); + if (ret) + return ret; + ret = __btt_log_write(arena, i, 1, &zerolog); + if (ret) + return ret; + } + + return 0; +} + +static int btt_freelist_init(struct arena_info *arena) +{ + int old, new, ret; + u32 i, map_entry; + struct log_entry log_new, log_old; + + arena->freelist = kcalloc(arena->nfree, sizeof(struct free_entry), + GFP_KERNEL); + if (!arena->freelist) + return -ENOMEM; + + for (i = 0; i < arena->nfree; i++) { + old = btt_log_read(arena, i, &log_old, LOG_OLD_ENT); + if (old < 0) + return old; + + new = btt_log_read(arena, i, &log_new, LOG_NEW_ENT); + if (new < 0) + return new; + + /* sub points to the next one to be overwritten */ + arena->freelist[i].sub = 1 - new; + arena->freelist[i].seq = nd_inc_seq(le32_to_cpu(log_new.seq)); + arena->freelist[i].block = le32_to_cpu(log_new.old_map); + + /* This implies a newly created or untouched flog entry */ + if (log_new.old_map == log_new.new_map) + continue; + + /* Check if map recovery is needed */ + ret = btt_map_read(arena, le32_to_cpu(log_new.lba), &map_entry, + NULL, NULL); + if (ret) + return ret; + if ((le32_to_cpu(log_new.new_map) != map_entry) && + (le32_to_cpu(log_new.old_map) == map_entry)) { + /* + * Last transaction wrote the flog, but wasn't able + * to complete the map write. So fix up the map. + */ + ret = btt_map_write(arena, le32_to_cpu(log_new.lba), + le32_to_cpu(log_new.new_map), 0, 0); + if (ret) + return ret; + } + + } + + return 0; +} + +static int btt_rtt_init(struct arena_info *arena) +{ + arena->rtt = kcalloc(arena->nfree, sizeof(u32), GFP_KERNEL); + if (arena->rtt == NULL) + return -ENOMEM; + + return 0; +} + +static int btt_maplocks_init(struct arena_info *arena) +{ + u32 i; + + arena->map_locks = kcalloc(arena->nfree, sizeof(struct aligned_lock), + GFP_KERNEL); + if (!arena->map_locks) + return -ENOMEM; + + for (i = 0; i < arena->nfree; i++) + spin_lock_init(&arena->map_locks[i].lock); + + return 0; +} + +static struct arena_info *alloc_arena(struct btt *btt, size_t size, + size_t start, size_t arena_off) +{ + struct arena_info *arena; + u64 logsize, mapsize, datasize; + u64 available = size; + + arena = kzalloc(sizeof(struct arena_info), GFP_KERNEL); + if (!arena) + return NULL; + arena->nd_btt = btt->nd_btt; + + if (!size) + return arena; + + arena->size = size; + arena->external_lba_start = start; + arena->external_lbasize = btt->lbasize; + arena->internal_lbasize = roundup(arena->external_lbasize, + INT_LBASIZE_ALIGNMENT); + arena->nfree = BTT_DEFAULT_NFREE; + arena->version_major = 1; + arena->version_minor = 1; + + if (available % BTT_PG_SIZE) + available -= (available % BTT_PG_SIZE); + + /* Two pages are reserved for the super block and its copy */ + available -= 2 * BTT_PG_SIZE; + + /* The log takes a fixed amount of space based on nfree */ + logsize = roundup(2 * arena->nfree * sizeof(struct log_entry), + BTT_PG_SIZE); + available -= logsize; + + /* Calculate optimal split between map and data area */ + arena->internal_nlba = div_u64(available - BTT_PG_SIZE, + arena->internal_lbasize + MAP_ENT_SIZE); + arena->external_nlba = arena->internal_nlba - arena->nfree; + + mapsize = roundup((arena->external_nlba * MAP_ENT_SIZE), BTT_PG_SIZE); + datasize = available - mapsize; + + /* 'Absolute' values, relative to start of storage space */ + arena->infooff = arena_off; + arena->dataoff = arena->infooff + BTT_PG_SIZE; + arena->mapoff = arena->dataoff + datasize; + arena->logoff = arena->mapoff + mapsize; + arena->info2off = arena->logoff + logsize; + return arena; +} + +static void free_arenas(struct btt *btt) +{ + struct arena_info *arena, *next; + + list_for_each_entry_safe(arena, next, &btt->arena_list, list) { + list_del(&arena->list); + kfree(arena->rtt); + kfree(arena->map_locks); + kfree(arena->freelist); + debugfs_remove_recursive(arena->debugfs_dir); + kfree(arena); + } +} + +/* + * This function reads an existing valid btt superblock and + * populates the corresponding arena_info struct + */ +static void parse_arena_meta(struct arena_info *arena, struct btt_sb *super, + u64 arena_off) +{ + arena->internal_nlba = le32_to_cpu(super->internal_nlba); + arena->internal_lbasize = le32_to_cpu(super->internal_lbasize); + arena->external_nlba = le32_to_cpu(super->external_nlba); + arena->external_lbasize = le32_to_cpu(super->external_lbasize); + arena->nfree = le32_to_cpu(super->nfree); + arena->version_major = le16_to_cpu(super->version_major); + arena->version_minor = le16_to_cpu(super->version_minor); + + arena->nextoff = (super->nextoff == 0) ? 0 : (arena_off + + le64_to_cpu(super->nextoff)); + arena->infooff = arena_off; + arena->dataoff = arena_off + le64_to_cpu(super->dataoff); + arena->mapoff = arena_off + le64_to_cpu(super->mapoff); + arena->logoff = arena_off + le64_to_cpu(super->logoff); + arena->info2off = arena_off + le64_to_cpu(super->info2off); + + arena->size = (le64_to_cpu(super->nextoff) > 0) + ? (le64_to_cpu(super->nextoff)) + : (arena->info2off - arena->infooff + BTT_PG_SIZE); + + arena->flags = le32_to_cpu(super->flags); +} + +static int discover_arenas(struct btt *btt) +{ + int ret = 0; + struct arena_info *arena; + struct btt_sb *super; + size_t remaining = btt->rawsize; + u64 cur_nlba = 0; + size_t cur_off = 0; + int num_arenas = 0; + + super = kzalloc(sizeof(*super), GFP_KERNEL); + if (!super) + return -ENOMEM; + + while (remaining) { + /* Alloc memory for arena */ + arena = alloc_arena(btt, 0, 0, 0); + if (!arena) { + ret = -ENOMEM; + goto out_super; + } + + arena->infooff = cur_off; + ret = btt_info_read(arena, super); + if (ret) + goto out; + + if (!nd_btt_arena_is_valid(btt->nd_btt, super)) { + if (remaining == btt->rawsize) { + btt->init_state = INIT_NOTFOUND; + dev_info(to_dev(arena), "No existing arenas\n"); + goto out; + } else { + dev_info(to_dev(arena), + "Found corrupted metadata!\n"); + ret = -ENODEV; + goto out; + } + } + + arena->external_lba_start = cur_nlba; + parse_arena_meta(arena, super, cur_off); + + ret = btt_freelist_init(arena); + if (ret) + goto out; + + ret = btt_rtt_init(arena); + if (ret) + goto out; + + ret = btt_maplocks_init(arena); + if (ret) + goto out; + + list_add_tail(&arena->list, &btt->arena_list); + + remaining -= arena->size; + cur_off += arena->size; + cur_nlba += arena->external_nlba; + num_arenas++; + + if (arena->nextoff == 0) + break; + } + btt->num_arenas = num_arenas; + btt->nlba = cur_nlba; + btt->init_state = INIT_READY; + + kfree(super); + return ret; + + out: + kfree(arena); + free_arenas(btt); + out_super: + kfree(super); + return ret; +} + +static int create_arenas(struct btt *btt) +{ + size_t remaining = btt->rawsize; + size_t cur_off = 0; + + while (remaining) { + struct arena_info *arena; + size_t arena_size = min_t(u64, ARENA_MAX_SIZE, remaining); + + remaining -= arena_size; + if (arena_size < ARENA_MIN_SIZE) + break; + + arena = alloc_arena(btt, arena_size, btt->nlba, cur_off); + if (!arena) { + free_arenas(btt); + return -ENOMEM; + } + btt->nlba += arena->external_nlba; + if (remaining >= ARENA_MIN_SIZE) + arena->nextoff = arena->size; + else + arena->nextoff = 0; + cur_off += arena_size; + list_add_tail(&arena->list, &btt->arena_list); + } + + return 0; +} + +/* + * This function completes arena initialization by writing + * all the metadata. + * It is only called for an uninitialized arena when a write + * to that arena occurs for the first time. + */ +static int btt_arena_write_layout(struct arena_info *arena) +{ + int ret; + u64 sum; + struct btt_sb *super; + struct nd_btt *nd_btt = arena->nd_btt; + const u8 *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev); + + ret = btt_map_init(arena); + if (ret) + return ret; + + ret = btt_log_init(arena); + if (ret) + return ret; + + super = kzalloc(sizeof(struct btt_sb), GFP_NOIO); + if (!super) + return -ENOMEM; + + strncpy(super->signature, BTT_SIG, BTT_SIG_LEN); + memcpy(super->uuid, nd_btt->uuid, 16); + memcpy(super->parent_uuid, parent_uuid, 16); + super->flags = cpu_to_le32(arena->flags); + super->version_major = cpu_to_le16(arena->version_major); + super->version_minor = cpu_to_le16(arena->version_minor); + super->external_lbasize = cpu_to_le32(arena->external_lbasize); + super->external_nlba = cpu_to_le32(arena->external_nlba); + super->internal_lbasize = cpu_to_le32(arena->internal_lbasize); + super->internal_nlba = cpu_to_le32(arena->internal_nlba); + super->nfree = cpu_to_le32(arena->nfree); + super->infosize = cpu_to_le32(sizeof(struct btt_sb)); + super->nextoff = cpu_to_le64(arena->nextoff); + /* + * Subtract arena->infooff (arena start) so numbers are relative + * to 'this' arena + */ + super->dataoff = cpu_to_le64(arena->dataoff - arena->infooff); + super->mapoff = cpu_to_le64(arena->mapoff - arena->infooff); + super->logoff = cpu_to_le64(arena->logoff - arena->infooff); + super->info2off = cpu_to_le64(arena->info2off - arena->infooff); + + super->flags = 0; + sum = nd_sb_checksum((struct nd_gen_sb *) super); + super->checksum = cpu_to_le64(sum); + + ret = btt_info_write(arena, super); + + kfree(super); + return ret; +} + +/* + * This function completes the initialization for the BTT namespace + * such that it is ready to accept IOs + */ +static int btt_meta_init(struct btt *btt) +{ + int ret = 0; + struct arena_info *arena; + + mutex_lock(&btt->init_lock); + list_for_each_entry(arena, &btt->arena_list, list) { + ret = btt_arena_write_layout(arena); + if (ret) + goto unlock; + + ret = btt_freelist_init(arena); + if (ret) + goto unlock; + + ret = btt_rtt_init(arena); + if (ret) + goto unlock; + + ret = btt_maplocks_init(arena); + if (ret) + goto unlock; + } + + btt->init_state = INIT_READY; + + unlock: + mutex_unlock(&btt->init_lock); + return ret; +} + +static u32 btt_meta_size(struct btt *btt) +{ + return btt->lbasize - btt->sector_size; +} + +/* + * This function calculates the arena in which the given LBA lies + * by doing a linear walk. This is acceptable since we expect only + * a few arenas. If we have backing devices that get much larger, + * we can construct a balanced binary tree of arenas at init time + * so that this range search becomes faster. + */ +static int lba_to_arena(struct btt *btt, sector_t sector, __u32 *premap, + struct arena_info **arena) +{ + struct arena_info *arena_list; + __u64 lba = div_u64(sector << SECTOR_SHIFT, btt->sector_size); + + list_for_each_entry(arena_list, &btt->arena_list, list) { + if (lba < arena_list->external_nlba) { + *arena = arena_list; + *premap = lba; + return 0; + } + lba -= arena_list->external_nlba; + } + + return -EIO; +} + +/* + * The following (lock_map, unlock_map) are mostly just to improve + * readability, since they index into an array of locks + */ +static void lock_map(struct arena_info *arena, u32 premap) + __acquires(&arena->map_locks[idx].lock) +{ + u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; + + spin_lock(&arena->map_locks[idx].lock); +} + +static void unlock_map(struct arena_info *arena, u32 premap) + __releases(&arena->map_locks[idx].lock) +{ + u32 idx = (premap * MAP_ENT_SIZE / L1_CACHE_BYTES) % arena->nfree; + + spin_unlock(&arena->map_locks[idx].lock); +} + +static u64 to_namespace_offset(struct arena_info *arena, u64 lba) +{ + return arena->dataoff + ((u64)lba * arena->internal_lbasize); +} + +static int btt_data_read(struct arena_info *arena, struct page *page, + unsigned int off, u32 lba, u32 len) +{ + int ret; + u64 nsoff = to_namespace_offset(arena, lba); + void *mem = kmap_atomic(page); + + ret = arena_read_bytes(arena, nsoff, mem + off, len); + kunmap_atomic(mem); + + return ret; +} + +static int btt_data_write(struct arena_info *arena, u32 lba, + struct page *page, unsigned int off, u32 len) +{ + int ret; + u64 nsoff = to_namespace_offset(arena, lba); + void *mem = kmap_atomic(page); + + ret = arena_write_bytes(arena, nsoff, mem + off, len); + kunmap_atomic(mem); + + return ret; +} + +static void zero_fill_data(struct page *page, unsigned int off, u32 len) +{ + void *mem = kmap_atomic(page); + + memset(mem + off, 0, len); + kunmap_atomic(mem); +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip, + struct arena_info *arena, u32 postmap, int rw) +{ + unsigned int len = btt_meta_size(btt); + u64 meta_nsoff; + int ret = 0; + + if (bip == NULL) + return 0; + + meta_nsoff = to_namespace_offset(arena, postmap) + btt->sector_size; + + while (len) { + unsigned int cur_len; + struct bio_vec bv; + void *mem; + + bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter); + /* + * The 'bv' obtained from bvec_iter_bvec has its .bv_len and + * .bv_offset already adjusted for iter->bi_bvec_done, and we + * can use those directly + */ + + cur_len = min(len, bv.bv_len); + mem = kmap_atomic(bv.bv_page); + if (rw) + ret = arena_write_bytes(arena, meta_nsoff, + mem + bv.bv_offset, cur_len); + else + ret = arena_read_bytes(arena, meta_nsoff, + mem + bv.bv_offset, cur_len); + + kunmap_atomic(mem); + if (ret) + return ret; + + len -= cur_len; + meta_nsoff += cur_len; + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, cur_len); + } + + return ret; +} + +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip, + struct arena_info *arena, u32 postmap, int rw) +{ + return 0; +} +#endif + +static int btt_read_pg(struct btt *btt, struct bio_integrity_payload *bip, + struct page *page, unsigned int off, sector_t sector, + unsigned int len) +{ + int ret = 0; + int t_flag, e_flag; + struct arena_info *arena = NULL; + u32 lane = 0, premap, postmap; + + while (len) { + u32 cur_len; + + lane = nd_region_acquire_lane(btt->nd_region); + + ret = lba_to_arena(btt, sector, &premap, &arena); + if (ret) + goto out_lane; + + cur_len = min(btt->sector_size, len); + + ret = btt_map_read(arena, premap, &postmap, &t_flag, &e_flag); + if (ret) + goto out_lane; + + /* + * We loop to make sure that the post map LBA didn't change + * from under us between writing the RTT and doing the actual + * read. + */ + while (1) { + u32 new_map; + + if (t_flag) { + zero_fill_data(page, off, cur_len); + goto out_lane; + } + + if (e_flag) { + ret = -EIO; + goto out_lane; + } + + arena->rtt[lane] = RTT_VALID | postmap; + /* + * Barrier to make sure this write is not reordered + * to do the verification map_read before the RTT store + */ + barrier(); + + ret = btt_map_read(arena, premap, &new_map, &t_flag, + &e_flag); + if (ret) + goto out_rtt; + + if (postmap == new_map) + break; + + postmap = new_map; + } + + ret = btt_data_read(arena, page, off, postmap, cur_len); + if (ret) + goto out_rtt; + + if (bip) { + ret = btt_rw_integrity(btt, bip, arena, postmap, READ); + if (ret) + goto out_rtt; + } + + arena->rtt[lane] = RTT_INVALID; + nd_region_release_lane(btt->nd_region, lane); + + len -= cur_len; + off += cur_len; + sector += btt->sector_size >> SECTOR_SHIFT; + } + + return 0; + + out_rtt: + arena->rtt[lane] = RTT_INVALID; + out_lane: + nd_region_release_lane(btt->nd_region, lane); + return ret; +} + +static int btt_write_pg(struct btt *btt, struct bio_integrity_payload *bip, + sector_t sector, struct page *page, unsigned int off, + unsigned int len) +{ + int ret = 0; + struct arena_info *arena = NULL; + u32 premap = 0, old_postmap, new_postmap, lane = 0, i; + struct log_entry log; + int sub; + + while (len) { + u32 cur_len; + + lane = nd_region_acquire_lane(btt->nd_region); + + ret = lba_to_arena(btt, sector, &premap, &arena); + if (ret) + goto out_lane; + cur_len = min(btt->sector_size, len); + + if ((arena->flags & IB_FLAG_ERROR_MASK) != 0) { + ret = -EIO; + goto out_lane; + } + + new_postmap = arena->freelist[lane].block; + + /* Wait if the new block is being read from */ + for (i = 0; i < arena->nfree; i++) + while (arena->rtt[i] == (RTT_VALID | new_postmap)) + cpu_relax(); + + + if (new_postmap >= arena->internal_nlba) { + ret = -EIO; + goto out_lane; + } + + ret = btt_data_write(arena, new_postmap, page, off, cur_len); + if (ret) + goto out_lane; + + if (bip) { + ret = btt_rw_integrity(btt, bip, arena, new_postmap, + WRITE); + if (ret) + goto out_lane; + } + + lock_map(arena, premap); + ret = btt_map_read(arena, premap, &old_postmap, NULL, NULL); + if (ret) + goto out_map; + if (old_postmap >= arena->internal_nlba) { + ret = -EIO; + goto out_map; + } + + log.lba = cpu_to_le32(premap); + log.old_map = cpu_to_le32(old_postmap); + log.new_map = cpu_to_le32(new_postmap); + log.seq = cpu_to_le32(arena->freelist[lane].seq); + sub = arena->freelist[lane].sub; + ret = btt_flog_write(arena, lane, sub, &log); + if (ret) + goto out_map; + + ret = btt_map_write(arena, premap, new_postmap, 0, 0); + if (ret) + goto out_map; + + unlock_map(arena, premap); + nd_region_release_lane(btt->nd_region, lane); + + len -= cur_len; + off += cur_len; + sector += btt->sector_size >> SECTOR_SHIFT; + } + + return 0; + + out_map: + unlock_map(arena, premap); + out_lane: + nd_region_release_lane(btt->nd_region, lane); + return ret; +} + +static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, + struct page *page, unsigned int len, unsigned int off, + int rw, sector_t sector) +{ + int ret; + + if (rw == READ) { + ret = btt_read_pg(btt, bip, page, off, sector, len); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + ret = btt_write_pg(btt, bip, sector, page, off, len); + } + + return ret; +} + +static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct btt *btt = q->queuedata; + struct bvec_iter iter; + unsigned long start; + struct bio_vec bvec; + int err = 0, rw; + bool do_acct; + + /* + * bio_integrity_enabled also checks if the bio already has an + * integrity payload attached. If it does, we *don't* do a + * bio_integrity_prep here - the payload has been generated by + * another kernel subsystem, and we just pass it through. + */ + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio->bi_error = -EIO; + goto out; + } + + do_acct = nd_iostat_start(bio, &start); + rw = bio_data_dir(bio); + bio_for_each_segment(bvec, bio, iter) { + unsigned int len = bvec.bv_len; + + BUG_ON(len > PAGE_SIZE); + /* Make sure len is in multiples of sector size. */ + /* XXX is this right? */ + BUG_ON(len < btt->sector_size); + BUG_ON(len % btt->sector_size); + + err = btt_do_bvec(btt, bip, bvec.bv_page, len, bvec.bv_offset, + rw, iter.bi_sector); + if (err) { + dev_info(&btt->nd_btt->dev, + "io error in %s sector %lld, len %d,\n", + (rw == READ) ? "READ" : "WRITE", + (unsigned long long) iter.bi_sector, len); + bio->bi_error = err; + break; + } + } + if (do_acct) + nd_iostat_end(bio, start); + +out: + bio_endio(bio); + return BLK_QC_T_NONE; +} + +static int btt_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + struct btt *btt = bdev->bd_disk->private_data; + + btt_do_bvec(btt, NULL, page, PAGE_CACHE_SIZE, 0, rw, sector); + page_endio(page, rw & WRITE, 0); + return 0; +} + + +static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bd->bd_disk) >> 11; + return 0; +} + +static const struct block_device_operations btt_fops = { + .owner = THIS_MODULE, + .rw_page = btt_rw_page, + .getgeo = btt_getgeo, + .revalidate_disk = nvdimm_revalidate_disk, +}; + +static int btt_blk_init(struct btt *btt) +{ + struct nd_btt *nd_btt = btt->nd_btt; + struct nd_namespace_common *ndns = nd_btt->ndns; + + /* create a new disk and request queue for btt */ + btt->btt_queue = blk_alloc_queue(GFP_KERNEL); + if (!btt->btt_queue) + return -ENOMEM; + + btt->btt_disk = alloc_disk(0); + if (!btt->btt_disk) { + blk_cleanup_queue(btt->btt_queue); + return -ENOMEM; + } + + nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); + btt->btt_disk->driverfs_dev = &btt->nd_btt->dev; + btt->btt_disk->major = btt_major; + btt->btt_disk->first_minor = 0; + btt->btt_disk->fops = &btt_fops; + btt->btt_disk->private_data = btt; + btt->btt_disk->queue = btt->btt_queue; + btt->btt_disk->flags = GENHD_FL_EXT_DEVT; + + blk_queue_make_request(btt->btt_queue, btt_make_request); + blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); + blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); + blk_queue_bounce_limit(btt->btt_queue, BLK_BOUNCE_ANY); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, btt->btt_queue); + btt->btt_queue->queuedata = btt; + + set_capacity(btt->btt_disk, 0); + add_disk(btt->btt_disk); + if (btt_meta_size(btt)) { + int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt)); + + if (rc) { + del_gendisk(btt->btt_disk); + put_disk(btt->btt_disk); + blk_cleanup_queue(btt->btt_queue); + return rc; + } + } + set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9); + revalidate_disk(btt->btt_disk); + + return 0; +} + +static void btt_blk_cleanup(struct btt *btt) +{ + del_gendisk(btt->btt_disk); + put_disk(btt->btt_disk); + blk_cleanup_queue(btt->btt_queue); +} + +/** + * btt_init - initialize a block translation table for the given device + * @nd_btt: device with BTT geometry and backing device info + * @rawsize: raw size in bytes of the backing device + * @lbasize: lba size of the backing device + * @uuid: A uuid for the backing device - this is stored on media + * @maxlane: maximum number of parallel requests the device can handle + * + * Initialize a Block Translation Table on a backing device to provide + * single sector power fail atomicity. + * + * Context: + * Might sleep. + * + * Returns: + * Pointer to a new struct btt on success, NULL on failure. + */ +static struct btt *btt_init(struct nd_btt *nd_btt, unsigned long long rawsize, + u32 lbasize, u8 *uuid, struct nd_region *nd_region) +{ + int ret; + struct btt *btt; + struct device *dev = &nd_btt->dev; + + btt = kzalloc(sizeof(struct btt), GFP_KERNEL); + if (!btt) + return NULL; + + btt->nd_btt = nd_btt; + btt->rawsize = rawsize; + btt->lbasize = lbasize; + btt->sector_size = ((lbasize >= 4096) ? 4096 : 512); + INIT_LIST_HEAD(&btt->arena_list); + mutex_init(&btt->init_lock); + btt->nd_region = nd_region; + + ret = discover_arenas(btt); + if (ret) { + dev_err(dev, "init: error in arena_discover: %d\n", ret); + goto out_free; + } + + if (btt->init_state != INIT_READY && nd_region->ro) { + dev_info(dev, "%s is read-only, unable to init btt metadata\n", + dev_name(&nd_region->dev)); + goto out_free; + } else if (btt->init_state != INIT_READY) { + btt->num_arenas = (rawsize / ARENA_MAX_SIZE) + + ((rawsize % ARENA_MAX_SIZE) ? 1 : 0); + dev_dbg(dev, "init: %d arenas for %llu rawsize\n", + btt->num_arenas, rawsize); + + ret = create_arenas(btt); + if (ret) { + dev_info(dev, "init: create_arenas: %d\n", ret); + goto out_free; + } + + ret = btt_meta_init(btt); + if (ret) { + dev_err(dev, "init: error in meta_init: %d\n", ret); + goto out_free; + } + } + + ret = btt_blk_init(btt); + if (ret) { + dev_err(dev, "init: error in blk_init: %d\n", ret); + goto out_free; + } + + btt_debugfs_init(btt); + + return btt; + + out_free: + kfree(btt); + return NULL; +} + +/** + * btt_fini - de-initialize a BTT + * @btt: the BTT handle that was generated by btt_init + * + * De-initialize a Block Translation Table on device removal + * + * Context: + * Might sleep. + */ +static void btt_fini(struct btt *btt) +{ + if (btt) { + btt_blk_cleanup(btt); + free_arenas(btt); + debugfs_remove_recursive(btt->debugfs_dir); + kfree(btt); + } +} + +int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt = to_nd_btt(ndns->claim); + struct nd_region *nd_region; + struct btt *btt; + size_t rawsize; + + if (!nd_btt->uuid || !nd_btt->ndns || !nd_btt->lbasize) + return -ENODEV; + + rawsize = nvdimm_namespace_capacity(ndns) - SZ_4K; + if (rawsize < ARENA_MIN_SIZE) { + return -ENXIO; + } + nd_region = to_nd_region(nd_btt->dev.parent); + btt = btt_init(nd_btt, rawsize, nd_btt->lbasize, nd_btt->uuid, + nd_region); + if (!btt) + return -ENOMEM; + nd_btt->btt = btt; + + return 0; +} +EXPORT_SYMBOL(nvdimm_namespace_attach_btt); + +int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt = to_nd_btt(ndns->claim); + struct btt *btt = nd_btt->btt; + + btt_fini(btt); + nd_btt->btt = NULL; + + return 0; +} +EXPORT_SYMBOL(nvdimm_namespace_detach_btt); + +static int __init nd_btt_init(void) +{ + int rc; + + btt_major = register_blkdev(0, "btt"); + if (btt_major < 0) + return btt_major; + + debugfs_root = debugfs_create_dir("btt", NULL); + if (IS_ERR_OR_NULL(debugfs_root)) { + rc = -ENXIO; + goto err_debugfs; + } + + return 0; + + err_debugfs: + unregister_blkdev(btt_major, "btt"); + + return rc; +} + +static void __exit nd_btt_exit(void) +{ + debugfs_remove_recursive(debugfs_root); + unregister_blkdev(btt_major, "btt"); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_BTT); +MODULE_AUTHOR("Vishal Verma <vishal.l.verma@linux.intel.com>"); +MODULE_LICENSE("GPL v2"); +module_init(nd_btt_init); +module_exit(nd_btt_exit); diff --git a/kernel/drivers/nvdimm/btt.h b/kernel/drivers/nvdimm/btt.h new file mode 100644 index 000000000..b2f8651e5 --- /dev/null +++ b/kernel/drivers/nvdimm/btt.h @@ -0,0 +1,188 @@ +/* + * Block Translation Table library + * Copyright (c) 2014-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _LINUX_BTT_H +#define _LINUX_BTT_H + +#include <linux/types.h> + +#define BTT_SIG_LEN 16 +#define BTT_SIG "BTT_ARENA_INFO\0" +#define MAP_ENT_SIZE 4 +#define MAP_TRIM_SHIFT 31 +#define MAP_TRIM_MASK (1 << MAP_TRIM_SHIFT) +#define MAP_ERR_SHIFT 30 +#define MAP_ERR_MASK (1 << MAP_ERR_SHIFT) +#define MAP_LBA_MASK (~((1 << MAP_TRIM_SHIFT) | (1 << MAP_ERR_SHIFT))) +#define MAP_ENT_NORMAL 0xC0000000 +#define LOG_ENT_SIZE sizeof(struct log_entry) +#define ARENA_MIN_SIZE (1UL << 24) /* 16 MB */ +#define ARENA_MAX_SIZE (1ULL << 39) /* 512 GB */ +#define RTT_VALID (1UL << 31) +#define RTT_INVALID 0 +#define BTT_PG_SIZE 4096 +#define BTT_DEFAULT_NFREE ND_MAX_LANES +#define LOG_SEQ_INIT 1 + +#define IB_FLAG_ERROR 0x00000001 +#define IB_FLAG_ERROR_MASK 0x00000001 + +enum btt_init_state { + INIT_UNCHECKED = 0, + INIT_NOTFOUND, + INIT_READY +}; + +struct log_entry { + __le32 lba; + __le32 old_map; + __le32 new_map; + __le32 seq; + __le64 padding[2]; +}; + +struct btt_sb { + u8 signature[BTT_SIG_LEN]; + u8 uuid[16]; + u8 parent_uuid[16]; + __le32 flags; + __le16 version_major; + __le16 version_minor; + __le32 external_lbasize; + __le32 external_nlba; + __le32 internal_lbasize; + __le32 internal_nlba; + __le32 nfree; + __le32 infosize; + __le64 nextoff; + __le64 dataoff; + __le64 mapoff; + __le64 logoff; + __le64 info2off; + u8 padding[3968]; + __le64 checksum; +}; + +struct free_entry { + u32 block; + u8 sub; + u8 seq; +}; + +struct aligned_lock { + union { + spinlock_t lock; + u8 cacheline_padding[L1_CACHE_BYTES]; + }; +}; + +/** + * struct arena_info - handle for an arena + * @size: Size in bytes this arena occupies on the raw device. + * This includes arena metadata. + * @external_lba_start: The first external LBA in this arena. + * @internal_nlba: Number of internal blocks available in the arena + * including nfree reserved blocks + * @internal_lbasize: Internal and external lba sizes may be different as + * we can round up 'odd' external lbasizes such as 520B + * to be aligned. + * @external_nlba: Number of blocks contributed by the arena to the number + * reported to upper layers. (internal_nlba - nfree) + * @external_lbasize: LBA size as exposed to upper layers. + * @nfree: A reserve number of 'free' blocks that is used to + * handle incoming writes. + * @version_major: Metadata layout version major. + * @version_minor: Metadata layout version minor. + * @nextoff: Offset in bytes to the start of the next arena. + * @infooff: Offset in bytes to the info block of this arena. + * @dataoff: Offset in bytes to the data area of this arena. + * @mapoff: Offset in bytes to the map area of this arena. + * @logoff: Offset in bytes to the log area of this arena. + * @info2off: Offset in bytes to the backup info block of this arena. + * @freelist: Pointer to in-memory list of free blocks + * @rtt: Pointer to in-memory "Read Tracking Table" + * @map_locks: Spinlocks protecting concurrent map writes + * @nd_btt: Pointer to parent nd_btt structure. + * @list: List head for list of arenas + * @debugfs_dir: Debugfs dentry + * @flags: Arena flags - may signify error states. + * + * arena_info is a per-arena handle. Once an arena is narrowed down for an + * IO, this struct is passed around for the duration of the IO. + */ +struct arena_info { + u64 size; /* Total bytes for this arena */ + u64 external_lba_start; + u32 internal_nlba; + u32 internal_lbasize; + u32 external_nlba; + u32 external_lbasize; + u32 nfree; + u16 version_major; + u16 version_minor; + /* Byte offsets to the different on-media structures */ + u64 nextoff; + u64 infooff; + u64 dataoff; + u64 mapoff; + u64 logoff; + u64 info2off; + /* Pointers to other in-memory structures for this arena */ + struct free_entry *freelist; + u32 *rtt; + struct aligned_lock *map_locks; + struct nd_btt *nd_btt; + struct list_head list; + struct dentry *debugfs_dir; + /* Arena flags */ + u32 flags; +}; + +/** + * struct btt - handle for a BTT instance + * @btt_disk: Pointer to the gendisk for BTT device + * @btt_queue: Pointer to the request queue for the BTT device + * @arena_list: Head of the list of arenas + * @debugfs_dir: Debugfs dentry + * @nd_btt: Parent nd_btt struct + * @nlba: Number of logical blocks exposed to the upper layers + * after removing the amount of space needed by metadata + * @rawsize: Total size in bytes of the available backing device + * @lbasize: LBA size as requested and presented to upper layers. + * This is sector_size + size of any metadata. + * @sector_size: The Linux sector size - 512 or 4096 + * @lanes: Per-lane spinlocks + * @init_lock: Mutex used for the BTT initialization + * @init_state: Flag describing the initialization state for the BTT + * @num_arenas: Number of arenas in the BTT instance + */ +struct btt { + struct gendisk *btt_disk; + struct request_queue *btt_queue; + struct list_head arena_list; + struct dentry *debugfs_dir; + struct nd_btt *nd_btt; + u64 nlba; + unsigned long long rawsize; + u32 lbasize; + u32 sector_size; + struct nd_region *nd_region; + struct mutex init_lock; + int init_state; + int num_arenas; +}; + +bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super); + +#endif diff --git a/kernel/drivers/nvdimm/btt_devs.c b/kernel/drivers/nvdimm/btt_devs.c new file mode 100644 index 000000000..cb477518d --- /dev/null +++ b/kernel/drivers/nvdimm/btt_devs.c @@ -0,0 +1,306 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/genhd.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include "nd-core.h" +#include "btt.h" +#include "nd.h" + +static void nd_btt_release(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_btt *nd_btt = to_nd_btt(dev); + + dev_dbg(dev, "%s\n", __func__); + nd_detach_ndns(&nd_btt->dev, &nd_btt->ndns); + ida_simple_remove(&nd_region->btt_ida, nd_btt->id); + kfree(nd_btt->uuid); + kfree(nd_btt); +} + +static struct device_type nd_btt_device_type = { + .name = "nd_btt", + .release = nd_btt_release, +}; + +bool is_nd_btt(struct device *dev) +{ + return dev->type == &nd_btt_device_type; +} +EXPORT_SYMBOL(is_nd_btt); + +struct nd_btt *to_nd_btt(struct device *dev) +{ + struct nd_btt *nd_btt = container_of(dev, struct nd_btt, dev); + + WARN_ON(!is_nd_btt(dev)); + return nd_btt; +} +EXPORT_SYMBOL(to_nd_btt); + +static const unsigned long btt_lbasize_supported[] = { 512, 520, 528, + 4096, 4104, 4160, 4224, 0 }; + +static ssize_t sector_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + + return nd_sector_size_show(nd_btt->lbasize, btt_lbasize_supported, buf); +} + +static ssize_t sector_size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = nd_sector_size_store(dev, buf, &nd_btt->lbasize, + btt_lbasize_supported); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(sector_size); + +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + + if (nd_btt->uuid) + return sprintf(buf, "%pUb\n", nd_btt->uuid); + return sprintf(buf, "\n"); +} + +static ssize_t uuid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + device_lock(dev); + rc = nd_uuid_store(dev, &nd_btt->uuid, buf, len); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(uuid); + +static ssize_t namespace_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + rc = sprintf(buf, "%s\n", nd_btt->ndns + ? dev_name(&nd_btt->ndns->dev) : ""); + nvdimm_bus_unlock(dev); + return rc; +} + +static ssize_t namespace_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_btt *nd_btt = to_nd_btt(dev); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = nd_namespace_store(dev, &nd_btt->ndns, buf, len); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(namespace); + +static struct attribute *nd_btt_attributes[] = { + &dev_attr_sector_size.attr, + &dev_attr_namespace.attr, + &dev_attr_uuid.attr, + NULL, +}; + +static struct attribute_group nd_btt_attribute_group = { + .attrs = nd_btt_attributes, +}; + +static const struct attribute_group *nd_btt_attribute_groups[] = { + &nd_btt_attribute_group, + &nd_device_attribute_group, + &nd_numa_attribute_group, + NULL, +}; + +static struct device *__nd_btt_create(struct nd_region *nd_region, + unsigned long lbasize, u8 *uuid, + struct nd_namespace_common *ndns) +{ + struct nd_btt *nd_btt; + struct device *dev; + + nd_btt = kzalloc(sizeof(*nd_btt), GFP_KERNEL); + if (!nd_btt) + return NULL; + + nd_btt->id = ida_simple_get(&nd_region->btt_ida, 0, 0, GFP_KERNEL); + if (nd_btt->id < 0) { + kfree(nd_btt); + return NULL; + } + + nd_btt->lbasize = lbasize; + if (uuid) + uuid = kmemdup(uuid, 16, GFP_KERNEL); + nd_btt->uuid = uuid; + dev = &nd_btt->dev; + dev_set_name(dev, "btt%d.%d", nd_region->id, nd_btt->id); + dev->parent = &nd_region->dev; + dev->type = &nd_btt_device_type; + dev->groups = nd_btt_attribute_groups; + device_initialize(&nd_btt->dev); + if (ndns && !__nd_attach_ndns(&nd_btt->dev, ndns, &nd_btt->ndns)) { + dev_dbg(&ndns->dev, "%s failed, already claimed by %s\n", + __func__, dev_name(ndns->claim)); + put_device(dev); + return NULL; + } + return dev; +} + +struct device *nd_btt_create(struct nd_region *nd_region) +{ + struct device *dev = __nd_btt_create(nd_region, 0, NULL, NULL); + + if (dev) + __nd_device_register(dev); + return dev; +} + +static bool uuid_is_null(u8 *uuid) +{ + static const u8 null_uuid[16]; + + return (memcmp(uuid, null_uuid, 16) == 0); +} + +/** + * nd_btt_arena_is_valid - check if the metadata layout is valid + * @nd_btt: device with BTT geometry and backing device info + * @super: pointer to the arena's info block being tested + * + * Check consistency of the btt info block with itself by validating + * the checksum, and with the parent namespace by verifying the + * parent_uuid contained in the info block with the one supplied in. + * + * Returns: + * false for an invalid info block, true for a valid one + */ +bool nd_btt_arena_is_valid(struct nd_btt *nd_btt, struct btt_sb *super) +{ + const u8 *parent_uuid = nd_dev_to_uuid(&nd_btt->ndns->dev); + u64 checksum; + + if (memcmp(super->signature, BTT_SIG, BTT_SIG_LEN) != 0) + return false; + + if (!uuid_is_null(super->parent_uuid)) + if (memcmp(super->parent_uuid, parent_uuid, 16) != 0) + return false; + + checksum = le64_to_cpu(super->checksum); + super->checksum = 0; + if (checksum != nd_sb_checksum((struct nd_gen_sb *) super)) + return false; + super->checksum = cpu_to_le64(checksum); + + /* TODO: figure out action for this */ + if ((le32_to_cpu(super->flags) & IB_FLAG_ERROR_MASK) != 0) + dev_info(&nd_btt->dev, "Found arena with an error flag\n"); + + return true; +} +EXPORT_SYMBOL(nd_btt_arena_is_valid); + +static int __nd_btt_probe(struct nd_btt *nd_btt, + struct nd_namespace_common *ndns, struct btt_sb *btt_sb) +{ + if (!btt_sb || !ndns || !nd_btt) + return -ENODEV; + + if (nvdimm_read_bytes(ndns, SZ_4K, btt_sb, sizeof(*btt_sb))) + return -ENXIO; + + if (nvdimm_namespace_capacity(ndns) < SZ_16M) + return -ENXIO; + + if (!nd_btt_arena_is_valid(nd_btt, btt_sb)) + return -ENODEV; + + nd_btt->lbasize = le32_to_cpu(btt_sb->external_lbasize); + nd_btt->uuid = kmemdup(btt_sb->uuid, 16, GFP_KERNEL); + if (!nd_btt->uuid) + return -ENOMEM; + + __nd_device_register(&nd_btt->dev); + + return 0; +} + +int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata) +{ + int rc; + struct device *dev; + struct btt_sb *btt_sb; + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + + if (ndns->force_raw) + return -ENODEV; + + nvdimm_bus_lock(&ndns->dev); + dev = __nd_btt_create(nd_region, 0, NULL, ndns); + nvdimm_bus_unlock(&ndns->dev); + if (!dev) + return -ENOMEM; + dev_set_drvdata(dev, drvdata); + btt_sb = kzalloc(sizeof(*btt_sb), GFP_KERNEL); + rc = __nd_btt_probe(to_nd_btt(dev), ndns, btt_sb); + kfree(btt_sb); + dev_dbg(&ndns->dev, "%s: btt: %s\n", __func__, + rc == 0 ? dev_name(dev) : "<none>"); + if (rc < 0) { + struct nd_btt *nd_btt = to_nd_btt(dev); + + __nd_detach_ndns(dev, &nd_btt->ndns); + put_device(dev); + } + + return rc; +} +EXPORT_SYMBOL(nd_btt_probe); diff --git a/kernel/drivers/nvdimm/bus.c b/kernel/drivers/nvdimm/bus.c new file mode 100644 index 000000000..7e2c43f70 --- /dev/null +++ b/kernel/drivers/nvdimm/bus.c @@ -0,0 +1,725 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/vmalloc.h> +#include <linux/uaccess.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/fcntl.h> +#include <linux/async.h> +#include <linux/genhd.h> +#include <linux/ndctl.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/io.h> +#include <linux/mm.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "nd.h" + +int nvdimm_major; +static int nvdimm_bus_major; +static struct class *nd_class; + +static int to_nd_device_type(struct device *dev) +{ + if (is_nvdimm(dev)) + return ND_DEVICE_DIMM; + else if (is_nd_pmem(dev)) + return ND_DEVICE_REGION_PMEM; + else if (is_nd_blk(dev)) + return ND_DEVICE_REGION_BLK; + else if (is_nd_pmem(dev->parent) || is_nd_blk(dev->parent)) + return nd_region_to_nstype(to_nd_region(dev->parent)); + + return 0; +} + +static int nvdimm_bus_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + /* + * Ensure that region devices always have their numa node set as + * early as possible. + */ + if (is_nd_pmem(dev) || is_nd_blk(dev)) + set_dev_node(dev, to_nd_region(dev)->numa_node); + return add_uevent_var(env, "MODALIAS=" ND_DEVICE_MODALIAS_FMT, + to_nd_device_type(dev)); +} + +static int nvdimm_bus_match(struct device *dev, struct device_driver *drv) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(drv); + + return test_bit(to_nd_device_type(dev), &nd_drv->type); +} + +static struct module *to_bus_provider(struct device *dev) +{ + /* pin bus providers while regions are enabled */ + if (is_nd_pmem(dev) || is_nd_blk(dev)) { + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + return nvdimm_bus->module; + } + return NULL; +} + +static void nvdimm_bus_probe_start(struct nvdimm_bus *nvdimm_bus) +{ + nvdimm_bus_lock(&nvdimm_bus->dev); + nvdimm_bus->probe_active++; + nvdimm_bus_unlock(&nvdimm_bus->dev); +} + +static void nvdimm_bus_probe_end(struct nvdimm_bus *nvdimm_bus) +{ + nvdimm_bus_lock(&nvdimm_bus->dev); + if (--nvdimm_bus->probe_active == 0) + wake_up(&nvdimm_bus->probe_wait); + nvdimm_bus_unlock(&nvdimm_bus->dev); +} + +static int nvdimm_bus_probe(struct device *dev) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver); + struct module *provider = to_bus_provider(dev); + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + int rc; + + if (!try_module_get(provider)) + return -ENXIO; + + nvdimm_bus_probe_start(nvdimm_bus); + rc = nd_drv->probe(dev); + if (rc == 0) + nd_region_probe_success(nvdimm_bus, dev); + else + nd_region_disable(nvdimm_bus, dev); + nvdimm_bus_probe_end(nvdimm_bus); + + dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name, + dev_name(dev), rc); + + if (rc != 0) + module_put(provider); + return rc; +} + +static int nvdimm_bus_remove(struct device *dev) +{ + struct nd_device_driver *nd_drv = to_nd_device_driver(dev->driver); + struct module *provider = to_bus_provider(dev); + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + int rc; + + rc = nd_drv->remove(dev); + nd_region_disable(nvdimm_bus, dev); + + dev_dbg(&nvdimm_bus->dev, "%s.remove(%s) = %d\n", dev->driver->name, + dev_name(dev), rc); + module_put(provider); + return rc; +} + +static struct bus_type nvdimm_bus_type = { + .name = "nd", + .uevent = nvdimm_bus_uevent, + .match = nvdimm_bus_match, + .probe = nvdimm_bus_probe, + .remove = nvdimm_bus_remove, +}; + +static ASYNC_DOMAIN_EXCLUSIVE(nd_async_domain); + +void nd_synchronize(void) +{ + async_synchronize_full_domain(&nd_async_domain); +} +EXPORT_SYMBOL_GPL(nd_synchronize); + +static void nd_async_device_register(void *d, async_cookie_t cookie) +{ + struct device *dev = d; + + if (device_add(dev) != 0) { + dev_err(dev, "%s: failed\n", __func__); + put_device(dev); + } + put_device(dev); +} + +static void nd_async_device_unregister(void *d, async_cookie_t cookie) +{ + struct device *dev = d; + + /* flush bus operations before delete */ + nvdimm_bus_lock(dev); + nvdimm_bus_unlock(dev); + + device_unregister(dev); + put_device(dev); +} + +void __nd_device_register(struct device *dev) +{ + dev->bus = &nvdimm_bus_type; + get_device(dev); + async_schedule_domain(nd_async_device_register, dev, + &nd_async_domain); +} + +void nd_device_register(struct device *dev) +{ + device_initialize(dev); + __nd_device_register(dev); +} +EXPORT_SYMBOL(nd_device_register); + +void nd_device_unregister(struct device *dev, enum nd_async_mode mode) +{ + switch (mode) { + case ND_ASYNC: + get_device(dev); + async_schedule_domain(nd_async_device_unregister, dev, + &nd_async_domain); + break; + case ND_SYNC: + nd_synchronize(); + device_unregister(dev); + break; + } +} +EXPORT_SYMBOL(nd_device_unregister); + +/** + * __nd_driver_register() - register a region or a namespace driver + * @nd_drv: driver to register + * @owner: automatically set by nd_driver_register() macro + * @mod_name: automatically set by nd_driver_register() macro + */ +int __nd_driver_register(struct nd_device_driver *nd_drv, struct module *owner, + const char *mod_name) +{ + struct device_driver *drv = &nd_drv->drv; + + if (!nd_drv->type) { + pr_debug("driver type bitmask not set (%pf)\n", + __builtin_return_address(0)); + return -EINVAL; + } + + if (!nd_drv->probe || !nd_drv->remove) { + pr_debug("->probe() and ->remove() must be specified\n"); + return -EINVAL; + } + + drv->bus = &nvdimm_bus_type; + drv->owner = owner; + drv->mod_name = mod_name; + + return driver_register(drv); +} +EXPORT_SYMBOL(__nd_driver_register); + +int nvdimm_revalidate_disk(struct gendisk *disk) +{ + struct device *dev = disk->driverfs_dev; + struct nd_region *nd_region = to_nd_region(dev->parent); + const char *pol = nd_region->ro ? "only" : "write"; + + if (nd_region->ro == get_disk_ro(disk)) + return 0; + + dev_info(dev, "%s read-%s, marking %s read-%s\n", + dev_name(&nd_region->dev), pol, disk->disk_name, pol); + set_disk_ro(disk, nd_region->ro); + + return 0; + +} +EXPORT_SYMBOL(nvdimm_revalidate_disk); + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, ND_DEVICE_MODALIAS_FMT "\n", + to_nd_device_type(dev)); +} +static DEVICE_ATTR_RO(modalias); + +static ssize_t devtype_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", dev->type->name); +} +static DEVICE_ATTR_RO(devtype); + +static struct attribute *nd_device_attributes[] = { + &dev_attr_modalias.attr, + &dev_attr_devtype.attr, + NULL, +}; + +/** + * nd_device_attribute_group - generic attributes for all devices on an nd bus + */ +struct attribute_group nd_device_attribute_group = { + .attrs = nd_device_attributes, +}; +EXPORT_SYMBOL_GPL(nd_device_attribute_group); + +static ssize_t numa_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", dev_to_node(dev)); +} +static DEVICE_ATTR_RO(numa_node); + +static struct attribute *nd_numa_attributes[] = { + &dev_attr_numa_node.attr, + NULL, +}; + +static umode_t nd_numa_attr_visible(struct kobject *kobj, struct attribute *a, + int n) +{ + if (!IS_ENABLED(CONFIG_NUMA)) + return 0; + + return a->mode; +} + +/** + * nd_numa_attribute_group - NUMA attributes for all devices on an nd bus + */ +struct attribute_group nd_numa_attribute_group = { + .attrs = nd_numa_attributes, + .is_visible = nd_numa_attr_visible, +}; +EXPORT_SYMBOL_GPL(nd_numa_attribute_group); + +int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus) +{ + dev_t devt = MKDEV(nvdimm_bus_major, nvdimm_bus->id); + struct device *dev; + + dev = device_create(nd_class, &nvdimm_bus->dev, devt, nvdimm_bus, + "ndctl%d", nvdimm_bus->id); + + if (IS_ERR(dev)) { + dev_dbg(&nvdimm_bus->dev, "failed to register ndctl%d: %ld\n", + nvdimm_bus->id, PTR_ERR(dev)); + return PTR_ERR(dev); + } + return 0; +} + +void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus) +{ + device_destroy(nd_class, MKDEV(nvdimm_bus_major, nvdimm_bus->id)); +} + +static const struct nd_cmd_desc __nd_cmd_dimm_descs[] = { + [ND_CMD_IMPLEMENTED] = { }, + [ND_CMD_SMART] = { + .out_num = 2, + .out_sizes = { 4, 8, }, + }, + [ND_CMD_SMART_THRESHOLD] = { + .out_num = 2, + .out_sizes = { 4, 8, }, + }, + [ND_CMD_DIMM_FLAGS] = { + .out_num = 2, + .out_sizes = { 4, 4 }, + }, + [ND_CMD_GET_CONFIG_SIZE] = { + .out_num = 3, + .out_sizes = { 4, 4, 4, }, + }, + [ND_CMD_GET_CONFIG_DATA] = { + .in_num = 2, + .in_sizes = { 4, 4, }, + .out_num = 2, + .out_sizes = { 4, UINT_MAX, }, + }, + [ND_CMD_SET_CONFIG_DATA] = { + .in_num = 3, + .in_sizes = { 4, 4, UINT_MAX, }, + .out_num = 1, + .out_sizes = { 4, }, + }, + [ND_CMD_VENDOR] = { + .in_num = 3, + .in_sizes = { 4, 4, UINT_MAX, }, + .out_num = 3, + .out_sizes = { 4, 4, UINT_MAX, }, + }, +}; + +const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd) +{ + if (cmd < ARRAY_SIZE(__nd_cmd_dimm_descs)) + return &__nd_cmd_dimm_descs[cmd]; + return NULL; +} +EXPORT_SYMBOL_GPL(nd_cmd_dimm_desc); + +static const struct nd_cmd_desc __nd_cmd_bus_descs[] = { + [ND_CMD_IMPLEMENTED] = { }, + [ND_CMD_ARS_CAP] = { + .in_num = 2, + .in_sizes = { 8, 8, }, + .out_num = 2, + .out_sizes = { 4, 4, }, + }, + [ND_CMD_ARS_START] = { + .in_num = 4, + .in_sizes = { 8, 8, 2, 6, }, + .out_num = 1, + .out_sizes = { 4, }, + }, + [ND_CMD_ARS_STATUS] = { + .out_num = 2, + .out_sizes = { 4, UINT_MAX, }, + }, +}; + +const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd) +{ + if (cmd < ARRAY_SIZE(__nd_cmd_bus_descs)) + return &__nd_cmd_bus_descs[cmd]; + return NULL; +} +EXPORT_SYMBOL_GPL(nd_cmd_bus_desc); + +u32 nd_cmd_in_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, void *buf) +{ + if (idx >= desc->in_num) + return UINT_MAX; + + if (desc->in_sizes[idx] < UINT_MAX) + return desc->in_sizes[idx]; + + if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA && idx == 2) { + struct nd_cmd_set_config_hdr *hdr = buf; + + return hdr->in_length; + } else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) { + struct nd_cmd_vendor_hdr *hdr = buf; + + return hdr->in_length; + } + + return UINT_MAX; +} +EXPORT_SYMBOL_GPL(nd_cmd_in_size); + +u32 nd_cmd_out_size(struct nvdimm *nvdimm, int cmd, + const struct nd_cmd_desc *desc, int idx, const u32 *in_field, + const u32 *out_field) +{ + if (idx >= desc->out_num) + return UINT_MAX; + + if (desc->out_sizes[idx] < UINT_MAX) + return desc->out_sizes[idx]; + + if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && idx == 1) + return in_field[1]; + else if (nvdimm && cmd == ND_CMD_VENDOR && idx == 2) + return out_field[1]; + else if (!nvdimm && cmd == ND_CMD_ARS_STATUS && idx == 1) + return ND_CMD_ARS_STATUS_MAX; + + return UINT_MAX; +} +EXPORT_SYMBOL_GPL(nd_cmd_out_size); + +void wait_nvdimm_bus_probe_idle(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + do { + if (nvdimm_bus->probe_active == 0) + break; + nvdimm_bus_unlock(&nvdimm_bus->dev); + wait_event(nvdimm_bus->probe_wait, + nvdimm_bus->probe_active == 0); + nvdimm_bus_lock(&nvdimm_bus->dev); + } while (true); +} + +/* set_config requires an idle interleave set */ +static int nd_cmd_clear_to_send(struct nvdimm *nvdimm, unsigned int cmd) +{ + struct nvdimm_bus *nvdimm_bus; + + if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA) + return 0; + + nvdimm_bus = walk_to_nvdimm_bus(&nvdimm->dev); + wait_nvdimm_bus_probe_idle(&nvdimm_bus->dev); + + if (atomic_read(&nvdimm->busy)) + return -EBUSY; + return 0; +} + +static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, + int read_only, unsigned int ioctl_cmd, unsigned long arg) +{ + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + size_t buf_len = 0, in_len = 0, out_len = 0; + static char out_env[ND_CMD_MAX_ENVELOPE]; + static char in_env[ND_CMD_MAX_ENVELOPE]; + const struct nd_cmd_desc *desc = NULL; + unsigned int cmd = _IOC_NR(ioctl_cmd); + void __user *p = (void __user *) arg; + struct device *dev = &nvdimm_bus->dev; + const char *cmd_name, *dimm_name; + unsigned long dsm_mask; + void *buf; + int rc, i; + + if (nvdimm) { + desc = nd_cmd_dimm_desc(cmd); + cmd_name = nvdimm_cmd_name(cmd); + dsm_mask = nvdimm->dsm_mask ? *(nvdimm->dsm_mask) : 0; + dimm_name = dev_name(&nvdimm->dev); + } else { + desc = nd_cmd_bus_desc(cmd); + cmd_name = nvdimm_bus_cmd_name(cmd); + dsm_mask = nd_desc->dsm_mask; + dimm_name = "bus"; + } + + if (!desc || (desc->out_num + desc->in_num == 0) || + !test_bit(cmd, &dsm_mask)) + return -ENOTTY; + + /* fail write commands (when read-only) */ + if (read_only) + switch (ioctl_cmd) { + case ND_IOCTL_VENDOR: + case ND_IOCTL_SET_CONFIG_DATA: + case ND_IOCTL_ARS_START: + dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n", + nvdimm ? nvdimm_cmd_name(cmd) + : nvdimm_bus_cmd_name(cmd)); + return -EPERM; + default: + break; + } + + /* process an input envelope */ + for (i = 0; i < desc->in_num; i++) { + u32 in_size, copy; + + in_size = nd_cmd_in_size(nvdimm, cmd, desc, i, in_env); + if (in_size == UINT_MAX) { + dev_err(dev, "%s:%s unknown input size cmd: %s field: %d\n", + __func__, dimm_name, cmd_name, i); + return -ENXIO; + } + if (in_len < sizeof(in_env)) + copy = min_t(u32, sizeof(in_env) - in_len, in_size); + else + copy = 0; + if (copy && copy_from_user(&in_env[in_len], p + in_len, copy)) + return -EFAULT; + in_len += in_size; + } + + /* process an output envelope */ + for (i = 0; i < desc->out_num; i++) { + u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, + (u32 *) in_env, (u32 *) out_env); + u32 copy; + + if (out_size == UINT_MAX) { + dev_dbg(dev, "%s:%s unknown output size cmd: %s field: %d\n", + __func__, dimm_name, cmd_name, i); + return -EFAULT; + } + if (out_len < sizeof(out_env)) + copy = min_t(u32, sizeof(out_env) - out_len, out_size); + else + copy = 0; + if (copy && copy_from_user(&out_env[out_len], + p + in_len + out_len, copy)) + return -EFAULT; + out_len += out_size; + } + + buf_len = out_len + in_len; + if (buf_len > ND_IOCTL_MAX_BUFLEN) { + dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__, + dimm_name, cmd_name, buf_len, + ND_IOCTL_MAX_BUFLEN); + return -EINVAL; + } + + buf = vmalloc(buf_len); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, p, buf_len)) { + rc = -EFAULT; + goto out; + } + + nvdimm_bus_lock(&nvdimm_bus->dev); + rc = nd_cmd_clear_to_send(nvdimm, cmd); + if (rc) + goto out_unlock; + + rc = nd_desc->ndctl(nd_desc, nvdimm, cmd, buf, buf_len); + if (rc < 0) + goto out_unlock; + if (copy_to_user(p, buf, buf_len)) + rc = -EFAULT; + out_unlock: + nvdimm_bus_unlock(&nvdimm_bus->dev); + out: + vfree(buf); + return rc; +} + +static long nd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long id = (long) file->private_data; + int rc = -ENXIO, read_only; + struct nvdimm_bus *nvdimm_bus; + + read_only = (O_RDWR != (file->f_flags & O_ACCMODE)); + mutex_lock(&nvdimm_bus_list_mutex); + list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) { + if (nvdimm_bus->id == id) { + rc = __nd_ioctl(nvdimm_bus, NULL, read_only, cmd, arg); + break; + } + } + mutex_unlock(&nvdimm_bus_list_mutex); + + return rc; +} + +static int match_dimm(struct device *dev, void *data) +{ + long id = (long) data; + + if (is_nvdimm(dev)) { + struct nvdimm *nvdimm = to_nvdimm(dev); + + return nvdimm->id == id; + } + + return 0; +} + +static long nvdimm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc = -ENXIO, read_only; + struct nvdimm_bus *nvdimm_bus; + + read_only = (O_RDWR != (file->f_flags & O_ACCMODE)); + mutex_lock(&nvdimm_bus_list_mutex); + list_for_each_entry(nvdimm_bus, &nvdimm_bus_list, list) { + struct device *dev = device_find_child(&nvdimm_bus->dev, + file->private_data, match_dimm); + struct nvdimm *nvdimm; + + if (!dev) + continue; + + nvdimm = to_nvdimm(dev); + rc = __nd_ioctl(nvdimm_bus, nvdimm, read_only, cmd, arg); + put_device(dev); + break; + } + mutex_unlock(&nvdimm_bus_list_mutex); + + return rc; +} + +static int nd_open(struct inode *inode, struct file *file) +{ + long minor = iminor(inode); + + file->private_data = (void *) minor; + return 0; +} + +static const struct file_operations nvdimm_bus_fops = { + .owner = THIS_MODULE, + .open = nd_open, + .unlocked_ioctl = nd_ioctl, + .compat_ioctl = nd_ioctl, + .llseek = noop_llseek, +}; + +static const struct file_operations nvdimm_fops = { + .owner = THIS_MODULE, + .open = nd_open, + .unlocked_ioctl = nvdimm_ioctl, + .compat_ioctl = nvdimm_ioctl, + .llseek = noop_llseek, +}; + +int __init nvdimm_bus_init(void) +{ + int rc; + + rc = bus_register(&nvdimm_bus_type); + if (rc) + return rc; + + rc = register_chrdev(0, "ndctl", &nvdimm_bus_fops); + if (rc < 0) + goto err_bus_chrdev; + nvdimm_bus_major = rc; + + rc = register_chrdev(0, "dimmctl", &nvdimm_fops); + if (rc < 0) + goto err_dimm_chrdev; + nvdimm_major = rc; + + nd_class = class_create(THIS_MODULE, "nd"); + if (IS_ERR(nd_class)) { + rc = PTR_ERR(nd_class); + goto err_class; + } + + return 0; + + err_class: + unregister_chrdev(nvdimm_major, "dimmctl"); + err_dimm_chrdev: + unregister_chrdev(nvdimm_bus_major, "ndctl"); + err_bus_chrdev: + bus_unregister(&nvdimm_bus_type); + + return rc; +} + +void nvdimm_bus_exit(void) +{ + class_destroy(nd_class); + unregister_chrdev(nvdimm_bus_major, "ndctl"); + unregister_chrdev(nvdimm_major, "dimmctl"); + bus_unregister(&nvdimm_bus_type); +} diff --git a/kernel/drivers/nvdimm/claim.c b/kernel/drivers/nvdimm/claim.c new file mode 100644 index 000000000..e8f03b0e9 --- /dev/null +++ b/kernel/drivers/nvdimm/claim.c @@ -0,0 +1,201 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/device.h> +#include <linux/sizes.h> +#include "nd-core.h" +#include "pfn.h" +#include "btt.h" +#include "nd.h" + +void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns) +{ + struct nd_namespace_common *ndns = *_ndns; + + dev_WARN_ONCE(dev, !mutex_is_locked(&ndns->dev.mutex) + || ndns->claim != dev, + "%s: invalid claim\n", __func__); + ndns->claim = NULL; + *_ndns = NULL; + put_device(&ndns->dev); +} + +void nd_detach_ndns(struct device *dev, + struct nd_namespace_common **_ndns) +{ + struct nd_namespace_common *ndns = *_ndns; + + if (!ndns) + return; + get_device(&ndns->dev); + device_lock(&ndns->dev); + __nd_detach_ndns(dev, _ndns); + device_unlock(&ndns->dev); + put_device(&ndns->dev); +} + +bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, + struct nd_namespace_common **_ndns) +{ + if (attach->claim) + return false; + dev_WARN_ONCE(dev, !mutex_is_locked(&attach->dev.mutex) + || *_ndns, + "%s: invalid claim\n", __func__); + attach->claim = dev; + *_ndns = attach; + get_device(&attach->dev); + return true; +} + +bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, + struct nd_namespace_common **_ndns) +{ + bool claimed; + + device_lock(&attach->dev); + claimed = __nd_attach_ndns(dev, attach, _ndns); + device_unlock(&attach->dev); + return claimed; +} + +static int namespace_match(struct device *dev, void *data) +{ + char *name = data; + + return strcmp(name, dev_name(dev)) == 0; +} + +static bool is_idle(struct device *dev, struct nd_namespace_common *ndns) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct device *seed = NULL; + + if (is_nd_btt(dev)) + seed = nd_region->btt_seed; + else if (is_nd_pfn(dev)) + seed = nd_region->pfn_seed; + + if (seed == dev || ndns || dev->driver) + return false; + return true; +} + +static void nd_detach_and_reset(struct device *dev, + struct nd_namespace_common **_ndns) +{ + /* detach the namespace and destroy / reset the device */ + nd_detach_ndns(dev, _ndns); + if (is_idle(dev, *_ndns)) { + nd_device_unregister(dev, ND_ASYNC); + } else if (is_nd_btt(dev)) { + struct nd_btt *nd_btt = to_nd_btt(dev); + + nd_btt->lbasize = 0; + kfree(nd_btt->uuid); + nd_btt->uuid = NULL; + } else if (is_nd_pfn(dev)) { + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + kfree(nd_pfn->uuid); + nd_pfn->uuid = NULL; + nd_pfn->mode = PFN_MODE_NONE; + } +} + +ssize_t nd_namespace_store(struct device *dev, + struct nd_namespace_common **_ndns, const char *buf, + size_t len) +{ + struct nd_namespace_common *ndns; + struct device *found; + char *name; + + if (dev->driver) { + dev_dbg(dev, "%s: -EBUSY\n", __func__); + return -EBUSY; + } + + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; + strim(name); + + if (strncmp(name, "namespace", 9) == 0 || strcmp(name, "") == 0) + /* pass */; + else { + len = -EINVAL; + goto out; + } + + ndns = *_ndns; + if (strcmp(name, "") == 0) { + nd_detach_and_reset(dev, _ndns); + goto out; + } else if (ndns) { + dev_dbg(dev, "namespace already set to: %s\n", + dev_name(&ndns->dev)); + len = -EBUSY; + goto out; + } + + found = device_find_child(dev->parent, name, namespace_match); + if (!found) { + dev_dbg(dev, "'%s' not found under %s\n", name, + dev_name(dev->parent)); + len = -ENODEV; + goto out; + } + + ndns = to_ndns(found); + if (__nvdimm_namespace_capacity(ndns) < SZ_16M) { + dev_dbg(dev, "%s too small to host\n", name); + len = -ENXIO; + goto out_attach; + } + + WARN_ON_ONCE(!is_nvdimm_bus_locked(dev)); + if (!nd_attach_ndns(dev, ndns, _ndns)) { + dev_dbg(dev, "%s already claimed\n", + dev_name(&ndns->dev)); + len = -EBUSY; + } + + out_attach: + put_device(&ndns->dev); /* from device_find_child */ + out: + kfree(name); + return len; +} + +/* + * nd_sb_checksum: compute checksum for a generic info block + * + * Returns a fletcher64 checksum of everything in the given info block + * except the last field (since that's where the checksum lives). + */ +u64 nd_sb_checksum(struct nd_gen_sb *nd_gen_sb) +{ + u64 sum; + __le64 sum_save; + + BUILD_BUG_ON(sizeof(struct btt_sb) != SZ_4K); + BUILD_BUG_ON(sizeof(struct nd_pfn_sb) != SZ_4K); + BUILD_BUG_ON(sizeof(struct nd_gen_sb) != SZ_4K); + + sum_save = nd_gen_sb->checksum; + nd_gen_sb->checksum = 0; + sum = nd_fletcher64(nd_gen_sb, sizeof(*nd_gen_sb), 1); + nd_gen_sb->checksum = sum_save; + return sum; +} +EXPORT_SYMBOL(nd_sb_checksum); diff --git a/kernel/drivers/nvdimm/core.c b/kernel/drivers/nvdimm/core.c new file mode 100644 index 000000000..82c49bb87 --- /dev/null +++ b/kernel/drivers/nvdimm/core.c @@ -0,0 +1,454 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/libnvdimm.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/ctype.h> +#include <linux/ndctl.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include "nd-core.h" +#include "nd.h" + +LIST_HEAD(nvdimm_bus_list); +DEFINE_MUTEX(nvdimm_bus_list_mutex); +static DEFINE_IDA(nd_ida); + +void nvdimm_bus_lock(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return; + mutex_lock(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(nvdimm_bus_lock); + +void nvdimm_bus_unlock(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return; + mutex_unlock(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(nvdimm_bus_unlock); + +bool is_nvdimm_bus_locked(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return false; + return mutex_is_locked(&nvdimm_bus->reconfig_mutex); +} +EXPORT_SYMBOL(is_nvdimm_bus_locked); + +u64 nd_fletcher64(void *addr, size_t len, bool le) +{ + u32 *buf = addr; + u32 lo32 = 0; + u64 hi32 = 0; + int i; + + for (i = 0; i < len / sizeof(u32); i++) { + lo32 += le ? le32_to_cpu((__le32) buf[i]) : buf[i]; + hi32 += lo32; + } + + return hi32 << 32 | lo32; +} +EXPORT_SYMBOL_GPL(nd_fletcher64); + +static void nvdimm_bus_release(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus; + + nvdimm_bus = container_of(dev, struct nvdimm_bus, dev); + ida_simple_remove(&nd_ida, nvdimm_bus->id); + kfree(nvdimm_bus); +} + +struct nvdimm_bus *to_nvdimm_bus(struct device *dev) +{ + struct nvdimm_bus *nvdimm_bus; + + nvdimm_bus = container_of(dev, struct nvdimm_bus, dev); + WARN_ON(nvdimm_bus->dev.release != nvdimm_bus_release); + return nvdimm_bus; +} +EXPORT_SYMBOL_GPL(to_nvdimm_bus); + +struct nvdimm_bus_descriptor *to_nd_desc(struct nvdimm_bus *nvdimm_bus) +{ + /* struct nvdimm_bus definition is private to libnvdimm */ + return nvdimm_bus->nd_desc; +} +EXPORT_SYMBOL_GPL(to_nd_desc); + +struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev) +{ + struct device *dev; + + for (dev = nd_dev; dev; dev = dev->parent) + if (dev->release == nvdimm_bus_release) + break; + dev_WARN_ONCE(nd_dev, !dev, "invalid dev, not on nd bus\n"); + if (dev) + return to_nvdimm_bus(dev); + return NULL; +} + +static bool is_uuid_sep(char sep) +{ + if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0') + return true; + return false; +} + +static int nd_uuid_parse(struct device *dev, u8 *uuid_out, const char *buf, + size_t len) +{ + const char *str = buf; + u8 uuid[16]; + int i; + + for (i = 0; i < 16; i++) { + if (!isxdigit(str[0]) || !isxdigit(str[1])) { + dev_dbg(dev, "%s: pos: %d buf[%zd]: %c buf[%zd]: %c\n", + __func__, i, str - buf, str[0], + str + 1 - buf, str[1]); + return -EINVAL; + } + + uuid[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]); + str += 2; + if (is_uuid_sep(*str)) + str++; + } + + memcpy(uuid_out, uuid, sizeof(uuid)); + return 0; +} + +/** + * nd_uuid_store: common implementation for writing 'uuid' sysfs attributes + * @dev: container device for the uuid property + * @uuid_out: uuid buffer to replace + * @buf: raw sysfs buffer to parse + * + * Enforce that uuids can only be changed while the device is disabled + * (driver detached) + * LOCKING: expects device_lock() is held on entry + */ +int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf, + size_t len) +{ + u8 uuid[16]; + int rc; + + if (dev->driver) + return -EBUSY; + + rc = nd_uuid_parse(dev, uuid, buf, len); + if (rc) + return rc; + + kfree(*uuid_out); + *uuid_out = kmemdup(uuid, sizeof(uuid), GFP_KERNEL); + if (!(*uuid_out)) + return -ENOMEM; + + return 0; +} + +ssize_t nd_sector_size_show(unsigned long current_lbasize, + const unsigned long *supported, char *buf) +{ + ssize_t len = 0; + int i; + + for (i = 0; supported[i]; i++) + if (current_lbasize == supported[i]) + len += sprintf(buf + len, "[%ld] ", supported[i]); + else + len += sprintf(buf + len, "%ld ", supported[i]); + len += sprintf(buf + len, "\n"); + return len; +} + +ssize_t nd_sector_size_store(struct device *dev, const char *buf, + unsigned long *current_lbasize, const unsigned long *supported) +{ + unsigned long lbasize; + int rc, i; + + if (dev->driver) + return -EBUSY; + + rc = kstrtoul(buf, 0, &lbasize); + if (rc) + return rc; + + for (i = 0; supported[i]; i++) + if (lbasize == supported[i]) + break; + + if (supported[i]) { + *current_lbasize = lbasize; + return 0; + } else { + return -EINVAL; + } +} + +void __nd_iostat_start(struct bio *bio, unsigned long *start) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + const int rw = bio_data_dir(bio); + int cpu = part_stat_lock(); + + *start = jiffies; + part_round_stats(cpu, &disk->part0); + part_stat_inc(cpu, &disk->part0, ios[rw]); + part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio)); + part_inc_in_flight(&disk->part0, rw); + part_stat_unlock(); +} +EXPORT_SYMBOL(__nd_iostat_start); + +void nd_iostat_end(struct bio *bio, unsigned long start) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + unsigned long duration = jiffies - start; + const int rw = bio_data_dir(bio); + int cpu = part_stat_lock(); + + part_stat_add(cpu, &disk->part0, ticks[rw], duration); + part_round_stats(cpu, &disk->part0); + part_dec_in_flight(&disk->part0, rw); + part_stat_unlock(); +} +EXPORT_SYMBOL(nd_iostat_end); + +static ssize_t commands_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int cmd, len = 0; + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + + for_each_set_bit(cmd, &nd_desc->dsm_mask, BITS_PER_LONG) + len += sprintf(buf + len, "%s ", nvdimm_bus_cmd_name(cmd)); + len += sprintf(buf + len, "\n"); + return len; +} +static DEVICE_ATTR_RO(commands); + +static const char *nvdimm_bus_provider(struct nvdimm_bus *nvdimm_bus) +{ + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + struct device *parent = nvdimm_bus->dev.parent; + + if (nd_desc->provider_name) + return nd_desc->provider_name; + else if (parent) + return dev_name(parent); + else + return "unknown"; +} + +static ssize_t provider_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); + + return sprintf(buf, "%s\n", nvdimm_bus_provider(nvdimm_bus)); +} +static DEVICE_ATTR_RO(provider); + +static int flush_namespaces(struct device *dev, void *data) +{ + device_lock(dev); + device_unlock(dev); + return 0; +} + +static int flush_regions_dimms(struct device *dev, void *data) +{ + device_lock(dev); + device_unlock(dev); + device_for_each_child(dev, NULL, flush_namespaces); + return 0; +} + +static ssize_t wait_probe_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + nd_synchronize(); + device_for_each_child(dev, NULL, flush_regions_dimms); + return sprintf(buf, "1\n"); +} +static DEVICE_ATTR_RO(wait_probe); + +static struct attribute *nvdimm_bus_attributes[] = { + &dev_attr_commands.attr, + &dev_attr_wait_probe.attr, + &dev_attr_provider.attr, + NULL, +}; + +struct attribute_group nvdimm_bus_attribute_group = { + .attrs = nvdimm_bus_attributes, +}; +EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group); + +struct nvdimm_bus *__nvdimm_bus_register(struct device *parent, + struct nvdimm_bus_descriptor *nd_desc, struct module *module) +{ + struct nvdimm_bus *nvdimm_bus; + int rc; + + nvdimm_bus = kzalloc(sizeof(*nvdimm_bus), GFP_KERNEL); + if (!nvdimm_bus) + return NULL; + INIT_LIST_HEAD(&nvdimm_bus->list); + init_waitqueue_head(&nvdimm_bus->probe_wait); + nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); + mutex_init(&nvdimm_bus->reconfig_mutex); + if (nvdimm_bus->id < 0) { + kfree(nvdimm_bus); + return NULL; + } + nvdimm_bus->nd_desc = nd_desc; + nvdimm_bus->module = module; + nvdimm_bus->dev.parent = parent; + nvdimm_bus->dev.release = nvdimm_bus_release; + nvdimm_bus->dev.groups = nd_desc->attr_groups; + dev_set_name(&nvdimm_bus->dev, "ndbus%d", nvdimm_bus->id); + rc = device_register(&nvdimm_bus->dev); + if (rc) { + dev_dbg(&nvdimm_bus->dev, "registration failed: %d\n", rc); + goto err; + } + + rc = nvdimm_bus_create_ndctl(nvdimm_bus); + if (rc) + goto err; + + mutex_lock(&nvdimm_bus_list_mutex); + list_add_tail(&nvdimm_bus->list, &nvdimm_bus_list); + mutex_unlock(&nvdimm_bus_list_mutex); + + return nvdimm_bus; + err: + put_device(&nvdimm_bus->dev); + return NULL; +} +EXPORT_SYMBOL_GPL(__nvdimm_bus_register); + +static int child_unregister(struct device *dev, void *data) +{ + /* + * the singular ndctl class device per bus needs to be + * "device_destroy"ed, so skip it here + * + * i.e. remove classless children + */ + if (dev->class) + /* pass */; + else + nd_device_unregister(dev, ND_SYNC); + return 0; +} + +void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus) +{ + if (!nvdimm_bus) + return; + + mutex_lock(&nvdimm_bus_list_mutex); + list_del_init(&nvdimm_bus->list); + mutex_unlock(&nvdimm_bus_list_mutex); + + nd_synchronize(); + device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); + nvdimm_bus_destroy_ndctl(nvdimm_bus); + + device_unregister(&nvdimm_bus->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_bus_unregister); + +#ifdef CONFIG_BLK_DEV_INTEGRITY +int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) +{ + struct blk_integrity bi; + + if (meta_size == 0) + return 0; + + bi.profile = NULL; + bi.tuple_size = meta_size; + bi.tag_size = meta_size; + + blk_integrity_register(disk, &bi); + blk_queue_max_integrity_segments(disk->queue, 1); + + return 0; +} +EXPORT_SYMBOL(nd_integrity_init); + +#else /* CONFIG_BLK_DEV_INTEGRITY */ +int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) +{ + return 0; +} +EXPORT_SYMBOL(nd_integrity_init); + +#endif + +static __init int libnvdimm_init(void) +{ + int rc; + + rc = nvdimm_bus_init(); + if (rc) + return rc; + rc = nvdimm_init(); + if (rc) + goto err_dimm; + rc = nd_region_init(); + if (rc) + goto err_region; + return 0; + err_region: + nvdimm_exit(); + err_dimm: + nvdimm_bus_exit(); + return rc; +} + +static __exit void libnvdimm_exit(void) +{ + WARN_ON(!list_empty(&nvdimm_bus_list)); + nd_region_exit(); + nvdimm_exit(); + nvdimm_bus_exit(); +} + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +subsys_initcall(libnvdimm_init); +module_exit(libnvdimm_exit); diff --git a/kernel/drivers/nvdimm/dimm.c b/kernel/drivers/nvdimm/dimm.c new file mode 100644 index 000000000..71d12bb67 --- /dev/null +++ b/kernel/drivers/nvdimm/dimm.c @@ -0,0 +1,102 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/sizes.h> +#include <linux/ndctl.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/nd.h> +#include "label.h" +#include "nd.h" + +static int nvdimm_probe(struct device *dev) +{ + struct nvdimm_drvdata *ndd; + int rc; + + ndd = kzalloc(sizeof(*ndd), GFP_KERNEL); + if (!ndd) + return -ENOMEM; + + dev_set_drvdata(dev, ndd); + ndd->dpa.name = dev_name(dev); + ndd->ns_current = -1; + ndd->ns_next = -1; + ndd->dpa.start = 0; + ndd->dpa.end = -1; + ndd->dev = dev; + get_device(dev); + kref_init(&ndd->kref); + + rc = nvdimm_init_nsarea(ndd); + if (rc) + goto err; + + rc = nvdimm_init_config_data(ndd); + if (rc) + goto err; + + dev_dbg(dev, "config data size: %d\n", ndd->nsarea.config_size); + + nvdimm_bus_lock(dev); + ndd->ns_current = nd_label_validate(ndd); + ndd->ns_next = nd_label_next_nsindex(ndd->ns_current); + nd_label_copy(ndd, to_next_namespace_index(ndd), + to_current_namespace_index(ndd)); + rc = nd_label_reserve_dpa(ndd); + nvdimm_bus_unlock(dev); + + if (rc) + goto err; + + return 0; + + err: + put_ndd(ndd); + return rc; +} + +static int nvdimm_remove(struct device *dev) +{ + struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); + + nvdimm_bus_lock(dev); + dev_set_drvdata(dev, NULL); + nvdimm_bus_unlock(dev); + put_ndd(ndd); + + return 0; +} + +static struct nd_device_driver nvdimm_driver = { + .probe = nvdimm_probe, + .remove = nvdimm_remove, + .drv = { + .name = "nvdimm", + }, + .type = ND_DRIVER_DIMM, +}; + +int __init nvdimm_init(void) +{ + return nd_driver_register(&nvdimm_driver); +} + +void nvdimm_exit(void) +{ + driver_unregister(&nvdimm_driver.drv); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_DIMM); diff --git a/kernel/drivers/nvdimm/dimm_devs.c b/kernel/drivers/nvdimm/dimm_devs.c new file mode 100644 index 000000000..651b8d19d --- /dev/null +++ b/kernel/drivers/nvdimm/dimm_devs.c @@ -0,0 +1,548 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/vmalloc.h> +#include <linux/device.h> +#include <linux/ndctl.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include "nd-core.h" +#include "label.h" +#include "nd.h" + +static DEFINE_IDA(dimm_ida); + +/* + * Retrieve bus and dimm handle and return if this bus supports + * get_config_data commands + */ +static int __validate_dimm(struct nvdimm_drvdata *ndd) +{ + struct nvdimm *nvdimm; + + if (!ndd) + return -EINVAL; + + nvdimm = to_nvdimm(ndd->dev); + + if (!nvdimm->dsm_mask) + return -ENXIO; + if (!test_bit(ND_CMD_GET_CONFIG_DATA, nvdimm->dsm_mask)) + return -ENXIO; + + return 0; +} + +static int validate_dimm(struct nvdimm_drvdata *ndd) +{ + int rc = __validate_dimm(ndd); + + if (rc && ndd) + dev_dbg(ndd->dev, "%pf: %s error: %d\n", + __builtin_return_address(0), __func__, rc); + return rc; +} + +/** + * nvdimm_init_nsarea - determine the geometry of a dimm's namespace area + * @nvdimm: dimm to initialize + */ +int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd) +{ + struct nd_cmd_get_config_size *cmd = &ndd->nsarea; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nvdimm_bus_descriptor *nd_desc; + int rc = validate_dimm(ndd); + + if (rc) + return rc; + + if (cmd->config_size) + return 0; /* already valid */ + + memset(cmd, 0, sizeof(*cmd)); + nd_desc = nvdimm_bus->nd_desc; + return nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_GET_CONFIG_SIZE, cmd, sizeof(*cmd)); +} + +int nvdimm_init_config_data(struct nvdimm_drvdata *ndd) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nd_cmd_get_config_data_hdr *cmd; + struct nvdimm_bus_descriptor *nd_desc; + int rc = validate_dimm(ndd); + u32 max_cmd_size, config_size; + size_t offset; + + if (rc) + return rc; + + if (ndd->data) + return 0; + + if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0 + || ndd->nsarea.config_size < ND_LABEL_MIN_SIZE) { + dev_dbg(ndd->dev, "failed to init config data area: (%d:%d)\n", + ndd->nsarea.max_xfer, ndd->nsarea.config_size); + return -ENXIO; + } + + ndd->data = kmalloc(ndd->nsarea.config_size, GFP_KERNEL); + if (!ndd->data) + ndd->data = vmalloc(ndd->nsarea.config_size); + + if (!ndd->data) + return -ENOMEM; + + max_cmd_size = min_t(u32, PAGE_SIZE, ndd->nsarea.max_xfer); + cmd = kzalloc(max_cmd_size + sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + nd_desc = nvdimm_bus->nd_desc; + for (config_size = ndd->nsarea.config_size, offset = 0; + config_size; config_size -= cmd->in_length, + offset += cmd->in_length) { + cmd->in_length = min(config_size, max_cmd_size); + cmd->in_offset = offset; + rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_GET_CONFIG_DATA, cmd, + cmd->in_length + sizeof(*cmd)); + if (rc || cmd->status) { + rc = -ENXIO; + break; + } + memcpy(ndd->data + offset, cmd->out_buf, cmd->in_length); + } + dev_dbg(ndd->dev, "%s: len: %zu rc: %d\n", __func__, offset, rc); + kfree(cmd); + + return rc; +} + +int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, + void *buf, size_t len) +{ + int rc = validate_dimm(ndd); + size_t max_cmd_size, buf_offset; + struct nd_cmd_set_config_hdr *cmd; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(ndd->dev); + struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; + + if (rc) + return rc; + + if (!ndd->data) + return -ENXIO; + + if (offset + len > ndd->nsarea.config_size) + return -ENXIO; + + max_cmd_size = min_t(u32, PAGE_SIZE, len); + max_cmd_size = min_t(u32, max_cmd_size, ndd->nsarea.max_xfer); + cmd = kzalloc(max_cmd_size + sizeof(*cmd) + sizeof(u32), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + for (buf_offset = 0; len; len -= cmd->in_length, + buf_offset += cmd->in_length) { + size_t cmd_size; + u32 *status; + + cmd->in_offset = offset + buf_offset; + cmd->in_length = min(max_cmd_size, len); + memcpy(cmd->in_buf, buf + buf_offset, cmd->in_length); + + /* status is output in the last 4-bytes of the command buffer */ + cmd_size = sizeof(*cmd) + cmd->in_length + sizeof(u32); + status = ((void *) cmd) + cmd_size - sizeof(u32); + + rc = nd_desc->ndctl(nd_desc, to_nvdimm(ndd->dev), + ND_CMD_SET_CONFIG_DATA, cmd, cmd_size); + if (rc || *status) { + rc = rc ? rc : -ENXIO; + break; + } + } + kfree(cmd); + + return rc; +} + +static void nvdimm_release(struct device *dev) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + ida_simple_remove(&dimm_ida, nvdimm->id); + kfree(nvdimm); +} + +static struct device_type nvdimm_device_type = { + .name = "nvdimm", + .release = nvdimm_release, +}; + +bool is_nvdimm(struct device *dev) +{ + return dev->type == &nvdimm_device_type; +} + +struct nvdimm *to_nvdimm(struct device *dev) +{ + struct nvdimm *nvdimm = container_of(dev, struct nvdimm, dev); + + WARN_ON(!is_nvdimm(dev)); + return nvdimm; +} +EXPORT_SYMBOL_GPL(to_nvdimm); + +struct nvdimm *nd_blk_region_to_dimm(struct nd_blk_region *ndbr) +{ + struct nd_region *nd_region = &ndbr->nd_region; + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + + return nd_mapping->nvdimm; +} +EXPORT_SYMBOL_GPL(nd_blk_region_to_dimm); + +struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping) +{ + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm->dev)); + + return dev_get_drvdata(&nvdimm->dev); +} +EXPORT_SYMBOL(to_ndd); + +void nvdimm_drvdata_release(struct kref *kref) +{ + struct nvdimm_drvdata *ndd = container_of(kref, typeof(*ndd), kref); + struct device *dev = ndd->dev; + struct resource *res, *_r; + + dev_dbg(dev, "%s\n", __func__); + + nvdimm_bus_lock(dev); + for_each_dpa_resource_safe(ndd, res, _r) + nvdimm_free_dpa(ndd, res); + nvdimm_bus_unlock(dev); + + kvfree(ndd->data); + kfree(ndd); + put_device(dev); +} + +void get_ndd(struct nvdimm_drvdata *ndd) +{ + kref_get(&ndd->kref); +} + +void put_ndd(struct nvdimm_drvdata *ndd) +{ + if (ndd) + kref_put(&ndd->kref, nvdimm_drvdata_release); +} + +const char *nvdimm_name(struct nvdimm *nvdimm) +{ + return dev_name(&nvdimm->dev); +} +EXPORT_SYMBOL_GPL(nvdimm_name); + +void *nvdimm_provider_data(struct nvdimm *nvdimm) +{ + if (nvdimm) + return nvdimm->provider_data; + return NULL; +} +EXPORT_SYMBOL_GPL(nvdimm_provider_data); + +static ssize_t commands_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + int cmd, len = 0; + + if (!nvdimm->dsm_mask) + return sprintf(buf, "\n"); + + for_each_set_bit(cmd, nvdimm->dsm_mask, BITS_PER_LONG) + len += sprintf(buf + len, "%s ", nvdimm_cmd_name(cmd)); + len += sprintf(buf + len, "\n"); + return len; +} +static DEVICE_ATTR_RO(commands); + +static ssize_t state_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvdimm *nvdimm = to_nvdimm(dev); + + /* + * The state may be in the process of changing, userspace should + * quiesce probing if it wants a static answer + */ + nvdimm_bus_lock(dev); + nvdimm_bus_unlock(dev); + return sprintf(buf, "%s\n", atomic_read(&nvdimm->busy) + ? "active" : "idle"); +} +static DEVICE_ATTR_RO(state); + +static ssize_t available_slots_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvdimm_drvdata *ndd = dev_get_drvdata(dev); + ssize_t rc; + u32 nfree; + + if (!ndd) + return -ENXIO; + + nvdimm_bus_lock(dev); + nfree = nd_label_nfree(ndd); + if (nfree - 1 > nfree) { + dev_WARN_ONCE(dev, 1, "we ate our last label?\n"); + nfree = 0; + } else + nfree--; + rc = sprintf(buf, "%d\n", nfree); + nvdimm_bus_unlock(dev); + return rc; +} +static DEVICE_ATTR_RO(available_slots); + +static struct attribute *nvdimm_attributes[] = { + &dev_attr_state.attr, + &dev_attr_commands.attr, + &dev_attr_available_slots.attr, + NULL, +}; + +struct attribute_group nvdimm_attribute_group = { + .attrs = nvdimm_attributes, +}; +EXPORT_SYMBOL_GPL(nvdimm_attribute_group); + +struct nvdimm *nvdimm_create(struct nvdimm_bus *nvdimm_bus, void *provider_data, + const struct attribute_group **groups, unsigned long flags, + unsigned long *dsm_mask) +{ + struct nvdimm *nvdimm = kzalloc(sizeof(*nvdimm), GFP_KERNEL); + struct device *dev; + + if (!nvdimm) + return NULL; + + nvdimm->id = ida_simple_get(&dimm_ida, 0, 0, GFP_KERNEL); + if (nvdimm->id < 0) { + kfree(nvdimm); + return NULL; + } + nvdimm->provider_data = provider_data; + nvdimm->flags = flags; + nvdimm->dsm_mask = dsm_mask; + atomic_set(&nvdimm->busy, 0); + dev = &nvdimm->dev; + dev_set_name(dev, "nmem%d", nvdimm->id); + dev->parent = &nvdimm_bus->dev; + dev->type = &nvdimm_device_type; + dev->devt = MKDEV(nvdimm_major, nvdimm->id); + dev->groups = groups; + nd_device_register(dev); + + return nvdimm; +} +EXPORT_SYMBOL_GPL(nvdimm_create); + +/** + * nd_blk_available_dpa - account the unused dpa of BLK region + * @nd_mapping: container of dpa-resource-root + labels + * + * Unlike PMEM, BLK namespaces can occupy discontiguous DPA ranges. + */ +resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + resource_size_t map_end, busy = 0, available; + struct resource *res; + + if (!ndd) + return 0; + + map_end = nd_mapping->start + nd_mapping->size - 1; + for_each_dpa_resource(ndd, res) + if (res->start >= nd_mapping->start && res->start < map_end) { + resource_size_t end = min(map_end, res->end); + + busy += end - res->start + 1; + } else if (res->end >= nd_mapping->start + && res->end <= map_end) { + busy += res->end - nd_mapping->start; + } else if (nd_mapping->start > res->start + && nd_mapping->start < res->end) { + /* total eclipse of the BLK region mapping */ + busy += nd_mapping->size; + } + + available = map_end - nd_mapping->start + 1; + if (busy < available) + return available - busy; + return 0; +} + +/** + * nd_pmem_available_dpa - for the given dimm+region account unallocated dpa + * @nd_mapping: container of dpa-resource-root + labels + * @nd_region: constrain available space check to this reference region + * @overlap: calculate available space assuming this level of overlap + * + * Validate that a PMEM label, if present, aligns with the start of an + * interleave set and truncate the available size at the lowest BLK + * overlap point. + * + * The expectation is that this routine is called multiple times as it + * probes for the largest BLK encroachment for any single member DIMM of + * the interleave set. Once that value is determined the PMEM-limit for + * the set can be established. + */ +resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, resource_size_t *overlap) +{ + resource_size_t map_start, map_end, busy = 0, available, blk_start; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + const char *reason; + + if (!ndd) + return 0; + + map_start = nd_mapping->start; + map_end = map_start + nd_mapping->size - 1; + blk_start = max(map_start, map_end + 1 - *overlap); + for_each_dpa_resource(ndd, res) + if (res->start >= map_start && res->start < map_end) { + if (strncmp(res->name, "blk", 3) == 0) + blk_start = min(blk_start, res->start); + else if (res->start != map_start) { + reason = "misaligned to iset"; + goto err; + } else { + if (busy) { + reason = "duplicate overlapping PMEM reservations?"; + goto err; + } + busy += resource_size(res); + continue; + } + } else if (res->end >= map_start && res->end <= map_end) { + if (strncmp(res->name, "blk", 3) == 0) { + /* + * If a BLK allocation overlaps the start of + * PMEM the entire interleave set may now only + * be used for BLK. + */ + blk_start = map_start; + } else { + reason = "misaligned to iset"; + goto err; + } + } else if (map_start > res->start && map_start < res->end) { + /* total eclipse of the mapping */ + busy += nd_mapping->size; + blk_start = map_start; + } + + *overlap = map_end + 1 - blk_start; + available = blk_start - map_start; + if (busy < available) + return available - busy; + return 0; + + err: + /* + * Something is wrong, PMEM must align with the start of the + * interleave set, and there can only be one allocation per set. + */ + nd_dbg_dpa(nd_region, ndd, res, "%s\n", reason); + return 0; +} + +void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res) +{ + WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev)); + kfree(res->name); + __release_region(&ndd->dpa, res->start, resource_size(res)); +} + +struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id, resource_size_t start, + resource_size_t n) +{ + char *name = kmemdup(label_id, sizeof(*label_id), GFP_KERNEL); + struct resource *res; + + if (!name) + return NULL; + + WARN_ON_ONCE(!is_nvdimm_bus_locked(ndd->dev)); + res = __request_region(&ndd->dpa, start, n, name, 0); + if (!res) + kfree(name); + return res; +} + +/** + * nvdimm_allocated_dpa - sum up the dpa currently allocated to this label_id + * @nvdimm: container of dpa-resource-root + labels + * @label_id: dpa resource name of the form {pmem|blk}-<human readable uuid> + */ +resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id) +{ + resource_size_t allocated = 0; + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id->id) == 0) + allocated += resource_size(res); + + return allocated; +} + +static int count_dimms(struct device *dev, void *c) +{ + int *count = c; + + if (is_nvdimm(dev)) + (*count)++; + return 0; +} + +int nvdimm_bus_check_dimm_count(struct nvdimm_bus *nvdimm_bus, int dimm_count) +{ + int count = 0; + /* Flush any possible dimm registration failures */ + nd_synchronize(); + + device_for_each_child(&nvdimm_bus->dev, &count, count_dimms); + dev_dbg(&nvdimm_bus->dev, "%s: count: %d\n", __func__, count); + if (count != dimm_count) + return -ENXIO; + return 0; +} +EXPORT_SYMBOL_GPL(nvdimm_bus_check_dimm_count); diff --git a/kernel/drivers/nvdimm/e820.c b/kernel/drivers/nvdimm/e820.c new file mode 100644 index 000000000..b0045a505 --- /dev/null +++ b/kernel/drivers/nvdimm/e820.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2015, Christoph Hellwig. + * Copyright (c) 2015, Intel Corporation. + */ +#include <linux/platform_device.h> +#include <linux/memory_hotplug.h> +#include <linux/libnvdimm.h> +#include <linux/module.h> + +static const struct attribute_group *e820_pmem_attribute_groups[] = { + &nvdimm_bus_attribute_group, + NULL, +}; + +static const struct attribute_group *e820_pmem_region_attribute_groups[] = { + &nd_region_attribute_group, + &nd_device_attribute_group, + NULL, +}; + +static int e820_pmem_remove(struct platform_device *pdev) +{ + struct nvdimm_bus *nvdimm_bus = platform_get_drvdata(pdev); + + nvdimm_bus_unregister(nvdimm_bus); + return 0; +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static int e820_range_to_nid(resource_size_t addr) +{ + return memory_add_physaddr_to_nid(addr); +} +#else +static int e820_range_to_nid(resource_size_t addr) +{ + return NUMA_NO_NODE; +} +#endif + +static int e820_pmem_probe(struct platform_device *pdev) +{ + static struct nvdimm_bus_descriptor nd_desc; + struct device *dev = &pdev->dev; + struct nvdimm_bus *nvdimm_bus; + struct resource *p; + + nd_desc.attr_groups = e820_pmem_attribute_groups; + nd_desc.provider_name = "e820"; + nvdimm_bus = nvdimm_bus_register(dev, &nd_desc); + if (!nvdimm_bus) + goto err; + platform_set_drvdata(pdev, nvdimm_bus); + + for (p = iomem_resource.child; p ; p = p->sibling) { + struct nd_region_desc ndr_desc; + + if (strncmp(p->name, "Persistent Memory (legacy)", 26) != 0) + continue; + + memset(&ndr_desc, 0, sizeof(ndr_desc)); + ndr_desc.res = p; + ndr_desc.attr_groups = e820_pmem_region_attribute_groups; + ndr_desc.numa_node = e820_range_to_nid(p->start); + set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); + if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) + goto err; + } + + return 0; + + err: + nvdimm_bus_unregister(nvdimm_bus); + dev_err(dev, "failed to register legacy persistent memory ranges\n"); + return -ENXIO; +} + +static struct platform_driver e820_pmem_driver = { + .probe = e820_pmem_probe, + .remove = e820_pmem_remove, + .driver = { + .name = "e820_pmem", + }, +}; + +static __init int e820_pmem_init(void) +{ + return platform_driver_register(&e820_pmem_driver); +} + +static __exit void e820_pmem_exit(void) +{ + platform_driver_unregister(&e820_pmem_driver); +} + +MODULE_ALIAS("platform:e820_pmem*"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Intel Corporation"); +module_init(e820_pmem_init); +module_exit(e820_pmem_exit); diff --git a/kernel/drivers/nvdimm/label.c b/kernel/drivers/nvdimm/label.c new file mode 100644 index 000000000..96526dcfd --- /dev/null +++ b/kernel/drivers/nvdimm/label.c @@ -0,0 +1,927 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/device.h> +#include <linux/ndctl.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "label.h" +#include "nd.h" + +static u32 best_seq(u32 a, u32 b) +{ + a &= NSINDEX_SEQ_MASK; + b &= NSINDEX_SEQ_MASK; + + if (a == 0 || a == b) + return b; + else if (b == 0) + return a; + else if (nd_inc_seq(a) == b) + return b; + else + return a; +} + +size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd) +{ + u32 index_span; + + if (ndd->nsindex_size) + return ndd->nsindex_size; + + /* + * The minimum index space is 512 bytes, with that amount of + * index we can describe ~1400 labels which is less than a byte + * of overhead per label. Round up to a byte of overhead per + * label and determine the size of the index region. Yes, this + * starts to waste space at larger config_sizes, but it's + * unlikely we'll ever see anything but 128K. + */ + index_span = ndd->nsarea.config_size / 129; + index_span /= NSINDEX_ALIGN * 2; + ndd->nsindex_size = index_span * NSINDEX_ALIGN; + + return ndd->nsindex_size; +} + +int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd) +{ + return ndd->nsarea.config_size / 129; +} + +int nd_label_validate(struct nvdimm_drvdata *ndd) +{ + /* + * On media label format consists of two index blocks followed + * by an array of labels. None of these structures are ever + * updated in place. A sequence number tracks the current + * active index and the next one to write, while labels are + * written to free slots. + * + * +------------+ + * | | + * | nsindex0 | + * | | + * +------------+ + * | | + * | nsindex1 | + * | | + * +------------+ + * | label0 | + * +------------+ + * | label1 | + * +------------+ + * | | + * ....nslot... + * | | + * +------------+ + * | labelN | + * +------------+ + */ + struct nd_namespace_index *nsindex[] = { + to_namespace_index(ndd, 0), + to_namespace_index(ndd, 1), + }; + const int num_index = ARRAY_SIZE(nsindex); + struct device *dev = ndd->dev; + bool valid[2] = { 0 }; + int i, num_valid = 0; + u32 seq; + + for (i = 0; i < num_index; i++) { + u32 nslot; + u8 sig[NSINDEX_SIG_LEN]; + u64 sum_save, sum, size; + + memcpy(sig, nsindex[i]->sig, NSINDEX_SIG_LEN); + if (memcmp(sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN) != 0) { + dev_dbg(dev, "%s: nsindex%d signature invalid\n", + __func__, i); + continue; + } + sum_save = __le64_to_cpu(nsindex[i]->checksum); + nsindex[i]->checksum = __cpu_to_le64(0); + sum = nd_fletcher64(nsindex[i], sizeof_namespace_index(ndd), 1); + nsindex[i]->checksum = __cpu_to_le64(sum_save); + if (sum != sum_save) { + dev_dbg(dev, "%s: nsindex%d checksum invalid\n", + __func__, i); + continue; + } + + seq = __le32_to_cpu(nsindex[i]->seq); + if ((seq & NSINDEX_SEQ_MASK) == 0) { + dev_dbg(dev, "%s: nsindex%d sequence: %#x invalid\n", + __func__, i, seq); + continue; + } + + /* sanity check the index against expected values */ + if (__le64_to_cpu(nsindex[i]->myoff) + != i * sizeof_namespace_index(ndd)) { + dev_dbg(dev, "%s: nsindex%d myoff: %#llx invalid\n", + __func__, i, (unsigned long long) + __le64_to_cpu(nsindex[i]->myoff)); + continue; + } + if (__le64_to_cpu(nsindex[i]->otheroff) + != (!i) * sizeof_namespace_index(ndd)) { + dev_dbg(dev, "%s: nsindex%d otheroff: %#llx invalid\n", + __func__, i, (unsigned long long) + __le64_to_cpu(nsindex[i]->otheroff)); + continue; + } + + size = __le64_to_cpu(nsindex[i]->mysize); + if (size > sizeof_namespace_index(ndd) + || size < sizeof(struct nd_namespace_index)) { + dev_dbg(dev, "%s: nsindex%d mysize: %#llx invalid\n", + __func__, i, size); + continue; + } + + nslot = __le32_to_cpu(nsindex[i]->nslot); + if (nslot * sizeof(struct nd_namespace_label) + + 2 * sizeof_namespace_index(ndd) + > ndd->nsarea.config_size) { + dev_dbg(dev, "%s: nsindex%d nslot: %u invalid, config_size: %#x\n", + __func__, i, nslot, + ndd->nsarea.config_size); + continue; + } + valid[i] = true; + num_valid++; + } + + switch (num_valid) { + case 0: + break; + case 1: + for (i = 0; i < num_index; i++) + if (valid[i]) + return i; + /* can't have num_valid > 0 but valid[] = { false, false } */ + WARN_ON(1); + break; + default: + /* pick the best index... */ + seq = best_seq(__le32_to_cpu(nsindex[0]->seq), + __le32_to_cpu(nsindex[1]->seq)); + if (seq == (__le32_to_cpu(nsindex[1]->seq) & NSINDEX_SEQ_MASK)) + return 1; + else + return 0; + break; + } + + return -1; +} + +void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst, + struct nd_namespace_index *src) +{ + if (dst && src) + /* pass */; + else + return; + + memcpy(dst, src, sizeof_namespace_index(ndd)); +} + +static struct nd_namespace_label *nd_label_base(struct nvdimm_drvdata *ndd) +{ + void *base = to_namespace_index(ndd, 0); + + return base + 2 * sizeof_namespace_index(ndd); +} + +static int to_slot(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + return nd_label - nd_label_base(ndd); +} + +#define for_each_clear_bit_le(bit, addr, size) \ + for ((bit) = find_next_zero_bit_le((addr), (size), 0); \ + (bit) < (size); \ + (bit) = find_next_zero_bit_le((addr), (size), (bit) + 1)) + +/** + * preamble_index - common variable initialization for nd_label_* routines + * @ndd: dimm container for the relevant label set + * @idx: namespace_index index + * @nsindex_out: on return set to the currently active namespace index + * @free: on return set to the free label bitmap in the index + * @nslot: on return set to the number of slots in the label space + */ +static bool preamble_index(struct nvdimm_drvdata *ndd, int idx, + struct nd_namespace_index **nsindex_out, + unsigned long **free, u32 *nslot) +{ + struct nd_namespace_index *nsindex; + + nsindex = to_namespace_index(ndd, idx); + if (nsindex == NULL) + return false; + + *free = (unsigned long *) nsindex->free; + *nslot = __le32_to_cpu(nsindex->nslot); + *nsindex_out = nsindex; + + return true; +} + +char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags) +{ + if (!label_id || !uuid) + return NULL; + snprintf(label_id->id, ND_LABEL_ID_SIZE, "%s-%pUb", + flags & NSLABEL_FLAG_LOCAL ? "blk" : "pmem", uuid); + return label_id->id; +} + +static bool preamble_current(struct nvdimm_drvdata *ndd, + struct nd_namespace_index **nsindex, + unsigned long **free, u32 *nslot) +{ + return preamble_index(ndd, ndd->ns_current, nsindex, + free, nslot); +} + +static bool preamble_next(struct nvdimm_drvdata *ndd, + struct nd_namespace_index **nsindex, + unsigned long **free, u32 *nslot) +{ + return preamble_index(ndd, ndd->ns_next, nsindex, + free, nslot); +} + +static bool slot_valid(struct nd_namespace_label *nd_label, u32 slot) +{ + /* check that we are written where we expect to be written */ + if (slot != __le32_to_cpu(nd_label->slot)) + return false; + + /* check that DPA allocations are page aligned */ + if ((__le64_to_cpu(nd_label->dpa) + | __le64_to_cpu(nd_label->rawsize)) % SZ_4K) + return false; + + return true; +} + +int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + + if (!preamble_current(ndd, &nsindex, &free, &nslot)) + return 0; /* no label, nothing to reserve */ + + for_each_clear_bit_le(slot, free, nslot) { + struct nd_namespace_label *nd_label; + struct nd_region *nd_region = NULL; + u8 label_uuid[NSLABEL_UUID_LEN]; + struct nd_label_id label_id; + struct resource *res; + u32 flags; + + nd_label = nd_label_base(ndd) + slot; + + if (!slot_valid(nd_label, slot)) + continue; + + memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN); + flags = __le32_to_cpu(nd_label->flags); + nd_label_gen_id(&label_id, label_uuid, flags); + res = nvdimm_allocate_dpa(ndd, &label_id, + __le64_to_cpu(nd_label->dpa), + __le64_to_cpu(nd_label->rawsize)); + nd_dbg_dpa(nd_region, ndd, res, "reserve\n"); + if (!res) + return -EBUSY; + } + + return 0; +} + +int nd_label_active_count(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + int count = 0; + + if (!preamble_current(ndd, &nsindex, &free, &nslot)) + return 0; + + for_each_clear_bit_le(slot, free, nslot) { + struct nd_namespace_label *nd_label; + + nd_label = nd_label_base(ndd) + slot; + + if (!slot_valid(nd_label, slot)) { + u32 label_slot = __le32_to_cpu(nd_label->slot); + u64 size = __le64_to_cpu(nd_label->rawsize); + u64 dpa = __le64_to_cpu(nd_label->dpa); + + dev_dbg(ndd->dev, + "%s: slot%d invalid slot: %d dpa: %llx size: %llx\n", + __func__, slot, label_slot, dpa, size); + continue; + } + count++; + } + return count; +} + +struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + + if (!preamble_current(ndd, &nsindex, &free, &nslot)) + return NULL; + + for_each_clear_bit_le(slot, free, nslot) { + struct nd_namespace_label *nd_label; + + nd_label = nd_label_base(ndd) + slot; + if (!slot_valid(nd_label, slot)) + continue; + + if (n-- == 0) + return nd_label_base(ndd) + slot; + } + + return NULL; +} + +u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return UINT_MAX; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + slot = find_next_bit_le(free, nslot, 0); + if (slot == nslot) + return UINT_MAX; + + clear_bit_le(slot, free); + + return slot; +} + +bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return false; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + if (slot < nslot) + return !test_and_set_bit_le(slot, free); + return false; +} + +u32 nd_label_nfree(struct nvdimm_drvdata *ndd) +{ + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot; + + WARN_ON(!is_nvdimm_bus_locked(ndd->dev)); + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return nvdimm_num_label_slots(ndd); + + return bitmap_weight(free, nslot); +} + +static int nd_label_write_index(struct nvdimm_drvdata *ndd, int index, u32 seq, + unsigned long flags) +{ + struct nd_namespace_index *nsindex; + unsigned long offset; + u64 checksum; + u32 nslot; + int rc; + + nsindex = to_namespace_index(ndd, index); + if (flags & ND_NSINDEX_INIT) + nslot = nvdimm_num_label_slots(ndd); + else + nslot = __le32_to_cpu(nsindex->nslot); + + memcpy(nsindex->sig, NSINDEX_SIGNATURE, NSINDEX_SIG_LEN); + nsindex->flags = __cpu_to_le32(0); + nsindex->seq = __cpu_to_le32(seq); + offset = (unsigned long) nsindex + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->myoff = __cpu_to_le64(offset); + nsindex->mysize = __cpu_to_le64(sizeof_namespace_index(ndd)); + offset = (unsigned long) to_namespace_index(ndd, + nd_label_next_nsindex(index)) + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->otheroff = __cpu_to_le64(offset); + offset = (unsigned long) nd_label_base(ndd) + - (unsigned long) to_namespace_index(ndd, 0); + nsindex->labeloff = __cpu_to_le64(offset); + nsindex->nslot = __cpu_to_le32(nslot); + nsindex->major = __cpu_to_le16(1); + nsindex->minor = __cpu_to_le16(1); + nsindex->checksum = __cpu_to_le64(0); + if (flags & ND_NSINDEX_INIT) { + unsigned long *free = (unsigned long *) nsindex->free; + u32 nfree = ALIGN(nslot, BITS_PER_LONG); + int last_bits, i; + + memset(nsindex->free, 0xff, nfree / 8); + for (i = 0, last_bits = nfree - nslot; i < last_bits; i++) + clear_bit_le(nslot + i, free); + } + checksum = nd_fletcher64(nsindex, sizeof_namespace_index(ndd), 1); + nsindex->checksum = __cpu_to_le64(checksum); + rc = nvdimm_set_config_data(ndd, __le64_to_cpu(nsindex->myoff), + nsindex, sizeof_namespace_index(ndd)); + if (rc < 0) + return rc; + + if (flags & ND_NSINDEX_INIT) + return 0; + + /* copy the index we just wrote to the new 'next' */ + WARN_ON(index != ndd->ns_next); + nd_label_copy(ndd, to_current_namespace_index(ndd), nsindex); + ndd->ns_current = nd_label_next_nsindex(ndd->ns_current); + ndd->ns_next = nd_label_next_nsindex(ndd->ns_next); + WARN_ON(ndd->ns_current == ndd->ns_next); + + return 0; +} + +static unsigned long nd_label_offset(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + return (unsigned long) nd_label + - (unsigned long) to_namespace_index(ndd, 0); +} + +static int __pmem_label_update(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_namespace_pmem *nspm, + int pos) +{ + u64 cookie = nd_region_interleave_set_cookie(nd_region), rawsize; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *victim_label; + struct nd_namespace_label *nd_label; + struct nd_namespace_index *nsindex; + unsigned long *free; + u32 nslot, slot; + size_t offset; + int rc; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return -ENXIO; + + /* allocate and write the label to the staging (next) index */ + slot = nd_label_alloc_slot(ndd); + if (slot == UINT_MAX) + return -ENXIO; + dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot); + + nd_label = nd_label_base(ndd) + slot; + memset(nd_label, 0, sizeof(struct nd_namespace_label)); + memcpy(nd_label->uuid, nspm->uuid, NSLABEL_UUID_LEN); + if (nspm->alt_name) + memcpy(nd_label->name, nspm->alt_name, NSLABEL_NAME_LEN); + nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_UPDATING); + nd_label->nlabel = __cpu_to_le16(nd_region->ndr_mappings); + nd_label->position = __cpu_to_le16(pos); + nd_label->isetcookie = __cpu_to_le64(cookie); + rawsize = div_u64(resource_size(&nspm->nsio.res), + nd_region->ndr_mappings); + nd_label->rawsize = __cpu_to_le64(rawsize); + nd_label->dpa = __cpu_to_le64(nd_mapping->start); + nd_label->slot = __cpu_to_le32(slot); + + /* update label */ + offset = nd_label_offset(ndd, nd_label); + rc = nvdimm_set_config_data(ndd, offset, nd_label, + sizeof(struct nd_namespace_label)); + if (rc < 0) + return rc; + + /* Garbage collect the previous label */ + victim_label = nd_mapping->labels[0]; + if (victim_label) { + slot = to_slot(ndd, victim_label); + nd_label_free_slot(ndd, slot); + dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + } + + /* update index */ + rc = nd_label_write_index(ndd, ndd->ns_next, + nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); + if (rc < 0) + return rc; + + nd_mapping->labels[0] = nd_label; + + return 0; +} + +static void del_label(struct nd_mapping *nd_mapping, int l) +{ + struct nd_namespace_label *next_label, *nd_label; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + unsigned int slot; + int j; + + nd_label = nd_mapping->labels[l]; + slot = to_slot(ndd, nd_label); + dev_vdbg(ndd->dev, "%s: clear: %d\n", __func__, slot); + + for (j = l; (next_label = nd_mapping->labels[j + 1]); j++) + nd_mapping->labels[j] = next_label; + nd_mapping->labels[j] = NULL; +} + +static bool is_old_resource(struct resource *res, struct resource **list, int n) +{ + int i; + + if (res->flags & DPA_RESOURCE_ADJUSTED) + return false; + for (i = 0; i < n; i++) + if (res == list[i]) + return true; + return false; +} + +static struct resource *to_resource(struct nvdimm_drvdata *ndd, + struct nd_namespace_label *nd_label) +{ + struct resource *res; + + for_each_dpa_resource(ndd, res) { + if (res->start != __le64_to_cpu(nd_label->dpa)) + continue; + if (resource_size(res) != __le64_to_cpu(nd_label->rawsize)) + continue; + return res; + } + + return NULL; +} + +/* + * 1/ Account all the labels that can be freed after this update + * 2/ Allocate and write the label to the staging (next) index + * 3/ Record the resources in the namespace device + */ +static int __blk_label_update(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_namespace_blk *nsblk, + int num_labels) +{ + int i, l, alloc, victims, nfree, old_num_resources, nlabel, rc = -ENXIO; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *nd_label; + struct nd_namespace_index *nsindex; + unsigned long *free, *victim_map = NULL; + struct resource *res, **old_res_list; + struct nd_label_id label_id; + u8 uuid[NSLABEL_UUID_LEN]; + u32 nslot, slot; + + if (!preamble_next(ndd, &nsindex, &free, &nslot)) + return -ENXIO; + + old_res_list = nsblk->res; + nfree = nd_label_nfree(ndd); + old_num_resources = nsblk->num_resources; + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + + /* + * We need to loop over the old resources a few times, which seems a + * bit inefficient, but we need to know that we have the label + * space before we start mutating the tracking structures. + * Otherwise the recovery method of last resort for userspace is + * disable and re-enable the parent region. + */ + alloc = 0; + for_each_dpa_resource(ndd, res) { + if (strcmp(res->name, label_id.id) != 0) + continue; + if (!is_old_resource(res, old_res_list, old_num_resources)) + alloc++; + } + + victims = 0; + if (old_num_resources) { + /* convert old local-label-map to dimm-slot victim-map */ + victim_map = kcalloc(BITS_TO_LONGS(nslot), sizeof(long), + GFP_KERNEL); + if (!victim_map) + return -ENOMEM; + + /* mark unused labels for garbage collection */ + for_each_clear_bit_le(slot, free, nslot) { + nd_label = nd_label_base(ndd) + slot; + memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) + continue; + res = to_resource(ndd, nd_label); + if (res && is_old_resource(res, old_res_list, + old_num_resources)) + continue; + slot = to_slot(ndd, nd_label); + set_bit(slot, victim_map); + victims++; + } + } + + /* don't allow updates that consume the last label */ + if (nfree - alloc < 0 || nfree - alloc + victims < 1) { + dev_info(&nsblk->common.dev, "insufficient label space\n"); + kfree(victim_map); + return -ENOSPC; + } + /* from here on we need to abort on error */ + + + /* assign all resources to the namespace before writing the labels */ + nsblk->res = NULL; + nsblk->num_resources = 0; + for_each_dpa_resource(ndd, res) { + if (strcmp(res->name, label_id.id) != 0) + continue; + if (!nsblk_add_resource(nd_region, ndd, nsblk, res->start)) { + rc = -ENOMEM; + goto abort; + } + } + + for (i = 0; i < nsblk->num_resources; i++) { + size_t offset; + + res = nsblk->res[i]; + if (is_old_resource(res, old_res_list, old_num_resources)) + continue; /* carry-over */ + slot = nd_label_alloc_slot(ndd); + if (slot == UINT_MAX) + goto abort; + dev_dbg(ndd->dev, "%s: allocated: %d\n", __func__, slot); + + nd_label = nd_label_base(ndd) + slot; + memset(nd_label, 0, sizeof(struct nd_namespace_label)); + memcpy(nd_label->uuid, nsblk->uuid, NSLABEL_UUID_LEN); + if (nsblk->alt_name) + memcpy(nd_label->name, nsblk->alt_name, + NSLABEL_NAME_LEN); + nd_label->flags = __cpu_to_le32(NSLABEL_FLAG_LOCAL); + nd_label->nlabel = __cpu_to_le16(0); /* N/A */ + nd_label->position = __cpu_to_le16(0); /* N/A */ + nd_label->isetcookie = __cpu_to_le64(0); /* N/A */ + nd_label->dpa = __cpu_to_le64(res->start); + nd_label->rawsize = __cpu_to_le64(resource_size(res)); + nd_label->lbasize = __cpu_to_le64(nsblk->lbasize); + nd_label->slot = __cpu_to_le32(slot); + + /* update label */ + offset = nd_label_offset(ndd, nd_label); + rc = nvdimm_set_config_data(ndd, offset, nd_label, + sizeof(struct nd_namespace_label)); + if (rc < 0) + goto abort; + } + + /* free up now unused slots in the new index */ + for_each_set_bit(slot, victim_map, victim_map ? nslot : 0) { + dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + nd_label_free_slot(ndd, slot); + } + + /* update index */ + rc = nd_label_write_index(ndd, ndd->ns_next, + nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); + if (rc) + goto abort; + + /* + * Now that the on-dimm labels are up to date, fix up the tracking + * entries in nd_mapping->labels + */ + nlabel = 0; + for_each_label(l, nd_label, nd_mapping->labels) { + nlabel++; + memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) + continue; + nlabel--; + del_label(nd_mapping, l); + l--; /* retry with the new label at this index */ + } + if (nlabel + nsblk->num_resources > num_labels) { + /* + * Bug, we can't end up with more resources than + * available labels + */ + WARN_ON_ONCE(1); + rc = -ENXIO; + goto out; + } + + for_each_clear_bit_le(slot, free, nslot) { + nd_label = nd_label_base(ndd) + slot; + memcpy(uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(uuid, nsblk->uuid, NSLABEL_UUID_LEN) != 0) + continue; + res = to_resource(ndd, nd_label); + res->flags &= ~DPA_RESOURCE_ADJUSTED; + dev_vdbg(&nsblk->common.dev, "assign label[%d] slot: %d\n", + l, slot); + nd_mapping->labels[l++] = nd_label; + } + nd_mapping->labels[l] = NULL; + + out: + kfree(old_res_list); + kfree(victim_map); + return rc; + + abort: + /* + * 1/ repair the allocated label bitmap in the index + * 2/ restore the resource list + */ + nd_label_copy(ndd, nsindex, to_current_namespace_index(ndd)); + kfree(nsblk->res); + nsblk->res = old_res_list; + nsblk->num_resources = old_num_resources; + old_res_list = NULL; + goto out; +} + +static int init_labels(struct nd_mapping *nd_mapping, int num_labels) +{ + int i, l, old_num_labels = 0; + struct nd_namespace_index *nsindex; + struct nd_namespace_label *nd_label; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + size_t size = (num_labels + 1) * sizeof(struct nd_namespace_label *); + + for_each_label(l, nd_label, nd_mapping->labels) + old_num_labels++; + + /* + * We need to preserve all the old labels for the mapping so + * they can be garbage collected after writing the new labels. + */ + if (num_labels > old_num_labels) { + struct nd_namespace_label **labels; + + labels = krealloc(nd_mapping->labels, size, GFP_KERNEL); + if (!labels) + return -ENOMEM; + nd_mapping->labels = labels; + } + if (!nd_mapping->labels) + return -ENOMEM; + + for (i = old_num_labels; i <= num_labels; i++) + nd_mapping->labels[i] = NULL; + + if (ndd->ns_current == -1 || ndd->ns_next == -1) + /* pass */; + else + return max(num_labels, old_num_labels); + + nsindex = to_namespace_index(ndd, 0); + memset(nsindex, 0, ndd->nsarea.config_size); + for (i = 0; i < 2; i++) { + int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT); + + if (rc) + return rc; + } + ndd->ns_next = 1; + ndd->ns_current = 0; + + return max(num_labels, old_num_labels); +} + +static int del_labels(struct nd_mapping *nd_mapping, u8 *uuid) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_namespace_label *nd_label; + struct nd_namespace_index *nsindex; + u8 label_uuid[NSLABEL_UUID_LEN]; + int l, num_freed = 0; + unsigned long *free; + u32 nslot, slot; + + if (!uuid) + return 0; + + /* no index || no labels == nothing to delete */ + if (!preamble_next(ndd, &nsindex, &free, &nslot) + || !nd_mapping->labels) + return 0; + + for_each_label(l, nd_label, nd_mapping->labels) { + memcpy(label_uuid, nd_label->uuid, NSLABEL_UUID_LEN); + if (memcmp(label_uuid, uuid, NSLABEL_UUID_LEN) != 0) + continue; + slot = to_slot(ndd, nd_label); + nd_label_free_slot(ndd, slot); + dev_dbg(ndd->dev, "%s: free: %d\n", __func__, slot); + del_label(nd_mapping, l); + num_freed++; + l--; /* retry with new label at this index */ + } + + if (num_freed > l) { + /* + * num_freed will only ever be > l when we delete the last + * label + */ + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + dev_dbg(ndd->dev, "%s: no more labels\n", __func__); + } + + return nd_label_write_index(ndd, ndd->ns_next, + nd_inc_seq(__le32_to_cpu(nsindex->seq)), 0); +} + +int nd_pmem_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + int rc; + + if (size == 0) { + rc = del_labels(nd_mapping, nspm->uuid); + if (rc) + return rc; + continue; + } + + rc = init_labels(nd_mapping, 1); + if (rc < 0) + return rc; + + rc = __pmem_label_update(nd_region, nd_mapping, nspm, i); + if (rc) + return rc; + } + + return 0; +} + +int nd_blk_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_blk *nsblk, resource_size_t size) +{ + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct resource *res; + int count = 0; + + if (size == 0) + return del_labels(nd_mapping, nsblk->uuid); + + for_each_dpa_resource(to_ndd(nd_mapping), res) + count++; + + count = init_labels(nd_mapping, count); + if (count < 0) + return count; + + return __blk_label_update(nd_region, nd_mapping, nsblk, count); +} diff --git a/kernel/drivers/nvdimm/label.h b/kernel/drivers/nvdimm/label.h new file mode 100644 index 000000000..a59ef6eef --- /dev/null +++ b/kernel/drivers/nvdimm/label.h @@ -0,0 +1,141 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __LABEL_H__ +#define __LABEL_H__ + +#include <linux/ndctl.h> +#include <linux/sizes.h> +#include <linux/io.h> + +enum { + NSINDEX_SIG_LEN = 16, + NSINDEX_ALIGN = 256, + NSINDEX_SEQ_MASK = 0x3, + NSLABEL_UUID_LEN = 16, + NSLABEL_NAME_LEN = 64, + NSLABEL_FLAG_ROLABEL = 0x1, /* read-only label */ + NSLABEL_FLAG_LOCAL = 0x2, /* DIMM-local namespace */ + NSLABEL_FLAG_BTT = 0x4, /* namespace contains a BTT */ + NSLABEL_FLAG_UPDATING = 0x8, /* label being updated */ + BTT_ALIGN = 4096, /* all btt structures */ + BTTINFO_SIG_LEN = 16, + BTTINFO_UUID_LEN = 16, + BTTINFO_FLAG_ERROR = 0x1, /* error state (read-only) */ + BTTINFO_MAJOR_VERSION = 1, + ND_LABEL_MIN_SIZE = 512 * 129, /* see sizeof_namespace_index() */ + ND_LABEL_ID_SIZE = 50, + ND_NSINDEX_INIT = 0x1, +}; + +static const char NSINDEX_SIGNATURE[] = "NAMESPACE_INDEX\0"; + +/** + * struct nd_namespace_index - label set superblock + * @sig: NAMESPACE_INDEX\0 + * @flags: placeholder + * @seq: sequence number for this index + * @myoff: offset of this index in label area + * @mysize: size of this index struct + * @otheroff: offset of other index + * @labeloff: offset of first label slot + * @nslot: total number of label slots + * @major: label area major version + * @minor: label area minor version + * @checksum: fletcher64 of all fields + * @free[0]: bitmap, nlabel bits + * + * The size of free[] is rounded up so the total struct size is a + * multiple of NSINDEX_ALIGN bytes. Any bits this allocates beyond + * nlabel bits must be zero. + */ +struct nd_namespace_index { + u8 sig[NSINDEX_SIG_LEN]; + __le32 flags; + __le32 seq; + __le64 myoff; + __le64 mysize; + __le64 otheroff; + __le64 labeloff; + __le32 nslot; + __le16 major; + __le16 minor; + __le64 checksum; + u8 free[0]; +}; + +/** + * struct nd_namespace_label - namespace superblock + * @uuid: UUID per RFC 4122 + * @name: optional name (NULL-terminated) + * @flags: see NSLABEL_FLAG_* + * @nlabel: num labels to describe this ns + * @position: labels position in set + * @isetcookie: interleave set cookie + * @lbasize: LBA size in bytes or 0 for pmem + * @dpa: DPA of NVM range on this DIMM + * @rawsize: size of namespace + * @slot: slot of this label in label area + * @unused: must be zero + */ +struct nd_namespace_label { + u8 uuid[NSLABEL_UUID_LEN]; + u8 name[NSLABEL_NAME_LEN]; + __le32 flags; + __le16 nlabel; + __le16 position; + __le64 isetcookie; + __le64 lbasize; + __le64 dpa; + __le64 rawsize; + __le32 slot; + __le32 unused; +}; + +/** + * struct nd_label_id - identifier string for dpa allocation + * @id: "{blk|pmem}-<namespace uuid>" + */ +struct nd_label_id { + char id[ND_LABEL_ID_SIZE]; +}; + +/* + * If the 'best' index is invalid, so is the 'next' index. Otherwise, + * the next index is MOD(index+1, 2) + */ +static inline int nd_label_next_nsindex(int index) +{ + if (index < 0) + return -1; + + return (index + 1) % 2; +} + +struct nvdimm_drvdata; +int nd_label_validate(struct nvdimm_drvdata *ndd); +void nd_label_copy(struct nvdimm_drvdata *ndd, struct nd_namespace_index *dst, + struct nd_namespace_index *src); +size_t sizeof_namespace_index(struct nvdimm_drvdata *ndd); +int nd_label_active_count(struct nvdimm_drvdata *ndd); +struct nd_namespace_label *nd_label_active(struct nvdimm_drvdata *ndd, int n); +u32 nd_label_alloc_slot(struct nvdimm_drvdata *ndd); +bool nd_label_free_slot(struct nvdimm_drvdata *ndd, u32 slot); +u32 nd_label_nfree(struct nvdimm_drvdata *ndd); +struct nd_region; +struct nd_namespace_pmem; +struct nd_namespace_blk; +int nd_pmem_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size); +int nd_blk_namespace_label_update(struct nd_region *nd_region, + struct nd_namespace_blk *nsblk, resource_size_t size); +#endif /* __LABEL_H__ */ diff --git a/kernel/drivers/nvdimm/namespace_devs.c b/kernel/drivers/nvdimm/namespace_devs.c new file mode 100644 index 000000000..62120c38d --- /dev/null +++ b/kernel/drivers/nvdimm/namespace_devs.c @@ -0,0 +1,1986 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/module.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/pmem.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "nd.h" + +static void namespace_io_release(struct device *dev) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + kfree(nsio); +} + +static void namespace_pmem_release(struct device *dev) +{ + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + kfree(nspm->alt_name); + kfree(nspm->uuid); + kfree(nspm); +} + +static void namespace_blk_release(struct device *dev) +{ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + struct nd_region *nd_region = to_nd_region(dev->parent); + + if (nsblk->id >= 0) + ida_simple_remove(&nd_region->ns_ida, nsblk->id); + kfree(nsblk->alt_name); + kfree(nsblk->uuid); + kfree(nsblk->res); + kfree(nsblk); +} + +static struct device_type namespace_io_device_type = { + .name = "nd_namespace_io", + .release = namespace_io_release, +}; + +static struct device_type namespace_pmem_device_type = { + .name = "nd_namespace_pmem", + .release = namespace_pmem_release, +}; + +static struct device_type namespace_blk_device_type = { + .name = "nd_namespace_blk", + .release = namespace_blk_release, +}; + +static bool is_namespace_pmem(struct device *dev) +{ + return dev ? dev->type == &namespace_pmem_device_type : false; +} + +static bool is_namespace_blk(struct device *dev) +{ + return dev ? dev->type == &namespace_blk_device_type : false; +} + +static bool is_namespace_io(struct device *dev) +{ + return dev ? dev->type == &namespace_io_device_type : false; +} + +static int is_uuid_busy(struct device *dev, void *data) +{ + u8 *uuid1 = data, *uuid2 = NULL; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid2 = nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid2 = nsblk->uuid; + } else if (is_nd_btt(dev)) { + struct nd_btt *nd_btt = to_nd_btt(dev); + + uuid2 = nd_btt->uuid; + } else if (is_nd_pfn(dev)) { + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + uuid2 = nd_pfn->uuid; + } + + if (uuid2 && memcmp(uuid1, uuid2, NSLABEL_UUID_LEN) == 0) + return -EBUSY; + + return 0; +} + +static int is_namespace_uuid_busy(struct device *dev, void *data) +{ + if (is_nd_pmem(dev) || is_nd_blk(dev)) + return device_for_each_child(dev, data, is_uuid_busy); + return 0; +} + +/** + * nd_is_uuid_unique - verify that no other namespace has @uuid + * @dev: any device on a nvdimm_bus + * @uuid: uuid to check + */ +bool nd_is_uuid_unique(struct device *dev, u8 *uuid) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!nvdimm_bus) + return false; + WARN_ON_ONCE(!is_nvdimm_bus_locked(&nvdimm_bus->dev)); + if (device_for_each_child(&nvdimm_bus->dev, uuid, + is_namespace_uuid_busy) != 0) + return false; + return true; +} + +bool pmem_should_map_pages(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + + if (!IS_ENABLED(CONFIG_ZONE_DEVICE)) + return false; + + if (!test_bit(ND_REGION_PAGEMAP, &nd_region->flags)) + return false; + + if (is_nd_pfn(dev) || is_nd_btt(dev)) + return false; + +#ifdef ARCH_MEMREMAP_PMEM + return ARCH_MEMREMAP_PMEM == MEMREMAP_WB; +#else + return false; +#endif +} +EXPORT_SYMBOL(pmem_should_map_pages); + +const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, + char *name) +{ + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + const char *suffix = NULL; + + if (ndns->claim) { + if (is_nd_btt(ndns->claim)) + suffix = "s"; + else if (is_nd_pfn(ndns->claim)) + suffix = "m"; + else + dev_WARN_ONCE(&ndns->dev, 1, + "unknown claim type by %s\n", + dev_name(ndns->claim)); + } + + if (is_namespace_pmem(&ndns->dev) || is_namespace_io(&ndns->dev)) { + if (!suffix && pmem_should_map_pages(&ndns->dev)) + suffix = "m"; + sprintf(name, "pmem%d%s", nd_region->id, suffix ? suffix : ""); + } else if (is_namespace_blk(&ndns->dev)) { + struct nd_namespace_blk *nsblk; + + nsblk = to_nd_namespace_blk(&ndns->dev); + sprintf(name, "ndblk%d.%d%s", nd_region->id, nsblk->id, + suffix ? suffix : ""); + } else { + return NULL; + } + + return name; +} +EXPORT_SYMBOL(nvdimm_namespace_disk_name); + +const u8 *nd_dev_to_uuid(struct device *dev) +{ + static const u8 null_uuid[16]; + + if (!dev) + return null_uuid; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + return nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + return nsblk->uuid; + } else + return null_uuid; +} +EXPORT_SYMBOL(nd_dev_to_uuid); + +static ssize_t nstype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + + return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region)); +} +static DEVICE_ATTR_RO(nstype); + +static ssize_t __alt_name_store(struct device *dev, const char *buf, + const size_t len) +{ + char *input, *pos, *alt_name, **ns_altname; + ssize_t rc; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_altname = &nspm->alt_name; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + ns_altname = &nsblk->alt_name; + } else + return -ENXIO; + + if (dev->driver || to_ndns(dev)->claim) + return -EBUSY; + + input = kmemdup(buf, len + 1, GFP_KERNEL); + if (!input) + return -ENOMEM; + + input[len] = '\0'; + pos = strim(input); + if (strlen(pos) + 1 > NSLABEL_NAME_LEN) { + rc = -EINVAL; + goto out; + } + + alt_name = kzalloc(NSLABEL_NAME_LEN, GFP_KERNEL); + if (!alt_name) { + rc = -ENOMEM; + goto out; + } + kfree(*ns_altname); + *ns_altname = alt_name; + sprintf(*ns_altname, "%s", pos); + rc = len; + +out: + kfree(input); + return rc; +} + +static resource_size_t nd_namespace_blk_size(struct nd_namespace_blk *nsblk) +{ + struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent); + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_id label_id; + resource_size_t size = 0; + struct resource *res; + + if (!nsblk->uuid) + return 0; + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) + size += resource_size(res); + return size; +} + +static bool __nd_namespace_blk_validate(struct nd_namespace_blk *nsblk) +{ + struct nd_region *nd_region = to_nd_region(nsblk->common.dev.parent); + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nd_label_id label_id; + struct resource *res; + int count, i; + + if (!nsblk->uuid || !nsblk->lbasize || !ndd) + return false; + + count = 0; + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + for_each_dpa_resource(ndd, res) { + if (strcmp(res->name, label_id.id) != 0) + continue; + /* + * Resources with unacknoweldged adjustments indicate a + * failure to update labels + */ + if (res->flags & DPA_RESOURCE_ADJUSTED) + return false; + count++; + } + + /* These values match after a successful label update */ + if (count != nsblk->num_resources) + return false; + + for (i = 0; i < nsblk->num_resources; i++) { + struct resource *found = NULL; + + for_each_dpa_resource(ndd, res) + if (res == nsblk->res[i]) { + found = res; + break; + } + /* stale resource */ + if (!found) + return false; + } + + return true; +} + +resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk) +{ + resource_size_t size; + + nvdimm_bus_lock(&nsblk->common.dev); + size = __nd_namespace_blk_validate(nsblk); + nvdimm_bus_unlock(&nsblk->common.dev); + + return size; +} +EXPORT_SYMBOL(nd_namespace_blk_validate); + + +static int nd_namespace_label_update(struct nd_region *nd_region, + struct device *dev) +{ + dev_WARN_ONCE(dev, dev->driver || to_ndns(dev)->claim, + "namespace must be idle during label update\n"); + if (dev->driver || to_ndns(dev)->claim) + return 0; + + /* + * Only allow label writes that will result in a valid namespace + * or deletion of an existing namespace. + */ + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + resource_size_t size = resource_size(&nspm->nsio.res); + + if (size == 0 && nspm->uuid) + /* delete allocation */; + else if (!nspm->uuid) + return 0; + + return nd_pmem_namespace_label_update(nd_region, nspm, size); + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + resource_size_t size = nd_namespace_blk_size(nsblk); + + if (size == 0 && nsblk->uuid) + /* delete allocation */; + else if (!nsblk->uuid || !nsblk->lbasize) + return 0; + + return nd_blk_namespace_label_update(nd_region, nsblk, size); + } else + return -ENXIO; +} + +static ssize_t alt_name_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = __alt_name_store(dev, buf, len); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + dev_dbg(dev, "%s: %s(%zd)\n", __func__, rc < 0 ? "fail " : "", rc); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc < 0 ? rc : len; +} + +static ssize_t alt_name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + char *ns_altname; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_altname = nspm->alt_name; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + ns_altname = nsblk->alt_name; + } else + return -ENXIO; + + return sprintf(buf, "%s\n", ns_altname ? ns_altname : ""); +} +static DEVICE_ATTR_RW(alt_name); + +static int scan_free(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id, + resource_size_t n) +{ + bool is_blk = strncmp(label_id->id, "blk", 3) == 0; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + int rc = 0; + + while (n) { + struct resource *res, *last; + resource_size_t new_start; + + last = NULL; + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id->id) == 0) + last = res; + res = last; + if (!res) + return 0; + + if (n >= resource_size(res)) { + n -= resource_size(res); + nd_dbg_dpa(nd_region, ndd, res, "delete %d\n", rc); + nvdimm_free_dpa(ndd, res); + /* retry with last resource deleted */ + continue; + } + + /* + * Keep BLK allocations relegated to high DPA as much as + * possible + */ + if (is_blk) + new_start = res->start + n; + else + new_start = res->start; + + rc = adjust_resource(res, new_start, resource_size(res) - n); + if (rc == 0) + res->flags |= DPA_RESOURCE_ADJUSTED; + nd_dbg_dpa(nd_region, ndd, res, "shrink %d\n", rc); + break; + } + + return rc; +} + +/** + * shrink_dpa_allocation - for each dimm in region free n bytes for label_id + * @nd_region: the set of dimms to reclaim @n bytes from + * @label_id: unique identifier for the namespace consuming this dpa range + * @n: number of bytes per-dimm to release + * + * Assumes resources are ordered. Starting from the end try to + * adjust_resource() the allocation to @n, but if @n is larger than the + * allocation delete it and find the 'new' last allocation in the label + * set. + */ +static int shrink_dpa_allocation(struct nd_region *nd_region, + struct nd_label_id *label_id, resource_size_t n) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + int rc; + + rc = scan_free(nd_region, nd_mapping, label_id, n); + if (rc) + return rc; + } + + return 0; +} + +static resource_size_t init_dpa_allocation(struct nd_label_id *label_id, + struct nd_region *nd_region, struct nd_mapping *nd_mapping, + resource_size_t n) +{ + bool is_blk = strncmp(label_id->id, "blk", 3) == 0; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + resource_size_t first_dpa; + struct resource *res; + int rc = 0; + + /* allocate blk from highest dpa first */ + if (is_blk) + first_dpa = nd_mapping->start + nd_mapping->size - n; + else + first_dpa = nd_mapping->start; + + /* first resource allocation for this label-id or dimm */ + res = nvdimm_allocate_dpa(ndd, label_id, first_dpa, n); + if (!res) + rc = -EBUSY; + + nd_dbg_dpa(nd_region, ndd, res, "init %d\n", rc); + return rc ? n : 0; +} + +static bool space_valid(bool is_pmem, bool is_reserve, + struct nd_label_id *label_id, struct resource *res) +{ + /* + * For BLK-space any space is valid, for PMEM-space, it must be + * contiguous with an existing allocation unless we are + * reserving pmem. + */ + if (is_reserve || !is_pmem) + return true; + if (!res || strcmp(res->name, label_id->id) == 0) + return true; + return false; +} + +enum alloc_loc { + ALLOC_ERR = 0, ALLOC_BEFORE, ALLOC_MID, ALLOC_AFTER, +}; + +static resource_size_t scan_allocate(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id, + resource_size_t n) +{ + resource_size_t mapping_end = nd_mapping->start + nd_mapping->size - 1; + bool is_reserve = strcmp(label_id->id, "pmem-reserve") == 0; + bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + const resource_size_t to_allocate = n; + struct resource *res; + int first; + + retry: + first = 0; + for_each_dpa_resource(ndd, res) { + resource_size_t allocate, available = 0, free_start, free_end; + struct resource *next = res->sibling, *new_res = NULL; + enum alloc_loc loc = ALLOC_ERR; + const char *action; + int rc = 0; + + /* ignore resources outside this nd_mapping */ + if (res->start > mapping_end) + continue; + if (res->end < nd_mapping->start) + continue; + + /* space at the beginning of the mapping */ + if (!first++ && res->start > nd_mapping->start) { + free_start = nd_mapping->start; + available = res->start - free_start; + if (space_valid(is_pmem, is_reserve, label_id, NULL)) + loc = ALLOC_BEFORE; + } + + /* space between allocations */ + if (!loc && next) { + free_start = res->start + resource_size(res); + free_end = min(mapping_end, next->start - 1); + if (space_valid(is_pmem, is_reserve, label_id, res) + && free_start < free_end) { + available = free_end + 1 - free_start; + loc = ALLOC_MID; + } + } + + /* space at the end of the mapping */ + if (!loc && !next) { + free_start = res->start + resource_size(res); + free_end = mapping_end; + if (space_valid(is_pmem, is_reserve, label_id, res) + && free_start < free_end) { + available = free_end + 1 - free_start; + loc = ALLOC_AFTER; + } + } + + if (!loc || !available) + continue; + allocate = min(available, n); + switch (loc) { + case ALLOC_BEFORE: + if (strcmp(res->name, label_id->id) == 0) { + /* adjust current resource up */ + if (is_pmem && !is_reserve) + return n; + rc = adjust_resource(res, res->start - allocate, + resource_size(res) + allocate); + action = "cur grow up"; + } else + action = "allocate"; + break; + case ALLOC_MID: + if (strcmp(next->name, label_id->id) == 0) { + /* adjust next resource up */ + if (is_pmem && !is_reserve) + return n; + rc = adjust_resource(next, next->start + - allocate, resource_size(next) + + allocate); + new_res = next; + action = "next grow up"; + } else if (strcmp(res->name, label_id->id) == 0) { + action = "grow down"; + } else + action = "allocate"; + break; + case ALLOC_AFTER: + if (strcmp(res->name, label_id->id) == 0) + action = "grow down"; + else + action = "allocate"; + break; + default: + return n; + } + + if (strcmp(action, "allocate") == 0) { + /* BLK allocate bottom up */ + if (!is_pmem) + free_start += available - allocate; + else if (!is_reserve && free_start != nd_mapping->start) + return n; + + new_res = nvdimm_allocate_dpa(ndd, label_id, + free_start, allocate); + if (!new_res) + rc = -EBUSY; + } else if (strcmp(action, "grow down") == 0) { + /* adjust current resource down */ + rc = adjust_resource(res, res->start, resource_size(res) + + allocate); + if (rc == 0) + res->flags |= DPA_RESOURCE_ADJUSTED; + } + + if (!new_res) + new_res = res; + + nd_dbg_dpa(nd_region, ndd, new_res, "%s(%d) %d\n", + action, loc, rc); + + if (rc) + return n; + + n -= allocate; + if (n) { + /* + * Retry scan with newly inserted resources. + * For example, if we did an ALLOC_BEFORE + * insertion there may also have been space + * available for an ALLOC_AFTER insertion, so we + * need to check this same resource again + */ + goto retry; + } else + return 0; + } + + /* + * If we allocated nothing in the BLK case it may be because we are in + * an initial "pmem-reserve pass". Only do an initial BLK allocation + * when none of the DPA space is reserved. + */ + if ((is_pmem || !ndd->dpa.child) && n == to_allocate) + return init_dpa_allocation(label_id, nd_region, nd_mapping, n); + return n; +} + +static int merge_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, struct nd_label_id *label_id) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + + if (strncmp("pmem", label_id->id, 4) == 0) + return 0; + retry: + for_each_dpa_resource(ndd, res) { + int rc; + struct resource *next = res->sibling; + resource_size_t end = res->start + resource_size(res); + + if (!next || strcmp(res->name, label_id->id) != 0 + || strcmp(next->name, label_id->id) != 0 + || end != next->start) + continue; + end += resource_size(next); + nvdimm_free_dpa(ndd, next); + rc = adjust_resource(res, res->start, end - res->start); + nd_dbg_dpa(nd_region, ndd, res, "merge %d\n", rc); + if (rc) + return rc; + res->flags |= DPA_RESOURCE_ADJUSTED; + goto retry; + } + + return 0; +} + +static int __reserve_free_pmem(struct device *dev, void *data) +{ + struct nvdimm *nvdimm = data; + struct nd_region *nd_region; + struct nd_label_id label_id; + int i; + + if (!is_nd_pmem(dev)) + return 0; + + nd_region = to_nd_region(dev); + if (nd_region->ndr_mappings == 0) + return 0; + + memset(&label_id, 0, sizeof(label_id)); + strcat(label_id.id, "pmem-reserve"); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + resource_size_t n, rem = 0; + + if (nd_mapping->nvdimm != nvdimm) + continue; + + n = nd_pmem_available_dpa(nd_region, nd_mapping, &rem); + if (n == 0) + return 0; + rem = scan_allocate(nd_region, nd_mapping, &label_id, n); + dev_WARN_ONCE(&nd_region->dev, rem, + "pmem reserve underrun: %#llx of %#llx bytes\n", + (unsigned long long) n - rem, + (unsigned long long) n); + return rem ? -ENXIO : 0; + } + + return 0; +} + +static void release_free_pmem(struct nvdimm_bus *nvdimm_bus, + struct nd_mapping *nd_mapping) +{ + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res, *_res; + + for_each_dpa_resource_safe(ndd, res, _res) + if (strcmp(res->name, "pmem-reserve") == 0) + nvdimm_free_dpa(ndd, res); +} + +static int reserve_free_pmem(struct nvdimm_bus *nvdimm_bus, + struct nd_mapping *nd_mapping) +{ + struct nvdimm *nvdimm = nd_mapping->nvdimm; + int rc; + + rc = device_for_each_child(&nvdimm_bus->dev, nvdimm, + __reserve_free_pmem); + if (rc) + release_free_pmem(nvdimm_bus, nd_mapping); + return rc; +} + +/** + * grow_dpa_allocation - for each dimm allocate n bytes for @label_id + * @nd_region: the set of dimms to allocate @n more bytes from + * @label_id: unique identifier for the namespace consuming this dpa range + * @n: number of bytes per-dimm to add to the existing allocation + * + * Assumes resources are ordered. For BLK regions, first consume + * BLK-only available DPA free space, then consume PMEM-aliased DPA + * space starting at the highest DPA. For PMEM regions start + * allocations from the start of an interleave set and end at the first + * BLK allocation or the end of the interleave set, whichever comes + * first. + */ +static int grow_dpa_allocation(struct nd_region *nd_region, + struct nd_label_id *label_id, resource_size_t n) +{ + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev); + bool is_pmem = strncmp(label_id->id, "pmem", 4) == 0; + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + resource_size_t rem = n; + int rc, j; + + /* + * In the BLK case try once with all unallocated PMEM + * reserved, and once without + */ + for (j = is_pmem; j < 2; j++) { + bool blk_only = j == 0; + + if (blk_only) { + rc = reserve_free_pmem(nvdimm_bus, nd_mapping); + if (rc) + return rc; + } + rem = scan_allocate(nd_region, nd_mapping, + label_id, rem); + if (blk_only) + release_free_pmem(nvdimm_bus, nd_mapping); + + /* try again and allow encroachments into PMEM */ + if (rem == 0) + break; + } + + dev_WARN_ONCE(&nd_region->dev, rem, + "allocation underrun: %#llx of %#llx bytes\n", + (unsigned long long) n - rem, + (unsigned long long) n); + if (rem) + return -ENXIO; + + rc = merge_dpa(nd_region, nd_mapping, label_id); + if (rc) + return rc; + } + + return 0; +} + +static void nd_namespace_pmem_set_size(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm, resource_size_t size) +{ + struct resource *res = &nspm->nsio.res; + + res->start = nd_region->ndr_start; + res->end = nd_region->ndr_start + size - 1; +} + +static ssize_t __size_store(struct device *dev, unsigned long long val) +{ + resource_size_t allocated = 0, available = 0; + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_mapping *nd_mapping; + struct nvdimm_drvdata *ndd; + struct nd_label_id label_id; + u32 flags = 0, remainder; + u8 *uuid = NULL; + int rc, i; + + if (dev->driver || to_ndns(dev)->claim) + return -EBUSY; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = nsblk->uuid; + flags = NSLABEL_FLAG_LOCAL; + } + + /* + * We need a uuid for the allocation-label and dimm(s) on which + * to store the label. + */ + if (!uuid || nd_region->ndr_mappings == 0) + return -ENXIO; + + div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, &remainder); + if (remainder) { + dev_dbg(dev, "%llu is not %dK aligned\n", val, + (SZ_4K * nd_region->ndr_mappings) / SZ_1K); + return -EINVAL; + } + + nd_label_gen_id(&label_id, uuid, flags); + for (i = 0; i < nd_region->ndr_mappings; i++) { + nd_mapping = &nd_region->mapping[i]; + ndd = to_ndd(nd_mapping); + + /* + * All dimms in an interleave set, or the base dimm for a blk + * region, need to be enabled for the size to be changed. + */ + if (!ndd) + return -ENXIO; + + allocated += nvdimm_allocated_dpa(ndd, &label_id); + } + available = nd_region_available_dpa(nd_region); + + if (val > available + allocated) + return -ENOSPC; + + if (val == allocated) + return 0; + + val = div_u64(val, nd_region->ndr_mappings); + allocated = div_u64(allocated, nd_region->ndr_mappings); + if (val < allocated) + rc = shrink_dpa_allocation(nd_region, &label_id, + allocated - val); + else + rc = grow_dpa_allocation(nd_region, &label_id, val - allocated); + + if (rc) + return rc; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + nd_namespace_pmem_set_size(nd_region, nspm, + val * nd_region->ndr_mappings); + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + /* + * Try to delete the namespace if we deleted all of its + * allocation, this is not the seed device for the + * region, and it is not actively claimed by a btt + * instance. + */ + if (val == 0 && nd_region->ns_seed != dev + && !nsblk->common.claim) + nd_device_unregister(dev, ND_ASYNC); + } + + return rc; +} + +static ssize_t size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + unsigned long long val; + u8 **uuid = NULL; + int rc; + + rc = kstrtoull(buf, 0, &val); + if (rc) + return rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + rc = __size_store(dev, val); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = &nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = &nsblk->uuid; + } + + if (rc == 0 && val == 0 && uuid) { + /* setting size zero == 'delete namespace' */ + kfree(*uuid); + *uuid = NULL; + } + + dev_dbg(dev, "%s: %llx %s (%d)\n", __func__, val, rc < 0 + ? "fail" : "success", rc); + + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc < 0 ? rc : len; +} + +resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns) +{ + struct device *dev = &ndns->dev; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + return resource_size(&nspm->nsio.res); + } else if (is_namespace_blk(dev)) { + return nd_namespace_blk_size(to_nd_namespace_blk(dev)); + } else if (is_namespace_io(dev)) { + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + return resource_size(&nsio->res); + } else + WARN_ONCE(1, "unknown namespace type\n"); + return 0; +} + +resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns) +{ + resource_size_t size; + + nvdimm_bus_lock(&ndns->dev); + size = __nvdimm_namespace_capacity(ndns); + nvdimm_bus_unlock(&ndns->dev); + + return size; +} +EXPORT_SYMBOL(nvdimm_namespace_capacity); + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long) + nvdimm_namespace_capacity(to_ndns(dev))); +} +static DEVICE_ATTR(size, S_IRUGO, size_show, size_store); + +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u8 *uuid; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = nsblk->uuid; + } else + return -ENXIO; + + if (uuid) + return sprintf(buf, "%pUb\n", uuid); + return sprintf(buf, "\n"); +} + +/** + * namespace_update_uuid - check for a unique uuid and whether we're "renaming" + * @nd_region: parent region so we can updates all dimms in the set + * @dev: namespace type for generating label_id + * @new_uuid: incoming uuid + * @old_uuid: reference to the uuid storage location in the namespace object + */ +static int namespace_update_uuid(struct nd_region *nd_region, + struct device *dev, u8 *new_uuid, u8 **old_uuid) +{ + u32 flags = is_namespace_blk(dev) ? NSLABEL_FLAG_LOCAL : 0; + struct nd_label_id old_label_id; + struct nd_label_id new_label_id; + int i; + + if (!nd_is_uuid_unique(dev, new_uuid)) + return -EINVAL; + + if (*old_uuid == NULL) + goto out; + + /* + * If we've already written a label with this uuid, then it's + * too late to rename because we can't reliably update the uuid + * without losing the old namespace. Userspace must delete this + * namespace to abandon the old uuid. + */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + /* + * This check by itself is sufficient because old_uuid + * would be NULL above if this uuid did not exist in the + * currently written set. + * + * FIXME: can we delete uuid with zero dpa allocated? + */ + if (nd_mapping->labels) + return -EBUSY; + } + + nd_label_gen_id(&old_label_id, *old_uuid, flags); + nd_label_gen_id(&new_label_id, new_uuid, flags); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, old_label_id.id) == 0) + sprintf((void *) res->name, "%s", + new_label_id.id); + } + kfree(*old_uuid); + out: + *old_uuid = new_uuid; + return 0; +} + +static ssize_t uuid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + u8 *uuid = NULL; + ssize_t rc = 0; + u8 **ns_uuid; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + ns_uuid = &nspm->uuid; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + ns_uuid = &nsblk->uuid; + } else + return -ENXIO; + + device_lock(dev); + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + if (to_ndns(dev)->claim) + rc = -EBUSY; + if (rc >= 0) + rc = nd_uuid_store(dev, &uuid, buf, len); + if (rc >= 0) + rc = namespace_update_uuid(nd_region, dev, uuid, ns_uuid); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + else + kfree(uuid); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc < 0 ? rc : len; +} +static DEVICE_ATTR_RW(uuid); + +static ssize_t resource_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct resource *res; + + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + res = &nspm->nsio.res; + } else if (is_namespace_io(dev)) { + struct nd_namespace_io *nsio = to_nd_namespace_io(dev); + + res = &nsio->res; + } else + return -ENXIO; + + /* no address to convey if the namespace has no allocation */ + if (resource_size(res) == 0) + return -ENXIO; + return sprintf(buf, "%#llx\n", (unsigned long long) res->start); +} +static DEVICE_ATTR_RO(resource); + +static const unsigned long ns_lbasize_supported[] = { 512, 520, 528, + 4096, 4104, 4160, 4224, 0 }; + +static ssize_t sector_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + if (!is_namespace_blk(dev)) + return -ENXIO; + + return nd_sector_size_show(nsblk->lbasize, ns_lbasize_supported, buf); +} + +static ssize_t sector_size_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + struct nd_region *nd_region = to_nd_region(dev->parent); + ssize_t rc = 0; + + if (!is_namespace_blk(dev)) + return -ENXIO; + + device_lock(dev); + nvdimm_bus_lock(dev); + if (to_ndns(dev)->claim) + rc = -EBUSY; + if (rc >= 0) + rc = nd_sector_size_store(dev, buf, &nsblk->lbasize, + ns_lbasize_supported); + if (rc >= 0) + rc = nd_namespace_label_update(nd_region, dev); + dev_dbg(dev, "%s: result: %zd %s: %s%s", __func__, + rc, rc < 0 ? "tried" : "wrote", buf, + buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(sector_size); + +static ssize_t dpa_extents_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_label_id label_id; + int count = 0, i; + u8 *uuid = NULL; + u32 flags = 0; + + nvdimm_bus_lock(dev); + if (is_namespace_pmem(dev)) { + struct nd_namespace_pmem *nspm = to_nd_namespace_pmem(dev); + + uuid = nspm->uuid; + flags = 0; + } else if (is_namespace_blk(dev)) { + struct nd_namespace_blk *nsblk = to_nd_namespace_blk(dev); + + uuid = nsblk->uuid; + flags = NSLABEL_FLAG_LOCAL; + } + + if (!uuid) + goto out; + + nd_label_gen_id(&label_id, uuid, flags); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct resource *res; + + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0) + count++; + } + out: + nvdimm_bus_unlock(dev); + + return sprintf(buf, "%d\n", count); +} +static DEVICE_ATTR_RO(dpa_extents); + +static ssize_t holder_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_namespace_common *ndns = to_ndns(dev); + ssize_t rc; + + device_lock(dev); + rc = sprintf(buf, "%s\n", ndns->claim ? dev_name(ndns->claim) : ""); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(holder); + +static ssize_t force_raw_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + bool force_raw; + int rc = strtobool(buf, &force_raw); + + if (rc) + return rc; + + to_ndns(dev)->force_raw = force_raw; + return len; +} + +static ssize_t force_raw_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", to_ndns(dev)->force_raw); +} +static DEVICE_ATTR_RW(force_raw); + +static struct attribute *nd_namespace_attributes[] = { + &dev_attr_nstype.attr, + &dev_attr_size.attr, + &dev_attr_uuid.attr, + &dev_attr_holder.attr, + &dev_attr_resource.attr, + &dev_attr_alt_name.attr, + &dev_attr_force_raw.attr, + &dev_attr_sector_size.attr, + &dev_attr_dpa_extents.attr, + NULL, +}; + +static umode_t namespace_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + + if (a == &dev_attr_resource.attr) { + if (is_namespace_blk(dev)) + return 0; + return a->mode; + } + + if (is_namespace_pmem(dev) || is_namespace_blk(dev)) { + if (a == &dev_attr_size.attr) + return S_IWUSR | S_IRUGO; + + if (is_namespace_pmem(dev) && a == &dev_attr_sector_size.attr) + return 0; + + return a->mode; + } + + if (a == &dev_attr_nstype.attr || a == &dev_attr_size.attr + || a == &dev_attr_holder.attr + || a == &dev_attr_force_raw.attr) + return a->mode; + + return 0; +} + +static struct attribute_group nd_namespace_attribute_group = { + .attrs = nd_namespace_attributes, + .is_visible = namespace_visible, +}; + +static const struct attribute_group *nd_namespace_attribute_groups[] = { + &nd_device_attribute_group, + &nd_namespace_attribute_group, + &nd_numa_attribute_group, + NULL, +}; + +struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev) +{ + struct nd_btt *nd_btt = is_nd_btt(dev) ? to_nd_btt(dev) : NULL; + struct nd_pfn *nd_pfn = is_nd_pfn(dev) ? to_nd_pfn(dev) : NULL; + struct nd_namespace_common *ndns; + resource_size_t size; + + if (nd_btt || nd_pfn) { + struct device *host = NULL; + + if (nd_btt) { + host = &nd_btt->dev; + ndns = nd_btt->ndns; + } else if (nd_pfn) { + host = &nd_pfn->dev; + ndns = nd_pfn->ndns; + } + + if (!ndns || !host) + return ERR_PTR(-ENODEV); + + /* + * Flush any in-progess probes / removals in the driver + * for the raw personality of this namespace. + */ + device_lock(&ndns->dev); + device_unlock(&ndns->dev); + if (ndns->dev.driver) { + dev_dbg(&ndns->dev, "is active, can't bind %s\n", + dev_name(host)); + return ERR_PTR(-EBUSY); + } + if (dev_WARN_ONCE(&ndns->dev, ndns->claim != host, + "host (%s) vs claim (%s) mismatch\n", + dev_name(host), + dev_name(ndns->claim))) + return ERR_PTR(-ENXIO); + } else { + ndns = to_ndns(dev); + if (ndns->claim) { + dev_dbg(dev, "claimed by %s, failing probe\n", + dev_name(ndns->claim)); + + return ERR_PTR(-ENXIO); + } + } + + size = nvdimm_namespace_capacity(ndns); + if (size < ND_MIN_NAMESPACE_SIZE) { + dev_dbg(&ndns->dev, "%pa, too small must be at least %#x\n", + &size, ND_MIN_NAMESPACE_SIZE); + return ERR_PTR(-ENODEV); + } + + if (is_namespace_pmem(&ndns->dev)) { + struct nd_namespace_pmem *nspm; + + nspm = to_nd_namespace_pmem(&ndns->dev); + if (!nspm->uuid) { + dev_dbg(&ndns->dev, "%s: uuid not set\n", __func__); + return ERR_PTR(-ENODEV); + } + } else if (is_namespace_blk(&ndns->dev)) { + struct nd_namespace_blk *nsblk; + + nsblk = to_nd_namespace_blk(&ndns->dev); + if (!nd_namespace_blk_validate(nsblk)) + return ERR_PTR(-ENODEV); + } + + return ndns; +} +EXPORT_SYMBOL(nvdimm_namespace_common_probe); + +static struct device **create_namespace_io(struct nd_region *nd_region) +{ + struct nd_namespace_io *nsio; + struct device *dev, **devs; + struct resource *res; + + nsio = kzalloc(sizeof(*nsio), GFP_KERNEL); + if (!nsio) + return NULL; + + devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL); + if (!devs) { + kfree(nsio); + return NULL; + } + + dev = &nsio->common.dev; + dev->type = &namespace_io_device_type; + dev->parent = &nd_region->dev; + res = &nsio->res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + res->start = nd_region->ndr_start; + res->end = res->start + nd_region->ndr_size - 1; + + devs[0] = dev; + return devs; +} + +static bool has_uuid_at_pos(struct nd_region *nd_region, u8 *uuid, + u64 cookie, u16 pos) +{ + struct nd_namespace_label *found = NULL; + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nd_namespace_label *nd_label; + bool found_uuid = false; + int l; + + for_each_label(l, nd_label, nd_mapping->labels) { + u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); + u16 position = __le16_to_cpu(nd_label->position); + u16 nlabel = __le16_to_cpu(nd_label->nlabel); + + if (isetcookie != cookie) + continue; + + if (memcmp(nd_label->uuid, uuid, NSLABEL_UUID_LEN) != 0) + continue; + + if (found_uuid) { + dev_dbg(to_ndd(nd_mapping)->dev, + "%s duplicate entry for uuid\n", + __func__); + return false; + } + found_uuid = true; + if (nlabel != nd_region->ndr_mappings) + continue; + if (position != pos) + continue; + found = nd_label; + break; + } + if (found) + break; + } + return found != NULL; +} + +static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id) +{ + struct nd_namespace_label *select = NULL; + int i; + + if (!pmem_id) + return -ENODEV; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nd_namespace_label *nd_label; + u64 hw_start, hw_end, pmem_start, pmem_end; + int l; + + for_each_label(l, nd_label, nd_mapping->labels) + if (memcmp(nd_label->uuid, pmem_id, NSLABEL_UUID_LEN) == 0) + break; + + if (!nd_label) { + WARN_ON(1); + return -EINVAL; + } + + select = nd_label; + /* + * Check that this label is compliant with the dpa + * range published in NFIT + */ + hw_start = nd_mapping->start; + hw_end = hw_start + nd_mapping->size; + pmem_start = __le64_to_cpu(select->dpa); + pmem_end = pmem_start + __le64_to_cpu(select->rawsize); + if (pmem_start == hw_start && pmem_end <= hw_end) + /* pass */; + else + return -EINVAL; + + nd_mapping->labels[0] = select; + nd_mapping->labels[1] = NULL; + } + return 0; +} + +/** + * find_pmem_label_set - validate interleave set labelling, retrieve label0 + * @nd_region: region with mappings to validate + */ +static int find_pmem_label_set(struct nd_region *nd_region, + struct nd_namespace_pmem *nspm) +{ + u64 cookie = nd_region_interleave_set_cookie(nd_region); + struct nd_namespace_label *nd_label; + u8 select_id[NSLABEL_UUID_LEN]; + resource_size_t size = 0; + u8 *pmem_id = NULL; + int rc = -ENODEV, l; + u16 i; + + if (cookie == 0) + return -ENXIO; + + /* + * Find a complete set of labels by uuid. By definition we can start + * with any mapping as the reference label + */ + for_each_label(l, nd_label, nd_region->mapping[0].labels) { + u64 isetcookie = __le64_to_cpu(nd_label->isetcookie); + + if (isetcookie != cookie) + continue; + + for (i = 0; nd_region->ndr_mappings; i++) + if (!has_uuid_at_pos(nd_region, nd_label->uuid, + cookie, i)) + break; + if (i < nd_region->ndr_mappings) { + /* + * Give up if we don't find an instance of a + * uuid at each position (from 0 to + * nd_region->ndr_mappings - 1), or if we find a + * dimm with two instances of the same uuid. + */ + rc = -EINVAL; + goto err; + } else if (pmem_id) { + /* + * If there is more than one valid uuid set, we + * need userspace to clean this up. + */ + rc = -EBUSY; + goto err; + } + memcpy(select_id, nd_label->uuid, NSLABEL_UUID_LEN); + pmem_id = select_id; + } + + /* + * Fix up each mapping's 'labels' to have the validated pmem label for + * that position at labels[0], and NULL at labels[1]. In the process, + * check that the namespace aligns with interleave-set. We know + * that it does not overlap with any blk namespaces by virtue of + * the dimm being enabled (i.e. nd_label_reserve_dpa() + * succeeded). + */ + rc = select_pmem_id(nd_region, pmem_id); + if (rc) + goto err; + + /* Calculate total size and populate namespace properties from label0 */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nd_namespace_label *label0 = nd_mapping->labels[0]; + + size += __le64_to_cpu(label0->rawsize); + if (__le16_to_cpu(label0->position) != 0) + continue; + WARN_ON(nspm->alt_name || nspm->uuid); + nspm->alt_name = kmemdup((void __force *) label0->name, + NSLABEL_NAME_LEN, GFP_KERNEL); + nspm->uuid = kmemdup((void __force *) label0->uuid, + NSLABEL_UUID_LEN, GFP_KERNEL); + } + + if (!nspm->alt_name || !nspm->uuid) { + rc = -ENOMEM; + goto err; + } + + nd_namespace_pmem_set_size(nd_region, nspm, size); + + return 0; + err: + switch (rc) { + case -EINVAL: + dev_dbg(&nd_region->dev, "%s: invalid label(s)\n", __func__); + break; + case -ENODEV: + dev_dbg(&nd_region->dev, "%s: label not found\n", __func__); + break; + default: + dev_dbg(&nd_region->dev, "%s: unexpected err: %d\n", + __func__, rc); + break; + } + return rc; +} + +static struct device **create_namespace_pmem(struct nd_region *nd_region) +{ + struct nd_namespace_pmem *nspm; + struct device *dev, **devs; + struct resource *res; + int rc; + + nspm = kzalloc(sizeof(*nspm), GFP_KERNEL); + if (!nspm) + return NULL; + + dev = &nspm->nsio.common.dev; + dev->type = &namespace_pmem_device_type; + dev->parent = &nd_region->dev; + res = &nspm->nsio.res; + res->name = dev_name(&nd_region->dev); + res->flags = IORESOURCE_MEM; + rc = find_pmem_label_set(nd_region, nspm); + if (rc == -ENODEV) { + int i; + + /* Pass, try to permit namespace creation... */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + } + + /* Publish a zero-sized namespace for userspace to configure. */ + nd_namespace_pmem_set_size(nd_region, nspm, 0); + + rc = 0; + } else if (rc) + goto err; + + devs = kcalloc(2, sizeof(struct device *), GFP_KERNEL); + if (!devs) + goto err; + + devs[0] = dev; + return devs; + + err: + namespace_pmem_release(&nspm->nsio.common.dev); + return NULL; +} + +struct resource *nsblk_add_resource(struct nd_region *nd_region, + struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk, + resource_size_t start) +{ + struct nd_label_id label_id; + struct resource *res; + + nd_label_gen_id(&label_id, nsblk->uuid, NSLABEL_FLAG_LOCAL); + res = krealloc(nsblk->res, + sizeof(void *) * (nsblk->num_resources + 1), + GFP_KERNEL); + if (!res) + return NULL; + nsblk->res = (struct resource **) res; + for_each_dpa_resource(ndd, res) + if (strcmp(res->name, label_id.id) == 0 + && res->start == start) { + nsblk->res[nsblk->num_resources++] = res; + return res; + } + return NULL; +} + +static struct device *nd_namespace_blk_create(struct nd_region *nd_region) +{ + struct nd_namespace_blk *nsblk; + struct device *dev; + + if (!is_nd_blk(&nd_region->dev)) + return NULL; + + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + return NULL; + + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + nsblk->id = ida_simple_get(&nd_region->ns_ida, 0, 0, GFP_KERNEL); + if (nsblk->id < 0) { + kfree(nsblk); + return NULL; + } + dev_set_name(dev, "namespace%d.%d", nd_region->id, nsblk->id); + dev->parent = &nd_region->dev; + dev->groups = nd_namespace_attribute_groups; + + return &nsblk->common.dev; +} + +void nd_region_create_blk_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + nd_region->ns_seed = nd_namespace_blk_create(nd_region); + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->ns_seed) + dev_err(&nd_region->dev, "failed to create blk namespace\n"); + else + nd_device_register(nd_region->ns_seed); +} + +void nd_region_create_btt_seed(struct nd_region *nd_region) +{ + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + nd_region->btt_seed = nd_btt_create(nd_region); + /* + * Seed creation failures are not fatal, provisioning is simply + * disabled until memory becomes available + */ + if (!nd_region->btt_seed) + dev_err(&nd_region->dev, "failed to create btt namespace\n"); +} + +static struct device **create_namespace_blk(struct nd_region *nd_region) +{ + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + struct nd_namespace_label *nd_label; + struct device *dev, **devs = NULL; + struct nd_namespace_blk *nsblk; + struct nvdimm_drvdata *ndd; + int i, l, count = 0; + struct resource *res; + + if (nd_region->ndr_mappings == 0) + return NULL; + + ndd = to_ndd(nd_mapping); + for_each_label(l, nd_label, nd_mapping->labels) { + u32 flags = __le32_to_cpu(nd_label->flags); + char *name[NSLABEL_NAME_LEN]; + struct device **__devs; + + if (flags & NSLABEL_FLAG_LOCAL) + /* pass */; + else + continue; + + for (i = 0; i < count; i++) { + nsblk = to_nd_namespace_blk(devs[i]); + if (memcmp(nsblk->uuid, nd_label->uuid, + NSLABEL_UUID_LEN) == 0) { + res = nsblk_add_resource(nd_region, ndd, nsblk, + __le64_to_cpu(nd_label->dpa)); + if (!res) + goto err; + nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", + dev_name(&nsblk->common.dev)); + break; + } + } + if (i < count) + continue; + __devs = kcalloc(count + 2, sizeof(dev), GFP_KERNEL); + if (!__devs) + goto err; + memcpy(__devs, devs, sizeof(dev) * count); + kfree(devs); + devs = __devs; + + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + goto err; + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + dev->parent = &nd_region->dev; + dev_set_name(dev, "namespace%d.%d", nd_region->id, count); + devs[count++] = dev; + nsblk->id = -1; + nsblk->lbasize = __le64_to_cpu(nd_label->lbasize); + nsblk->uuid = kmemdup(nd_label->uuid, NSLABEL_UUID_LEN, + GFP_KERNEL); + if (!nsblk->uuid) + goto err; + memcpy(name, nd_label->name, NSLABEL_NAME_LEN); + if (name[0]) + nsblk->alt_name = kmemdup(name, NSLABEL_NAME_LEN, + GFP_KERNEL); + res = nsblk_add_resource(nd_region, ndd, nsblk, + __le64_to_cpu(nd_label->dpa)); + if (!res) + goto err; + nd_dbg_dpa(nd_region, ndd, res, "%s assign\n", + dev_name(&nsblk->common.dev)); + } + + dev_dbg(&nd_region->dev, "%s: discovered %d blk namespace%s\n", + __func__, count, count == 1 ? "" : "s"); + + if (count == 0) { + /* Publish a zero-sized namespace for userspace to configure. */ + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + } + + devs = kcalloc(2, sizeof(dev), GFP_KERNEL); + if (!devs) + goto err; + nsblk = kzalloc(sizeof(*nsblk), GFP_KERNEL); + if (!nsblk) + goto err; + dev = &nsblk->common.dev; + dev->type = &namespace_blk_device_type; + dev->parent = &nd_region->dev; + devs[count++] = dev; + } + + return devs; + +err: + for (i = 0; i < count; i++) { + nsblk = to_nd_namespace_blk(devs[i]); + namespace_blk_release(&nsblk->common.dev); + } + kfree(devs); + return NULL; +} + +static int init_active_labels(struct nd_region *nd_region) +{ + int i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + struct nvdimm *nvdimm = nd_mapping->nvdimm; + int count, j; + + /* + * If the dimm is disabled then prevent the region from + * being activated if it aliases DPA. + */ + if (!ndd) { + if ((nvdimm->flags & NDD_ALIASING) == 0) + return 0; + dev_dbg(&nd_region->dev, "%s: is disabled, failing probe\n", + dev_name(&nd_mapping->nvdimm->dev)); + return -ENXIO; + } + nd_mapping->ndd = ndd; + atomic_inc(&nvdimm->busy); + get_ndd(ndd); + + count = nd_label_active_count(ndd); + dev_dbg(ndd->dev, "%s: %d\n", __func__, count); + if (!count) + continue; + nd_mapping->labels = kcalloc(count + 1, sizeof(void *), + GFP_KERNEL); + if (!nd_mapping->labels) + return -ENOMEM; + for (j = 0; j < count; j++) { + struct nd_namespace_label *label; + + label = nd_label_active(ndd, j); + nd_mapping->labels[j] = label; + } + } + + return 0; +} + +int nd_region_register_namespaces(struct nd_region *nd_region, int *err) +{ + struct device **devs = NULL; + int i, rc = 0, type; + + *err = 0; + nvdimm_bus_lock(&nd_region->dev); + rc = init_active_labels(nd_region); + if (rc) { + nvdimm_bus_unlock(&nd_region->dev); + return rc; + } + + type = nd_region_to_nstype(nd_region); + switch (type) { + case ND_DEVICE_NAMESPACE_IO: + devs = create_namespace_io(nd_region); + break; + case ND_DEVICE_NAMESPACE_PMEM: + devs = create_namespace_pmem(nd_region); + break; + case ND_DEVICE_NAMESPACE_BLK: + devs = create_namespace_blk(nd_region); + break; + default: + break; + } + nvdimm_bus_unlock(&nd_region->dev); + + if (!devs) + return -ENODEV; + + for (i = 0; devs[i]; i++) { + struct device *dev = devs[i]; + int id; + + if (type == ND_DEVICE_NAMESPACE_BLK) { + struct nd_namespace_blk *nsblk; + + nsblk = to_nd_namespace_blk(dev); + id = ida_simple_get(&nd_region->ns_ida, 0, 0, + GFP_KERNEL); + nsblk->id = id; + } else + id = i; + + if (id < 0) + break; + dev_set_name(dev, "namespace%d.%d", nd_region->id, id); + dev->groups = nd_namespace_attribute_groups; + nd_device_register(dev); + } + if (i) + nd_region->ns_seed = devs[0]; + + if (devs[i]) { + int j; + + for (j = i; devs[j]; j++) { + struct device *dev = devs[j]; + + device_initialize(dev); + put_device(dev); + } + *err = j - i; + /* + * All of the namespaces we tried to register failed, so + * fail region activation. + */ + if (*err == 0) + rc = -ENODEV; + } + kfree(devs); + + if (rc == -ENODEV) + return rc; + + return i; +} diff --git a/kernel/drivers/nvdimm/nd-core.h b/kernel/drivers/nvdimm/nd-core.h new file mode 100644 index 000000000..159aed532 --- /dev/null +++ b/kernel/drivers/nvdimm/nd-core.h @@ -0,0 +1,92 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __ND_CORE_H__ +#define __ND_CORE_H__ +#include <linux/libnvdimm.h> +#include <linux/device.h> +#include <linux/libnvdimm.h> +#include <linux/sizes.h> +#include <linux/mutex.h> +#include <linux/nd.h> + +extern struct list_head nvdimm_bus_list; +extern struct mutex nvdimm_bus_list_mutex; +extern int nvdimm_major; + +struct nvdimm_bus { + struct nvdimm_bus_descriptor *nd_desc; + wait_queue_head_t probe_wait; + struct module *module; + struct list_head list; + struct device dev; + int id, probe_active; + struct mutex reconfig_mutex; +}; + +struct nvdimm { + unsigned long flags; + void *provider_data; + unsigned long *dsm_mask; + struct device dev; + atomic_t busy; + int id; +}; + +bool is_nvdimm(struct device *dev); +bool is_nd_pmem(struct device *dev); +bool is_nd_blk(struct device *dev); +struct nvdimm_bus *walk_to_nvdimm_bus(struct device *nd_dev); +int __init nvdimm_bus_init(void); +void nvdimm_bus_exit(void); +void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev); +struct nd_region; +void nd_region_create_blk_seed(struct nd_region *nd_region); +void nd_region_create_btt_seed(struct nd_region *nd_region); +void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev); +int nvdimm_bus_create_ndctl(struct nvdimm_bus *nvdimm_bus); +void nvdimm_bus_destroy_ndctl(struct nvdimm_bus *nvdimm_bus); +void nd_synchronize(void); +int nvdimm_bus_register_dimms(struct nvdimm_bus *nvdimm_bus); +int nvdimm_bus_register_regions(struct nvdimm_bus *nvdimm_bus); +int nvdimm_bus_init_interleave_sets(struct nvdimm_bus *nvdimm_bus); +void __nd_device_register(struct device *dev); +int nd_match_dimm(struct device *dev, void *data); +struct nd_label_id; +char *nd_label_gen_id(struct nd_label_id *label_id, u8 *uuid, u32 flags); +bool nd_is_uuid_unique(struct device *dev, u8 *uuid); +struct nd_region; +struct nvdimm_drvdata; +struct nd_mapping; +resource_size_t nd_pmem_available_dpa(struct nd_region *nd_region, + struct nd_mapping *nd_mapping, resource_size_t *overlap); +resource_size_t nd_blk_available_dpa(struct nd_mapping *nd_mapping); +resource_size_t nd_region_available_dpa(struct nd_region *nd_region); +resource_size_t nvdimm_allocated_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id); +struct nd_mapping; +struct resource *nsblk_add_resource(struct nd_region *nd_region, + struct nvdimm_drvdata *ndd, struct nd_namespace_blk *nsblk, + resource_size_t start); +int nvdimm_num_label_slots(struct nvdimm_drvdata *ndd); +void get_ndd(struct nvdimm_drvdata *ndd); +resource_size_t __nvdimm_namespace_capacity(struct nd_namespace_common *ndns); +void nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns); +void __nd_detach_ndns(struct device *dev, struct nd_namespace_common **_ndns); +bool nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, + struct nd_namespace_common **_ndns); +bool __nd_attach_ndns(struct device *dev, struct nd_namespace_common *attach, + struct nd_namespace_common **_ndns); +ssize_t nd_namespace_store(struct device *dev, + struct nd_namespace_common **_ndns, const char *buf, + size_t len); +#endif /* __ND_CORE_H__ */ diff --git a/kernel/drivers/nvdimm/nd.h b/kernel/drivers/nvdimm/nd.h new file mode 100644 index 000000000..417e521d2 --- /dev/null +++ b/kernel/drivers/nvdimm/nd.h @@ -0,0 +1,281 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __ND_H__ +#define __ND_H__ +#include <linux/libnvdimm.h> +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/ndctl.h> +#include <linux/types.h> +#include "label.h" + +enum { + /* + * Limits the maximum number of block apertures a dimm can + * support and is an input to the geometry/on-disk-format of a + * BTT instance + */ + ND_MAX_LANES = 256, + SECTOR_SHIFT = 9, + INT_LBASIZE_ALIGNMENT = 64, +#if IS_ENABLED(CONFIG_NVDIMM_PFN) + ND_PFN_ALIGN = PAGES_PER_SECTION * PAGE_SIZE, + ND_PFN_MASK = ND_PFN_ALIGN - 1, +#else + ND_PFN_ALIGN = 0, + ND_PFN_MASK = 0, +#endif +}; + +struct nvdimm_drvdata { + struct device *dev; + int nsindex_size; + struct nd_cmd_get_config_size nsarea; + void *data; + int ns_current, ns_next; + struct resource dpa; + struct kref kref; +}; + +struct nd_region_namespaces { + int count; + int active; +}; + +static inline struct nd_namespace_index *to_namespace_index( + struct nvdimm_drvdata *ndd, int i) +{ + if (i < 0) + return NULL; + + return ndd->data + sizeof_namespace_index(ndd) * i; +} + +static inline struct nd_namespace_index *to_current_namespace_index( + struct nvdimm_drvdata *ndd) +{ + return to_namespace_index(ndd, ndd->ns_current); +} + +static inline struct nd_namespace_index *to_next_namespace_index( + struct nvdimm_drvdata *ndd) +{ + return to_namespace_index(ndd, ndd->ns_next); +} + +#define nd_dbg_dpa(r, d, res, fmt, arg...) \ + dev_dbg((r) ? &(r)->dev : (d)->dev, "%s: %.13s: %#llx @ %#llx " fmt, \ + (r) ? dev_name((d)->dev) : "", res ? res->name : "null", \ + (unsigned long long) (res ? resource_size(res) : 0), \ + (unsigned long long) (res ? res->start : 0), ##arg) + +#define for_each_label(l, label, labels) \ + for (l = 0; (label = labels ? labels[l] : NULL); l++) + +#define for_each_dpa_resource(ndd, res) \ + for (res = (ndd)->dpa.child; res; res = res->sibling) + +#define for_each_dpa_resource_safe(ndd, res, next) \ + for (res = (ndd)->dpa.child, next = res ? res->sibling : NULL; \ + res; res = next, next = next ? next->sibling : NULL) + +struct nd_percpu_lane { + int count; + spinlock_t lock; +}; + +struct nd_region { + struct device dev; + struct ida ns_ida; + struct ida btt_ida; + struct ida pfn_ida; + unsigned long flags; + struct device *ns_seed; + struct device *btt_seed; + struct device *pfn_seed; + u16 ndr_mappings; + u64 ndr_size; + u64 ndr_start; + int id, num_lanes, ro, numa_node; + void *provider_data; + struct nd_interleave_set *nd_set; + struct nd_percpu_lane __percpu *lane; + struct nd_mapping mapping[0]; +}; + +struct nd_blk_region { + int (*enable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + void (*disable)(struct nvdimm_bus *nvdimm_bus, struct device *dev); + int (*do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, + void *iobuf, u64 len, int rw); + void *blk_provider_data; + struct nd_region nd_region; +}; + +/* + * Lookup next in the repeating sequence of 01, 10, and 11. + */ +static inline unsigned nd_inc_seq(unsigned seq) +{ + static const unsigned next[] = { 0, 2, 3, 1 }; + + return next[seq & 3]; +} + +struct btt; +struct nd_btt { + struct device dev; + struct nd_namespace_common *ndns; + struct btt *btt; + unsigned long lbasize; + u8 *uuid; + int id; +}; + +enum nd_pfn_mode { + PFN_MODE_NONE, + PFN_MODE_RAM, + PFN_MODE_PMEM, +}; + +struct nd_pfn { + int id; + u8 *uuid; + struct device dev; + unsigned long npfns; + enum nd_pfn_mode mode; + struct nd_pfn_sb *pfn_sb; + struct nd_namespace_common *ndns; +}; + +enum nd_async_mode { + ND_SYNC, + ND_ASYNC, +}; + +int nd_integrity_init(struct gendisk *disk, unsigned long meta_size); +void wait_nvdimm_bus_probe_idle(struct device *dev); +void nd_device_register(struct device *dev); +void nd_device_unregister(struct device *dev, enum nd_async_mode mode); +int nd_uuid_store(struct device *dev, u8 **uuid_out, const char *buf, + size_t len); +ssize_t nd_sector_size_show(unsigned long current_lbasize, + const unsigned long *supported, char *buf); +ssize_t nd_sector_size_store(struct device *dev, const char *buf, + unsigned long *current_lbasize, const unsigned long *supported); +int __init nvdimm_init(void); +int __init nd_region_init(void); +void nvdimm_exit(void); +void nd_region_exit(void); +struct nvdimm; +struct nvdimm_drvdata *to_ndd(struct nd_mapping *nd_mapping); +int nvdimm_init_nsarea(struct nvdimm_drvdata *ndd); +int nvdimm_init_config_data(struct nvdimm_drvdata *ndd); +int nvdimm_set_config_data(struct nvdimm_drvdata *ndd, size_t offset, + void *buf, size_t len); +struct nd_btt *to_nd_btt(struct device *dev); + +struct nd_gen_sb { + char reserved[SZ_4K - 8]; + __le64 checksum; +}; + +u64 nd_sb_checksum(struct nd_gen_sb *sb); +#if IS_ENABLED(CONFIG_BTT) +int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata); +bool is_nd_btt(struct device *dev); +struct device *nd_btt_create(struct nd_region *nd_region); +#else +static inline int nd_btt_probe(struct nd_namespace_common *ndns, void *drvdata) +{ + return -ENODEV; +} + +static inline bool is_nd_btt(struct device *dev) +{ + return false; +} + +static inline struct device *nd_btt_create(struct nd_region *nd_region) +{ + return NULL; +} +#endif + +struct nd_pfn *to_nd_pfn(struct device *dev); +#if IS_ENABLED(CONFIG_NVDIMM_PFN) +int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata); +bool is_nd_pfn(struct device *dev); +struct device *nd_pfn_create(struct nd_region *nd_region); +int nd_pfn_validate(struct nd_pfn *nd_pfn); +#else +static inline int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata) +{ + return -ENODEV; +} + +static inline bool is_nd_pfn(struct device *dev) +{ + return false; +} + +static inline struct device *nd_pfn_create(struct nd_region *nd_region) +{ + return NULL; +} + +static inline int nd_pfn_validate(struct nd_pfn *nd_pfn) +{ + return -ENODEV; +} +#endif + +struct nd_region *to_nd_region(struct device *dev); +int nd_region_to_nstype(struct nd_region *nd_region); +int nd_region_register_namespaces(struct nd_region *nd_region, int *err); +u64 nd_region_interleave_set_cookie(struct nd_region *nd_region); +void nvdimm_bus_lock(struct device *dev); +void nvdimm_bus_unlock(struct device *dev); +bool is_nvdimm_bus_locked(struct device *dev); +int nvdimm_revalidate_disk(struct gendisk *disk); +void nvdimm_drvdata_release(struct kref *kref); +void put_ndd(struct nvdimm_drvdata *ndd); +int nd_label_reserve_dpa(struct nvdimm_drvdata *ndd); +void nvdimm_free_dpa(struct nvdimm_drvdata *ndd, struct resource *res); +struct resource *nvdimm_allocate_dpa(struct nvdimm_drvdata *ndd, + struct nd_label_id *label_id, resource_size_t start, + resource_size_t n); +resource_size_t nvdimm_namespace_capacity(struct nd_namespace_common *ndns); +struct nd_namespace_common *nvdimm_namespace_common_probe(struct device *dev); +int nvdimm_namespace_attach_btt(struct nd_namespace_common *ndns); +int nvdimm_namespace_detach_btt(struct nd_namespace_common *ndns); +const char *nvdimm_namespace_disk_name(struct nd_namespace_common *ndns, + char *name); +int nd_blk_region_init(struct nd_region *nd_region); +void __nd_iostat_start(struct bio *bio, unsigned long *start); +static inline bool nd_iostat_start(struct bio *bio, unsigned long *start) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + + if (!blk_queue_io_stat(disk->queue)) + return false; + + __nd_iostat_start(bio, start); + return true; +} +void nd_iostat_end(struct bio *bio, unsigned long start); +resource_size_t nd_namespace_blk_validate(struct nd_namespace_blk *nsblk); +const u8 *nd_dev_to_uuid(struct device *dev); +bool pmem_should_map_pages(struct device *dev); +#endif /* __ND_H__ */ diff --git a/kernel/drivers/nvdimm/pfn.h b/kernel/drivers/nvdimm/pfn.h new file mode 100644 index 000000000..cc243754a --- /dev/null +++ b/kernel/drivers/nvdimm/pfn.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2014-2015, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef __NVDIMM_PFN_H +#define __NVDIMM_PFN_H + +#include <linux/types.h> + +#define PFN_SIG_LEN 16 +#define PFN_SIG "NVDIMM_PFN_INFO\0" + +struct nd_pfn_sb { + u8 signature[PFN_SIG_LEN]; + u8 uuid[16]; + u8 parent_uuid[16]; + __le32 flags; + __le16 version_major; + __le16 version_minor; + __le64 dataoff; + __le64 npfns; + __le32 mode; + u8 padding[4012]; + __le64 checksum; +}; +#endif /* __NVDIMM_PFN_H */ diff --git a/kernel/drivers/nvdimm/pfn_devs.c b/kernel/drivers/nvdimm/pfn_devs.c new file mode 100644 index 000000000..71805a1aa --- /dev/null +++ b/kernel/drivers/nvdimm/pfn_devs.c @@ -0,0 +1,337 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/blkdev.h> +#include <linux/device.h> +#include <linux/genhd.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include "nd-core.h" +#include "pfn.h" +#include "nd.h" + +static void nd_pfn_release(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + dev_dbg(dev, "%s\n", __func__); + nd_detach_ndns(&nd_pfn->dev, &nd_pfn->ndns); + ida_simple_remove(&nd_region->pfn_ida, nd_pfn->id); + kfree(nd_pfn->uuid); + kfree(nd_pfn); +} + +static struct device_type nd_pfn_device_type = { + .name = "nd_pfn", + .release = nd_pfn_release, +}; + +bool is_nd_pfn(struct device *dev) +{ + return dev ? dev->type == &nd_pfn_device_type : false; +} +EXPORT_SYMBOL(is_nd_pfn); + +struct nd_pfn *to_nd_pfn(struct device *dev) +{ + struct nd_pfn *nd_pfn = container_of(dev, struct nd_pfn, dev); + + WARN_ON(!is_nd_pfn(dev)); + return nd_pfn; +} +EXPORT_SYMBOL(to_nd_pfn); + +static ssize_t mode_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + switch (nd_pfn->mode) { + case PFN_MODE_RAM: + return sprintf(buf, "ram\n"); + case PFN_MODE_PMEM: + return sprintf(buf, "pmem\n"); + default: + return sprintf(buf, "none\n"); + } +} + +static ssize_t mode_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + ssize_t rc = 0; + + device_lock(dev); + nvdimm_bus_lock(dev); + if (dev->driver) + rc = -EBUSY; + else { + size_t n = len - 1; + + if (strncmp(buf, "pmem\n", n) == 0 + || strncmp(buf, "pmem", n) == 0) { + /* TODO: allocate from PMEM support */ + rc = -ENOTTY; + } else if (strncmp(buf, "ram\n", n) == 0 + || strncmp(buf, "ram", n) == 0) + nd_pfn->mode = PFN_MODE_RAM; + else if (strncmp(buf, "none\n", n) == 0 + || strncmp(buf, "none", n) == 0) + nd_pfn->mode = PFN_MODE_NONE; + else + rc = -EINVAL; + } + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(mode); + +static ssize_t uuid_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + + if (nd_pfn->uuid) + return sprintf(buf, "%pUb\n", nd_pfn->uuid); + return sprintf(buf, "\n"); +} + +static ssize_t uuid_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + ssize_t rc; + + device_lock(dev); + rc = nd_uuid_store(dev, &nd_pfn->uuid, buf, len); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + device_unlock(dev); + + return rc ? rc : len; +} +static DEVICE_ATTR_RW(uuid); + +static ssize_t namespace_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + rc = sprintf(buf, "%s\n", nd_pfn->ndns + ? dev_name(&nd_pfn->ndns->dev) : ""); + nvdimm_bus_unlock(dev); + return rc; +} + +static ssize_t namespace_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(dev); + ssize_t rc; + + device_lock(dev); + nvdimm_bus_lock(dev); + rc = nd_namespace_store(dev, &nd_pfn->ndns, buf, len); + dev_dbg(dev, "%s: result: %zd wrote: %s%s", __func__, + rc, buf, buf[len - 1] == '\n' ? "" : "\n"); + nvdimm_bus_unlock(dev); + device_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RW(namespace); + +static struct attribute *nd_pfn_attributes[] = { + &dev_attr_mode.attr, + &dev_attr_namespace.attr, + &dev_attr_uuid.attr, + NULL, +}; + +static struct attribute_group nd_pfn_attribute_group = { + .attrs = nd_pfn_attributes, +}; + +static const struct attribute_group *nd_pfn_attribute_groups[] = { + &nd_pfn_attribute_group, + &nd_device_attribute_group, + &nd_numa_attribute_group, + NULL, +}; + +static struct device *__nd_pfn_create(struct nd_region *nd_region, + u8 *uuid, enum nd_pfn_mode mode, + struct nd_namespace_common *ndns) +{ + struct nd_pfn *nd_pfn; + struct device *dev; + + /* we can only create pages for contiguous ranged of pmem */ + if (!is_nd_pmem(&nd_region->dev)) + return NULL; + + nd_pfn = kzalloc(sizeof(*nd_pfn), GFP_KERNEL); + if (!nd_pfn) + return NULL; + + nd_pfn->id = ida_simple_get(&nd_region->pfn_ida, 0, 0, GFP_KERNEL); + if (nd_pfn->id < 0) { + kfree(nd_pfn); + return NULL; + } + + nd_pfn->mode = mode; + if (uuid) + uuid = kmemdup(uuid, 16, GFP_KERNEL); + nd_pfn->uuid = uuid; + dev = &nd_pfn->dev; + dev_set_name(dev, "pfn%d.%d", nd_region->id, nd_pfn->id); + dev->parent = &nd_region->dev; + dev->type = &nd_pfn_device_type; + dev->groups = nd_pfn_attribute_groups; + device_initialize(&nd_pfn->dev); + if (ndns && !__nd_attach_ndns(&nd_pfn->dev, ndns, &nd_pfn->ndns)) { + dev_dbg(&ndns->dev, "%s failed, already claimed by %s\n", + __func__, dev_name(ndns->claim)); + put_device(dev); + return NULL; + } + return dev; +} + +struct device *nd_pfn_create(struct nd_region *nd_region) +{ + struct device *dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, + NULL); + + if (dev) + __nd_device_register(dev); + return dev; +} + +int nd_pfn_validate(struct nd_pfn *nd_pfn) +{ + struct nd_namespace_common *ndns = nd_pfn->ndns; + struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; + struct nd_namespace_io *nsio; + u64 checksum, offset; + + if (!pfn_sb || !ndns) + return -ENODEV; + + if (!is_nd_pmem(nd_pfn->dev.parent)) + return -ENODEV; + + /* section alignment for simple hotplug */ + if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN) + return -ENODEV; + + if (nvdimm_read_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb))) + return -ENXIO; + + if (memcmp(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN) != 0) + return -ENODEV; + + checksum = le64_to_cpu(pfn_sb->checksum); + pfn_sb->checksum = 0; + if (checksum != nd_sb_checksum((struct nd_gen_sb *) pfn_sb)) + return -ENODEV; + pfn_sb->checksum = cpu_to_le64(checksum); + + switch (le32_to_cpu(pfn_sb->mode)) { + case PFN_MODE_RAM: + break; + case PFN_MODE_PMEM: + /* TODO: allocate from PMEM support */ + return -ENOTTY; + default: + return -ENXIO; + } + + if (!nd_pfn->uuid) { + /* from probe we allocate */ + nd_pfn->uuid = kmemdup(pfn_sb->uuid, 16, GFP_KERNEL); + if (!nd_pfn->uuid) + return -ENOMEM; + } else { + /* from init we validate */ + if (memcmp(nd_pfn->uuid, pfn_sb->uuid, 16) != 0) + return -EINVAL; + } + + /* + * These warnings are verbose because they can only trigger in + * the case where the physical address alignment of the + * namespace has changed since the pfn superblock was + * established. + */ + offset = le64_to_cpu(pfn_sb->dataoff); + nsio = to_nd_namespace_io(&ndns->dev); + if (nsio->res.start & ND_PFN_MASK) { + dev_err(&nd_pfn->dev, + "init failed: %s not section aligned\n", + dev_name(&ndns->dev)); + return -EBUSY; + } else if (offset >= resource_size(&nsio->res)) { + dev_err(&nd_pfn->dev, "pfn array size exceeds capacity of %s\n", + dev_name(&ndns->dev)); + return -EBUSY; + } + + return 0; +} +EXPORT_SYMBOL(nd_pfn_validate); + +int nd_pfn_probe(struct nd_namespace_common *ndns, void *drvdata) +{ + int rc; + struct device *dev; + struct nd_pfn *nd_pfn; + struct nd_pfn_sb *pfn_sb; + struct nd_region *nd_region = to_nd_region(ndns->dev.parent); + + if (ndns->force_raw) + return -ENODEV; + + nvdimm_bus_lock(&ndns->dev); + dev = __nd_pfn_create(nd_region, NULL, PFN_MODE_NONE, ndns); + nvdimm_bus_unlock(&ndns->dev); + if (!dev) + return -ENOMEM; + dev_set_drvdata(dev, drvdata); + pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL); + nd_pfn = to_nd_pfn(dev); + nd_pfn->pfn_sb = pfn_sb; + rc = nd_pfn_validate(nd_pfn); + nd_pfn->pfn_sb = NULL; + kfree(pfn_sb); + dev_dbg(&ndns->dev, "%s: pfn: %s\n", __func__, + rc == 0 ? dev_name(dev) : "<none>"); + if (rc < 0) { + __nd_detach_ndns(dev, &nd_pfn->ndns); + put_device(dev); + } else + __nd_device_register(&nd_pfn->dev); + + return rc; +} +EXPORT_SYMBOL(nd_pfn_probe); diff --git a/kernel/drivers/nvdimm/pmem.c b/kernel/drivers/nvdimm/pmem.c new file mode 100644 index 000000000..8ee79893d --- /dev/null +++ b/kernel/drivers/nvdimm/pmem.c @@ -0,0 +1,464 @@ +/* + * Persistent Memory Driver + * + * Copyright (c) 2014-2015, Intel Corporation. + * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>. + * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <asm/cacheflush.h> +#include <linux/blkdev.h> +#include <linux/hdreg.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/module.h> +#include <linux/memory_hotplug.h> +#include <linux/moduleparam.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/pmem.h> +#include <linux/nd.h> +#include "pfn.h" +#include "nd.h" + +struct pmem_device { + struct request_queue *pmem_queue; + struct gendisk *pmem_disk; + struct nd_namespace_common *ndns; + + /* One contiguous memory region per device */ + phys_addr_t phys_addr; + /* when non-zero this device is hosting a 'pfn' instance */ + phys_addr_t data_offset; + void __pmem *virt_addr; + size_t size; +}; + +static int pmem_major; + +static void pmem_do_bvec(struct pmem_device *pmem, struct page *page, + unsigned int len, unsigned int off, int rw, + sector_t sector) +{ + void *mem = kmap_atomic(page); + phys_addr_t pmem_off = sector * 512 + pmem->data_offset; + void __pmem *pmem_addr = pmem->virt_addr + pmem_off; + + if (rw == READ) { + memcpy_from_pmem(mem + off, pmem_addr, len); + flush_dcache_page(page); + } else { + flush_dcache_page(page); + memcpy_to_pmem(pmem_addr, mem + off, len); + } + + kunmap_atomic(mem); +} + +static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) +{ + bool do_acct; + unsigned long start; + struct bio_vec bvec; + struct bvec_iter iter; + struct block_device *bdev = bio->bi_bdev; + struct pmem_device *pmem = bdev->bd_disk->private_data; + + do_acct = nd_iostat_start(bio, &start); + bio_for_each_segment(bvec, bio, iter) + pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset, + bio_data_dir(bio), iter.bi_sector); + if (do_acct) + nd_iostat_end(bio, start); + + if (bio_data_dir(bio)) + wmb_pmem(); + + bio_endio(bio); + return BLK_QC_T_NONE; +} + +static int pmem_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + struct pmem_device *pmem = bdev->bd_disk->private_data; + + pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector); + if (rw & WRITE) + wmb_pmem(); + page_endio(page, rw & WRITE, 0); + + return 0; +} + +static long pmem_direct_access(struct block_device *bdev, sector_t sector, + void __pmem **kaddr, unsigned long *pfn) +{ + struct pmem_device *pmem = bdev->bd_disk->private_data; + resource_size_t offset = sector * 512 + pmem->data_offset; + + *kaddr = pmem->virt_addr + offset; + *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; + + return pmem->size - offset; +} + +static const struct block_device_operations pmem_fops = { + .owner = THIS_MODULE, + .rw_page = pmem_rw_page, + .direct_access = pmem_direct_access, + .revalidate_disk = nvdimm_revalidate_disk, +}; + +static struct pmem_device *pmem_alloc(struct device *dev, + struct resource *res, int id) +{ + struct pmem_device *pmem; + + pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); + if (!pmem) + return ERR_PTR(-ENOMEM); + + pmem->phys_addr = res->start; + pmem->size = resource_size(res); + if (!arch_has_wmb_pmem()) + dev_warn(dev, "unable to guarantee persistence of writes\n"); + + if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size, + dev_name(dev))) { + dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", + &pmem->phys_addr, pmem->size); + return ERR_PTR(-EBUSY); + } + + if (pmem_should_map_pages(dev)) + pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res); + else + pmem->virt_addr = (void __pmem *) devm_memremap(dev, + pmem->phys_addr, pmem->size, + ARCH_MEMREMAP_PMEM); + + if (IS_ERR(pmem->virt_addr)) + return (void __force *) pmem->virt_addr; + + return pmem; +} + +static void pmem_detach_disk(struct pmem_device *pmem) +{ + if (!pmem->pmem_disk) + return; + + del_gendisk(pmem->pmem_disk); + put_disk(pmem->pmem_disk); + blk_cleanup_queue(pmem->pmem_queue); +} + +static int pmem_attach_disk(struct device *dev, + struct nd_namespace_common *ndns, struct pmem_device *pmem) +{ + int nid = dev_to_node(dev); + struct gendisk *disk; + + pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid); + if (!pmem->pmem_queue) + return -ENOMEM; + + blk_queue_make_request(pmem->pmem_queue, pmem_make_request); + blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE); + blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX); + blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue); + + disk = alloc_disk_node(0, nid); + if (!disk) { + blk_cleanup_queue(pmem->pmem_queue); + return -ENOMEM; + } + + disk->major = pmem_major; + disk->first_minor = 0; + disk->fops = &pmem_fops; + disk->private_data = pmem; + disk->queue = pmem->pmem_queue; + disk->flags = GENHD_FL_EXT_DEVT; + nvdimm_namespace_disk_name(ndns, disk->disk_name); + disk->driverfs_dev = dev; + set_capacity(disk, (pmem->size - pmem->data_offset) / 512); + pmem->pmem_disk = disk; + + add_disk(disk); + revalidate_disk(disk); + + return 0; +} + +static int pmem_rw_bytes(struct nd_namespace_common *ndns, + resource_size_t offset, void *buf, size_t size, int rw) +{ + struct pmem_device *pmem = dev_get_drvdata(ndns->claim); + + if (unlikely(offset + size > pmem->size)) { + dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n"); + return -EFAULT; + } + + if (rw == READ) + memcpy_from_pmem(buf, pmem->virt_addr + offset, size); + else { + memcpy_to_pmem(pmem->virt_addr + offset, buf, size); + wmb_pmem(); + } + + return 0; +} + +static int nd_pfn_init(struct nd_pfn *nd_pfn) +{ + struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL); + struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev); + struct nd_namespace_common *ndns = nd_pfn->ndns; + struct nd_region *nd_region; + unsigned long npfns; + phys_addr_t offset; + u64 checksum; + int rc; + + if (!pfn_sb) + return -ENOMEM; + + nd_pfn->pfn_sb = pfn_sb; + rc = nd_pfn_validate(nd_pfn); + if (rc == 0 || rc == -EBUSY) + return rc; + + /* section alignment for simple hotplug */ + if (nvdimm_namespace_capacity(ndns) < ND_PFN_ALIGN + || pmem->phys_addr & ND_PFN_MASK) + return -ENODEV; + + nd_region = to_nd_region(nd_pfn->dev.parent); + if (nd_region->ro) { + dev_info(&nd_pfn->dev, + "%s is read-only, unable to init metadata\n", + dev_name(&nd_region->dev)); + goto err; + } + + memset(pfn_sb, 0, sizeof(*pfn_sb)); + npfns = (pmem->size - SZ_8K) / SZ_4K; + /* + * Note, we use 64 here for the standard size of struct page, + * debugging options may cause it to be larger in which case the + * implementation will limit the pfns advertised through + * ->direct_access() to those that are included in the memmap. + */ + if (nd_pfn->mode == PFN_MODE_PMEM) + offset = ALIGN(SZ_8K + 64 * npfns, PMD_SIZE); + else if (nd_pfn->mode == PFN_MODE_RAM) + offset = SZ_8K; + else + goto err; + + npfns = (pmem->size - offset) / SZ_4K; + pfn_sb->mode = cpu_to_le32(nd_pfn->mode); + pfn_sb->dataoff = cpu_to_le64(offset); + pfn_sb->npfns = cpu_to_le64(npfns); + memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN); + memcpy(pfn_sb->uuid, nd_pfn->uuid, 16); + pfn_sb->version_major = cpu_to_le16(1); + checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb); + pfn_sb->checksum = cpu_to_le64(checksum); + + rc = nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb)); + if (rc) + goto err; + + return 0; + err: + nd_pfn->pfn_sb = NULL; + kfree(pfn_sb); + return -ENXIO; +} + +static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns) +{ + struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); + struct pmem_device *pmem; + + /* free pmem disk */ + pmem = dev_get_drvdata(&nd_pfn->dev); + pmem_detach_disk(pmem); + + /* release nd_pfn resources */ + kfree(nd_pfn->pfn_sb); + nd_pfn->pfn_sb = NULL; + + return 0; +} + +static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) +{ + struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); + struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); + struct device *dev = &nd_pfn->dev; + struct vmem_altmap *altmap; + struct nd_region *nd_region; + struct nd_pfn_sb *pfn_sb; + struct pmem_device *pmem; + phys_addr_t offset; + int rc; + + if (!nd_pfn->uuid || !nd_pfn->ndns) + return -ENODEV; + + nd_region = to_nd_region(dev->parent); + rc = nd_pfn_init(nd_pfn); + if (rc) + return rc; + + if (PAGE_SIZE != SZ_4K) { + dev_err(dev, "only supported on systems with 4K PAGE_SIZE\n"); + return -ENXIO; + } + if (nsio->res.start & ND_PFN_MASK) { + dev_err(dev, "%s not memory hotplug section aligned\n", + dev_name(&ndns->dev)); + return -ENXIO; + } + + pfn_sb = nd_pfn->pfn_sb; + offset = le64_to_cpu(pfn_sb->dataoff); + nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode); + if (nd_pfn->mode == PFN_MODE_RAM) { + if (offset != SZ_8K) + return -EINVAL; + nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); + altmap = NULL; + } else { + rc = -ENXIO; + goto err; + } + + /* establish pfn range for lookup, and switch to direct map */ + pmem = dev_get_drvdata(dev); + devm_memunmap(dev, (void __force *) pmem->virt_addr); + pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res); + if (IS_ERR(pmem->virt_addr)) { + rc = PTR_ERR(pmem->virt_addr); + goto err; + } + + /* attach pmem disk in "pfn-mode" */ + pmem->data_offset = offset; + rc = pmem_attach_disk(dev, ndns, pmem); + if (rc) + goto err; + + return rc; + err: + nvdimm_namespace_detach_pfn(ndns); + return rc; +} + +static int nd_pmem_probe(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev->parent); + struct nd_namespace_common *ndns; + struct nd_namespace_io *nsio; + struct pmem_device *pmem; + + ndns = nvdimm_namespace_common_probe(dev); + if (IS_ERR(ndns)) + return PTR_ERR(ndns); + + nsio = to_nd_namespace_io(&ndns->dev); + pmem = pmem_alloc(dev, &nsio->res, nd_region->id); + if (IS_ERR(pmem)) + return PTR_ERR(pmem); + + pmem->ndns = ndns; + dev_set_drvdata(dev, pmem); + ndns->rw_bytes = pmem_rw_bytes; + + if (is_nd_btt(dev)) + return nvdimm_namespace_attach_btt(ndns); + + if (is_nd_pfn(dev)) + return nvdimm_namespace_attach_pfn(ndns); + + if (nd_btt_probe(ndns, pmem) == 0) { + /* we'll come back as btt-pmem */ + return -ENXIO; + } + + if (nd_pfn_probe(ndns, pmem) == 0) { + /* we'll come back as pfn-pmem */ + return -ENXIO; + } + + return pmem_attach_disk(dev, ndns, pmem); +} + +static int nd_pmem_remove(struct device *dev) +{ + struct pmem_device *pmem = dev_get_drvdata(dev); + + if (is_nd_btt(dev)) + nvdimm_namespace_detach_btt(pmem->ndns); + else if (is_nd_pfn(dev)) + nvdimm_namespace_detach_pfn(pmem->ndns); + else + pmem_detach_disk(pmem); + + return 0; +} + +MODULE_ALIAS("pmem"); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM); +static struct nd_device_driver nd_pmem_driver = { + .probe = nd_pmem_probe, + .remove = nd_pmem_remove, + .drv = { + .name = "nd_pmem", + }, + .type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM, +}; + +static int __init pmem_init(void) +{ + int error; + + pmem_major = register_blkdev(0, "pmem"); + if (pmem_major < 0) + return pmem_major; + + error = nd_driver_register(&nd_pmem_driver); + if (error) { + unregister_blkdev(pmem_major, "pmem"); + return error; + } + + return 0; +} +module_init(pmem_init); + +static void pmem_exit(void) +{ + driver_unregister(&nd_pmem_driver.drv); + unregister_blkdev(pmem_major, "pmem"); +} +module_exit(pmem_exit); + +MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/drivers/nvdimm/region.c b/kernel/drivers/nvdimm/region.c new file mode 100644 index 000000000..7da63eac7 --- /dev/null +++ b/kernel/drivers/nvdimm/region.c @@ -0,0 +1,116 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/cpumask.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/nd.h> +#include "nd.h" + +static int nd_region_probe(struct device *dev) +{ + int err, rc; + static unsigned long once; + struct nd_region_namespaces *num_ns; + struct nd_region *nd_region = to_nd_region(dev); + + if (nd_region->num_lanes > num_online_cpus() + && nd_region->num_lanes < num_possible_cpus() + && !test_and_set_bit(0, &once)) { + dev_info(dev, "online cpus (%d) < concurrent i/o lanes (%d) < possible cpus (%d)\n", + num_online_cpus(), nd_region->num_lanes, + num_possible_cpus()); + dev_info(dev, "setting nr_cpus=%d may yield better libnvdimm device performance\n", + nd_region->num_lanes); + } + + rc = nd_blk_region_init(nd_region); + if (rc) + return rc; + + rc = nd_region_register_namespaces(nd_region, &err); + num_ns = devm_kzalloc(dev, sizeof(*num_ns), GFP_KERNEL); + if (!num_ns) + return -ENOMEM; + + if (rc < 0) + return rc; + + num_ns->active = rc; + num_ns->count = rc + err; + dev_set_drvdata(dev, num_ns); + + if (rc && err && rc == err) + return -ENODEV; + + nd_region->btt_seed = nd_btt_create(nd_region); + nd_region->pfn_seed = nd_pfn_create(nd_region); + if (err == 0) + return 0; + + /* + * Given multiple namespaces per region, we do not want to + * disable all the successfully registered peer namespaces upon + * a single registration failure. If userspace is missing a + * namespace that it expects it can disable/re-enable the region + * to retry discovery after correcting the failure. + * <regionX>/namespaces returns the current + * "<async-registered>/<total>" namespace count. + */ + dev_err(dev, "failed to register %d namespace%s, continuing...\n", + err, err == 1 ? "" : "s"); + return 0; +} + +static int child_unregister(struct device *dev, void *data) +{ + nd_device_unregister(dev, ND_SYNC); + return 0; +} + +static int nd_region_remove(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev); + + /* flush attribute readers and disable */ + nvdimm_bus_lock(dev); + nd_region->ns_seed = NULL; + nd_region->btt_seed = NULL; + nd_region->pfn_seed = NULL; + dev_set_drvdata(dev, NULL); + nvdimm_bus_unlock(dev); + + device_for_each_child(dev, NULL, child_unregister); + return 0; +} + +static struct nd_device_driver nd_region_driver = { + .probe = nd_region_probe, + .remove = nd_region_remove, + .drv = { + .name = "nd_region", + }, + .type = ND_DRIVER_REGION_BLK | ND_DRIVER_REGION_PMEM, +}; + +int __init nd_region_init(void) +{ + return nd_driver_register(&nd_region_driver); +} + +void nd_region_exit(void) +{ + driver_unregister(&nd_region_driver.drv); +} + +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_PMEM); +MODULE_ALIAS_ND_DEVICE(ND_DEVICE_REGION_BLK); diff --git a/kernel/drivers/nvdimm/region_devs.c b/kernel/drivers/nvdimm/region_devs.c new file mode 100644 index 000000000..9521696c9 --- /dev/null +++ b/kernel/drivers/nvdimm/region_devs.c @@ -0,0 +1,756 @@ +/* + * Copyright(c) 2013-2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/scatterlist.h> +#include <linux/highmem.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/sort.h> +#include <linux/io.h> +#include <linux/nd.h> +#include "nd-core.h" +#include "nd.h" + +static DEFINE_IDA(region_ida); + +static void nd_region_release(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev); + u16 i; + + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + put_device(&nvdimm->dev); + } + free_percpu(nd_region->lane); + ida_simple_remove(®ion_ida, nd_region->id); + if (is_nd_blk(dev)) + kfree(to_nd_blk_region(dev)); + else + kfree(nd_region); +} + +static struct device_type nd_blk_device_type = { + .name = "nd_blk", + .release = nd_region_release, +}; + +static struct device_type nd_pmem_device_type = { + .name = "nd_pmem", + .release = nd_region_release, +}; + +static struct device_type nd_volatile_device_type = { + .name = "nd_volatile", + .release = nd_region_release, +}; + +bool is_nd_pmem(struct device *dev) +{ + return dev ? dev->type == &nd_pmem_device_type : false; +} + +bool is_nd_blk(struct device *dev) +{ + return dev ? dev->type == &nd_blk_device_type : false; +} + +struct nd_region *to_nd_region(struct device *dev) +{ + struct nd_region *nd_region = container_of(dev, struct nd_region, dev); + + WARN_ON(dev->type->release != nd_region_release); + return nd_region; +} +EXPORT_SYMBOL_GPL(to_nd_region); + +struct nd_blk_region *to_nd_blk_region(struct device *dev) +{ + struct nd_region *nd_region = to_nd_region(dev); + + WARN_ON(!is_nd_blk(dev)); + return container_of(nd_region, struct nd_blk_region, nd_region); +} +EXPORT_SYMBOL_GPL(to_nd_blk_region); + +void *nd_region_provider_data(struct nd_region *nd_region) +{ + return nd_region->provider_data; +} +EXPORT_SYMBOL_GPL(nd_region_provider_data); + +void *nd_blk_region_provider_data(struct nd_blk_region *ndbr) +{ + return ndbr->blk_provider_data; +} +EXPORT_SYMBOL_GPL(nd_blk_region_provider_data); + +void nd_blk_region_set_provider_data(struct nd_blk_region *ndbr, void *data) +{ + ndbr->blk_provider_data = data; +} +EXPORT_SYMBOL_GPL(nd_blk_region_set_provider_data); + +/** + * nd_region_to_nstype() - region to an integer namespace type + * @nd_region: region-device to interrogate + * + * This is the 'nstype' attribute of a region as well, an input to the + * MODALIAS for namespace devices, and bit number for a nvdimm_bus to match + * namespace devices with namespace drivers. + */ +int nd_region_to_nstype(struct nd_region *nd_region) +{ + if (is_nd_pmem(&nd_region->dev)) { + u16 i, alias; + + for (i = 0, alias = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if (nvdimm->flags & NDD_ALIASING) + alias++; + } + if (alias) + return ND_DEVICE_NAMESPACE_PMEM; + else + return ND_DEVICE_NAMESPACE_IO; + } else if (is_nd_blk(&nd_region->dev)) { + return ND_DEVICE_NAMESPACE_BLK; + } + + return 0; +} +EXPORT_SYMBOL(nd_region_to_nstype); + +static ssize_t size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + unsigned long long size = 0; + + if (is_nd_pmem(dev)) { + size = nd_region->ndr_size; + } else if (nd_region->ndr_mappings == 1) { + struct nd_mapping *nd_mapping = &nd_region->mapping[0]; + + size = nd_mapping->size; + } + + return sprintf(buf, "%llu\n", size); +} +static DEVICE_ATTR_RO(size); + +static ssize_t mappings_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region->ndr_mappings); +} +static DEVICE_ATTR_RO(mappings); + +static ssize_t nstype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region_to_nstype(nd_region)); +} +static DEVICE_ATTR_RO(nstype); + +static ssize_t set_cookie_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + struct nd_interleave_set *nd_set = nd_region->nd_set; + + if (is_nd_pmem(dev) && nd_set) + /* pass, should be precluded by region_visible */; + else + return -ENXIO; + + return sprintf(buf, "%#llx\n", nd_set->cookie); +} +static DEVICE_ATTR_RO(set_cookie); + +resource_size_t nd_region_available_dpa(struct nd_region *nd_region) +{ + resource_size_t blk_max_overlap = 0, available, overlap; + int i; + + WARN_ON(!is_nvdimm_bus_locked(&nd_region->dev)); + + retry: + available = 0; + overlap = blk_max_overlap; + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = to_ndd(nd_mapping); + + /* if a dimm is disabled the available capacity is zero */ + if (!ndd) + return 0; + + if (is_nd_pmem(&nd_region->dev)) { + available += nd_pmem_available_dpa(nd_region, + nd_mapping, &overlap); + if (overlap > blk_max_overlap) { + blk_max_overlap = overlap; + goto retry; + } + } else if (is_nd_blk(&nd_region->dev)) { + available += nd_blk_available_dpa(nd_mapping); + } + } + + return available; +} + +static ssize_t available_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + unsigned long long available = 0; + + /* + * Flush in-flight updates and grab a snapshot of the available + * size. Of course, this value is potentially invalidated the + * memory nvdimm_bus_lock() is dropped, but that's userspace's + * problem to not race itself. + */ + nvdimm_bus_lock(dev); + wait_nvdimm_bus_probe_idle(dev); + available = nd_region_available_dpa(nd_region); + nvdimm_bus_unlock(dev); + + return sprintf(buf, "%llu\n", available); +} +static DEVICE_ATTR_RO(available_size); + +static ssize_t init_namespaces_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region_namespaces *num_ns = dev_get_drvdata(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (num_ns) + rc = sprintf(buf, "%d/%d\n", num_ns->active, num_ns->count); + else + rc = -ENXIO; + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(init_namespaces); + +static ssize_t namespace_seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (nd_region->ns_seed) + rc = sprintf(buf, "%s\n", dev_name(nd_region->ns_seed)); + else + rc = sprintf(buf, "\n"); + nvdimm_bus_unlock(dev); + return rc; +} +static DEVICE_ATTR_RO(namespace_seed); + +static ssize_t btt_seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (nd_region->btt_seed) + rc = sprintf(buf, "%s\n", dev_name(nd_region->btt_seed)); + else + rc = sprintf(buf, "\n"); + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(btt_seed); + +static ssize_t pfn_seed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + ssize_t rc; + + nvdimm_bus_lock(dev); + if (nd_region->pfn_seed) + rc = sprintf(buf, "%s\n", dev_name(nd_region->pfn_seed)); + else + rc = sprintf(buf, "\n"); + nvdimm_bus_unlock(dev); + + return rc; +} +static DEVICE_ATTR_RO(pfn_seed); + +static ssize_t read_only_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nd_region *nd_region = to_nd_region(dev); + + return sprintf(buf, "%d\n", nd_region->ro); +} + +static ssize_t read_only_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t len) +{ + bool ro; + int rc = strtobool(buf, &ro); + struct nd_region *nd_region = to_nd_region(dev); + + if (rc) + return rc; + + nd_region->ro = ro; + return len; +} +static DEVICE_ATTR_RW(read_only); + +static struct attribute *nd_region_attributes[] = { + &dev_attr_size.attr, + &dev_attr_nstype.attr, + &dev_attr_mappings.attr, + &dev_attr_btt_seed.attr, + &dev_attr_pfn_seed.attr, + &dev_attr_read_only.attr, + &dev_attr_set_cookie.attr, + &dev_attr_available_size.attr, + &dev_attr_namespace_seed.attr, + &dev_attr_init_namespaces.attr, + NULL, +}; + +static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct nd_region *nd_region = to_nd_region(dev); + struct nd_interleave_set *nd_set = nd_region->nd_set; + int type = nd_region_to_nstype(nd_region); + + if (a != &dev_attr_set_cookie.attr + && a != &dev_attr_available_size.attr) + return a->mode; + + if ((type == ND_DEVICE_NAMESPACE_PMEM + || type == ND_DEVICE_NAMESPACE_BLK) + && a == &dev_attr_available_size.attr) + return a->mode; + else if (is_nd_pmem(dev) && nd_set) + return a->mode; + + return 0; +} + +struct attribute_group nd_region_attribute_group = { + .attrs = nd_region_attributes, + .is_visible = region_visible, +}; +EXPORT_SYMBOL_GPL(nd_region_attribute_group); + +u64 nd_region_interleave_set_cookie(struct nd_region *nd_region) +{ + struct nd_interleave_set *nd_set = nd_region->nd_set; + + if (nd_set) + return nd_set->cookie; + return 0; +} + +/* + * Upon successful probe/remove, take/release a reference on the + * associated interleave set (if present), and plant new btt + namespace + * seeds. Also, on the removal of a BLK region, notify the provider to + * disable the region. + */ +static void nd_region_notify_driver_action(struct nvdimm_bus *nvdimm_bus, + struct device *dev, bool probe) +{ + struct nd_region *nd_region; + + if (!probe && (is_nd_pmem(dev) || is_nd_blk(dev))) { + int i; + + nd_region = to_nd_region(dev); + for (i = 0; i < nd_region->ndr_mappings; i++) { + struct nd_mapping *nd_mapping = &nd_region->mapping[i]; + struct nvdimm_drvdata *ndd = nd_mapping->ndd; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + kfree(nd_mapping->labels); + nd_mapping->labels = NULL; + put_ndd(ndd); + nd_mapping->ndd = NULL; + if (ndd) + atomic_dec(&nvdimm->busy); + } + + if (is_nd_pmem(dev)) + return; + + to_nd_blk_region(dev)->disable(nvdimm_bus, dev); + } + if (dev->parent && is_nd_blk(dev->parent) && probe) { + nd_region = to_nd_region(dev->parent); + nvdimm_bus_lock(dev); + if (nd_region->ns_seed == dev) + nd_region_create_blk_seed(nd_region); + nvdimm_bus_unlock(dev); + } + if (is_nd_btt(dev) && probe) { + struct nd_btt *nd_btt = to_nd_btt(dev); + + nd_region = to_nd_region(dev->parent); + nvdimm_bus_lock(dev); + if (nd_region->btt_seed == dev) + nd_region_create_btt_seed(nd_region); + if (nd_region->ns_seed == &nd_btt->ndns->dev && + is_nd_blk(dev->parent)) + nd_region_create_blk_seed(nd_region); + nvdimm_bus_unlock(dev); + } +} + +void nd_region_probe_success(struct nvdimm_bus *nvdimm_bus, struct device *dev) +{ + nd_region_notify_driver_action(nvdimm_bus, dev, true); +} + +void nd_region_disable(struct nvdimm_bus *nvdimm_bus, struct device *dev) +{ + nd_region_notify_driver_action(nvdimm_bus, dev, false); +} + +static ssize_t mappingN(struct device *dev, char *buf, int n) +{ + struct nd_region *nd_region = to_nd_region(dev); + struct nd_mapping *nd_mapping; + struct nvdimm *nvdimm; + + if (n >= nd_region->ndr_mappings) + return -ENXIO; + nd_mapping = &nd_region->mapping[n]; + nvdimm = nd_mapping->nvdimm; + + return sprintf(buf, "%s,%llu,%llu\n", dev_name(&nvdimm->dev), + nd_mapping->start, nd_mapping->size); +} + +#define REGION_MAPPING(idx) \ +static ssize_t mapping##idx##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + return mappingN(dev, buf, idx); \ +} \ +static DEVICE_ATTR_RO(mapping##idx) + +/* + * 32 should be enough for a while, even in the presence of socket + * interleave a 32-way interleave set is a degenerate case. + */ +REGION_MAPPING(0); +REGION_MAPPING(1); +REGION_MAPPING(2); +REGION_MAPPING(3); +REGION_MAPPING(4); +REGION_MAPPING(5); +REGION_MAPPING(6); +REGION_MAPPING(7); +REGION_MAPPING(8); +REGION_MAPPING(9); +REGION_MAPPING(10); +REGION_MAPPING(11); +REGION_MAPPING(12); +REGION_MAPPING(13); +REGION_MAPPING(14); +REGION_MAPPING(15); +REGION_MAPPING(16); +REGION_MAPPING(17); +REGION_MAPPING(18); +REGION_MAPPING(19); +REGION_MAPPING(20); +REGION_MAPPING(21); +REGION_MAPPING(22); +REGION_MAPPING(23); +REGION_MAPPING(24); +REGION_MAPPING(25); +REGION_MAPPING(26); +REGION_MAPPING(27); +REGION_MAPPING(28); +REGION_MAPPING(29); +REGION_MAPPING(30); +REGION_MAPPING(31); + +static umode_t mapping_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nd_region *nd_region = to_nd_region(dev); + + if (n < nd_region->ndr_mappings) + return a->mode; + return 0; +} + +static struct attribute *mapping_attributes[] = { + &dev_attr_mapping0.attr, + &dev_attr_mapping1.attr, + &dev_attr_mapping2.attr, + &dev_attr_mapping3.attr, + &dev_attr_mapping4.attr, + &dev_attr_mapping5.attr, + &dev_attr_mapping6.attr, + &dev_attr_mapping7.attr, + &dev_attr_mapping8.attr, + &dev_attr_mapping9.attr, + &dev_attr_mapping10.attr, + &dev_attr_mapping11.attr, + &dev_attr_mapping12.attr, + &dev_attr_mapping13.attr, + &dev_attr_mapping14.attr, + &dev_attr_mapping15.attr, + &dev_attr_mapping16.attr, + &dev_attr_mapping17.attr, + &dev_attr_mapping18.attr, + &dev_attr_mapping19.attr, + &dev_attr_mapping20.attr, + &dev_attr_mapping21.attr, + &dev_attr_mapping22.attr, + &dev_attr_mapping23.attr, + &dev_attr_mapping24.attr, + &dev_attr_mapping25.attr, + &dev_attr_mapping26.attr, + &dev_attr_mapping27.attr, + &dev_attr_mapping28.attr, + &dev_attr_mapping29.attr, + &dev_attr_mapping30.attr, + &dev_attr_mapping31.attr, + NULL, +}; + +struct attribute_group nd_mapping_attribute_group = { + .is_visible = mapping_visible, + .attrs = mapping_attributes, +}; +EXPORT_SYMBOL_GPL(nd_mapping_attribute_group); + +int nd_blk_region_init(struct nd_region *nd_region) +{ + struct device *dev = &nd_region->dev; + struct nvdimm_bus *nvdimm_bus = walk_to_nvdimm_bus(dev); + + if (!is_nd_blk(dev)) + return 0; + + if (nd_region->ndr_mappings < 1) { + dev_err(dev, "invalid BLK region\n"); + return -ENXIO; + } + + return to_nd_blk_region(dev)->enable(nvdimm_bus, dev); +} + +/** + * nd_region_acquire_lane - allocate and lock a lane + * @nd_region: region id and number of lanes possible + * + * A lane correlates to a BLK-data-window and/or a log slot in the BTT. + * We optimize for the common case where there are 256 lanes, one + * per-cpu. For larger systems we need to lock to share lanes. For now + * this implementation assumes the cost of maintaining an allocator for + * free lanes is on the order of the lock hold time, so it implements a + * static lane = cpu % num_lanes mapping. + * + * In the case of a BTT instance on top of a BLK namespace a lane may be + * acquired recursively. We lock on the first instance. + * + * In the case of a BTT instance on top of PMEM, we only acquire a lane + * for the BTT metadata updates. + */ +unsigned int nd_region_acquire_lane(struct nd_region *nd_region) +{ + unsigned int cpu, lane; + + cpu = get_cpu(); + if (nd_region->num_lanes < nr_cpu_ids) { + struct nd_percpu_lane *ndl_lock, *ndl_count; + + lane = cpu % nd_region->num_lanes; + ndl_count = per_cpu_ptr(nd_region->lane, cpu); + ndl_lock = per_cpu_ptr(nd_region->lane, lane); + if (ndl_count->count++ == 0) + spin_lock(&ndl_lock->lock); + } else + lane = cpu; + + return lane; +} +EXPORT_SYMBOL(nd_region_acquire_lane); + +void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane) +{ + if (nd_region->num_lanes < nr_cpu_ids) { + unsigned int cpu = get_cpu(); + struct nd_percpu_lane *ndl_lock, *ndl_count; + + ndl_count = per_cpu_ptr(nd_region->lane, cpu); + ndl_lock = per_cpu_ptr(nd_region->lane, lane); + if (--ndl_count->count == 0) + spin_unlock(&ndl_lock->lock); + put_cpu(); + } + put_cpu(); +} +EXPORT_SYMBOL(nd_region_release_lane); + +static struct nd_region *nd_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc, struct device_type *dev_type, + const char *caller) +{ + struct nd_region *nd_region; + struct device *dev; + void *region_buf; + unsigned int i; + int ro = 0; + + for (i = 0; i < ndr_desc->num_mappings; i++) { + struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + if ((nd_mapping->start | nd_mapping->size) % SZ_4K) { + dev_err(&nvdimm_bus->dev, "%s: %s mapping%d is not 4K aligned\n", + caller, dev_name(&nvdimm->dev), i); + + return NULL; + } + + if (nvdimm->flags & NDD_UNARMED) + ro = 1; + } + + if (dev_type == &nd_blk_device_type) { + struct nd_blk_region_desc *ndbr_desc; + struct nd_blk_region *ndbr; + + ndbr_desc = to_blk_region_desc(ndr_desc); + ndbr = kzalloc(sizeof(*ndbr) + sizeof(struct nd_mapping) + * ndr_desc->num_mappings, + GFP_KERNEL); + if (ndbr) { + nd_region = &ndbr->nd_region; + ndbr->enable = ndbr_desc->enable; + ndbr->disable = ndbr_desc->disable; + ndbr->do_io = ndbr_desc->do_io; + } + region_buf = ndbr; + } else { + nd_region = kzalloc(sizeof(struct nd_region) + + sizeof(struct nd_mapping) + * ndr_desc->num_mappings, + GFP_KERNEL); + region_buf = nd_region; + } + + if (!region_buf) + return NULL; + nd_region->id = ida_simple_get(®ion_ida, 0, 0, GFP_KERNEL); + if (nd_region->id < 0) + goto err_id; + + nd_region->lane = alloc_percpu(struct nd_percpu_lane); + if (!nd_region->lane) + goto err_percpu; + + for (i = 0; i < nr_cpu_ids; i++) { + struct nd_percpu_lane *ndl; + + ndl = per_cpu_ptr(nd_region->lane, i); + spin_lock_init(&ndl->lock); + ndl->count = 0; + } + + memcpy(nd_region->mapping, ndr_desc->nd_mapping, + sizeof(struct nd_mapping) * ndr_desc->num_mappings); + for (i = 0; i < ndr_desc->num_mappings; i++) { + struct nd_mapping *nd_mapping = &ndr_desc->nd_mapping[i]; + struct nvdimm *nvdimm = nd_mapping->nvdimm; + + get_device(&nvdimm->dev); + } + nd_region->ndr_mappings = ndr_desc->num_mappings; + nd_region->provider_data = ndr_desc->provider_data; + nd_region->nd_set = ndr_desc->nd_set; + nd_region->num_lanes = ndr_desc->num_lanes; + nd_region->flags = ndr_desc->flags; + nd_region->ro = ro; + nd_region->numa_node = ndr_desc->numa_node; + ida_init(&nd_region->ns_ida); + ida_init(&nd_region->btt_ida); + ida_init(&nd_region->pfn_ida); + dev = &nd_region->dev; + dev_set_name(dev, "region%d", nd_region->id); + dev->parent = &nvdimm_bus->dev; + dev->type = dev_type; + dev->groups = ndr_desc->attr_groups; + nd_region->ndr_size = resource_size(ndr_desc->res); + nd_region->ndr_start = ndr_desc->res->start; + nd_device_register(dev); + + return nd_region; + + err_percpu: + ida_simple_remove(®ion_ida, nd_region->id); + err_id: + kfree(region_buf); + return NULL; +} + +struct nd_region *nvdimm_pmem_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc) +{ + ndr_desc->num_lanes = ND_MAX_LANES; + return nd_region_create(nvdimm_bus, ndr_desc, &nd_pmem_device_type, + __func__); +} +EXPORT_SYMBOL_GPL(nvdimm_pmem_region_create); + +struct nd_region *nvdimm_blk_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc) +{ + if (ndr_desc->num_mappings > 1) + return NULL; + ndr_desc->num_lanes = min(ndr_desc->num_lanes, ND_MAX_LANES); + return nd_region_create(nvdimm_bus, ndr_desc, &nd_blk_device_type, + __func__); +} +EXPORT_SYMBOL_GPL(nvdimm_blk_region_create); + +struct nd_region *nvdimm_volatile_region_create(struct nvdimm_bus *nvdimm_bus, + struct nd_region_desc *ndr_desc) +{ + ndr_desc->num_lanes = ND_MAX_LANES; + return nd_region_create(nvdimm_bus, ndr_desc, &nd_volatile_device_type, + __func__); +} +EXPORT_SYMBOL_GPL(nvdimm_volatile_region_create); |