summaryrefslogtreecommitdiffstats
path: root/kernel/drivers/md/dm-thin.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/drivers/md/dm-thin.c')
-rw-r--r--kernel/drivers/md/dm-thin.c664
1 files changed, 441 insertions, 223 deletions
diff --git a/kernel/drivers/md/dm-thin.c b/kernel/drivers/md/dm-thin.c
index e22e6c892..a1cc797fe 100644
--- a/kernel/drivers/md/dm-thin.c
+++ b/kernel/drivers/md/dm-thin.c
@@ -112,22 +112,30 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
/*
* Key building.
*/
-static void build_data_key(struct dm_thin_device *td,
- dm_block_t b, struct dm_cell_key *key)
+enum lock_space {
+ VIRTUAL,
+ PHYSICAL
+};
+
+static void build_key(struct dm_thin_device *td, enum lock_space ls,
+ dm_block_t b, dm_block_t e, struct dm_cell_key *key)
{
- key->virtual = 0;
+ key->virtual = (ls == VIRTUAL);
key->dev = dm_thin_dev_id(td);
key->block_begin = b;
- key->block_end = b + 1ULL;
+ key->block_end = e;
+}
+
+static void build_data_key(struct dm_thin_device *td, dm_block_t b,
+ struct dm_cell_key *key)
+{
+ build_key(td, PHYSICAL, b, b + 1llu, key);
}
static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
struct dm_cell_key *key)
{
- key->virtual = 1;
- key->dev = dm_thin_dev_id(td);
- key->block_begin = b;
- key->block_end = b + 1ULL;
+ build_key(td, VIRTUAL, b, b + 1llu, key);
}
/*----------------------------------------------------------------*/
@@ -313,6 +321,80 @@ struct thin_c {
/*----------------------------------------------------------------*/
+/**
+ * __blkdev_issue_discard_async - queue a discard with async completion
+ * @bdev: blockdev to issue discard for
+ * @sector: start sector
+ * @nr_sects: number of sectors to discard
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @flags: BLKDEV_IFL_* flags to control behaviour
+ * @parent_bio: parent discard bio that all sub discards get chained to
+ *
+ * Description:
+ * Asynchronously issue a discard request for the sectors in question.
+ */
+static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
+ struct bio *parent_bio)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+ int type = REQ_WRITE | REQ_DISCARD;
+ struct bio *bio;
+
+ if (!q || !nr_sects)
+ return -ENXIO;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (flags & BLKDEV_DISCARD_SECURE) {
+ if (!blk_queue_secdiscard(q))
+ return -EOPNOTSUPP;
+ type |= REQ_SECURE;
+ }
+
+ /*
+ * Required bio_put occurs in bio_endio thanks to bio_chain below
+ */
+ bio = bio_alloc(gfp_mask, 1);
+ if (!bio)
+ return -ENOMEM;
+
+ bio_chain(bio, parent_bio);
+
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = bdev;
+ bio->bi_iter.bi_size = nr_sects << 9;
+
+ submit_bio(type, bio);
+
+ return 0;
+}
+
+static bool block_size_is_power_of_two(struct pool *pool)
+{
+ return pool->sectors_per_block_shift >= 0;
+}
+
+static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
+{
+ return block_size_is_power_of_two(pool) ?
+ (b << pool->sectors_per_block_shift) :
+ (b * pool->sectors_per_block);
+}
+
+static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e,
+ struct bio *parent_bio)
+{
+ sector_t s = block_to_sectors(tc->pool, data_b);
+ sector_t len = block_to_sectors(tc->pool, data_e - data_b);
+
+ return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len,
+ GFP_NOWAIT, 0, parent_bio);
+}
+
+/*----------------------------------------------------------------*/
+
/*
* wake_worker() is used when new work is queued and when pool_resume is
* ready to continue deferred IO processing.
@@ -462,6 +544,7 @@ struct dm_thin_endio_hook {
struct dm_deferred_entry *all_io_entry;
struct dm_thin_new_mapping *overwrite_mapping;
struct rb_node rb_node;
+ struct dm_bio_prison_cell *cell;
};
static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
@@ -474,8 +557,10 @@ static void error_bio_list(struct bio_list *bios, int error)
{
struct bio *bio;
- while ((bio = bio_list_pop(bios)))
- bio_endio(bio, error);
+ while ((bio = bio_list_pop(bios))) {
+ bio->bi_error = error;
+ bio_endio(bio);
+ }
}
static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
@@ -525,16 +610,21 @@ static void requeue_io(struct thin_c *tc)
requeue_deferred_cells(tc);
}
-static void error_retry_list(struct pool *pool)
+static void error_retry_list_with_code(struct pool *pool, int error)
{
struct thin_c *tc;
rcu_read_lock();
list_for_each_entry_rcu(tc, &pool->active_thins, list)
- error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
+ error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
rcu_read_unlock();
}
+static void error_retry_list(struct pool *pool)
+{
+ return error_retry_list_with_code(pool, -EIO);
+}
+
/*
* This section of code contains the logic for processing a thin device's IO.
* Much of the code depends on pool object resources (lists, workqueues, etc)
@@ -542,11 +632,6 @@ static void error_retry_list(struct pool *pool)
* target.
*/
-static bool block_size_is_power_of_two(struct pool *pool)
-{
- return pool->sectors_per_block_shift >= 0;
-}
-
static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
{
struct pool *pool = tc->pool;
@@ -560,6 +645,34 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
return block_nr;
}
+/*
+ * Returns the _complete_ blocks that this bio covers.
+ */
+static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
+ dm_block_t *begin, dm_block_t *end)
+{
+ struct pool *pool = tc->pool;
+ sector_t b = bio->bi_iter.bi_sector;
+ sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
+
+ b += pool->sectors_per_block - 1ull; /* so we round up */
+
+ if (block_size_is_power_of_two(pool)) {
+ b >>= pool->sectors_per_block_shift;
+ e >>= pool->sectors_per_block_shift;
+ } else {
+ (void) sector_div(b, pool->sectors_per_block);
+ (void) sector_div(e, pool->sectors_per_block);
+ }
+
+ if (e < b)
+ /* Can happen if the bio is within a single block. */
+ e = b;
+
+ *begin = b;
+ *end = e;
+}
+
static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
{
struct pool *pool = tc->pool;
@@ -648,7 +761,7 @@ struct dm_thin_new_mapping {
struct list_head list;
bool pass_discard:1;
- bool definitely_not_shared:1;
+ bool maybe_shared:1;
/*
* Track quiescing, copying and zeroing preparation actions. When this
@@ -659,9 +772,9 @@ struct dm_thin_new_mapping {
int err;
struct thin_c *tc;
- dm_block_t virt_block;
+ dm_block_t virt_begin, virt_end;
dm_block_t data_block;
- struct dm_bio_prison_cell *cell, *cell2;
+ struct dm_bio_prison_cell *cell;
/*
* If the bio covers the whole area of a block then we can avoid
@@ -701,12 +814,14 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
complete_mapping_preparation(m);
}
-static void overwrite_endio(struct bio *bio, int err)
+static void overwrite_endio(struct bio *bio)
{
struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
struct dm_thin_new_mapping *m = h->overwrite_mapping;
- m->err = err;
+ bio->bi_end_io = m->saved_bi_end_io;
+
+ m->err = bio->bi_error;
complete_mapping_preparation(m);
}
@@ -794,10 +909,6 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
{
- if (m->bio) {
- m->bio->bi_end_io = m->saved_bi_end_io;
- atomic_inc(&m->bio->bi_remaining);
- }
cell_error(m->tc->pool, m->cell);
list_del(&m->list);
mempool_free(m, m->tc->pool->mapping_pool);
@@ -807,15 +918,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
struct pool *pool = tc->pool;
- struct bio *bio;
+ struct bio *bio = m->bio;
int r;
- bio = m->bio;
- if (bio) {
- bio->bi_end_io = m->saved_bi_end_io;
- atomic_inc(&bio->bi_remaining);
- }
-
if (m->err) {
cell_error(pool, m->cell);
goto out;
@@ -826,7 +931,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
* Any I/O for this block arriving after this point will get
* remapped to it directly.
*/
- r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
+ r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
if (r) {
metadata_operation_failed(pool, "dm_thin_insert_block", r);
cell_error(pool, m->cell);
@@ -841,7 +946,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
*/
if (bio) {
inc_remap_and_issue_cell(tc, m->cell, m->data_block);
- bio_endio(bio, 0);
+ bio_endio(bio);
} else {
inc_all_io_entry(tc->pool, m->cell->holder);
remap_and_issue(tc, m->cell->holder, m->data_block);
@@ -853,50 +958,113 @@ out:
mempool_free(m, pool->mapping_pool);
}
-static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+/*----------------------------------------------------------------*/
+
+static void free_discard_mapping(struct dm_thin_new_mapping *m)
{
struct thin_c *tc = m->tc;
+ if (m->cell)
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, tc->pool->mapping_pool);
+}
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+{
bio_io_error(m->bio);
+ free_discard_mapping(m);
+}
+
+static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
+{
+ bio_endio(m->bio);
+ free_discard_mapping(m);
+}
+
+static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
+{
+ int r;
+ struct thin_c *tc = m->tc;
+
+ r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
+ if (r) {
+ metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
+ bio_io_error(m->bio);
+ } else
+ bio_endio(m->bio);
+
cell_defer_no_holder(tc, m->cell);
- cell_defer_no_holder(tc, m->cell2);
mempool_free(m, tc->pool->mapping_pool);
}
-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
{
+ /*
+ * We've already unmapped this range of blocks, but before we
+ * passdown we have to check that these blocks are now unused.
+ */
+ int r;
+ bool used = true;
struct thin_c *tc = m->tc;
+ struct pool *pool = tc->pool;
+ dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
- inc_all_io_entry(tc->pool, m->bio);
- cell_defer_no_holder(tc, m->cell);
- cell_defer_no_holder(tc, m->cell2);
+ while (b != end) {
+ /* find start of unmapped run */
+ for (; b < end; b++) {
+ r = dm_pool_block_is_used(pool->pmd, b, &used);
+ if (r)
+ return r;
- if (m->pass_discard)
- if (m->definitely_not_shared)
- remap_and_issue(tc, m->bio, m->data_block);
- else {
- bool used = false;
- if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
- bio_endio(m->bio, 0);
- else
- remap_and_issue(tc, m->bio, m->data_block);
+ if (!used)
+ break;
}
- else
- bio_endio(m->bio, 0);
- mempool_free(m, tc->pool->mapping_pool);
+ if (b == end)
+ break;
+
+ /* find end of run */
+ for (e = b + 1; e != end; e++) {
+ r = dm_pool_block_is_used(pool->pmd, e, &used);
+ if (r)
+ return r;
+
+ if (used)
+ break;
+ }
+
+ r = issue_discard(tc, b, e, m->bio);
+ if (r)
+ return r;
+
+ b = e;
+ }
+
+ return 0;
}
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
{
int r;
struct thin_c *tc = m->tc;
+ struct pool *pool = tc->pool;
- r = dm_thin_remove_block(tc->td, m->virt_block);
+ r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
if (r)
- DMERR_LIMIT("dm_thin_remove_block() failed");
+ metadata_operation_failed(pool, "dm_thin_remove_range", r);
+
+ else if (m->maybe_shared)
+ r = passdown_double_checking_shared_status(m);
+ else
+ r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio);
- process_prepared_discard_passdown(m);
+ /*
+ * Even if r is set, there could be sub discards in flight that we
+ * need to wait for.
+ */
+ m->bio->bi_error = r;
+ bio_endio(m->bio);
+ cell_defer_no_holder(tc, m->cell);
+ mempool_free(m, pool->mapping_pool);
}
static void process_prepared(struct pool *pool, struct list_head *head,
@@ -980,7 +1148,7 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
}
static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
- dm_block_t data_block,
+ dm_block_t data_begin,
struct dm_thin_new_mapping *m)
{
struct pool *pool = tc->pool;
@@ -990,7 +1158,7 @@ static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
m->bio = bio;
save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
inc_all_io_entry(pool, bio);
- remap_and_issue(tc, bio, data_block);
+ remap_and_issue(tc, bio, data_begin);
}
/*
@@ -1007,7 +1175,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
struct dm_thin_new_mapping *m = get_next_mapping(pool);
m->tc = tc;
- m->virt_block = virt_block;
+ m->virt_begin = virt_block;
+ m->virt_end = virt_block + 1u;
m->data_block = data_dest;
m->cell = cell;
@@ -1086,7 +1255,8 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
m->tc = tc;
- m->virt_block = virt_block;
+ m->virt_begin = virt_block;
+ m->virt_end = virt_block + 1u;
m->data_block = data_block;
m->cell = cell;
@@ -1095,16 +1265,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
* zeroing pre-existing data, we can issue the bio immediately.
* Otherwise we use kcopyd to zero the data first.
*/
- if (!pool->pf.zero_new_blocks)
+ if (pool->pf.zero_new_blocks) {
+ if (io_overwrites_block(pool, bio))
+ remap_and_issue_overwrite(tc, bio, data_block, m);
+ else
+ ll_zero(tc, m, data_block * pool->sectors_per_block,
+ (data_block + 1) * pool->sectors_per_block);
+ } else
process_prepared_mapping(m);
-
- else if (io_overwrites_block(pool, bio))
- remap_and_issue_overwrite(tc, bio, data_block, m);
-
- else
- ll_zero(tc, m,
- data_block * pool->sectors_per_block,
- (data_block + 1) * pool->sectors_per_block);
}
static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
@@ -1270,9 +1438,10 @@ static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
{
int error = should_error_unserviceable_bio(pool);
- if (error)
- bio_endio(bio, error);
- else
+ if (error) {
+ bio->bi_error = error;
+ bio_endio(bio);
+ } else
retry_on_resume(bio);
}
@@ -1295,99 +1464,148 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
retry_on_resume(bio);
}
-static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void process_discard_cell_no_passdown(struct thin_c *tc,
+ struct dm_bio_prison_cell *virt_cell)
{
- int r;
- struct bio *bio = cell->holder;
struct pool *pool = tc->pool;
- struct dm_bio_prison_cell *cell2;
- struct dm_cell_key key2;
- dm_block_t block = get_bio_block(tc, bio);
- struct dm_thin_lookup_result lookup_result;
- struct dm_thin_new_mapping *m;
+ struct dm_thin_new_mapping *m = get_next_mapping(pool);
- if (tc->requeue_mode) {
- cell_requeue(pool, cell);
- return;
- }
+ /*
+ * We don't need to lock the data blocks, since there's no
+ * passdown. We only lock data blocks for allocation and breaking sharing.
+ */
+ m->tc = tc;
+ m->virt_begin = virt_cell->key.block_begin;
+ m->virt_end = virt_cell->key.block_end;
+ m->cell = virt_cell;
+ m->bio = virt_cell->holder;
- r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
- switch (r) {
- case 0:
- /*
- * Check nobody is fiddling with this pool block. This can
- * happen if someone's in the process of breaking sharing
- * on this block.
- */
- build_data_key(tc->td, lookup_result.block, &key2);
- if (bio_detain(tc->pool, &key2, bio, &cell2)) {
- cell_defer_no_holder(tc, cell);
- break;
- }
+ if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+ pool->process_prepared_discard(m);
+}
- if (io_overlaps_block(pool, bio)) {
- /*
- * IO may still be going to the destination block. We must
- * quiesce before we can do the removal.
- */
- m = get_next_mapping(pool);
- m->tc = tc;
- m->pass_discard = pool->pf.discard_passdown;
- m->definitely_not_shared = !lookup_result.shared;
- m->virt_block = block;
- m->data_block = lookup_result.block;
- m->cell = cell;
- m->cell2 = cell2;
- m->bio = bio;
-
- if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
- pool->process_prepared_discard(m);
+/*
+ * __bio_inc_remaining() is used to defer parent bios's end_io until
+ * we _know_ all chained sub range discard bios have completed.
+ */
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+ bio->bi_flags |= (1 << BIO_CHAIN);
+ smp_mb__before_atomic();
+ atomic_inc(&bio->__bi_remaining);
+}
- } else {
- inc_all_io_entry(pool, bio);
- cell_defer_no_holder(tc, cell);
- cell_defer_no_holder(tc, cell2);
+static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
+ struct bio *bio)
+{
+ struct pool *pool = tc->pool;
+
+ int r;
+ bool maybe_shared;
+ struct dm_cell_key data_key;
+ struct dm_bio_prison_cell *data_cell;
+ struct dm_thin_new_mapping *m;
+ dm_block_t virt_begin, virt_end, data_begin;
+
+ while (begin != end) {
+ r = ensure_next_mapping(pool);
+ if (r)
+ /* we did our best */
+ return;
+ r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
+ &data_begin, &maybe_shared);
+ if (r)
/*
- * The DM core makes sure that the discard doesn't span
- * a block boundary. So we submit the discard of a
- * partial block appropriately.
+ * Silently fail, letting any mappings we've
+ * created complete.
*/
- if ((!lookup_result.shared) && pool->pf.discard_passdown)
- remap_and_issue(tc, bio, lookup_result.block);
- else
- bio_endio(bio, 0);
+ break;
+
+ build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
+ if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
+ /* contention, we'll give up with this range */
+ begin = virt_end;
+ continue;
}
- break;
- case -ENODATA:
/*
- * It isn't provisioned, just forget it.
+ * IO may still be going to the destination block. We must
+ * quiesce before we can do the removal.
*/
- cell_defer_no_holder(tc, cell);
- bio_endio(bio, 0);
- break;
+ m = get_next_mapping(pool);
+ m->tc = tc;
+ m->maybe_shared = maybe_shared;
+ m->virt_begin = virt_begin;
+ m->virt_end = virt_end;
+ m->data_block = data_begin;
+ m->cell = data_cell;
+ m->bio = bio;
- default:
- DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
- __func__, r);
- cell_defer_no_holder(tc, cell);
- bio_io_error(bio);
- break;
+ /*
+ * The parent bio must not complete before sub discard bios are
+ * chained to it (see __blkdev_issue_discard_async's bio_chain)!
+ *
+ * This per-mapping bi_remaining increment is paired with
+ * the implicit decrement that occurs via bio_endio() in
+ * process_prepared_discard_{passdown,no_passdown}.
+ */
+ __bio_inc_remaining(bio);
+ if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+ pool->process_prepared_discard(m);
+
+ begin = virt_end;
}
}
+static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
+{
+ struct bio *bio = virt_cell->holder;
+ struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+ /*
+ * The virt_cell will only get freed once the origin bio completes.
+ * This means it will remain locked while all the individual
+ * passdown bios are in flight.
+ */
+ h->cell = virt_cell;
+ break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
+
+ /*
+ * We complete the bio now, knowing that the bi_remaining field
+ * will prevent completion until the sub range discards have
+ * completed.
+ */
+ bio_endio(bio);
+}
+
static void process_discard_bio(struct thin_c *tc, struct bio *bio)
{
- struct dm_bio_prison_cell *cell;
- struct dm_cell_key key;
- dm_block_t block = get_bio_block(tc, bio);
+ dm_block_t begin, end;
+ struct dm_cell_key virt_key;
+ struct dm_bio_prison_cell *virt_cell;
- build_virtual_key(tc->td, block, &key);
- if (bio_detain(tc->pool, &key, bio, &cell))
+ get_bio_block_range(tc, bio, &begin, &end);
+ if (begin == end) {
+ /*
+ * The discard covers less than a block.
+ */
+ bio_endio(bio);
+ return;
+ }
+
+ build_key(tc->td, VIRTUAL, begin, end, &virt_key);
+ if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
+ /*
+ * Potential starvation issue: We're relying on the
+ * fs/application being well behaved, and not trying to
+ * send IO to a region at the same time as discarding it.
+ * If they do this persistently then it's possible this
+ * cell will never be granted.
+ */
return;
- process_discard_cell(tc, cell);
+ tc->pool->process_discard_cell(tc, virt_cell);
}
static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
@@ -1517,7 +1735,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
if (bio_data_dir(bio) == READ) {
zero_fill_bio(bio);
cell_defer_no_holder(tc, cell);
- bio_endio(bio, 0);
+ bio_endio(bio);
return;
}
@@ -1582,7 +1800,7 @@ static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
} else {
zero_fill_bio(bio);
- bio_endio(bio, 0);
+ bio_endio(bio);
}
} else
provision_block(tc, bio, block, cell);
@@ -1653,7 +1871,7 @@ static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
}
zero_fill_bio(bio);
- bio_endio(bio, 0);
+ bio_endio(bio);
break;
default:
@@ -1678,7 +1896,7 @@ static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell
static void process_bio_success(struct thin_c *tc, struct bio *bio)
{
- bio_endio(bio, 0);
+ bio_endio(bio);
}
static void process_bio_fail(struct thin_c *tc, struct bio *bio)
@@ -2014,18 +2232,23 @@ static void do_waker(struct work_struct *ws)
queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
}
+static void notify_of_pool_mode_change_to_oods(struct pool *pool);
+
/*
* We're holding onto IO to allow userland time to react. After the
* timeout either the pool will have been resized (and thus back in
- * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
*/
static void do_no_space_timeout(struct work_struct *ws)
{
struct pool *pool = container_of(to_delayed_work(ws), struct pool,
no_space_timeout);
- if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
- set_pool_mode(pool, PM_READ_ONLY);
+ if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
+ pool->pf.error_if_no_space = true;
+ notify_of_pool_mode_change_to_oods(pool);
+ error_retry_list_with_code(pool, -ENOSPC);
+ }
}
/*----------------------------------------------------------------*/
@@ -2103,6 +2326,32 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
dm_device_name(pool->pool_md), new_mode);
}
+static void notify_of_pool_mode_change_to_oods(struct pool *pool)
+{
+ if (!pool->pf.error_if_no_space)
+ notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)");
+ else
+ notify_of_pool_mode_change(pool, "out-of-data-space (error IO)");
+}
+
+static bool passdown_enabled(struct pool_c *pt)
+{
+ return pt->adjusted_pf.discard_passdown;
+}
+
+static void set_discard_callbacks(struct pool *pool)
+{
+ struct pool_c *pt = pool->ti->private;
+
+ if (passdown_enabled(pt)) {
+ pool->process_discard_cell = process_discard_cell_passdown;
+ pool->process_prepared_discard = process_prepared_discard_passdown;
+ } else {
+ pool->process_discard_cell = process_discard_cell_no_passdown;
+ pool->process_prepared_discard = process_prepared_discard_no_passdown;
+ }
+}
+
static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
{
struct pool_c *pt = pool->ti->private;
@@ -2154,7 +2403,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pool->process_cell = process_cell_read_only;
pool->process_discard_cell = process_cell_success;
pool->process_prepared_mapping = process_prepared_mapping_fail;
- pool->process_prepared_discard = process_prepared_discard_passdown;
+ pool->process_prepared_discard = process_prepared_discard_success;
error_retry_list(pool);
break;
@@ -2169,13 +2418,12 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
* frequently seeing this mode.
*/
if (old_mode != new_mode)
- notify_of_pool_mode_change(pool, "out-of-data-space");
+ notify_of_pool_mode_change_to_oods(pool);
pool->process_bio = process_bio_read_only;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell_read_only;
- pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
- pool->process_prepared_discard = process_prepared_discard;
+ set_discard_callbacks(pool);
if (!pool->pf.error_if_no_space && no_space_timeout)
queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
@@ -2184,13 +2432,13 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
case PM_WRITE:
if (old_mode != new_mode)
notify_of_pool_mode_change(pool, "write");
+ pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
dm_pool_metadata_read_write(pool->pmd);
pool->process_bio = process_bio;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell;
- pool->process_discard_cell = process_discard_cell;
pool->process_prepared_mapping = process_prepared_mapping;
- pool->process_prepared_discard = process_prepared_discard;
+ set_discard_callbacks(pool);
break;
}
@@ -2279,6 +2527,7 @@ static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
h->shared_read_entry = NULL;
h->all_io_entry = NULL;
h->overwrite_mapping = NULL;
+ h->cell = NULL;
}
/*
@@ -2297,7 +2546,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio)
thin_hook_bio(tc, bio);
if (tc->requeue_mode) {
- bio_endio(bio, DM_ENDIO_REQUEUE);
+ bio->bi_error = DM_ENDIO_REQUEUE;
+ bio_endio(bio);
return DM_MAPIO_SUBMITTED;
}
@@ -2426,7 +2676,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
struct pool *pool = pt->pool;
struct block_device *data_bdev = pt->data_dev->bdev;
struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
- sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
const char *reason = NULL;
char buf[BDEVNAME_SIZE];
@@ -2439,12 +2688,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
else if (data_limits->max_discard_sectors < pool->sectors_per_block)
reason = "max discard sectors smaller than a block";
- else if (data_limits->discard_granularity > block_size)
- reason = "discard granularity larger than a block";
-
- else if (!is_factor(block_size, data_limits->discard_granularity))
- reason = "discard granularity not a factor of block size";
-
if (reason) {
DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
pt->adjusted_pf.discard_passdown = false;
@@ -2959,7 +3202,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
metadata_low_callback,
pool);
if (r)
- goto out_free_pt;
+ goto out_flags_changed;
pt->callbacks.congested_fn = pool_is_congested;
dm_table_add_target_callbacks(ti->table, &pt->callbacks);
@@ -3210,8 +3453,8 @@ static void pool_postsuspend(struct dm_target *ti)
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- cancel_delayed_work(&pool->waker);
- cancel_delayed_work(&pool->no_space_timeout);
+ cancel_delayed_work_sync(&pool->waker);
+ cancel_delayed_work_sync(&pool->no_space_timeout);
flush_workqueue(pool->wq);
(void) commit(pool);
}
@@ -3389,7 +3632,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
if (get_pool_mode(pool) >= PM_READ_ONLY) {
DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
dm_device_name(pool->pool_md));
- return -EINVAL;
+ return -EOPNOTSUPP;
}
if (!strcasecmp(argv[0], "create_thin"))
@@ -3447,6 +3690,7 @@ static void emit_flags(struct pool_features *pf, char *result,
* Status line is:
* <transaction id> <used metadata sectors>/<total metadata sectors>
* <used data sectors>/<total data sectors> <held metadata root>
+ * <pool mode> <discard config> <no space config> <needs_check>
*/
static void pool_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
@@ -3548,6 +3792,11 @@ static void pool_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("queue_if_no_space ");
+ if (dm_pool_metadata_needs_check(pool->pmd))
+ DMEMIT("needs_check ");
+ else
+ DMEMIT("- ");
+
break;
case STATUSTYPE_TABLE:
@@ -3573,38 +3822,6 @@ static int pool_iterate_devices(struct dm_target *ti,
return fn(ti, pt->data_dev, 0, ti->len, data);
}
-static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct pool_c *pt = ti->private;
- struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = pt->data_dev->bdev;
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
-static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
-{
- struct pool *pool = pt->pool;
- struct queue_limits *data_limits;
-
- limits->max_discard_sectors = pool->sectors_per_block;
-
- /*
- * discard_granularity is just a hint, and not enforced.
- */
- if (pt->adjusted_pf.discard_passdown) {
- data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
- limits->discard_granularity = max(data_limits->discard_granularity,
- pool->sectors_per_block << SECTOR_SHIFT);
- } else
- limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-}
-
static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
{
struct pool_c *pt = ti->private;
@@ -3659,14 +3876,17 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
disable_passdown_if_not_supported(pt);
- set_discard_limits(pt, limits);
+ /*
+ * The pool uses the same discard limits as the underlying data
+ * device. DM core has already set this up.
+ */
}
static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
- .version = {1, 14, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@@ -3678,7 +3898,6 @@ static struct target_type pool_target = {
.resume = pool_resume,
.message = pool_message,
.status = pool_status,
- .merge = pool_merge,
.iterate_devices = pool_iterate_devices,
.io_hints = pool_io_hints,
};
@@ -3825,8 +4044,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true;
ti->num_discard_bios = 1;
- /* Discard bios must be split on a block boundary */
- ti->split_discard_bios = true;
+ ti->split_discard_bios = false;
}
mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3913,6 +4131,9 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
}
}
+ if (h->cell)
+ cell_defer_no_holder(h->tc, h->cell);
+
return 0;
}
@@ -4003,21 +4224,6 @@ err:
DMEMIT("Error");
}
-static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
- struct bio_vec *biovec, int max_size)
-{
- struct thin_c *tc = ti->private;
- struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
-
- if (!q->merge_bvec_fn)
- return max_size;
-
- bvm->bi_bdev = tc->pool_dev->bdev;
- bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
-
- return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
-}
-
static int thin_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
@@ -4040,9 +4246,21 @@ static int thin_iterate_devices(struct dm_target *ti,
return 0;
}
+static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct thin_c *tc = ti->private;
+ struct pool *pool = tc->pool;
+
+ if (!pool->pf.discard_enabled)
+ return;
+
+ limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+ limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
+}
+
static struct target_type thin_target = {
.name = "thin",
- .version = {1, 14, 0},
+ .version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,
@@ -4052,8 +4270,8 @@ static struct target_type thin_target = {
.presuspend = thin_presuspend,
.postsuspend = thin_postsuspend,
.status = thin_status,
- .merge = thin_merge,
.iterate_devices = thin_iterate_devices,
+ .io_hints = thin_io_hints,
};
/*----------------------------------------------------------------*/