diff options
Diffstat (limited to 'kernel/drivers/md/dm-thin.c')
-rw-r--r-- | kernel/drivers/md/dm-thin.c | 664 |
1 files changed, 441 insertions, 223 deletions
diff --git a/kernel/drivers/md/dm-thin.c b/kernel/drivers/md/dm-thin.c index e22e6c892..a1cc797fe 100644 --- a/kernel/drivers/md/dm-thin.c +++ b/kernel/drivers/md/dm-thin.c @@ -112,22 +112,30 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle, /* * Key building. */ -static void build_data_key(struct dm_thin_device *td, - dm_block_t b, struct dm_cell_key *key) +enum lock_space { + VIRTUAL, + PHYSICAL +}; + +static void build_key(struct dm_thin_device *td, enum lock_space ls, + dm_block_t b, dm_block_t e, struct dm_cell_key *key) { - key->virtual = 0; + key->virtual = (ls == VIRTUAL); key->dev = dm_thin_dev_id(td); key->block_begin = b; - key->block_end = b + 1ULL; + key->block_end = e; +} + +static void build_data_key(struct dm_thin_device *td, dm_block_t b, + struct dm_cell_key *key) +{ + build_key(td, PHYSICAL, b, b + 1llu, key); } static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, struct dm_cell_key *key) { - key->virtual = 1; - key->dev = dm_thin_dev_id(td); - key->block_begin = b; - key->block_end = b + 1ULL; + build_key(td, VIRTUAL, b, b + 1llu, key); } /*----------------------------------------------------------------*/ @@ -313,6 +321,80 @@ struct thin_c { /*----------------------------------------------------------------*/ +/** + * __blkdev_issue_discard_async - queue a discard with async completion + * @bdev: blockdev to issue discard for + * @sector: start sector + * @nr_sects: number of sectors to discard + * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: BLKDEV_IFL_* flags to control behaviour + * @parent_bio: parent discard bio that all sub discards get chained to + * + * Description: + * Asynchronously issue a discard request for the sectors in question. + */ +static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, unsigned long flags, + struct bio *parent_bio) +{ + struct request_queue *q = bdev_get_queue(bdev); + int type = REQ_WRITE | REQ_DISCARD; + struct bio *bio; + + if (!q || !nr_sects) + return -ENXIO; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (flags & BLKDEV_DISCARD_SECURE) { + if (!blk_queue_secdiscard(q)) + return -EOPNOTSUPP; + type |= REQ_SECURE; + } + + /* + * Required bio_put occurs in bio_endio thanks to bio_chain below + */ + bio = bio_alloc(gfp_mask, 1); + if (!bio) + return -ENOMEM; + + bio_chain(bio, parent_bio); + + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_iter.bi_size = nr_sects << 9; + + submit_bio(type, bio); + + return 0; +} + +static bool block_size_is_power_of_two(struct pool *pool) +{ + return pool->sectors_per_block_shift >= 0; +} + +static sector_t block_to_sectors(struct pool *pool, dm_block_t b) +{ + return block_size_is_power_of_two(pool) ? + (b << pool->sectors_per_block_shift) : + (b * pool->sectors_per_block); +} + +static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e, + struct bio *parent_bio) +{ + sector_t s = block_to_sectors(tc->pool, data_b); + sector_t len = block_to_sectors(tc->pool, data_e - data_b); + + return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len, + GFP_NOWAIT, 0, parent_bio); +} + +/*----------------------------------------------------------------*/ + /* * wake_worker() is used when new work is queued and when pool_resume is * ready to continue deferred IO processing. @@ -462,6 +544,7 @@ struct dm_thin_endio_hook { struct dm_deferred_entry *all_io_entry; struct dm_thin_new_mapping *overwrite_mapping; struct rb_node rb_node; + struct dm_bio_prison_cell *cell; }; static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) @@ -474,8 +557,10 @@ static void error_bio_list(struct bio_list *bios, int error) { struct bio *bio; - while ((bio = bio_list_pop(bios))) - bio_endio(bio, error); + while ((bio = bio_list_pop(bios))) { + bio->bi_error = error; + bio_endio(bio); + } } static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error) @@ -525,16 +610,21 @@ static void requeue_io(struct thin_c *tc) requeue_deferred_cells(tc); } -static void error_retry_list(struct pool *pool) +static void error_retry_list_with_code(struct pool *pool, int error) { struct thin_c *tc; rcu_read_lock(); list_for_each_entry_rcu(tc, &pool->active_thins, list) - error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO); + error_thin_bio_list(tc, &tc->retry_on_resume_list, error); rcu_read_unlock(); } +static void error_retry_list(struct pool *pool) +{ + return error_retry_list_with_code(pool, -EIO); +} + /* * This section of code contains the logic for processing a thin device's IO. * Much of the code depends on pool object resources (lists, workqueues, etc) @@ -542,11 +632,6 @@ static void error_retry_list(struct pool *pool) * target. */ -static bool block_size_is_power_of_two(struct pool *pool) -{ - return pool->sectors_per_block_shift >= 0; -} - static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) { struct pool *pool = tc->pool; @@ -560,6 +645,34 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio) return block_nr; } +/* + * Returns the _complete_ blocks that this bio covers. + */ +static void get_bio_block_range(struct thin_c *tc, struct bio *bio, + dm_block_t *begin, dm_block_t *end) +{ + struct pool *pool = tc->pool; + sector_t b = bio->bi_iter.bi_sector; + sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT); + + b += pool->sectors_per_block - 1ull; /* so we round up */ + + if (block_size_is_power_of_two(pool)) { + b >>= pool->sectors_per_block_shift; + e >>= pool->sectors_per_block_shift; + } else { + (void) sector_div(b, pool->sectors_per_block); + (void) sector_div(e, pool->sectors_per_block); + } + + if (e < b) + /* Can happen if the bio is within a single block. */ + e = b; + + *begin = b; + *end = e; +} + static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) { struct pool *pool = tc->pool; @@ -648,7 +761,7 @@ struct dm_thin_new_mapping { struct list_head list; bool pass_discard:1; - bool definitely_not_shared:1; + bool maybe_shared:1; /* * Track quiescing, copying and zeroing preparation actions. When this @@ -659,9 +772,9 @@ struct dm_thin_new_mapping { int err; struct thin_c *tc; - dm_block_t virt_block; + dm_block_t virt_begin, virt_end; dm_block_t data_block; - struct dm_bio_prison_cell *cell, *cell2; + struct dm_bio_prison_cell *cell; /* * If the bio covers the whole area of a block then we can avoid @@ -701,12 +814,14 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) complete_mapping_preparation(m); } -static void overwrite_endio(struct bio *bio, int err) +static void overwrite_endio(struct bio *bio) { struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); struct dm_thin_new_mapping *m = h->overwrite_mapping; - m->err = err; + bio->bi_end_io = m->saved_bi_end_io; + + m->err = bio->bi_error; complete_mapping_preparation(m); } @@ -794,10 +909,6 @@ static void inc_remap_and_issue_cell(struct thin_c *tc, static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) { - if (m->bio) { - m->bio->bi_end_io = m->saved_bi_end_io; - atomic_inc(&m->bio->bi_remaining); - } cell_error(m->tc->pool, m->cell); list_del(&m->list); mempool_free(m, m->tc->pool->mapping_pool); @@ -807,15 +918,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) { struct thin_c *tc = m->tc; struct pool *pool = tc->pool; - struct bio *bio; + struct bio *bio = m->bio; int r; - bio = m->bio; - if (bio) { - bio->bi_end_io = m->saved_bi_end_io; - atomic_inc(&bio->bi_remaining); - } - if (m->err) { cell_error(pool, m->cell); goto out; @@ -826,7 +931,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) * Any I/O for this block arriving after this point will get * remapped to it directly. */ - r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); + r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block); if (r) { metadata_operation_failed(pool, "dm_thin_insert_block", r); cell_error(pool, m->cell); @@ -841,7 +946,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) */ if (bio) { inc_remap_and_issue_cell(tc, m->cell, m->data_block); - bio_endio(bio, 0); + bio_endio(bio); } else { inc_all_io_entry(tc->pool, m->cell->holder); remap_and_issue(tc, m->cell->holder, m->data_block); @@ -853,50 +958,113 @@ out: mempool_free(m, pool->mapping_pool); } -static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) +/*----------------------------------------------------------------*/ + +static void free_discard_mapping(struct dm_thin_new_mapping *m) { struct thin_c *tc = m->tc; + if (m->cell) + cell_defer_no_holder(tc, m->cell); + mempool_free(m, tc->pool->mapping_pool); +} +static void process_prepared_discard_fail(struct dm_thin_new_mapping *m) +{ bio_io_error(m->bio); + free_discard_mapping(m); +} + +static void process_prepared_discard_success(struct dm_thin_new_mapping *m) +{ + bio_endio(m->bio); + free_discard_mapping(m); +} + +static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m) +{ + int r; + struct thin_c *tc = m->tc; + + r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end); + if (r) { + metadata_operation_failed(tc->pool, "dm_thin_remove_range", r); + bio_io_error(m->bio); + } else + bio_endio(m->bio); + cell_defer_no_holder(tc, m->cell); - cell_defer_no_holder(tc, m->cell2); mempool_free(m, tc->pool->mapping_pool); } -static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) +static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m) { + /* + * We've already unmapped this range of blocks, but before we + * passdown we have to check that these blocks are now unused. + */ + int r; + bool used = true; struct thin_c *tc = m->tc; + struct pool *pool = tc->pool; + dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin; - inc_all_io_entry(tc->pool, m->bio); - cell_defer_no_holder(tc, m->cell); - cell_defer_no_holder(tc, m->cell2); + while (b != end) { + /* find start of unmapped run */ + for (; b < end; b++) { + r = dm_pool_block_is_used(pool->pmd, b, &used); + if (r) + return r; - if (m->pass_discard) - if (m->definitely_not_shared) - remap_and_issue(tc, m->bio, m->data_block); - else { - bool used = false; - if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used) - bio_endio(m->bio, 0); - else - remap_and_issue(tc, m->bio, m->data_block); + if (!used) + break; } - else - bio_endio(m->bio, 0); - mempool_free(m, tc->pool->mapping_pool); + if (b == end) + break; + + /* find end of run */ + for (e = b + 1; e != end; e++) { + r = dm_pool_block_is_used(pool->pmd, e, &used); + if (r) + return r; + + if (used) + break; + } + + r = issue_discard(tc, b, e, m->bio); + if (r) + return r; + + b = e; + } + + return 0; } -static void process_prepared_discard(struct dm_thin_new_mapping *m) +static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m) { int r; struct thin_c *tc = m->tc; + struct pool *pool = tc->pool; - r = dm_thin_remove_block(tc->td, m->virt_block); + r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end); if (r) - DMERR_LIMIT("dm_thin_remove_block() failed"); + metadata_operation_failed(pool, "dm_thin_remove_range", r); + + else if (m->maybe_shared) + r = passdown_double_checking_shared_status(m); + else + r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio); - process_prepared_discard_passdown(m); + /* + * Even if r is set, there could be sub discards in flight that we + * need to wait for. + */ + m->bio->bi_error = r; + bio_endio(m->bio); + cell_defer_no_holder(tc, m->cell); + mempool_free(m, pool->mapping_pool); } static void process_prepared(struct pool *pool, struct list_head *head, @@ -980,7 +1148,7 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m, } static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio, - dm_block_t data_block, + dm_block_t data_begin, struct dm_thin_new_mapping *m) { struct pool *pool = tc->pool; @@ -990,7 +1158,7 @@ static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio, m->bio = bio; save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); inc_all_io_entry(pool, bio); - remap_and_issue(tc, bio, data_block); + remap_and_issue(tc, bio, data_begin); } /* @@ -1007,7 +1175,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, struct dm_thin_new_mapping *m = get_next_mapping(pool); m->tc = tc; - m->virt_block = virt_block; + m->virt_begin = virt_block; + m->virt_end = virt_block + 1u; m->data_block = data_dest; m->cell = cell; @@ -1086,7 +1255,8 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, atomic_set(&m->prepare_actions, 1); /* no need to quiesce */ m->tc = tc; - m->virt_block = virt_block; + m->virt_begin = virt_block; + m->virt_end = virt_block + 1u; m->data_block = data_block; m->cell = cell; @@ -1095,16 +1265,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, * zeroing pre-existing data, we can issue the bio immediately. * Otherwise we use kcopyd to zero the data first. */ - if (!pool->pf.zero_new_blocks) + if (pool->pf.zero_new_blocks) { + if (io_overwrites_block(pool, bio)) + remap_and_issue_overwrite(tc, bio, data_block, m); + else + ll_zero(tc, m, data_block * pool->sectors_per_block, + (data_block + 1) * pool->sectors_per_block); + } else process_prepared_mapping(m); - - else if (io_overwrites_block(pool, bio)) - remap_and_issue_overwrite(tc, bio, data_block, m); - - else - ll_zero(tc, m, - data_block * pool->sectors_per_block, - (data_block + 1) * pool->sectors_per_block); } static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, @@ -1270,9 +1438,10 @@ static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) { int error = should_error_unserviceable_bio(pool); - if (error) - bio_endio(bio, error); - else + if (error) { + bio->bi_error = error; + bio_endio(bio); + } else retry_on_resume(bio); } @@ -1295,99 +1464,148 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c retry_on_resume(bio); } -static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) +static void process_discard_cell_no_passdown(struct thin_c *tc, + struct dm_bio_prison_cell *virt_cell) { - int r; - struct bio *bio = cell->holder; struct pool *pool = tc->pool; - struct dm_bio_prison_cell *cell2; - struct dm_cell_key key2; - dm_block_t block = get_bio_block(tc, bio); - struct dm_thin_lookup_result lookup_result; - struct dm_thin_new_mapping *m; + struct dm_thin_new_mapping *m = get_next_mapping(pool); - if (tc->requeue_mode) { - cell_requeue(pool, cell); - return; - } + /* + * We don't need to lock the data blocks, since there's no + * passdown. We only lock data blocks for allocation and breaking sharing. + */ + m->tc = tc; + m->virt_begin = virt_cell->key.block_begin; + m->virt_end = virt_cell->key.block_end; + m->cell = virt_cell; + m->bio = virt_cell->holder; - r = dm_thin_find_block(tc->td, block, 1, &lookup_result); - switch (r) { - case 0: - /* - * Check nobody is fiddling with this pool block. This can - * happen if someone's in the process of breaking sharing - * on this block. - */ - build_data_key(tc->td, lookup_result.block, &key2); - if (bio_detain(tc->pool, &key2, bio, &cell2)) { - cell_defer_no_holder(tc, cell); - break; - } + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) + pool->process_prepared_discard(m); +} - if (io_overlaps_block(pool, bio)) { - /* - * IO may still be going to the destination block. We must - * quiesce before we can do the removal. - */ - m = get_next_mapping(pool); - m->tc = tc; - m->pass_discard = pool->pf.discard_passdown; - m->definitely_not_shared = !lookup_result.shared; - m->virt_block = block; - m->data_block = lookup_result.block; - m->cell = cell; - m->cell2 = cell2; - m->bio = bio; - - if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) - pool->process_prepared_discard(m); +/* + * __bio_inc_remaining() is used to defer parent bios's end_io until + * we _know_ all chained sub range discard bios have completed. + */ +static inline void __bio_inc_remaining(struct bio *bio) +{ + bio->bi_flags |= (1 << BIO_CHAIN); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_remaining); +} - } else { - inc_all_io_entry(pool, bio); - cell_defer_no_holder(tc, cell); - cell_defer_no_holder(tc, cell2); +static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end, + struct bio *bio) +{ + struct pool *pool = tc->pool; + + int r; + bool maybe_shared; + struct dm_cell_key data_key; + struct dm_bio_prison_cell *data_cell; + struct dm_thin_new_mapping *m; + dm_block_t virt_begin, virt_end, data_begin; + + while (begin != end) { + r = ensure_next_mapping(pool); + if (r) + /* we did our best */ + return; + r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end, + &data_begin, &maybe_shared); + if (r) /* - * The DM core makes sure that the discard doesn't span - * a block boundary. So we submit the discard of a - * partial block appropriately. + * Silently fail, letting any mappings we've + * created complete. */ - if ((!lookup_result.shared) && pool->pf.discard_passdown) - remap_and_issue(tc, bio, lookup_result.block); - else - bio_endio(bio, 0); + break; + + build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key); + if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) { + /* contention, we'll give up with this range */ + begin = virt_end; + continue; } - break; - case -ENODATA: /* - * It isn't provisioned, just forget it. + * IO may still be going to the destination block. We must + * quiesce before we can do the removal. */ - cell_defer_no_holder(tc, cell); - bio_endio(bio, 0); - break; + m = get_next_mapping(pool); + m->tc = tc; + m->maybe_shared = maybe_shared; + m->virt_begin = virt_begin; + m->virt_end = virt_end; + m->data_block = data_begin; + m->cell = data_cell; + m->bio = bio; - default: - DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d", - __func__, r); - cell_defer_no_holder(tc, cell); - bio_io_error(bio); - break; + /* + * The parent bio must not complete before sub discard bios are + * chained to it (see __blkdev_issue_discard_async's bio_chain)! + * + * This per-mapping bi_remaining increment is paired with + * the implicit decrement that occurs via bio_endio() in + * process_prepared_discard_{passdown,no_passdown}. + */ + __bio_inc_remaining(bio); + if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) + pool->process_prepared_discard(m); + + begin = virt_end; } } +static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell) +{ + struct bio *bio = virt_cell->holder; + struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook)); + + /* + * The virt_cell will only get freed once the origin bio completes. + * This means it will remain locked while all the individual + * passdown bios are in flight. + */ + h->cell = virt_cell; + break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio); + + /* + * We complete the bio now, knowing that the bi_remaining field + * will prevent completion until the sub range discards have + * completed. + */ + bio_endio(bio); +} + static void process_discard_bio(struct thin_c *tc, struct bio *bio) { - struct dm_bio_prison_cell *cell; - struct dm_cell_key key; - dm_block_t block = get_bio_block(tc, bio); + dm_block_t begin, end; + struct dm_cell_key virt_key; + struct dm_bio_prison_cell *virt_cell; - build_virtual_key(tc->td, block, &key); - if (bio_detain(tc->pool, &key, bio, &cell)) + get_bio_block_range(tc, bio, &begin, &end); + if (begin == end) { + /* + * The discard covers less than a block. + */ + bio_endio(bio); + return; + } + + build_key(tc->td, VIRTUAL, begin, end, &virt_key); + if (bio_detain(tc->pool, &virt_key, bio, &virt_cell)) + /* + * Potential starvation issue: We're relying on the + * fs/application being well behaved, and not trying to + * send IO to a region at the same time as discarding it. + * If they do this persistently then it's possible this + * cell will never be granted. + */ return; - process_discard_cell(tc, cell); + tc->pool->process_discard_cell(tc, virt_cell); } static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, @@ -1517,7 +1735,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block if (bio_data_dir(bio) == READ) { zero_fill_bio(bio); cell_defer_no_holder(tc, cell); - bio_endio(bio, 0); + bio_endio(bio); return; } @@ -1582,7 +1800,7 @@ static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell) } else { zero_fill_bio(bio); - bio_endio(bio, 0); + bio_endio(bio); } } else provision_block(tc, bio, block, cell); @@ -1653,7 +1871,7 @@ static void __process_bio_read_only(struct thin_c *tc, struct bio *bio, } zero_fill_bio(bio); - bio_endio(bio, 0); + bio_endio(bio); break; default: @@ -1678,7 +1896,7 @@ static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell static void process_bio_success(struct thin_c *tc, struct bio *bio) { - bio_endio(bio, 0); + bio_endio(bio); } static void process_bio_fail(struct thin_c *tc, struct bio *bio) @@ -2014,18 +2232,23 @@ static void do_waker(struct work_struct *ws) queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); } +static void notify_of_pool_mode_change_to_oods(struct pool *pool); + /* * We're holding onto IO to allow userland time to react. After the * timeout either the pool will have been resized (and thus back in - * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO. + * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space. */ static void do_no_space_timeout(struct work_struct *ws) { struct pool *pool = container_of(to_delayed_work(ws), struct pool, no_space_timeout); - if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) - set_pool_mode(pool, PM_READ_ONLY); + if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) { + pool->pf.error_if_no_space = true; + notify_of_pool_mode_change_to_oods(pool); + error_retry_list_with_code(pool, -ENOSPC); + } } /*----------------------------------------------------------------*/ @@ -2103,6 +2326,32 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode) dm_device_name(pool->pool_md), new_mode); } +static void notify_of_pool_mode_change_to_oods(struct pool *pool) +{ + if (!pool->pf.error_if_no_space) + notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)"); + else + notify_of_pool_mode_change(pool, "out-of-data-space (error IO)"); +} + +static bool passdown_enabled(struct pool_c *pt) +{ + return pt->adjusted_pf.discard_passdown; +} + +static void set_discard_callbacks(struct pool *pool) +{ + struct pool_c *pt = pool->ti->private; + + if (passdown_enabled(pt)) { + pool->process_discard_cell = process_discard_cell_passdown; + pool->process_prepared_discard = process_prepared_discard_passdown; + } else { + pool->process_discard_cell = process_discard_cell_no_passdown; + pool->process_prepared_discard = process_prepared_discard_no_passdown; + } +} + static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) { struct pool_c *pt = pool->ti->private; @@ -2154,7 +2403,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) pool->process_cell = process_cell_read_only; pool->process_discard_cell = process_cell_success; pool->process_prepared_mapping = process_prepared_mapping_fail; - pool->process_prepared_discard = process_prepared_discard_passdown; + pool->process_prepared_discard = process_prepared_discard_success; error_retry_list(pool); break; @@ -2169,13 +2418,12 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) * frequently seeing this mode. */ if (old_mode != new_mode) - notify_of_pool_mode_change(pool, "out-of-data-space"); + notify_of_pool_mode_change_to_oods(pool); pool->process_bio = process_bio_read_only; pool->process_discard = process_discard_bio; pool->process_cell = process_cell_read_only; - pool->process_discard_cell = process_discard_cell; pool->process_prepared_mapping = process_prepared_mapping; - pool->process_prepared_discard = process_prepared_discard; + set_discard_callbacks(pool); if (!pool->pf.error_if_no_space && no_space_timeout) queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout); @@ -2184,13 +2432,13 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) case PM_WRITE: if (old_mode != new_mode) notify_of_pool_mode_change(pool, "write"); + pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space; dm_pool_metadata_read_write(pool->pmd); pool->process_bio = process_bio; pool->process_discard = process_discard_bio; pool->process_cell = process_cell; - pool->process_discard_cell = process_discard_cell; pool->process_prepared_mapping = process_prepared_mapping; - pool->process_prepared_discard = process_prepared_discard; + set_discard_callbacks(pool); break; } @@ -2279,6 +2527,7 @@ static void thin_hook_bio(struct thin_c *tc, struct bio *bio) h->shared_read_entry = NULL; h->all_io_entry = NULL; h->overwrite_mapping = NULL; + h->cell = NULL; } /* @@ -2297,7 +2546,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio) thin_hook_bio(tc, bio); if (tc->requeue_mode) { - bio_endio(bio, DM_ENDIO_REQUEUE); + bio->bi_error = DM_ENDIO_REQUEUE; + bio_endio(bio); return DM_MAPIO_SUBMITTED; } @@ -2426,7 +2676,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt) struct pool *pool = pt->pool; struct block_device *data_bdev = pt->data_dev->bdev; struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits; - sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT; const char *reason = NULL; char buf[BDEVNAME_SIZE]; @@ -2439,12 +2688,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt) else if (data_limits->max_discard_sectors < pool->sectors_per_block) reason = "max discard sectors smaller than a block"; - else if (data_limits->discard_granularity > block_size) - reason = "discard granularity larger than a block"; - - else if (!is_factor(block_size, data_limits->discard_granularity)) - reason = "discard granularity not a factor of block size"; - if (reason) { DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason); pt->adjusted_pf.discard_passdown = false; @@ -2959,7 +3202,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) metadata_low_callback, pool); if (r) - goto out_free_pt; + goto out_flags_changed; pt->callbacks.congested_fn = pool_is_congested; dm_table_add_target_callbacks(ti->table, &pt->callbacks); @@ -3210,8 +3453,8 @@ static void pool_postsuspend(struct dm_target *ti) struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - cancel_delayed_work(&pool->waker); - cancel_delayed_work(&pool->no_space_timeout); + cancel_delayed_work_sync(&pool->waker); + cancel_delayed_work_sync(&pool->no_space_timeout); flush_workqueue(pool->wq); (void) commit(pool); } @@ -3389,7 +3632,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) if (get_pool_mode(pool) >= PM_READ_ONLY) { DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode", dm_device_name(pool->pool_md)); - return -EINVAL; + return -EOPNOTSUPP; } if (!strcasecmp(argv[0], "create_thin")) @@ -3447,6 +3690,7 @@ static void emit_flags(struct pool_features *pf, char *result, * Status line is: * <transaction id> <used metadata sectors>/<total metadata sectors> * <used data sectors>/<total data sectors> <held metadata root> + * <pool mode> <discard config> <no space config> <needs_check> */ static void pool_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) @@ -3548,6 +3792,11 @@ static void pool_status(struct dm_target *ti, status_type_t type, else DMEMIT("queue_if_no_space "); + if (dm_pool_metadata_needs_check(pool->pmd)) + DMEMIT("needs_check "); + else + DMEMIT("- "); + break; case STATUSTYPE_TABLE: @@ -3573,38 +3822,6 @@ static int pool_iterate_devices(struct dm_target *ti, return fn(ti, pt->data_dev, 0, ti->len, data); } -static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct pool_c *pt = ti->private; - struct request_queue *q = bdev_get_queue(pt->data_dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = pt->data_dev->bdev; - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - -static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) -{ - struct pool *pool = pt->pool; - struct queue_limits *data_limits; - - limits->max_discard_sectors = pool->sectors_per_block; - - /* - * discard_granularity is just a hint, and not enforced. - */ - if (pt->adjusted_pf.discard_passdown) { - data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; - limits->discard_granularity = max(data_limits->discard_granularity, - pool->sectors_per_block << SECTOR_SHIFT); - } else - limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; -} - static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) { struct pool_c *pt = ti->private; @@ -3659,14 +3876,17 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) disable_passdown_if_not_supported(pt); - set_discard_limits(pt, limits); + /* + * The pool uses the same discard limits as the underlying data + * device. DM core has already set this up. + */ } static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 14, 0}, + .version = {1, 16, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -3678,7 +3898,6 @@ static struct target_type pool_target = { .resume = pool_resume, .message = pool_message, .status = pool_status, - .merge = pool_merge, .iterate_devices = pool_iterate_devices, .io_hints = pool_io_hints, }; @@ -3825,8 +4044,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) if (tc->pool->pf.discard_enabled) { ti->discards_supported = true; ti->num_discard_bios = 1; - /* Discard bios must be split on a block boundary */ - ti->split_discard_bios = true; + ti->split_discard_bios = false; } mutex_unlock(&dm_thin_pool_table.mutex); @@ -3913,6 +4131,9 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err) } } + if (h->cell) + cell_defer_no_holder(h->tc, h->cell); + return 0; } @@ -4003,21 +4224,6 @@ err: DMEMIT("Error"); } -static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, - struct bio_vec *biovec, int max_size) -{ - struct thin_c *tc = ti->private; - struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev); - - if (!q->merge_bvec_fn) - return max_size; - - bvm->bi_bdev = tc->pool_dev->bdev; - bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector); - - return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); -} - static int thin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { @@ -4040,9 +4246,21 @@ static int thin_iterate_devices(struct dm_target *ti, return 0; } +static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) +{ + struct thin_c *tc = ti->private; + struct pool *pool = tc->pool; + + if (!pool->pf.discard_enabled) + return; + + limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; + limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */ +} + static struct target_type thin_target = { .name = "thin", - .version = {1, 14, 0}, + .version = {1, 16, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, @@ -4052,8 +4270,8 @@ static struct target_type thin_target = { .presuspend = thin_presuspend, .postsuspend = thin_postsuspend, .status = thin_status, - .merge = thin_merge, .iterate_devices = thin_iterate_devices, + .io_hints = thin_io_hints, }; /*----------------------------------------------------------------*/ |