diff options
author | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-11 10:41:07 +0300 |
---|---|---|
committer | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-13 08:17:18 +0300 |
commit | e09b41010ba33a20a87472ee821fa407a5b8da36 (patch) | |
tree | d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/block | |
parent | f93b97fd65072de626c074dbe099a1fff05ce060 (diff) |
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page.
During the rebasing, the following patch collided:
Force tick interrupt and get rid of softirq magic(I70131fb85).
Collisions have been removed because its logic was found on the
source already.
Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769
Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/block')
34 files changed, 2850 insertions, 2435 deletions
diff --git a/kernel/block/bio-integrity.c b/kernel/block/bio-integrity.c index 39ce74d10..f6325d573 100644 --- a/kernel/block/bio-integrity.c +++ b/kernel/block/bio-integrity.c @@ -32,6 +32,11 @@ static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; +void blk_flush_integrity(void) +{ + flush_workqueue(kintegrityd_wq); +} + /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to @@ -140,6 +145,11 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; + if (bip->bip_vcnt && + bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev), + &bip->bip_vec[bip->bip_vcnt - 1], offset)) + return 0; + iv->bv_page = page; iv->bv_len = len; iv->bv_offset = offset; @@ -172,11 +182,11 @@ bool bio_integrity_enabled(struct bio *bio) if (bi == NULL) return false; - if (bio_data_dir(bio) == READ && bi->verify_fn != NULL && + if (bio_data_dir(bio) == READ && bi->profile->verify_fn != NULL && (bi->flags & BLK_INTEGRITY_VERIFY)) return true; - if (bio_data_dir(bio) == WRITE && bi->generate_fn != NULL && + if (bio_data_dir(bio) == WRITE && bi->profile->generate_fn != NULL && (bi->flags & BLK_INTEGRITY_GENERATE)) return true; @@ -197,7 +207,7 @@ EXPORT_SYMBOL(bio_integrity_enabled); static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) { - return sectors >> (ilog2(bi->interval) - 9); + return sectors >> (bi->interval_exp - 9); } static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, @@ -224,7 +234,7 @@ static int bio_integrity_process(struct bio *bio, bip->bip_vec->bv_offset; iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = bi->interval; + iter.interval = 1 << bi->interval_exp; iter.seed = bip_get_seed(bip); iter.prot_buf = prot_buf; @@ -335,7 +345,7 @@ int bio_integrity_prep(struct bio *bio) /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE) - bio_integrity_process(bio, bi->generate_fn); + bio_integrity_process(bio, bi->profile->generate_fn); return 0; } @@ -355,13 +365,12 @@ static void bio_integrity_verify_fn(struct work_struct *work) container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); - int error; - error = bio_integrity_process(bio, bi->verify_fn); + bio->bi_error = bio_integrity_process(bio, bi->profile->verify_fn); /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; - bio_endio_nodec(bio, error); + bio_endio(bio); } /** @@ -376,7 +385,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) * in process context. This function postpones completion * accordingly. */ -void bio_integrity_endio(struct bio *bio, int error) +void bio_integrity_endio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); @@ -386,9 +395,9 @@ void bio_integrity_endio(struct bio *bio, int error) * integrity metadata. Restore original bio end_io handler * and run it. */ - if (error) { + if (bio->bi_error) { bio->bi_end_io = bip->bip_end_io; - bio_endio_nodec(bio, error); + bio_endio(bio); return; } diff --git a/kernel/block/bio.c b/kernel/block/bio.c index 4441522ca..d4d144363 100644 --- a/kernel/block/bio.c +++ b/kernel/block/bio.c @@ -211,7 +211,7 @@ fallback: bvl = mempool_alloc(pool, gfp_mask); } else { struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO); + gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); /* * Make this allocation restricted and don't dump info on @@ -221,11 +221,11 @@ fallback: __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; /* - * Try a slab allocation. If this fails and __GFP_WAIT + * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM * is set, retry with the 1-entry mempool */ bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) { + if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { *idx = BIOVEC_MAX_IDX; goto fallback; } @@ -269,9 +269,8 @@ static void bio_free(struct bio *bio) void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); - bio->bi_flags = 1 << BIO_UPTODATE; - atomic_set(&bio->bi_remaining, 1); - atomic_set(&bio->bi_cnt, 1); + atomic_set(&bio->__bi_remaining, 1); + atomic_set(&bio->__bi_cnt, 1); } EXPORT_SYMBOL(bio_init); @@ -292,17 +291,31 @@ void bio_reset(struct bio *bio) __bio_free(bio); memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags|(1 << BIO_UPTODATE); - atomic_set(&bio->bi_remaining, 1); + bio->bi_flags = flags; + atomic_set(&bio->__bi_remaining, 1); } EXPORT_SYMBOL(bio_reset); -static void bio_chain_endio(struct bio *bio, int error) +static void bio_chain_endio(struct bio *bio) { - bio_endio(bio->bi_private, error); + struct bio *parent = bio->bi_private; + + parent->bi_error = bio->bi_error; + bio_endio(parent); bio_put(bio); } +/* + * Increment chain count for the bio. Make sure the CHAIN flag update + * is visible before the raised count. + */ +static inline void bio_inc_remaining(struct bio *bio) +{ + bio_set_flag(bio, BIO_CHAIN); + smp_mb__before_atomic(); + atomic_inc(&bio->__bi_remaining); +} + /** * bio_chain - chain bio completions * @bio: the target bio @@ -320,7 +333,7 @@ void bio_chain(struct bio *bio, struct bio *parent) bio->bi_private = parent; bio->bi_end_io = bio_chain_endio; - atomic_inc(&parent->bi_remaining); + bio_inc_remaining(parent); } EXPORT_SYMBOL(bio_chain); @@ -382,12 +395,12 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is * backed by the @bs's mempool. * - * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be - * able to allocate a bio. This is due to the mempool guarantees. To make this - * work, callers must never allocate more than 1 bio at a time from this pool. - * Callers that need to allocate more than 1 bio must always submit the - * previously allocated bio for IO before attempting to allocate a new one. - * Failure to do so can cause deadlocks under memory pressure. + * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will + * always be able to allocate a bio. This is due to the mempool guarantees. + * To make this work, callers must never allocate more than 1 bio at a time + * from this pool. Callers that need to allocate more than 1 bio must always + * submit the previously allocated bio for IO before attempting to allocate + * a new one. Failure to do so can cause deadlocks under memory pressure. * * Note that when running under generic_make_request() (i.e. any block * driver), bios are not submitted until after you return - see the code in @@ -446,13 +459,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) * We solve this, and guarantee forward progress, with a rescuer * workqueue per bio_set. If we go to allocate and there are * bios on current->bio_list, we first try the allocation - * without __GFP_WAIT; if that fails, we punt those bios we - * would be blocking to the rescuer workqueue before we retry - * with the original gfp_flags. + * without __GFP_DIRECT_RECLAIM; if that fails, we punt those + * bios we would be blocking to the rescuer workqueue before + * we retry with the original gfp_flags. */ if (current->bio_list && !bio_list_empty(current->bio_list)) - gfp_mask &= ~__GFP_WAIT; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; p = mempool_alloc(bs->bio_pool, gfp_mask); if (!p && gfp_mask != saved_gfp) { @@ -482,7 +495,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) if (unlikely(!bvl)) goto err_free; - bio->bi_flags |= 1 << BIO_OWNS_VEC; + bio_set_flag(bio, BIO_OWNS_VEC); } else if (nr_iovecs) { bvl = bio->bi_inline_vecs; } @@ -524,13 +537,17 @@ EXPORT_SYMBOL(zero_fill_bio); **/ void bio_put(struct bio *bio) { - BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); - - /* - * last put frees it - */ - if (atomic_dec_and_test(&bio->bi_cnt)) + if (!bio_flagged(bio, BIO_REFFED)) bio_free(bio); + else { + BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); + + /* + * last put frees it + */ + if (atomic_dec_and_test(&bio->__bi_cnt)) + bio_free(bio); + } } EXPORT_SYMBOL(bio_put); @@ -563,7 +580,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) * so we don't set nor calculate new physical/hw segment counts here */ bio->bi_bdev = bio_src->bi_bdev; - bio->bi_flags |= 1 << BIO_CLONED; + bio_set_flag(bio, BIO_CLONED); bio->bi_rw = bio_src->bi_rw; bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; @@ -677,31 +694,22 @@ integrity_clone: EXPORT_SYMBOL(bio_clone_bioset); /** - * bio_get_nr_vecs - return approx number of vecs - * @bdev: I/O target + * bio_add_pc_page - attempt to add page to bio + * @q: the target queue + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset * - * Return the approximate number of pages we can send to this target. - * There's no guarantee that you will be able to fit this number of pages - * into a bio, it does not account for dynamic restrictions that vary - * on offset. + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + * + * This should only be used by REQ_PC bios. */ -int bio_get_nr_vecs(struct block_device *bdev) -{ - struct request_queue *q = bdev_get_queue(bdev); - int nr_pages; - - nr_pages = min_t(unsigned, - queue_max_segments(q), - queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1); - - return min_t(unsigned, nr_pages, BIO_MAX_PAGES); - -} -EXPORT_SYMBOL(bio_get_nr_vecs); - -static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page - *page, unsigned int len, unsigned int offset, - unsigned int max_sectors) +int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page + *page, unsigned int len, unsigned int offset) { int retried_segments = 0; struct bio_vec *bvec; @@ -712,7 +720,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (unlikely(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) + if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q)) return 0; /* @@ -725,28 +733,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { - unsigned int prev_bv_len = prev->bv_len; prev->bv_len += len; - - if (q->merge_bvec_fn) { - struct bvec_merge_data bvm = { - /* prev_bvec is already charged in - bi_size, discharge it in order to - simulate merging updated prev_bvec - as new bvec. */ - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size - - prev_bv_len, - .bi_rw = bio->bi_rw, - }; - - if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { - prev->bv_len -= len; - return 0; - } - } - bio->bi_iter.bi_size += len; goto done; } @@ -755,8 +742,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ - if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS) && - bvec_gap_to_prev(prev, offset)) + if (bvec_gap_to_prev(q, prev, offset)) return 0; } @@ -789,30 +775,9 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page blk_recount_segments(q, bio); } - /* - * if queue has other restrictions (eg varying max sector size - * depending on offset), it can specify a merge_bvec_fn in the - * queue to get further control - */ - if (q->merge_bvec_fn) { - struct bvec_merge_data bvm = { - .bi_bdev = bio->bi_bdev, - .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size - len, - .bi_rw = bio->bi_rw, - }; - - /* - * merge_bvec_fn() returns number of bytes it can accept - * at this offset - */ - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) - goto failed; - } - /* If we may be able to merge these biovecs, force a recount */ if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) - bio->bi_flags &= ~(1 << BIO_SEG_VALID); + bio_clear_flag(bio, BIO_SEG_VALID); done: return len; @@ -826,28 +791,6 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page blk_recount_segments(q, bio); return 0; } - -/** - * bio_add_pc_page - attempt to add page to bio - * @q: the target queue - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset - * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. - * - * This should only be used by REQ_PC bios. - */ -int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) -{ - return __bio_add_page(q, bio, page, len, offset, - queue_max_hw_sectors(q)); -} EXPORT_SYMBOL(bio_add_pc_page); /** @@ -857,22 +800,47 @@ EXPORT_SYMBOL(bio_add_pc_page); * @len: vec entry length * @offset: vec entry offset * - * Attempt to add a page to the bio_vec maplist. This can fail for a - * number of reasons, such as the bio being full or target block device - * limitations. The target block device must allow bio's up to PAGE_SIZE, - * so it is always possible to add a single page to an empty bio. + * Attempt to add a page to the bio_vec maplist. This will only fail + * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. */ -int bio_add_page(struct bio *bio, struct page *page, unsigned int len, - unsigned int offset) +int bio_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - unsigned int max_sectors; + struct bio_vec *bv; + + /* + * cloned bio must not modify vec list + */ + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return 0; + + /* + * For filesystems with a blocksize smaller than the pagesize + * we will often be called with the same page as last time and + * a consecutive offset. Optimize this special case. + */ + if (bio->bi_vcnt > 0) { + bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - max_sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); - if ((max_sectors < (len >> 9)) && !bio->bi_iter.bi_size) - max_sectors = len >> 9; + if (page == bv->bv_page && + offset == bv->bv_offset + bv->bv_len) { + bv->bv_len += len; + goto done; + } + } - return __bio_add_page(q, bio, page, len, offset, max_sectors); + if (bio->bi_vcnt >= bio->bi_max_vecs) + return 0; + + bv = &bio->bi_io_vec[bio->bi_vcnt]; + bv->bv_page = page; + bv->bv_len = len; + bv->bv_offset = offset; + + bio->bi_vcnt++; +done: + bio->bi_iter.bi_size += len; + return len; } EXPORT_SYMBOL(bio_add_page); @@ -881,11 +849,11 @@ struct submit_bio_ret { int error; }; -static void submit_bio_wait_endio(struct bio *bio, int error) +static void submit_bio_wait_endio(struct bio *bio) { struct submit_bio_ret *ret = bio->bi_private; - ret->error = error; + ret->error = bio->bi_error; complete(&ret->event); } @@ -1122,9 +1090,12 @@ int bio_uncopy_user(struct bio *bio) if (!bio_flagged(bio, BIO_NULL_MAPPED)) { /* * if we're in a workqueue, the request is orphaned, so - * don't copy into a random user address space, just free. + * don't copy into a random user address space, just free + * and return -EINTR so user space doesn't expect any data. */ - if (current->mm && bio_data_dir(bio) == READ) + if (!current->mm) + ret = -EINTR; + else if (bio_data_dir(bio) == READ) ret = bio_copy_to_iter(bio, bmd->iter); if (bmd->is_our_pages) bio_free_pages(bio); @@ -1373,7 +1344,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, if (iter->type & WRITE) bio->bi_rw |= REQ_WRITE; - bio->bi_flags |= (1 << BIO_USER_MAPPED); + bio_set_flag(bio, BIO_USER_MAPPED); /* * subtle -- if __bio_map_user() ended up bouncing a bio, @@ -1430,7 +1401,7 @@ void bio_unmap_user(struct bio *bio) } EXPORT_SYMBOL(bio_unmap_user); -static void bio_map_kern_endio(struct bio *bio, int err) +static void bio_map_kern_endio(struct bio *bio) { bio_put(bio); } @@ -1486,13 +1457,13 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, } EXPORT_SYMBOL(bio_map_kern); -static void bio_copy_kern_endio(struct bio *bio, int err) +static void bio_copy_kern_endio(struct bio *bio) { bio_free_pages(bio); bio_put(bio); } -static void bio_copy_kern_endio_read(struct bio *bio, int err) +static void bio_copy_kern_endio_read(struct bio *bio) { char *p = bio->bi_private; struct bio_vec *bvec; @@ -1503,7 +1474,7 @@ static void bio_copy_kern_endio_read(struct bio *bio, int err) p += bvec->bv_len; } - bio_copy_kern_endio(bio, err); + bio_copy_kern_endio(bio); } /** @@ -1741,32 +1712,39 @@ void bio_flush_dcache_pages(struct bio *bi) EXPORT_SYMBOL(bio_flush_dcache_pages); #endif +static inline bool bio_remaining_done(struct bio *bio) +{ + /* + * If we're not chaining, then ->__bi_remaining is always 1 and + * we always end io on the first invocation. + */ + if (!bio_flagged(bio, BIO_CHAIN)) + return true; + + BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); + + if (atomic_dec_and_test(&bio->__bi_remaining)) { + bio_clear_flag(bio, BIO_CHAIN); + return true; + } + + return false; +} + /** * bio_endio - end I/O on a bio * @bio: bio - * @error: error, if any * * Description: - * bio_endio() will end I/O on the whole bio. bio_endio() is the - * preferred way to end I/O on a bio, it takes care of clearing - * BIO_UPTODATE on error. @error is 0 on success, and and one of the - * established -Exxxx (-EIO, for instance) error values in case - * something went wrong. No one should call bi_end_io() directly on a - * bio unless they own it and thus know that it has an end_io - * function. + * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred + * way to end I/O on a bio. No one should call bi_end_io() directly on a + * bio unless they own it and thus know that it has an end_io function. **/ -void bio_endio(struct bio *bio, int error) +void bio_endio(struct bio *bio) { while (bio) { - BUG_ON(atomic_read(&bio->bi_remaining) <= 0); - - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; - - if (!atomic_dec_and_test(&bio->bi_remaining)) - return; + if (unlikely(!bio_remaining_done(bio))) + break; /* * Need to have a real endio function for chained bios, @@ -1778,11 +1756,12 @@ void bio_endio(struct bio *bio, int error) */ if (bio->bi_end_io == bio_chain_endio) { struct bio *parent = bio->bi_private; + parent->bi_error = bio->bi_error; bio_put(bio); bio = parent; } else { if (bio->bi_end_io) - bio->bi_end_io(bio, error); + bio->bi_end_io(bio); bio = NULL; } } @@ -1790,21 +1769,6 @@ void bio_endio(struct bio *bio, int error) EXPORT_SYMBOL(bio_endio); /** - * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining - * @bio: bio - * @error: error, if any - * - * For code that has saved and restored bi_end_io; thing hard before using this - * function, probably you should've cloned the entire bio. - **/ -void bio_endio_nodec(struct bio *bio, int error) -{ - atomic_inc(&bio->bi_remaining); - bio_endio(bio, error); -} -EXPORT_SYMBOL(bio_endio_nodec); - -/** * bio_split - split a bio * @bio: bio to split * @sectors: number of sectors to split from the front of @bio @@ -1865,7 +1829,7 @@ void bio_trim(struct bio *bio, int offset, int size) if (offset == 0 && size == bio->bi_iter.bi_size) return; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); + bio_clear_flag(bio, BIO_SEG_VALID); bio_advance(bio, offset << 9); @@ -1980,6 +1944,29 @@ struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_ EXPORT_SYMBOL(bioset_create_nobvec); #ifdef CONFIG_BLK_CGROUP + +/** + * bio_associate_blkcg - associate a bio with the specified blkcg + * @bio: target bio + * @blkcg_css: css of the blkcg to associate + * + * Associate @bio with the blkcg specified by @blkcg_css. Block layer will + * treat @bio as if it were issued by a task which belongs to the blkcg. + * + * This function takes an extra reference of @blkcg_css which will be put + * when @bio is released. The caller must own @bio and is responsible for + * synchronizing calls to this function. + */ +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) +{ + if (unlikely(bio->bi_css)) + return -EBUSY; + css_get(blkcg_css); + bio->bi_css = blkcg_css; + return 0; +} +EXPORT_SYMBOL_GPL(bio_associate_blkcg); + /** * bio_associate_current - associate a bio with %current * @bio: target bio @@ -1996,28 +1983,20 @@ EXPORT_SYMBOL(bioset_create_nobvec); int bio_associate_current(struct bio *bio) { struct io_context *ioc; - struct cgroup_subsys_state *css; - if (bio->bi_ioc) + if (bio->bi_css) return -EBUSY; ioc = current->io_context; if (!ioc) return -ENOENT; - /* acquire active ref on @ioc and associate */ get_io_context_active(ioc); bio->bi_ioc = ioc; - - /* associate blkcg if exists */ - rcu_read_lock(); - css = task_css(current, blkio_cgrp_id); - if (css && css_tryget_online(css)) - bio->bi_css = css; - rcu_read_unlock(); - + bio->bi_css = task_get_css(current, io_cgrp_id); return 0; } +EXPORT_SYMBOL_GPL(bio_associate_current); /** * bio_disassociate_task - undo bio_associate_current() diff --git a/kernel/block/blk-cgroup.c b/kernel/block/blk-cgroup.c index 6817e2896..5a37188b5 100644 --- a/kernel/block/blk-cgroup.c +++ b/kernel/block/blk-cgroup.c @@ -9,29 +9,46 @@ * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> + * + * For policy-specific per-blkcg data: + * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> + * Arianna Avanzini <avanzini.arianna@gmail.com> */ #include <linux/ioprio.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/err.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/genhd.h> #include <linux/delay.h> #include <linux/atomic.h> -#include "blk-cgroup.h" +#include <linux/ctype.h> +#include <linux/blk-cgroup.h> #include "blk.h" #define MAX_KEY_LEN 100 +/* + * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. + * blkcg_pol_register_mutex nests outside of it and synchronizes entire + * policy [un]register operations including cgroup file additions / + * removals. Putting cgroup file registration outside blkcg_pol_mutex + * allows grabbing it from cgroup callbacks. + */ +static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, - .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; +struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); +struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; + static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; +static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -52,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg) return; for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkg->pd[i]); + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - blk_exit_rl(&blkg->rl); + if (blkg->blkcg != &blkcg_root) + blk_exit_rl(&blkg->rl); + + blkg_rwstat_exit(&blkg->stat_ios); + blkg_rwstat_exit(&blkg->stat_bytes); kfree(blkg); } @@ -77,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg) return NULL; + if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || + blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + goto err_free; + blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; @@ -97,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, continue; /* alloc per-policy data and attach it to blkg */ - pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); + pd = pol->pd_alloc_fn(gfp_mask, q->node); if (!pd) goto err_free; @@ -113,26 +139,11 @@ err_free: return NULL; } -/** - * __blkg_lookup - internal version of blkg_lookup() - * @blkcg: blkcg of interest - * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not - * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. - */ -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint) +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; - blkg = rcu_dereference(blkcg->blkg_hint); - if (blkg && blkg->q == q) - return blkg; - /* * Hint didn't match. Look up from the radix tree. Note that the * hint can only be updated under queue_lock as otherwise @blkg @@ -150,35 +161,18 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, return NULL; } - -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock and is guaranteed to return %NULL if @q is bypassing - * - see blk_queue_bypass_start() for details. - */ -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (unlikely(blk_queue_bypass(q))) - return NULL; - return __blkg_lookup(blkcg, q, false); -} -EXPORT_SYMBOL_GPL(blkg_lookup); +EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as - * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. + * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct request_queue *q, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; + struct bdi_writeback_congested *wb_congested; int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -186,26 +180,34 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { - ret = -EINVAL; + ret = -ENODEV; goto err_free_blkg; } + wb_congested = wb_congested_get_create(&q->backing_dev_info, + blkcg->css.id, GFP_NOWAIT); + if (!wb_congested) { + ret = -ENOMEM; + goto err_put_css; + } + /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); + new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_css; + goto err_put_congested; } } blkg = new_blkg; + blkg->wb_congested = wb_congested; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { - ret = -EINVAL; - goto err_put_css; + ret = -ENODEV; + goto err_put_congested; } blkg_get(blkg->parent); } @@ -215,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) - pol->pd_init_fn(blkg); + pol->pd_init_fn(blkg->pd[i]); } /* insert */ @@ -229,24 +231,21 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg); + pol->pd_online_fn(blkg->pd[i]); } } blkg->online = true; spin_unlock(&blkcg->lock); - if (!ret) { - if (blkcg == &blkcg_root) { - q->root_blkg = blkg; - q->root_rl.blkg = blkg; - } + if (!ret) return blkg; - } /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); return ERR_PTR(ret); +err_put_congested: + wb_congested_put(wb_congested); err_put_css: css_put(&blkcg->css); err_free_blkg: @@ -281,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -305,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, return blkg; } } -EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(blkg->q->queue_lock); @@ -323,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg); + pol->pd_offline_fn(blkg->pd[i]); } + + if (parent) { + blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); + blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); + } + blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -340,15 +345,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* - * If root blkg is destroyed. Just clear the pointer since root_rl - * does not take reference on root blkg. - */ - if (blkcg == &blkcg_root) { - blkg->q->root_blkg = NULL; - blkg->q->root_rl.blkg = NULL; - } - - /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ @@ -374,6 +370,9 @@ static void blkg_destroy_all(struct request_queue *q) blkg_destroy(blkg); spin_unlock(&blkcg->lock); } + + q->root_blkg = NULL; + q->root_rl.blkg = NULL; } /* @@ -387,21 +386,14 @@ static void blkg_destroy_all(struct request_queue *q) void __blkg_release_rcu(struct rcu_head *rcu_head) { struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); - int i; - - /* tell policies that this one is being freed */ - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - - if (blkg->pd[i] && pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - } /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); if (blkg->parent) blkg_put(blkg->parent); + wb_congested_put(blkg->wb_congested); + blkg_free(blkg); } EXPORT_SYMBOL_GPL(__blkg_release_rcu); @@ -448,20 +440,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct blkcg_gq *blkg; int i; - /* - * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex - * which ends up putting cgroup's internal cgroup_tree_mutex under - * it; however, cgroup_tree_mutex is nested above cgroup file - * active protection and grabbing blkcg_pol_mutex from a cgroup - * file operation creates a possible circular dependency. cgroup - * internal locking is planned to go through further simplification - * and this issue should go away soon. For now, let's trylock - * blkcg_pol_mutex and restart the write on failure. - * - * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com - */ - if (!mutex_trylock(&blkcg_pol_mutex)) - return restart_syscall(); + mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); /* @@ -470,12 +449,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + blkg_rwstat_reset(&blkg->stat_bytes); + blkg_rwstat_reset(&blkg->stat_ios); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkcg_policy_enabled(blkg->q, pol) && - pol->pd_reset_stats_fn) - pol->pd_reset_stats_fn(blkg); + if (blkg->pd[i] && pol->pd_reset_stats_fn) + pol->pd_reset_stats_fn(blkg->pd[i]); } } @@ -484,13 +465,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, return 0; } -static const char *blkg_dev_name(struct blkcg_gq *blkg) +const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ if (blkg->q->backing_dev_info.dev) return dev_name(blkg->q->backing_dev_info.dev); return NULL; } +EXPORT_SYMBOL_GPL(blkg_dev_name); /** * blkcg_print_blkgs - helper for printing per-blkg data @@ -579,9 +561,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, for (i = 0; i < BLKG_RWSTAT_NR; i++) seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - (unsigned long long)rwstat->cnt[i]); + (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); - v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; + v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } @@ -618,31 +601,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); +static u64 blkg_prfill_rwstat_field(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); + + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_bytes. + * cftype->private must be set to the blkcg_policy. + */ +int blkg_print_stat_bytes(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private + * must be set to the blkcg_policy. + */ +int blkg_print_stat_ios(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios); + +static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, + NULL, off); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); + +/** + * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); + /** * blkg_stat_recursive_sum - collect hierarchical blkg_stat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_stat + * @off: offset to the blkg_stat in blkg_policy_data or @blkg + * + * Collect the blkg_stat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. * - * Collect the blkg_stat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is + * at @off bytes into @blkg's blkg_policy_data of the policy. */ -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; u64 sum = 0; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_stat *stat = (void *)pos_pd + off; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_stat *stat; + + if (!pos_blkg->online) + continue; - if (pos_blkg->online) - sum += blkg_stat_read(stat); + if (pol) + stat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + stat = (void *)blkg + off; + + sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); @@ -652,37 +726,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); /** * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg + * + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. * - * Collect the blkg_rwstat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. */ -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off) +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; struct blkg_rwstat sum = { }; int i; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_rwstat *rwstat = (void *)pos_pd + off; - struct blkg_rwstat tmp; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat; if (!pos_blkg->online) continue; - tmp = blkg_rwstat_read(rwstat); + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum.cnt[i] += tmp.cnt[i]; + atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]), + &sum.aux_cnt[i]); } rcu_read_unlock(); @@ -698,29 +778,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); * @ctx: blkg_conf_ctx to be filled * * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new - * value. This function returns with RCU read lock and queue lock held and - * must be paired with blkg_conf_finish(). + * result. @ctx->blkg points to the blkg to be updated and @ctx->body the + * part of @input following MAJ:MIN. This function returns with RCU read + * lock and queue lock held and must be paired with blkg_conf_finish(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx) + char *input, struct blkg_conf_ctx *ctx) __acquires(rcu) __acquires(disk->queue->queue_lock) { struct gendisk *disk; struct blkcg_gq *blkg; unsigned int major, minor; - unsigned long long v; - int part, ret; + int key_len, part, ret; + char *body; + + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) + return -EINVAL; - if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) + body = input + key_len; + if (!isspace(*body)) return -EINVAL; + body = skip_spaces(body); disk = get_gendisk(MKDEV(major, minor), &part); if (!disk) - return -EINVAL; + return -ENODEV; if (part) { put_disk(disk); - return -EINVAL; + return -ENODEV; } rcu_read_lock(); @@ -729,7 +814,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (blkcg_policy_enabled(disk->queue, pol)) blkg = blkg_lookup_create(blkcg, disk->queue); else - blkg = ERR_PTR(-EINVAL); + blkg = ERR_PTR(-EOPNOTSUPP); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); @@ -751,7 +836,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ctx->disk = disk; ctx->blkg = blkg; - ctx->v = v; + ctx->body = body; return 0; } EXPORT_SYMBOL_GPL(blkg_conf_prep); @@ -772,8 +857,56 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + const char *dname; + struct blkg_rwstat rwstat; + u64 rbytes, wbytes, rios, wios; + + dname = blkg_dev_name(blkg); + if (!dname) + continue; + + spin_lock_irq(blkg->q->queue_lock); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_ios)); + rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + spin_unlock_irq(blkg->q->queue_lock); + + if (rbytes || wbytes || rios || wios) + seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", + dname, rbytes, wbytes, rios, wios); + } + + rcu_read_unlock(); + return 0; +} + struct cftype blkcg_files[] = { { + .name = "stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = blkcg_print_stat, + }, + { } /* terminate */ +}; + +struct cftype blkcg_legacy_files[] = { + { .name = "reset_stats", .write_u64 = blkcg_reset_stats, }, @@ -813,38 +946,91 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) } spin_unlock_irq(&blkcg->lock); + + wb_blkcg_offline(blkcg); } static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); + int i; + + mutex_lock(&blkcg_pol_mutex); + + list_del(&blkcg->all_blkcgs_node); + + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); - if (blkcg != &blkcg_root) - kfree(blkcg); + mutex_unlock(&blkcg_pol_mutex); + + kfree(blkcg); } static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; + struct cgroup_subsys_state *ret; + int i; + + mutex_lock(&blkcg_pol_mutex); if (!parent_css) { blkcg = &blkcg_root; - goto done; + } else { + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) { + ret = ERR_PTR(-ENOMEM); + goto free_blkcg; + } } - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) - return ERR_PTR(-ENOMEM); + for (i = 0; i < BLKCG_MAX_POLS ; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg_policy_data *cpd; + + /* + * If the policy hasn't been attached yet, wait for it + * to be attached before doing anything else. Otherwise, + * check if the policy requires any specific per-cgroup + * data: if it does, allocate and initialize it. + */ + if (!pol || !pol->cpd_alloc_fn) + continue; + + cpd = pol->cpd_alloc_fn(GFP_KERNEL); + if (!cpd) { + ret = ERR_PTR(-ENOMEM); + goto free_pd_blkcg; + } + blkcg->cpd[i] = cpd; + cpd->blkcg = blkcg; + cpd->plid = i; + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); + } - blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; - blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; -done: spin_lock_init(&blkcg->lock); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&blkcg->cgwb_list); +#endif + list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); + mutex_unlock(&blkcg_pol_mutex); return &blkcg->css; + +free_pd_blkcg: + for (i--; i >= 0; i--) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); +free_blkcg: + kfree(blkcg); + mutex_unlock(&blkcg_pol_mutex); + return ret; } /** @@ -859,9 +1045,45 @@ done: */ int blkcg_init_queue(struct request_queue *q) { - might_sleep(); + struct blkcg_gq *new_blkg, *blkg; + bool preloaded; + int ret; + + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) + return -ENOMEM; + + preloaded = !radix_tree_preload(GFP_KERNEL); - return blk_throtl_init(q); + /* + * Make sure the root blkg exists and count the existing blkgs. As + * @q is bypassing at this point, blkg_lookup_create() can't be + * used. Open code insertion. + */ + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + blkg = blkg_create(&blkcg_root, q, new_blkg); + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); + + if (preloaded) + radix_tree_preload_end(); + + if (IS_ERR(blkg)) { + blkg_free(new_blkg); + return PTR_ERR(blkg); + } + + q->root_blkg = blkg; + q->root_rl.blkg = blkg; + + ret = blk_throtl_init(q); + if (ret) { + spin_lock_irq(q->queue_lock); + blkg_destroy_all(q); + spin_unlock_irq(q->queue_lock); + } + return ret; } /** @@ -905,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *dst_css; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) @@ -925,12 +1147,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } -struct cgroup_subsys blkio_cgrp_subsys = { +static void blkcg_bind(struct cgroup_subsys_state *root_css) +{ + int i; + + mutex_lock(&blkcg_pol_mutex); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg *blkcg; + + if (!pol || !pol->cpd_bind_fn) + continue; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) + if (blkcg->cpd[pol->plid]) + pol->cpd_bind_fn(blkcg->cpd[pol->plid]); + } + mutex_unlock(&blkcg_pol_mutex); +} + +struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .legacy_cftypes = blkcg_files, + .bind = blkcg_bind, + .dfl_cftypes = blkcg_files, + .legacy_cftypes = blkcg_legacy_files, + .legacy_name = "blkio", #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled @@ -940,7 +1185,7 @@ struct cgroup_subsys blkio_cgrp_subsys = { .depends_on = 1 << memory_cgrp_id, #endif }; -EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); +EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -961,96 +1206,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { - LIST_HEAD(pds); - struct blkcg_gq *blkg, *new_blkg; - struct blkg_policy_data *pd, *n; - int cnt = 0, ret; - bool preloaded; + struct blkg_policy_data *pd_prealloc = NULL; + struct blkcg_gq *blkg; + int ret; if (blkcg_policy_enabled(q, pol)) return 0; - /* preallocations for root blkg */ - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!new_blkg) - return -ENOMEM; - blk_queue_bypass_start(q); - - preloaded = !radix_tree_preload(GFP_KERNEL); - - /* - * Make sure the root blkg exists and count the existing blkgs. As - * @q is bypassing at this point, blkg_lookup_create() can't be - * used. Open code it. - */ - spin_lock_irq(q->queue_lock); - - rcu_read_lock(); - blkg = __blkg_lookup(&blkcg_root, q, false); - if (blkg) - blkg_free(new_blkg); - else - blkg = blkg_create(&blkcg_root, q, new_blkg); - rcu_read_unlock(); - - if (preloaded) - radix_tree_preload_end(); - - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); - goto out_unlock; - } - - list_for_each_entry(blkg, &q->blkg_list, q_node) - cnt++; - - spin_unlock_irq(q->queue_lock); - - /* allocate policy_data for all existing blkgs */ - while (cnt--) { - pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); - if (!pd) { +pd_prealloc: + if (!pd_prealloc) { + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); + if (!pd_prealloc) { ret = -ENOMEM; - goto out_free; + goto out_bypass_end; } - list_add_tail(&pd->alloc_node, &pds); } - /* - * Install the allocated pds. With @q bypassing, no new blkg - * should have been created while the queue lock was dropped. - */ spin_lock_irq(q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (WARN_ON(list_empty(&pds))) { - /* umm... this shouldn't happen, just abort */ - ret = -ENOMEM; - goto out_unlock; - } - pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); - list_del_init(&pd->alloc_node); + struct blkg_policy_data *pd; - /* grab blkcg lock too while installing @pd on @blkg */ - spin_lock(&blkg->blkcg->lock); + if (blkg->pd[pol->plid]) + continue; + + pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); + if (!pd) + swap(pd, pd_prealloc); + if (!pd) { + spin_unlock_irq(q->queue_lock); + goto pd_prealloc; + } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; - pol->pd_init_fn(blkg); - - spin_unlock(&blkg->blkcg->lock); + if (pol->pd_init_fn) + pol->pd_init_fn(pd); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; -out_unlock: + spin_unlock_irq(q->queue_lock); -out_free: +out_bypass_end: blk_queue_bypass_end(q); - list_for_each_entry_safe(pd, n, &pds, alloc_node) - kfree(pd); + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); @@ -1076,21 +1279,16 @@ void blkcg_deactivate_policy(struct request_queue *q, __clear_bit(pol->plid, q->blkcg_pols); - /* if no policy is left, no need for blkgs - shoot them down */ - if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) - blkg_destroy_all(q); - list_for_each_entry(blkg, &q->blkg_list, q_node) { /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); - if (pol->pd_offline_fn) - pol->pd_offline_fn(blkg); - if (pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - - kfree(blkg->pd[pol->plid]); - blkg->pd[pol->plid] = NULL; + if (blkg->pd[pol->plid]) { + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[pol->plid]); + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } spin_unlock(&blkg->blkcg->lock); } @@ -1109,11 +1307,10 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); */ int blkcg_policy_register(struct blkcg_policy *pol) { + struct blkcg *blkcg; int i, ret; - if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) - return -EINVAL; - + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ @@ -1122,19 +1319,55 @@ int blkcg_policy_register(struct blkcg_policy *pol) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) - goto out_unlock; + goto err_unlock; - /* register and update blkgs */ + /* register @pol */ pol->plid = i; - blkcg_policy[i] = pol; + blkcg_policy[pol->plid] = pol; + + /* allocate and install cpd's */ + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + struct blkcg_policy_data *cpd; + + cpd = pol->cpd_alloc_fn(GFP_KERNEL); + if (!cpd) { + mutex_unlock(&blkcg_pol_mutex); + goto err_free_cpds; + } + + blkcg->cpd[pol->plid] = cpd; + cpd->blkcg = blkcg; + cpd->plid = pol->plid; + pol->cpd_init_fn(cpd); + } + } + + mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->cftypes) - WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, - pol->cftypes)); - ret = 0; -out_unlock: + if (pol->dfl_cftypes) + WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + if (pol->legacy_cftypes) + WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, + pol->legacy_cftypes)); + mutex_unlock(&blkcg_pol_register_mutex); + return 0; + +err_free_cpds: + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } + } + blkcg_policy[pol->plid] = NULL; +err_unlock: mutex_unlock(&blkcg_pol_mutex); + mutex_unlock(&blkcg_pol_register_mutex); return ret; } EXPORT_SYMBOL_GPL(blkcg_policy_register); @@ -1147,18 +1380,34 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register); */ void blkcg_policy_unregister(struct blkcg_policy *pol) { - mutex_lock(&blkcg_pol_mutex); + struct blkcg *blkcg; + + mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) goto out_unlock; /* kill the intf files first */ - if (pol->cftypes) - cgroup_rm_cftypes(pol->cftypes); + if (pol->dfl_cftypes) + cgroup_rm_cftypes(pol->dfl_cftypes); + if (pol->legacy_cftypes) + cgroup_rm_cftypes(pol->legacy_cftypes); + + /* remove cpds and unregister */ + mutex_lock(&blkcg_pol_mutex); - /* unregister and update blkgs */ + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } + } blkcg_policy[pol->plid] = NULL; -out_unlock: + mutex_unlock(&blkcg_pol_mutex); +out_unlock: + mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); diff --git a/kernel/block/blk-cgroup.h b/kernel/block/blk-cgroup.h deleted file mode 100644 index c567865b5..000000000 --- a/kernel/block/blk-cgroup.h +++ /dev/null @@ -1,603 +0,0 @@ -#ifndef _BLK_CGROUP_H -#define _BLK_CGROUP_H -/* - * Common Block IO controller cgroup interface - * - * Based on ideas and code from CFQ, CFS and BFQ: - * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> - * - * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> - * Paolo Valente <paolo.valente@unimore.it> - * - * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> - * Nauman Rafique <nauman@google.com> - */ - -#include <linux/cgroup.h> -#include <linux/u64_stats_sync.h> -#include <linux/seq_file.h> -#include <linux/radix-tree.h> -#include <linux/blkdev.h> -#include <linux/atomic.h> - -/* Max limits for throttle policy */ -#define THROTL_IOPS_MAX UINT_MAX - -/* CFQ specific, out here for blkcg->cfq_weight */ -#define CFQ_WEIGHT_MIN 10 -#define CFQ_WEIGHT_MAX 1000 -#define CFQ_WEIGHT_DEFAULT 500 - -#ifdef CONFIG_BLK_CGROUP - -enum blkg_rwstat_type { - BLKG_RWSTAT_READ, - BLKG_RWSTAT_WRITE, - BLKG_RWSTAT_SYNC, - BLKG_RWSTAT_ASYNC, - - BLKG_RWSTAT_NR, - BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, -}; - -struct blkcg_gq; - -struct blkcg { - struct cgroup_subsys_state css; - spinlock_t lock; - - struct radix_tree_root blkg_tree; - struct blkcg_gq *blkg_hint; - struct hlist_head blkg_list; - - /* TODO: per-policy storage in blkcg */ - unsigned int cfq_weight; /* belongs to cfq */ - unsigned int cfq_leaf_weight; -}; - -struct blkg_stat { - struct u64_stats_sync syncp; - uint64_t cnt; -}; - -struct blkg_rwstat { - struct u64_stats_sync syncp; - uint64_t cnt[BLKG_RWSTAT_NR]; -}; - -/* - * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a - * request_queue (q). This is used by blkcg policies which need to track - * information per blkcg - q pair. - * - * There can be multiple active blkcg policies and each has its private - * data on each blkg, the size of which is determined by - * blkcg_policy->pd_size. blkcg core allocates and frees such areas - * together with blkg and invokes pd_init/exit_fn() methods. - * - * Such private data must embed struct blkg_policy_data (pd) at the - * beginning and pd_size can't be smaller than pd. - */ -struct blkg_policy_data { - /* the blkg and policy id this per-policy data belongs to */ - struct blkcg_gq *blkg; - int plid; - - /* used during policy activation */ - struct list_head alloc_node; -}; - -/* association between a blk cgroup and a request queue */ -struct blkcg_gq { - /* Pointer to the associated request_queue */ - struct request_queue *q; - struct list_head q_node; - struct hlist_node blkcg_node; - struct blkcg *blkcg; - - /* all non-root blkcg_gq's are guaranteed to have access to parent */ - struct blkcg_gq *parent; - - /* request allocation list for this blkcg-q pair */ - struct request_list rl; - - /* reference count */ - atomic_t refcnt; - - /* is this blkg online? protected by both blkcg and q locks */ - bool online; - - struct blkg_policy_data *pd[BLKCG_MAX_POLS]; - - struct rcu_head rcu_head; -}; - -typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); - -struct blkcg_policy { - int plid; - /* policy specific private data size */ - size_t pd_size; - /* cgroup files for the policy */ - struct cftype *cftypes; - - /* operations */ - blkcg_pol_init_pd_fn *pd_init_fn; - blkcg_pol_online_pd_fn *pd_online_fn; - blkcg_pol_offline_pd_fn *pd_offline_fn; - blkcg_pol_exit_pd_fn *pd_exit_fn; - blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; -}; - -extern struct blkcg blkcg_root; - -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); -int blkcg_init_queue(struct request_queue *q); -void blkcg_drain_queue(struct request_queue *q); -void blkcg_exit_queue(struct request_queue *q); - -/* Blkio controller policy registration */ -int blkcg_policy_register(struct blkcg_policy *pol); -void blkcg_policy_unregister(struct blkcg_policy *pol); -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol); -void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol); - -void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, - u64 (*prfill)(struct seq_file *, - struct blkg_policy_data *, int), - const struct blkcg_policy *pol, int data, - bool show_total); -u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat *rwstat); -u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off); - -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off); - -struct blkg_conf_ctx { - struct gendisk *disk; - struct blkcg_gq *blkg; - u64 v; -}; - -int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx); -void blkg_conf_finish(struct blkg_conf_ctx *ctx); - - -static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct blkcg, css) : NULL; -} - -static inline struct blkcg *task_blkcg(struct task_struct *tsk) -{ - return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); -} - -static inline struct blkcg *bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); - return task_blkcg(current); -} - -/** - * blkcg_parent - get the parent of a blkcg - * @blkcg: blkcg of interest - * - * Return the parent blkcg of @blkcg. Can be called anytime. - */ -static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) -{ - return css_to_blkcg(blkcg->css.parent); -} - -/** - * blkg_to_pdata - get policy private data - * @blkg: blkg of interest - * @pol: policy of interest - * - * Return pointer to private data associated with the @blkg-@pol pair. - */ -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) -{ - return blkg ? blkg->pd[pol->plid] : NULL; -} - -/** - * pdata_to_blkg - get blkg associated with policy private data - * @pd: policy private data of interest - * - * @pd is policy private data. Determine the blkg it's associated with. - */ -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) -{ - return pd ? pd->blkg : NULL; -} - -/** - * blkg_path - format cgroup path of blkg - * @blkg: blkg of interest - * @buf: target buffer - * @buflen: target buffer length - * - * Format the path of the cgroup of @blkg into @buf. - */ -static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) -{ - char *p; - - p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - if (!p) { - strncpy(buf, "<unavailable>", buflen); - return -ENAMETOOLONG; - } - - memmove(buf, p, buf + buflen - p); - return 0; -} - -/** - * blkg_get - get a blkg reference - * @blkg: blkg to get - * - * The caller should be holding an existing reference. - */ -static inline void blkg_get(struct blkcg_gq *blkg) -{ - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - atomic_inc(&blkg->refcnt); -} - -void __blkg_release_rcu(struct rcu_head *rcu); - -/** - * blkg_put - put a blkg reference - * @blkg: blkg to put - */ -static inline void blkg_put(struct blkcg_gq *blkg) -{ - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - if (atomic_dec_and_test(&blkg->refcnt)) - call_rcu(&blkg->rcu_head, __blkg_release_rcu); -} - -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint); - -/** - * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU - * read locked. If called under either blkcg or queue lock, the iteration - * is guaranteed to include all and only online blkgs. The caller may - * update @pos_css by calling css_rightmost_descendant() to skip subtree. - * @p_blkg is included in the iteration and the first node to be visited. - */ -#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -/** - * blkg_for_each_descendant_post - post-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Similar to blkg_for_each_descendant_pre() but performs post-order - * traversal instead. Synchronization rules are the same. @p_blkg is - * included in the iteration and the last node to be visited. - */ -#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -/** - * blk_get_rl - get request_list to use - * @q: request_queue of interest - * @bio: bio which will be attached to the allocated request (may be %NULL) - * - * The caller wants to allocate a request from @q to use for @bio. Find - * the request_list to use and obtain a reference on it. Should be called - * under queue_lock. This function is guaranteed to return non-%NULL - * request_list. - */ -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) -{ - struct blkcg *blkcg; - struct blkcg_gq *blkg; - - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - - /* bypass blkg lookup and use @q->root_rl directly for root */ - if (blkcg == &blkcg_root) - goto root_rl; - - /* - * Try to use blkg->rl. blkg lookup may fail under memory pressure - * or if either the blkcg or queue is going away. Fall back to - * root_rl in such cases. - */ - blkg = blkg_lookup_create(blkcg, q); - if (unlikely(IS_ERR(blkg))) - goto root_rl; - - blkg_get(blkg); - rcu_read_unlock(); - return &blkg->rl; -root_rl: - rcu_read_unlock(); - return &q->root_rl; -} - -/** - * blk_put_rl - put request_list - * @rl: request_list to put - * - * Put the reference acquired by blk_get_rl(). Should be called under - * queue_lock. - */ -static inline void blk_put_rl(struct request_list *rl) -{ - /* root_rl may not have blkg set */ - if (rl->blkg && rl->blkg->blkcg != &blkcg_root) - blkg_put(rl->blkg); -} - -/** - * blk_rq_set_rl - associate a request with a request_list - * @rq: request of interest - * @rl: target request_list - * - * Associate @rq with @rl so that accounting and freeing can know the - * request_list @rq came from. - */ -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) -{ - rq->rl = rl; -} - -/** - * blk_rq_rl - return the request_list a request came from - * @rq: request of interest - * - * Return the request_list @rq is allocated from. - */ -static inline struct request_list *blk_rq_rl(struct request *rq) -{ - return rq->rl; -} - -struct request_list *__blk_queue_next_rl(struct request_list *rl, - struct request_queue *q); -/** - * blk_queue_for_each_rl - iterate through all request_lists of a request_queue - * - * Should be used under queue_lock. - */ -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) - -static inline void blkg_stat_init(struct blkg_stat *stat) -{ - u64_stats_init(&stat->syncp); -} - -/** - * blkg_stat_add - add a value to a blkg_stat - * @stat: target blkg_stat - * @val: value to add - * - * Add @val to @stat. The caller is responsible for synchronizing calls to - * this function. - */ -static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) -{ - u64_stats_update_begin(&stat->syncp); - stat->cnt += val; - u64_stats_update_end(&stat->syncp); -} - -/** - * blkg_stat_read - read the current value of a blkg_stat - * @stat: blkg_stat to read - * - * Read the current value of @stat. This function can be called without - * synchroniztion and takes care of u64 atomicity. - */ -static inline uint64_t blkg_stat_read(struct blkg_stat *stat) -{ - unsigned int start; - uint64_t v; - - do { - start = u64_stats_fetch_begin_irq(&stat->syncp); - v = stat->cnt; - } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); - - return v; -} - -/** - * blkg_stat_reset - reset a blkg_stat - * @stat: blkg_stat to reset - */ -static inline void blkg_stat_reset(struct blkg_stat *stat) -{ - stat->cnt = 0; -} - -/** - * blkg_stat_merge - merge a blkg_stat into another - * @to: the destination blkg_stat - * @from: the source - * - * Add @from's count to @to. - */ -static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) -{ - blkg_stat_add(to, blkg_stat_read(from)); -} - -static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) -{ - u64_stats_init(&rwstat->syncp); -} - -/** - * blkg_rwstat_add - add a value to a blkg_rwstat - * @rwstat: target blkg_rwstat - * @rw: mask of REQ_{WRITE|SYNC} - * @val: value to add - * - * Add @val to @rwstat. The counters are chosen according to @rw. The - * caller is responsible for synchronizing calls to this function. - */ -static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - int rw, uint64_t val) -{ - u64_stats_update_begin(&rwstat->syncp); - - if (rw & REQ_WRITE) - rwstat->cnt[BLKG_RWSTAT_WRITE] += val; - else - rwstat->cnt[BLKG_RWSTAT_READ] += val; - if (rw & REQ_SYNC) - rwstat->cnt[BLKG_RWSTAT_SYNC] += val; - else - rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; - - u64_stats_update_end(&rwstat->syncp); -} - -/** - * blkg_rwstat_read - read the current values of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Read the current snapshot of @rwstat and return it as the return value. - * This function can be called without synchronization and takes care of - * u64 atomicity. - */ -static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) -{ - unsigned int start; - struct blkg_rwstat tmp; - - do { - start = u64_stats_fetch_begin_irq(&rwstat->syncp); - tmp = *rwstat; - } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); - - return tmp; -} - -/** - * blkg_rwstat_total - read the total count of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Return the total count of @rwstat regardless of the IO direction. This - * function can be called without synchronization and takes care of u64 - * atomicity. - */ -static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) -{ - struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); - - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; -} - -/** - * blkg_rwstat_reset - reset a blkg_rwstat - * @rwstat: blkg_rwstat to reset - */ -static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) -{ - memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); -} - -/** - * blkg_rwstat_merge - merge a blkg_rwstat into another - * @to: the destination blkg_rwstat - * @from: the source - * - * Add @from's counts to @to. - */ -static inline void blkg_rwstat_merge(struct blkg_rwstat *to, - struct blkg_rwstat *from) -{ - struct blkg_rwstat v = blkg_rwstat_read(from); - int i; - - u64_stats_update_begin(&to->syncp); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - to->cnt[i] += v.cnt[i]; - u64_stats_update_end(&to->syncp); -} - -#else /* CONFIG_BLK_CGROUP */ - -struct cgroup; -struct blkcg; - -struct blkg_policy_data { -}; - -struct blkcg_gq { -}; - -struct blkcg_policy { -}; - -static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline int blkcg_init_queue(struct request_queue *q) { return 0; } -static inline void blkcg_drain_queue(struct request_queue *q) { } -static inline void blkcg_exit_queue(struct request_queue *q) { } -static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } -static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } -static inline int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { return 0; } -static inline void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { } - -static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } - -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) { return NULL; } -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } -static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } -static inline void blkg_get(struct blkcg_gq *blkg) { } -static inline void blkg_put(struct blkcg_gq *blkg) { } - -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) { return &q->root_rl; } -static inline void blk_put_rl(struct request_list *rl) { } -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } -static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } - -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) - -#endif /* CONFIG_BLK_CGROUP */ -#endif /* _BLK_CGROUP_H */ diff --git a/kernel/block/blk-core.c b/kernel/block/blk-core.c index 4e7dded64..2f7afb98e 100644 --- a/kernel/block/blk-core.c +++ b/kernel/block/blk-core.c @@ -32,12 +32,12 @@ #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> +#include <linux/blk-cgroup.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" -#include "blk-cgroup.h" #include "blk-mq.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); @@ -63,6 +63,31 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; +static void blk_clear_congested(struct request_list *rl, int sync) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + clear_wb_congested(rl->blkg->wb_congested, sync); +#else + /* + * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't + * flip its congestion state for events on other blkcgs. + */ + if (rl == &rl->q->root_rl) + clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + +static void blk_set_congested(struct request_list *rl, int sync) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + set_wb_congested(rl->blkg->wb_congested, sync); +#else + /* see blk_clear_congested() */ + if (rl == &rl->q->root_rl) + set_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -121,18 +146,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; + bio->bi_error = error; if (unlikely(rq->cmd_flags & REQ_QUIET)) - set_bit(BIO_QUIET, &bio->bi_flags); + bio_set_flag(bio, BIO_QUIET); bio_advance(bio, nbytes); /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) - bio_endio(bio, error); + bio_endio(bio); } void blk_dump_rq_flags(struct request *rq, char *msg) @@ -187,6 +210,22 @@ void blk_delay_queue(struct request_queue *q, unsigned long msecs) EXPORT_SYMBOL(blk_delay_queue); /** + * blk_start_queue_async - asynchronously restart a previously stopped queue + * @q: The &struct request_queue in question + * + * Description: + * blk_start_queue_async() will clear the stop flag on the queue, and + * ensure that the request_fn for the queue is run from an async + * context. + **/ +void blk_start_queue_async(struct request_queue *q) +{ + queue_flag_clear(QUEUE_FLAG_STOPPED, q); + blk_run_queue_async(q); +} +EXPORT_SYMBOL(blk_start_queue_async); + +/** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question * @@ -288,6 +327,7 @@ inline void __blk_run_queue_uncond(struct request_queue *q) q->request_fn(q); q->request_fn_active--; } +EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); /** * __blk_run_queue - run a single device queue @@ -533,29 +573,30 @@ void blk_cleanup_queue(struct request_queue *q) * Drain all requests queued before DYING marking. Set DEAD flag to * prevent that q->request_fn() gets invoked after draining finished. */ - if (q->mq_ops) { - blk_mq_freeze_queue(q); - spin_lock_irq(lock); - } else { - spin_lock_irq(lock); + blk_freeze_queue(q); + spin_lock_irq(lock); + if (!q->mq_ops) __blk_drain_queue(q, true); - } queue_flag_set(QUEUE_FLAG_DEAD, q); spin_unlock_irq(lock); + /* for synchronous bio-based driver finish in-flight integrity i/o */ + blk_flush_integrity(); + /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); blk_sync_queue(q); if (q->mq_ops) blk_mq_free_queue(q); + percpu_ref_exit(&q->q_usage_counter); spin_lock_irq(lock); if (q->queue_lock != &q->__queue_lock) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); - bdi_destroy(&q->backing_dev_info); + bdi_unregister(&q->backing_dev_info); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); @@ -608,6 +649,40 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask) } EXPORT_SYMBOL(blk_alloc_queue); +int blk_queue_enter(struct request_queue *q, gfp_t gfp) +{ + while (true) { + int ret; + + if (percpu_ref_tryget_live(&q->q_usage_counter)) + return 0; + + if (!gfpflags_allow_blocking(gfp)) + return -EBUSY; + + ret = swait_event_interruptible(q->mq_freeze_wq, + !atomic_read(&q->mq_freeze_depth) || + blk_queue_dying(q)); + if (blk_queue_dying(q)) + return -ENODEV; + if (ret) + return ret; + } +} + +void blk_queue_exit(struct request_queue *q) +{ + percpu_ref_put(&q->q_usage_counter); +} + +static void blk_queue_usage_counter_release(struct percpu_ref *ref) +{ + struct request_queue *q = + container_of(ref, struct request_queue, q_usage_counter); + + swake_up_all(&q->mq_freeze_wq); +} + struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; @@ -622,16 +697,19 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (q->id < 0) goto fail_q; + q->bio_split = bioset_create(BIO_POOL_SIZE, 0); + if (!q->bio_split) + goto fail_id; + q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - q->backing_dev_info.state = 0; - q->backing_dev_info.capabilities = 0; + q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK; q->backing_dev_info.name = "block"; q->node = node_id; err = bdi_init(&q->backing_dev_info); if (err) - goto fail_id; + goto fail_split; setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); @@ -664,15 +742,28 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) q->bypass_depth = 1; __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); - init_swait_head(&q->mq_freeze_wq); + init_swait_queue_head(&q->mq_freeze_wq); - if (blkcg_init_queue(q)) + /* + * Init percpu_ref in atomic mode so that it's faster to shutdown. + * See blk_register_queue() for details. + */ + if (percpu_ref_init(&q->q_usage_counter, + blk_queue_usage_counter_release, + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_bdi; + if (blkcg_init_queue(q)) + goto fail_ref; + return q; +fail_ref: + percpu_ref_exit(&q->q_usage_counter); fail_bdi: bdi_destroy(&q->backing_dev_info); +fail_split: + bioset_free(q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); fail_q: @@ -737,7 +828,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) } EXPORT_SYMBOL(blk_init_queue_node); -static void blk_queue_bio(struct request_queue *q, struct bio *bio); +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, @@ -848,13 +939,8 @@ static void __freed_request(struct request_list *rl, int sync) { struct request_queue *q = rl->q; - /* - * bdi isn't aware of blkcg yet. As all async IOs end up root - * blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl && - rl->count[sync] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, sync); + if (rl->count[sync] < queue_congestion_off_threshold(q)) + blk_clear_congested(rl, sync); if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) @@ -887,25 +973,25 @@ static void freed_request(struct request_list *rl, unsigned int flags) int blk_update_nr_requests(struct request_queue *q, unsigned int nr) { struct request_list *rl; + int on_thresh, off_thresh; spin_lock_irq(q->queue_lock); q->nr_requests = nr; blk_queue_congestion_threshold(q); + on_thresh = queue_congestion_on_threshold(q); + off_thresh = queue_congestion_off_threshold(q); - /* congestion isn't cgroup aware and follows root blkcg for now */ - rl = &q->root_rl; - - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_SYNC); + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_SYNC); - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_ASYNC); + if (rl->count[BLK_RW_ASYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_ASYNC); - blk_queue_for_each_rl(rl, q) { if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { blk_set_rl_full(rl, BLK_RW_SYNC); } else { @@ -1015,12 +1101,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, } } } - /* - * bdi isn't aware of blkcg yet. As all async IOs end up - * root blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl) - blk_set_queue_congested(q, is_sync); + blk_set_congested(rl, is_sync); } /* @@ -1144,8 +1225,8 @@ rq_starved: * @bio: bio to allocate request for (can be %NULL) * @gfp_mask: allocation mask * - * Get a free request from @q. If %__GFP_WAIT is set in @gfp_mask, this - * function keeps retrying under memory pressure and fails iff @q is dead. + * Get a free request from @q. If %__GFP_DIRECT_RECLAIM is set in @gfp_mask, + * this function keeps retrying under memory pressure and fails iff @q is dead. * * Must be called with @q->queue_lock held and, * Returns ERR_PTR on failure, with @q->queue_lock held. @@ -1165,7 +1246,7 @@ retry: if (!IS_ERR(rq)) return rq; - if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dying(q))) { + if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) { blk_put_rl(rl); return rq; } @@ -1243,11 +1324,11 @@ EXPORT_SYMBOL(blk_get_request); * BUG. * * WARNING: When allocating/cloning a bio-chain, careful consideration should be - * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for - * anything but the first bio in the chain. Otherwise you risk waiting for IO - * completion of a bio that hasn't been submitted yet, thus resulting in a - * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead - * of bio_alloc(), as that avoids the mempool deadlock. + * given to how you allocate bios. In particular, you cannot use + * __GFP_DIRECT_RECLAIM for anything but the first bio in the chain. Otherwise + * you risk waiting for IO completion of a bio that hasn't been submitted yet, + * thus resulting in a deadlock. Alternatively bios should be allocated using + * bio_kmalloc() instead of bio_alloc(), as that avoids the mempool deadlock. * If possible a big IO should be split into smaller parts when allocation * fails. Partial allocation should not be an error, or you risk a live-lock. */ @@ -1513,6 +1594,9 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * @q: request_queue new bio is being queued at * @bio: new bio being queued * @request_count: out parameter for number of traversed plugged requests + * @same_queue_rq: pointer to &struct request that gets filled in when + * another request associated with @q is found on the plug list + * (optional, may be %NULL) * * Determine whether @bio being queued on @q can be merged with a request * on %current's plugged list. Returns %true if merge was successful, @@ -1528,7 +1612,8 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count) + unsigned int *request_count, + struct request **same_queue_rq) { struct blk_plug *plug; struct request *rq; @@ -1548,8 +1633,16 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, list_for_each_entry_reverse(rq, plug_list, queuelist) { int el_ret; - if (rq->q == q) + if (rq->q == q) { (*request_count)++; + /* + * Only blk-mq multiple hardware queues case checks the + * rq in the same queue, there should be only one such + * rq in a queue + **/ + if (same_queue_rq) + *same_queue_rq = rq; + } if (rq->q != q || !blk_rq_merge_ok(rq, bio)) continue; @@ -1569,6 +1662,30 @@ out: return ret; } +unsigned int blk_plug_queued_count(struct request_queue *q) +{ + struct blk_plug *plug; + struct request *rq; + struct list_head *plug_list; + unsigned int ret = 0; + + plug = current->plug; + if (!plug) + goto out; + + if (q->mq_ops) + plug_list = &plug->mq_list; + else + plug_list = &plug->list; + + list_for_each_entry(rq, plug_list, queuelist) { + if (rq->q == q) + ret++; + } +out: + return ret; +} + void init_request_from_bio(struct request *req, struct bio *bio) { req->cmd_type = REQ_TYPE_FS; @@ -1583,7 +1700,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) blk_rq_bio_prep(req->q, req, bio); } -static void blk_queue_bio(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) { const bool sync = !!(bio->bi_rw & REQ_SYNC); struct blk_plug *plug; @@ -1598,9 +1715,12 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) */ blk_queue_bounce(q, &bio); + blk_queue_split(q, &bio, q->bio_split); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_endio(bio, -EIO); - return; + bio->bi_error = -EIO; + bio_endio(bio); + return BLK_QC_T_NONE; } if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { @@ -1613,9 +1733,11 @@ static void blk_queue_bio(struct request_queue *q, struct bio *bio) * Check if we can merge with the plugged list before grabbing * any locks. */ - if (!blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) - return; + if (!blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) + return BLK_QC_T_NONE; + } else + request_count = blk_plug_queued_count(q); spin_lock_irq(q->queue_lock); @@ -1652,7 +1774,8 @@ get_rq: */ req = get_request(q, rw_flags, bio, GFP_NOIO); if (IS_ERR(req)) { - bio_endio(bio, PTR_ERR(req)); /* @q is dead */ + bio->bi_error = PTR_ERR(req); + bio_endio(bio); goto out_unlock; } @@ -1690,6 +1813,8 @@ get_rq: out_unlock: spin_unlock_irq(q->queue_lock); } + + return BLK_QC_T_NONE; } /* @@ -1721,8 +1846,6 @@ static void handle_bad_sector(struct bio *bio) bio->bi_rw, (unsigned long long)bio_end_sector(bio), (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); - - set_bit(BIO_EOF, &bio->bi_flags); } #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -1813,15 +1936,6 @@ generic_make_request_checks(struct bio *bio) goto end_io; } - if (likely(bio_is_rw(bio) && - nr_sectors > queue_max_hw_sectors(q))) { - printk(KERN_ERR "bio too big device %s (%u > %u)\n", - bdevname(bio->bi_bdev, b), - bio_sectors(bio), - queue_max_hw_sectors(q)); - goto end_io; - } - part = bio->bi_bdev->bd_part; if (should_fail_request(part, bio->bi_iter.bi_size) || should_fail_request(&part_to_disk(part)->part0, @@ -1870,14 +1984,15 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node); - if (blk_throtl_bio(q, bio)) - return false; /* throttled, will be resubmitted later */ + if (!blkcg_bio_issue_check(q, bio)) + return false; trace_block_bio_queue(q, bio); return true; end_io: - bio_endio(bio, err); + bio->bi_error = err; + bio_endio(bio); return false; } @@ -1905,12 +2020,13 @@ end_io: * a lower device by calling into generic_make_request recursively, which * means the bio should NOT be touched after the call to ->make_request_fn. */ -void generic_make_request(struct bio *bio) +blk_qc_t generic_make_request(struct bio *bio) { struct bio_list bio_list_on_stack; + blk_qc_t ret = BLK_QC_T_NONE; if (!generic_make_request_checks(bio)) - return; + goto out; /* * We only want one ->make_request_fn to be active at a time, else @@ -1924,7 +2040,7 @@ void generic_make_request(struct bio *bio) */ if (current->bio_list) { bio_list_add(current->bio_list, bio); - return; + goto out; } /* following loop may be a bit non-obvious, and so deserves some @@ -1947,11 +2063,24 @@ void generic_make_request(struct bio *bio) do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - q->make_request_fn(q, bio); + if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) { + + ret = q->make_request_fn(q, bio); - bio = bio_list_pop(current->bio_list); + blk_queue_exit(q); + + bio = bio_list_pop(current->bio_list); + } else { + struct bio *bio_next = bio_list_pop(current->bio_list); + + bio_io_error(bio); + bio = bio_next; + } } while (bio); current->bio_list = NULL; /* deactivate */ + +out: + return ret; } EXPORT_SYMBOL(generic_make_request); @@ -1965,7 +2094,7 @@ EXPORT_SYMBOL(generic_make_request); * interfaces; @bio must be presetup and ready for I/O. * */ -void submit_bio(int rw, struct bio *bio) +blk_qc_t submit_bio(int rw, struct bio *bio) { bio->bi_rw |= rw; @@ -1999,12 +2128,13 @@ void submit_bio(int rw, struct bio *bio) } } - generic_make_request(bio); + return generic_make_request(bio); } EXPORT_SYMBOL(submit_bio); /** - * blk_rq_check_limits - Helper function to check a request for the queue limit + * blk_cloned_rq_check_limits - Helper function to check a cloned request + * for new the queue limits * @q: the queue * @rq: the request being checked * @@ -2015,20 +2145,13 @@ EXPORT_SYMBOL(submit_bio); * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * - * This function should also be useful for request stacking drivers - * in some cases below, so export this function. * Request stacking drivers like request-based dm may change the queue - * limits while requests are in the queue (e.g. dm's table swapping). - * Such request stacking drivers should check those requests against - * the new queue limits again when they dispatch those requests, - * although such checkings are also done against the old queue limits - * when submitting requests. + * limits when retrying requests on other queues. Those requests need + * to be checked against the new queue limits again during dispatch. */ -int blk_rq_check_limits(struct request_queue *q, struct request *rq) +static int blk_cloned_rq_check_limits(struct request_queue *q, + struct request *rq) { - if (!rq_mergeable(rq)) - return 0; - if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; @@ -2048,7 +2171,6 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) return 0; } -EXPORT_SYMBOL_GPL(blk_rq_check_limits); /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request @@ -2060,7 +2182,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) unsigned long flags; int where = ELEVATOR_INSERT_BACK; - if (blk_rq_check_limits(q, rq)) + if (blk_cloned_rq_check_limits(q, rq)) return -EIO; if (rq->rq_disk && @@ -3037,21 +3159,20 @@ void blk_start_plug(struct blk_plug *plug) { struct task_struct *tsk = current; + /* + * If this is a nested plug, don't actually assign it. + */ + if (tsk->plug) + return; + INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); - /* - * If this is a nested plug, don't actually assign it. It will be - * flushed on its own. + * Store ordering should not be needed here, since a potential + * preempt will imply a full memory barrier */ - if (!tsk->plug) { - /* - * Store ordering should not be needed here, since a potential - * preempt will imply a full memory barrier - */ - tsk->plug = plug; - } + tsk->plug = plug; } EXPORT_SYMBOL(blk_start_plug); @@ -3190,13 +3311,55 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) void blk_finish_plug(struct blk_plug *plug) { + if (plug != current->plug) + return; blk_flush_plug_list(plug, false); - if (plug == current->plug) - current->plug = NULL; + current->plug = NULL; } EXPORT_SYMBOL(blk_finish_plug); +bool blk_poll(struct request_queue *q, blk_qc_t cookie) +{ + struct blk_plug *plug; + long state; + + if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return false; + + plug = current->plug; + if (plug) + blk_flush_plug_list(plug, false); + + state = current->state; + while (!need_resched()) { + unsigned int queue_num = blk_qc_t_to_queue_num(cookie); + struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie)); + if (ret > 0) { + hctx->poll_success++; + set_current_state(TASK_RUNNING); + return true; + } + + if (signal_pending_state(state, current)) + set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return true; + if (ret < 0) + break; + cpu_relax(); + } + + return false; +} + #ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine @@ -3253,6 +3416,9 @@ int blk_pre_runtime_suspend(struct request_queue *q) { int ret = 0; + if (!q->dev) + return ret; + spin_lock_irq(q->queue_lock); if (q->nr_pending) { ret = -EBUSY; @@ -3280,6 +3446,9 @@ EXPORT_SYMBOL(blk_pre_runtime_suspend); */ void blk_post_runtime_suspend(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_SUSPENDED; @@ -3304,6 +3473,9 @@ EXPORT_SYMBOL(blk_post_runtime_suspend); */ void blk_pre_runtime_resume(struct request_queue *q) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); q->rpm_status = RPM_RESUMING; spin_unlock_irq(q->queue_lock); @@ -3326,6 +3498,9 @@ EXPORT_SYMBOL(blk_pre_runtime_resume); */ void blk_post_runtime_resume(struct request_queue *q, int err) { + if (!q->dev) + return; + spin_lock_irq(q->queue_lock); if (!err) { q->rpm_status = RPM_ACTIVE; @@ -3343,7 +3518,7 @@ EXPORT_SYMBOL(blk_post_runtime_resume); int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * - sizeof(((struct request *)0)->cmd_flags)); + FIELD_SIZEOF(struct request, cmd_flags)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", diff --git a/kernel/block/blk-exec.c b/kernel/block/blk-exec.c index 9924725fa..3fec8a29d 100644 --- a/kernel/block/blk-exec.c +++ b/kernel/block/blk-exec.c @@ -53,7 +53,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, rq_end_io_fn *done) { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - bool is_pm_resume; WARN_ON(irqs_disabled()); WARN_ON(rq->cmd_type == REQ_TYPE_FS); @@ -70,12 +69,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, return; } - /* - * need to check this before __blk_run_queue(), because rq can - * be freed before that returns. - */ - is_pm_resume = rq->cmd_type == REQ_TYPE_PM_RESUME; - spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) { @@ -88,9 +81,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, __elv_add_request(q, rq, where); __blk_run_queue(q); - /* the queue is stopped so it won't be run */ - if (is_pm_resume) - __blk_run_queue_uncond(q); spin_unlock_irq(q->queue_lock); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); diff --git a/kernel/block/blk-flush.c b/kernel/block/blk-flush.c index 20badd7b9..9c423e533 100644 --- a/kernel/block/blk-flush.c +++ b/kernel/block/blk-flush.c @@ -73,6 +73,7 @@ #include "blk.h" #include "blk-mq.h" +#include "blk-mq-tag.h" /* FLUSH/FUA sequences */ enum { @@ -226,7 +227,12 @@ static void flush_end_io(struct request *flush_rq, int error) struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + + /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); + hctx = q->mq_ops->map_queue(q, flush_rq->mq_ctx->cpu); + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); flush_rq->tag = -1; } @@ -308,11 +314,18 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq) /* * Borrow tag from the first request since they can't - * be in flight at the same time. + * be in flight at the same time. And acquire the tag's + * ownership for flush req. */ if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->tag = first_rq->tag; + fq->orig_rq = first_rq; + + hctx = q->mq_ops->map_queue(q, first_rq->mq_ctx->cpu); + blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); } flush_rq->cmd_type = REQ_TYPE_FS; diff --git a/kernel/block/blk-integrity.c b/kernel/block/blk-integrity.c index 79ffb4855..d69c5c79f 100644 --- a/kernel/block/blk-integrity.c +++ b/kernel/block/blk-integrity.c @@ -21,6 +21,7 @@ */ #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/mempool.h> #include <linux/bio.h> #include <linux/scatterlist.h> @@ -29,10 +30,6 @@ #include "blk.h" -static struct kmem_cache *integrity_cachep; - -static const char *bi_unsupported_name = "unsupported"; - /** * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements * @q: request queue @@ -145,40 +142,40 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg); */ int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) { - struct blk_integrity *b1 = gd1->integrity; - struct blk_integrity *b2 = gd2->integrity; + struct blk_integrity *b1 = &gd1->queue->integrity; + struct blk_integrity *b2 = &gd2->queue->integrity; - if (!b1 && !b2) + if (!b1->profile && !b2->profile) return 0; - if (!b1 || !b2) + if (!b1->profile || !b2->profile) return -1; - if (b1->interval != b2->interval) { + if (b1->interval_exp != b2->interval_exp) { pr_err("%s: %s/%s protection interval %u != %u\n", __func__, gd1->disk_name, gd2->disk_name, - b1->interval, b2->interval); + 1 << b1->interval_exp, 1 << b2->interval_exp); return -1; } if (b1->tuple_size != b2->tuple_size) { - printk(KERN_ERR "%s: %s/%s tuple sz %u != %u\n", __func__, + pr_err("%s: %s/%s tuple sz %u != %u\n", __func__, gd1->disk_name, gd2->disk_name, b1->tuple_size, b2->tuple_size); return -1; } if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { - printk(KERN_ERR "%s: %s/%s tag sz %u != %u\n", __func__, + pr_err("%s: %s/%s tag sz %u != %u\n", __func__, gd1->disk_name, gd2->disk_name, b1->tag_size, b2->tag_size); return -1; } - if (strcmp(b1->name, b2->name)) { - printk(KERN_ERR "%s: %s/%s type %s != %s\n", __func__, + if (b1->profile != b2->profile) { + pr_err("%s: %s/%s type %s != %s\n", __func__, gd1->disk_name, gd2->disk_name, - b1->name, b2->name); + b1->profile->name, b2->profile->name); return -1; } @@ -203,6 +200,9 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, q->limits.max_integrity_segments) return false; + if (integrity_req_gap_back_merge(req, next->bio)) + return false; + return true; } EXPORT_SYMBOL(blk_integrity_merge_rq); @@ -245,8 +245,8 @@ struct integrity_sysfs_entry { static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { - struct blk_integrity *bi = - container_of(kobj, struct blk_integrity, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); + struct blk_integrity *bi = &disk->queue->integrity; struct integrity_sysfs_entry *entry = container_of(attr, struct integrity_sysfs_entry, attr); @@ -257,8 +257,8 @@ static ssize_t integrity_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t count) { - struct blk_integrity *bi = - container_of(kobj, struct blk_integrity, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); + struct blk_integrity *bi = &disk->queue->integrity; struct integrity_sysfs_entry *entry = container_of(attr, struct integrity_sysfs_entry, attr); ssize_t ret = 0; @@ -271,18 +271,21 @@ static ssize_t integrity_attr_store(struct kobject *kobj, static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) { - if (bi != NULL && bi->name != NULL) - return sprintf(page, "%s\n", bi->name); + if (bi->profile && bi->profile->name) + return sprintf(page, "%s\n", bi->profile->name); else return sprintf(page, "none\n"); } static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) { - if (bi != NULL) - return sprintf(page, "%u\n", bi->tag_size); - else - return sprintf(page, "0\n"); + return sprintf(page, "%u\n", bi->tag_size); +} + +static ssize_t integrity_interval_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%u\n", + bi->interval_exp ? 1 << bi->interval_exp : 0); } static ssize_t integrity_verify_store(struct blk_integrity *bi, @@ -339,6 +342,11 @@ static struct integrity_sysfs_entry integrity_tag_size_entry = { .show = integrity_tag_size_show, }; +static struct integrity_sysfs_entry integrity_interval_entry = { + .attr = { .name = "protection_interval_bytes", .mode = S_IRUGO }, + .show = integrity_interval_show, +}; + static struct integrity_sysfs_entry integrity_verify_entry = { .attr = { .name = "read_verify", .mode = S_IRUGO | S_IWUSR }, .show = integrity_verify_show, @@ -359,6 +367,7 @@ static struct integrity_sysfs_entry integrity_device_entry = { static struct attribute *integrity_attrs[] = { &integrity_format_entry.attr, &integrity_tag_size_entry.attr, + &integrity_interval_entry.attr, &integrity_verify_entry.attr, &integrity_generate_entry.attr, &integrity_device_entry.attr, @@ -370,114 +379,89 @@ static const struct sysfs_ops integrity_ops = { .store = &integrity_attr_store, }; -static int __init blk_dev_integrity_init(void) -{ - integrity_cachep = kmem_cache_create("blkdev_integrity", - sizeof(struct blk_integrity), - 0, SLAB_PANIC, NULL); - return 0; -} -subsys_initcall(blk_dev_integrity_init); - -static void blk_integrity_release(struct kobject *kobj) -{ - struct blk_integrity *bi = - container_of(kobj, struct blk_integrity, kobj); - - kmem_cache_free(integrity_cachep, bi); -} - static struct kobj_type integrity_ktype = { .default_attrs = integrity_attrs, .sysfs_ops = &integrity_ops, - .release = blk_integrity_release, }; -bool blk_integrity_is_initialized(struct gendisk *disk) +static int blk_integrity_nop_fn(struct blk_integrity_iter *iter) { - struct blk_integrity *bi = blk_get_integrity(disk); - - return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0); + return 0; } -EXPORT_SYMBOL(blk_integrity_is_initialized); + +static struct blk_integrity_profile nop_profile = { + .name = "nop", + .generate_fn = blk_integrity_nop_fn, + .verify_fn = blk_integrity_nop_fn, +}; /** * blk_integrity_register - Register a gendisk as being integrity-capable * @disk: struct gendisk pointer to make integrity-aware - * @template: optional integrity profile to register + * @template: block integrity profile to register * - * Description: When a device needs to advertise itself as being able - * to send/receive integrity metadata it must use this function to - * register the capability with the block layer. The template is a - * blk_integrity struct with values appropriate for the underlying - * hardware. If template is NULL the new profile is allocated but - * not filled out. See Documentation/block/data-integrity.txt. + * Description: When a device needs to advertise itself as being able to + * send/receive integrity metadata it must use this function to register + * the capability with the block layer. The template is a blk_integrity + * struct with values appropriate for the underlying hardware. See + * Documentation/block/data-integrity.txt. */ -int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) +void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) { - struct blk_integrity *bi; - - BUG_ON(disk == NULL); - - if (disk->integrity == NULL) { - bi = kmem_cache_alloc(integrity_cachep, - GFP_KERNEL | __GFP_ZERO); - if (!bi) - return -1; - - if (kobject_init_and_add(&bi->kobj, &integrity_ktype, - &disk_to_dev(disk)->kobj, - "%s", "integrity")) { - kmem_cache_free(integrity_cachep, bi); - return -1; - } - - kobject_uevent(&bi->kobj, KOBJ_ADD); + struct blk_integrity *bi = &disk->queue->integrity; - bi->flags |= BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE; - bi->interval = queue_logical_block_size(disk->queue); - disk->integrity = bi; - } else - bi = disk->integrity; + bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | + template->flags; + bi->interval_exp = ilog2(queue_logical_block_size(disk->queue)); + bi->profile = template->profile ? template->profile : &nop_profile; + bi->tuple_size = template->tuple_size; + bi->tag_size = template->tag_size; - /* Use the provided profile as template */ - if (template != NULL) { - bi->name = template->name; - bi->generate_fn = template->generate_fn; - bi->verify_fn = template->verify_fn; - bi->tuple_size = template->tuple_size; - bi->tag_size = template->tag_size; - bi->flags |= template->flags; - } else - bi->name = bi_unsupported_name; - - disk->queue->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; - - return 0; + blk_integrity_revalidate(disk); } EXPORT_SYMBOL(blk_integrity_register); /** - * blk_integrity_unregister - Remove block integrity profile - * @disk: disk whose integrity profile to deallocate + * blk_integrity_unregister - Unregister block integrity profile + * @disk: disk whose integrity profile to unregister * - * Description: This function frees all memory used by the block - * integrity profile. To be called at device teardown. + * Description: This function unregisters the integrity capability from + * a block device. */ void blk_integrity_unregister(struct gendisk *disk) { - struct blk_integrity *bi; + blk_integrity_revalidate(disk); + memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); +} +EXPORT_SYMBOL(blk_integrity_unregister); - if (!disk || !disk->integrity) +void blk_integrity_revalidate(struct gendisk *disk) +{ + struct blk_integrity *bi = &disk->queue->integrity; + + if (!(disk->flags & GENHD_FL_UP)) return; - disk->queue->backing_dev_info.capabilities &= ~BDI_CAP_STABLE_WRITES; + if (bi->profile) + disk->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + disk->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; +} + +void blk_integrity_add(struct gendisk *disk) +{ + if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, + &disk_to_dev(disk)->kobj, "%s", "integrity")) + return; - bi = disk->integrity; + kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); +} - kobject_uevent(&bi->kobj, KOBJ_REMOVE); - kobject_del(&bi->kobj); - kobject_put(&bi->kobj); - disk->integrity = NULL; +void blk_integrity_del(struct gendisk *disk) +{ + kobject_uevent(&disk->integrity_kobj, KOBJ_REMOVE); + kobject_del(&disk->integrity_kobj); + kobject_put(&disk->integrity_kobj); } -EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/kernel/block/blk-ioc.c b/kernel/block/blk-ioc.c index 28f467e63..dc8785233 100644 --- a/kernel/block/blk-ioc.c +++ b/kernel/block/blk-ioc.c @@ -290,7 +290,7 @@ struct io_context *get_task_io_context(struct task_struct *task, { struct io_context *ioc; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); do { task_lock(task); diff --git a/kernel/block/blk-lib.c b/kernel/block/blk-lib.c index 7688ee3f5..9ebf65379 100644 --- a/kernel/block/blk-lib.c +++ b/kernel/block/blk-lib.c @@ -11,16 +11,16 @@ struct bio_batch { atomic_t done; - unsigned long flags; + int error; struct completion *wait; }; -static void bio_batch_end_io(struct bio *bio, int err) +static void bio_batch_end_io(struct bio *bio) { struct bio_batch *bb = bio->bi_private; - if (err && (err != -EOPNOTSUPP)) - clear_bit(BIO_UPTODATE, &bb->flags); + if (bio->bi_error && bio->bi_error != -EOPNOTSUPP) + bb->error = bio->bi_error; if (atomic_dec_and_test(&bb->done)) complete(bb->wait); bio_put(bio); @@ -43,7 +43,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, DECLARE_COMPLETION_ONSTACK(wait); struct request_queue *q = bdev_get_queue(bdev); int type = REQ_WRITE | REQ_DISCARD; - unsigned int max_discard_sectors, granularity; + unsigned int granularity; int alignment; struct bio_batch bb; struct bio *bio; @@ -60,17 +60,6 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, granularity = max(q->limits.discard_granularity >> 9, 1U); alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; - /* - * Ensure that max_discard_sectors is of the proper - * granularity, so that requests stay aligned after a split. - */ - max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); - max_discard_sectors -= max_discard_sectors % granularity; - if (unlikely(!max_discard_sectors)) { - /* Avoid infinite loop below. Being cautious never hurts. */ - return -EOPNOTSUPP; - } - if (flags & BLKDEV_DISCARD_SECURE) { if (!blk_queue_secdiscard(q)) return -EOPNOTSUPP; @@ -78,7 +67,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } atomic_set(&bb.done, 1); - bb.flags = 1 << BIO_UPTODATE; + bb.error = 0; bb.wait = &wait; blk_start_plug(&plug); @@ -92,7 +81,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, break; } - req_sects = min_t(sector_t, nr_sects, max_discard_sectors); + /* Make sure bi_size doesn't overflow */ + req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9); /* * If splitting a request, and the next starting sector would be @@ -134,9 +124,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!atomic_dec_and_test(&bb.done)) wait_for_completion_io(&wait); - if (!test_bit(BIO_UPTODATE, &bb.flags)) - ret = -EIO; - + if (bb.error) + return bb.error; return ret; } EXPORT_SYMBOL(blkdev_issue_discard); @@ -166,13 +155,11 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if (!q) return -ENXIO; - max_write_same_sectors = q->limits.max_write_same_sectors; - - if (max_write_same_sectors == 0) - return -EOPNOTSUPP; + /* Ensure that max_write_same_sectors doesn't overflow bi_size */ + max_write_same_sectors = UINT_MAX >> 9; atomic_set(&bb.done, 1); - bb.flags = 1 << BIO_UPTODATE; + bb.error = 0; bb.wait = &wait; while (nr_sects) { @@ -208,9 +195,8 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, if (!atomic_dec_and_test(&bb.done)) wait_for_completion_io(&wait); - if (!test_bit(BIO_UPTODATE, &bb.flags)) - ret = -ENOTSUPP; - + if (bb.error) + return bb.error; return ret; } EXPORT_SYMBOL(blkdev_issue_write_same); @@ -236,7 +222,7 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, DECLARE_COMPLETION_ONSTACK(wait); atomic_set(&bb.done, 1); - bb.flags = 1 << BIO_UPTODATE; + bb.error = 0; bb.wait = &wait; ret = 0; @@ -270,10 +256,8 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, if (!atomic_dec_and_test(&bb.done)) wait_for_completion_io(&wait); - if (!test_bit(BIO_UPTODATE, &bb.flags)) - /* One of bios in the batch was completed with error.*/ - ret = -EIO; - + if (bb.error) + return bb.error; return ret; } diff --git a/kernel/block/blk-map.c b/kernel/block/blk-map.c index da310a105..f565e11f4 100644 --- a/kernel/block/blk-map.c +++ b/kernel/block/blk-map.c @@ -9,6 +9,24 @@ #include "blk.h" +static bool iovec_gap_to_prv(struct request_queue *q, + struct iovec *prv, struct iovec *cur) +{ + unsigned long prev_end; + + if (!queue_virt_boundary(q)) + return false; + + if (prv->iov_base == NULL && prv->iov_len == 0) + /* prv is not set - don't check */ + return false; + + prev_end = (unsigned long)(prv->iov_base + prv->iov_len); + + return (((unsigned long)cur->iov_base & queue_virt_boundary(q)) || + prev_end & queue_virt_boundary(q)); +} + int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -67,7 +85,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct bio *bio; int unaligned = 0; struct iov_iter i; - struct iovec iov; + struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0}; if (!iter || !iter->count) return -EINVAL; @@ -81,8 +99,12 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, /* * Keep going so we check length of all segments */ - if (uaddr & queue_dma_alignment(q)) + if ((uaddr & queue_dma_alignment(q)) || + iovec_gap_to_prv(q, &prv, &iov)) unaligned = 1; + + prv.iov_base = iov.iov_base; + prv.iov_len = iov.iov_len; } if (unaligned || (q->dma_pad_mask & iter->count) || map_data) @@ -94,7 +116,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, return PTR_ERR(bio); if (map_data && map_data->null_mapped) - bio->bi_flags |= (1 << BIO_NULL_MAPPED); + bio_set_flag(bio, BIO_NULL_MAPPED); if (bio->bi_iter.bi_size != iter->count) { /* @@ -103,7 +125,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, * normal IO completion path */ bio_get(bio); - bio_endio(bio, 0); + bio_endio(bio); __blk_rq_unmap_user(bio); return -EINVAL; } diff --git a/kernel/block/blk-merge.c b/kernel/block/blk-merge.c index fd3fee81c..b966db8f3 100644 --- a/kernel/block/blk-merge.c +++ b/kernel/block/blk-merge.c @@ -9,12 +9,196 @@ #include "blk.h" +static struct bio *blk_bio_discard_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs, + unsigned *nsegs) +{ + unsigned int max_discard_sectors, granularity; + int alignment; + sector_t tmp; + unsigned split_sectors; + + *nsegs = 1; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + + max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); + max_discard_sectors -= max_discard_sectors % granularity; + + if (unlikely(!max_discard_sectors)) { + /* XXX: warn */ + return NULL; + } + + if (bio_sectors(bio) <= max_discard_sectors) + return NULL; + + split_sectors = max_discard_sectors; + + /* + * If the next starting sector would be misaligned, stop the discard at + * the previous aligned sector. + */ + alignment = (q->limits.discard_alignment >> 9) % granularity; + + tmp = bio->bi_iter.bi_sector + split_sectors - alignment; + tmp = sector_div(tmp, granularity); + + if (split_sectors > tmp) + split_sectors -= tmp; + + return bio_split(bio, split_sectors, GFP_NOIO, bs); +} + +static struct bio *blk_bio_write_same_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs, + unsigned *nsegs) +{ + *nsegs = 1; + + if (!q->limits.max_write_same_sectors) + return NULL; + + if (bio_sectors(bio) <= q->limits.max_write_same_sectors) + return NULL; + + return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs); +} + +static inline unsigned get_max_io_size(struct request_queue *q, + struct bio *bio) +{ + unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); + unsigned mask = queue_logical_block_size(q) - 1; + + /* aligned to logical block size */ + sectors &= ~(mask >> 9); + + return sectors; +} + +static struct bio *blk_bio_segment_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs, + unsigned *segs) +{ + struct bio_vec bv, bvprv, *bvprvp = NULL; + struct bvec_iter iter; + unsigned seg_size = 0, nsegs = 0, sectors = 0; + unsigned front_seg_size = bio->bi_seg_front_size; + bool do_split = true; + struct bio *new = NULL; + const unsigned max_sectors = get_max_io_size(q, bio); + + bio_for_each_segment(bv, bio, iter) { + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, disallow it. + */ + if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) + goto split; + + if (sectors + (bv.bv_len >> 9) > max_sectors) { + /* + * Consider this a new segment if we're splitting in + * the middle of this vector. + */ + if (nsegs < queue_max_segments(q) && + sectors < max_sectors) { + nsegs++; + sectors = max_sectors; + } + if (sectors) + goto split; + /* Make this single bvec as the 1st segment */ + } + + if (bvprvp && blk_queue_cluster(q)) { + if (seg_size + bv.bv_len > queue_max_segment_size(q)) + goto new_segment; + if (!BIOVEC_PHYS_MERGEABLE(bvprvp, &bv)) + goto new_segment; + if (!BIOVEC_SEG_BOUNDARY(q, bvprvp, &bv)) + goto new_segment; + + seg_size += bv.bv_len; + bvprv = bv; + bvprvp = &bvprv; + sectors += bv.bv_len >> 9; + + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; + continue; + } +new_segment: + if (nsegs == queue_max_segments(q)) + goto split; + + nsegs++; + bvprv = bv; + bvprvp = &bvprv; + seg_size = bv.bv_len; + sectors += bv.bv_len >> 9; + + if (nsegs == 1 && seg_size > front_seg_size) + front_seg_size = seg_size; + } + + do_split = false; +split: + *segs = nsegs; + + if (do_split) { + new = bio_split(bio, sectors, GFP_NOIO, bs); + if (new) + bio = new; + } + + bio->bi_seg_front_size = front_seg_size; + if (seg_size > bio->bi_seg_back_size) + bio->bi_seg_back_size = seg_size; + + return do_split ? new : NULL; +} + +void blk_queue_split(struct request_queue *q, struct bio **bio, + struct bio_set *bs) +{ + struct bio *split, *res; + unsigned nsegs; + + if ((*bio)->bi_rw & REQ_DISCARD) + split = blk_bio_discard_split(q, *bio, bs, &nsegs); + else if ((*bio)->bi_rw & REQ_WRITE_SAME) + split = blk_bio_write_same_split(q, *bio, bs, &nsegs); + else + split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs); + + /* physical segments can be figured out during splitting */ + res = split ? split : *bio; + res->bi_phys_segments = nsegs; + bio_set_flag(res, BIO_SEG_VALID); + + if (split) { + /* there isn't chance to merge the splitted bio */ + split->bi_rw |= REQ_NOMERGE; + + bio_chain(split, *bio); + generic_make_request(*bio); + *bio = split; + } +} +EXPORT_SYMBOL(blk_queue_split); + static unsigned int __blk_recalc_rq_segments(struct request_queue *q, struct bio *bio, bool no_sg_merge) { struct bio_vec bv, bvprv = { NULL }; - int cluster, high, highprv = 1; + int cluster, prev = 0; unsigned int seg_size, nr_phys_segs; struct bio *fbio, *bbio; struct bvec_iter iter; @@ -36,7 +220,6 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, cluster = blk_queue_cluster(q); seg_size = 0; nr_phys_segs = 0; - high = 0; for_each_bio(bio) { bio_for_each_segment(bv, bio, iter) { /* @@ -46,13 +229,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, if (no_sg_merge) goto new_segment; - /* - * the trick here is making sure that a high page is - * never considered part of another segment, since - * that might change with the bounce page. - */ - high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q); - if (!high && !highprv && cluster) { + if (prev && cluster) { if (seg_size + bv.bv_len > queue_max_segment_size(q)) goto new_segment; @@ -72,8 +249,8 @@ new_segment: nr_phys_segs++; bvprv = bv; + prev = 1; seg_size = bv.bv_len; - highprv = high; } bbio = bio; } @@ -116,7 +293,7 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio) bio->bi_next = nxt; } - bio->bi_flags |= (1 << BIO_SEG_VALID); + bio_set_flag(bio, BIO_SEG_VALID); } EXPORT_SYMBOL(blk_recount_segments); @@ -266,7 +443,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (rq->cmd_flags & REQ_WRITE) memset(q->dma_drain_buffer, 0, q->dma_drain_size); - sg->page_link &= ~0x02; + sg_unmark_end(sg); sg = sg_next(sg); sg_set_page(sg, virt_to_page(q->dma_drain_buffer), q->dma_drain_size, @@ -279,6 +456,12 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, if (sg) sg_mark_end(sg); + /* + * Something must have been wrong if the figured number of + * segment is bigger than number of req's physical segments + */ + WARN_ON(nsegs > rq->nr_phys_segments); + return nsegs; } EXPORT_SYMBOL(blk_rq_map_sg); @@ -312,6 +495,11 @@ no_merge: int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { + if (req_gap_back_merge(req, bio)) + return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_back_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -330,6 +518,12 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { + + if (req_gap_front_merge(req, bio)) + return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_front_merge(req, bio)) + return 0; if (blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; @@ -356,14 +550,6 @@ static bool req_no_special_merge(struct request *req) return !q->mq_ops && req->special; } -static int req_gap_to_prev(struct request *req, struct request *next) -{ - struct bio *prev = req->biotail; - - return bvec_gap_to_prev(&prev->bi_io_vec[prev->bi_vcnt - 1], - next->bio->bi_io_vec[0].bv_offset); -} - static int ll_merge_requests_fn(struct request_queue *q, struct request *req, struct request *next) { @@ -378,8 +564,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, if (req_no_special_merge(req) || req_no_special_merge(next)) return 0; - if (test_bit(QUEUE_FLAG_SG_GAPS, &q->queue_flags) && - req_gap_to_prev(req, next)) + if (req_gap_back_merge(req, next->bio)) return 0; /* @@ -564,8 +749,6 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq, bool blk_rq_merge_ok(struct request *rq, struct bio *bio) { - struct request_queue *q = rq->q; - if (!rq_mergeable(rq) || !bio_mergeable(bio)) return false; @@ -589,14 +772,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) !blk_write_same_mergeable(rq->bio, bio)) return false; - if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) { - struct bio_vec *bprev; - - bprev = &rq->biotail->bi_io_vec[rq->biotail->bi_vcnt - 1]; - if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset)) - return false; - } - return true; } diff --git a/kernel/block/blk-mq-cpumap.c b/kernel/block/blk-mq-cpumap.c index 5f13f4d0b..8764c241e 100644 --- a/kernel/block/blk-mq-cpumap.c +++ b/kernel/block/blk-mq-cpumap.c @@ -24,14 +24,15 @@ static int get_first_sibling(unsigned int cpu) { unsigned int ret; - ret = cpumask_first(topology_thread_cpumask(cpu)); + ret = cpumask_first(topology_sibling_cpumask(cpu)); if (ret < nr_cpu_ids) return ret; return cpu; } -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, + const struct cpumask *online_mask) { unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling; cpumask_var_t cpus; @@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) cpumask_clear(cpus); nr_cpus = nr_uniq_cpus = 0; - for_each_online_cpu(i) { + for_each_cpu(i, online_mask) { nr_cpus++; first_sibling = get_first_sibling(i); if (!cpumask_test_cpu(first_sibling, cpus)) @@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues) queue = 0; for_each_possible_cpu(i) { - if (!cpu_online(i)) { + if (!cpumask_test_cpu(i, online_mask)) { map[i] = 0; continue; } @@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) if (!map) return NULL; - if (!blk_mq_update_queue_map(map, set->nr_hw_queues)) + if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask)) return map; kfree(map); diff --git a/kernel/block/blk-mq-sysfs.c b/kernel/block/blk-mq-sysfs.c index 279c5d674..1cf18784c 100644 --- a/kernel/block/blk-mq-sysfs.c +++ b/kernel/block/blk-mq-sysfs.c @@ -174,6 +174,11 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) return ret; } +static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success); +} + static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, char *page) { @@ -229,8 +234,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) unsigned int i, first = 1; ssize_t ret = 0; - blk_mq_disable_hotplug(); - for_each_cpu(i, hctx->cpumask) { if (first) ret += sprintf(ret + page, "%u", i); @@ -240,8 +243,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) first = 0; } - blk_mq_enable_hotplug(); - ret += sprintf(ret + page, "\n"); return ret; } @@ -299,6 +300,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_cpus_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { + .attr = {.name = "io_poll", .mode = S_IRUGO }, + .show = blk_mq_hw_sysfs_poll_show, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, @@ -308,6 +313,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_tags.attr, &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, + &blk_mq_hw_sysfs_poll.attr, NULL, }; @@ -343,7 +349,7 @@ static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) struct blk_mq_ctx *ctx; int i; - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + if (!hctx->nr_ctx) return; hctx_for_each_ctx(hctx, ctx, i) @@ -358,7 +364,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) struct blk_mq_ctx *ctx; int i, ret; - if (!hctx->nr_ctx || !(hctx->flags & BLK_MQ_F_SYSFS_UP)) + if (!hctx->nr_ctx) return 0; ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); @@ -381,6 +387,8 @@ void blk_mq_unregister_disk(struct gendisk *disk) struct blk_mq_ctx *ctx; int i, j; + blk_mq_disable_hotplug(); + queue_for_each_hw_ctx(q, hctx, i) { blk_mq_unregister_hctx(hctx); @@ -395,6 +403,9 @@ void blk_mq_unregister_disk(struct gendisk *disk) kobject_put(&q->mq_kobj); kobject_put(&disk_to_dev(disk)->kobj); + + q->mq_sysfs_init_done = false; + blk_mq_enable_hotplug(); } static void blk_mq_sysfs_init(struct request_queue *q) @@ -412,12 +423,6 @@ static void blk_mq_sysfs_init(struct request_queue *q) kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } -/* see blk_register_queue() */ -void blk_mq_finish_init(struct request_queue *q) -{ - percpu_ref_switch_to_percpu(&q->mq_usage_counter); -} - int blk_mq_register_disk(struct gendisk *disk) { struct device *dev = disk_to_dev(disk); @@ -425,27 +430,30 @@ int blk_mq_register_disk(struct gendisk *disk) struct blk_mq_hw_ctx *hctx; int ret, i; + blk_mq_disable_hotplug(); + blk_mq_sysfs_init(q); ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) - return ret; + goto out; kobject_uevent(&q->mq_kobj, KOBJ_ADD); queue_for_each_hw_ctx(q, hctx, i) { - hctx->flags |= BLK_MQ_F_SYSFS_UP; ret = blk_mq_register_hctx(hctx); if (ret) break; } - if (ret) { + if (ret) blk_mq_unregister_disk(disk); - return ret; - } + else + q->mq_sysfs_init_done = true; +out: + blk_mq_enable_hotplug(); - return 0; + return ret; } EXPORT_SYMBOL_GPL(blk_mq_register_disk); @@ -454,6 +462,9 @@ void blk_mq_sysfs_unregister(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; + if (!q->mq_sysfs_init_done) + return; + queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); } @@ -463,6 +474,9 @@ int blk_mq_sysfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i, ret = 0; + if (!q->mq_sysfs_init_done) + return ret; + queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) diff --git a/kernel/block/blk-mq-tag.c b/kernel/block/blk-mq-tag.c index be3290cc0..a07ca3488 100644 --- a/kernel/block/blk-mq-tag.c +++ b/kernel/block/blk-mq-tag.c @@ -75,6 +75,10 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) struct blk_mq_bitmap_tags *bt; int i, wake_index; + /* + * Make sure all changes prior to this are visible from other CPUs. + */ + smp_mb(); bt = &tags->bitmap_tags; wake_index = atomic_read(&bt->wake_index); for (i = 0; i < BT_WAIT_QUEUES; i++) { @@ -264,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) return tag; - if (!(data->gfp & __GFP_WAIT)) + if (!gfpflags_allow_blocking(data->gfp)) return -1; bs = bt_wait_ptr(bt, hctx); @@ -429,7 +433,7 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, for (bit = find_first_bit(&bm->word, bm->depth); bit < bm->depth; bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { - rq = blk_mq_tag_to_rq(hctx->tags, off + bit); + rq = hctx->tags->rqs[off + bit]; if (rq->q == hctx->queue) fn(hctx, rq, data, reserved); } @@ -438,17 +442,63 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, } } -void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, - void *priv) +static void bt_tags_for_each(struct blk_mq_tags *tags, + struct blk_mq_bitmap_tags *bt, unsigned int off, + busy_tag_iter_fn *fn, void *data, bool reserved) { - struct blk_mq_tags *tags = hctx->tags; + struct request *rq; + int bit, i; + + if (!tags->rqs) + return; + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + for (bit = find_first_bit(&bm->word, bm->depth); + bit < bm->depth; + bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { + rq = tags->rqs[off + bit]; + fn(rq, data, reserved); + } + + off += (1 << bt->bits_per_word); + } +} +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, + void *priv) +{ if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true); + bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, false); } -EXPORT_SYMBOL(blk_mq_tag_busy_iter); +EXPORT_SYMBOL(blk_mq_all_tag_busy_iter); + +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_tags *tags = hctx->tags; + + /* + * If not software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!blk_mq_hw_queue_mapped(hctx)) + continue; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + false); + } + +} static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) { @@ -580,6 +630,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, if (!tags) return NULL; + if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) { + kfree(tags); + return NULL; + } + tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; @@ -590,6 +645,7 @@ void blk_mq_free_tags(struct blk_mq_tags *tags) { bt_free(&tags->bitmap_tags); bt_free(&tags->breserved_tags); + free_cpumask_var(tags->cpumask); kfree(tags); } diff --git a/kernel/block/blk-mq-tag.h b/kernel/block/blk-mq-tag.h index 90767b370..d468a79f2 100644 --- a/kernel/block/blk-mq-tag.h +++ b/kernel/block/blk-mq-tag.h @@ -44,6 +44,7 @@ struct blk_mq_tags { struct list_head page_list; int alloc_policy; + cpumask_var_t cpumask; }; @@ -57,6 +58,8 @@ extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv); enum { BLK_MQ_TAG_CACHE_MIN = 1, @@ -88,4 +91,16 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) __blk_mq_tag_idle(hctx); } +/* + * This helper should only be used for flush request to share tag + * with the request cloned from, and both the two requests can't be + * in flight at the same time. The caller has to make sure the tag + * can't be freed. + */ +static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, + unsigned int tag, struct request *rq) +{ + hctx->tags->rqs[tag] = rq; +} + #endif diff --git a/kernel/block/blk-mq.c b/kernel/block/blk-mq.c index c473bd192..7cdf19e4a 100644 --- a/kernel/block/blk-mq.c +++ b/kernel/block/blk-mq.c @@ -9,6 +9,7 @@ #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/kmemleak.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> @@ -77,49 +78,13 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); } -static int blk_mq_queue_enter(struct request_queue *q, gfp_t gfp) -{ - while (true) { - int ret; - - if (percpu_ref_tryget_live(&q->mq_usage_counter)) - return 0; - - if (!(gfp & __GFP_WAIT)) - return -EBUSY; - - ret = swait_event_interruptible(q->mq_freeze_wq, - !q->mq_freeze_depth || blk_queue_dying(q)); - if (blk_queue_dying(q)) - return -ENODEV; - if (ret) - return ret; - } -} - -static void blk_mq_queue_exit(struct request_queue *q) -{ - percpu_ref_put(&q->mq_usage_counter); -} - -static void blk_mq_usage_counter_release(struct percpu_ref *ref) -{ - struct request_queue *q = - container_of(ref, struct request_queue, mq_usage_counter); - - swait_wake_all(&q->mq_freeze_wq); -} - void blk_mq_freeze_queue_start(struct request_queue *q) { - bool freeze; + int freeze_depth; - spin_lock_irq(q->queue_lock); - freeze = !q->mq_freeze_depth++; - spin_unlock_irq(q->queue_lock); - - if (freeze) { - percpu_ref_kill(&q->mq_usage_counter); + freeze_depth = atomic_inc_return(&q->mq_freeze_depth); + if (freeze_depth == 1) { + percpu_ref_kill(&q->q_usage_counter); blk_mq_run_hw_queues(q, false); } } @@ -127,31 +92,45 @@ EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); static void blk_mq_freeze_queue_wait(struct request_queue *q) { - swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); + swait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } /* * Guarantee no request is in use, so we can change any data structure of * the queue afterward. */ -void blk_mq_freeze_queue(struct request_queue *q) +void blk_freeze_queue(struct request_queue *q) { + /* + * In the !blk_mq case we are only calling this to kill the + * q_usage_counter, otherwise this increases the freeze depth + * and waits for it to return to zero. For this reason there is + * no blk_unfreeze_queue(), and blk_freeze_queue() is not + * exported to drivers as the only user for unfreeze is blk_mq. + */ blk_mq_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } + +void blk_mq_freeze_queue(struct request_queue *q) +{ + /* + * ...just an alias to keep freeze and unfreeze actions balanced + * in the blk_mq_* namespace + */ + blk_freeze_queue(q); +} EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void blk_mq_unfreeze_queue(struct request_queue *q) { - bool wake; + int freeze_depth; - spin_lock_irq(q->queue_lock); - wake = !--q->mq_freeze_depth; - WARN_ON_ONCE(q->mq_freeze_depth < 0); - spin_unlock_irq(q->queue_lock); - if (wake) { - percpu_ref_reinit(&q->mq_usage_counter); - swait_wake_all(&q->mq_freeze_wq); + freeze_depth = atomic_dec_return(&q->mq_freeze_depth); + WARN_ON_ONCE(freeze_depth < 0); + if (!freeze_depth) { + percpu_ref_reinit(&q->q_usage_counter); + swake_up_all(&q->mq_freeze_wq); } } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); @@ -170,7 +149,7 @@ void blk_mq_wake_waiters(struct request_queue *q) * dying, we need to ensure that processes currently waiting on * the queue are notified as well. */ - swait_wake_all(&q->mq_freeze_wq); + swake_up_all(&q->mq_freeze_wq); } bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) @@ -262,17 +241,17 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, struct blk_mq_alloc_data alloc_data; int ret; - ret = blk_mq_queue_enter(q, gfp); + ret = blk_queue_enter(q, gfp); if (ret) return ERR_PTR(ret); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, + blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM, reserved, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); - if (!rq && (gfp & __GFP_WAIT)) { + if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) { __blk_mq_run_hw_queue(hctx); blk_mq_put_ctx(ctx); @@ -285,7 +264,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, } blk_mq_put_ctx(ctx); if (!rq) { - blk_mq_queue_exit(q); + blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); } return rq; @@ -304,7 +283,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); blk_mq_put_tag(hctx, tag, &ctx->last_tag); - blk_mq_queue_exit(q); + blk_queue_exit(q); } void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) @@ -399,7 +378,7 @@ static void blk_mq_ipi_complete_request(struct request *rq) put_cpu_light(); } -void __blk_mq_complete_request(struct request *rq) +static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; @@ -417,14 +396,16 @@ void __blk_mq_complete_request(struct request *rq) * Ends all I/O on a request. It does not handle partial completions. * The actual completion happens out-of-order, through a IPI handler. **/ -void blk_mq_complete_request(struct request *rq) +void blk_mq_complete_request(struct request *rq, int error) { struct request_queue *q = rq->q; if (unlikely(blk_should_fake_timeout(q))) return; - if (!blk_mark_rq_complete(rq)) + if (!blk_mark_rq_complete(rq)) { + rq->errors = error; __blk_mq_complete_request(rq); + } } EXPORT_SYMBOL(blk_mq_complete_request); @@ -583,23 +564,9 @@ void blk_mq_abort_requeue_list(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_abort_requeue_list); -static inline bool is_flush_request(struct request *rq, - struct blk_flush_queue *fq, unsigned int tag) -{ - return ((rq->cmd_flags & REQ_FLUSH_SEQ) && - fq->flush_rq->tag == tag); -} - struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { - struct request *rq = tags->rqs[tag]; - /* mq_ctx of flush rq is always cloned from the corresponding req */ - struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); - - if (!is_flush_request(rq, fq, tag)) - return rq; - - return fq->flush_rq; + return tags->rqs[tag]; } EXPORT_SYMBOL(blk_mq_tag_to_rq); @@ -654,10 +621,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * If a request wasn't started before the queue was * marked dying, kill it here or it'll go unnoticed. */ - if (unlikely(blk_queue_dying(rq->q))) { - rq->errors = -EIO; - blk_mq_complete_request(rq); - } + if (unlikely(blk_queue_dying(rq->q))) + blk_mq_complete_request(rq, -EIO); return; } if (rq->cmd_flags & REQ_NO_TIMEOUT) @@ -679,24 +644,16 @@ static void blk_mq_rq_timer(unsigned long priv) .next = 0, .next_set = 0, }; - struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) { - /* - * If not software queues are currently mapped to this - * hardware queue, there's nothing to check - */ - if (!blk_mq_hw_queue_mapped(hctx)) - continue; - - blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); - } + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); if (data.next_set) { data.next = blk_rq_timeout(round_jiffies_up(data.next)); mod_timer(&q->timeout, data.next); } else { + struct blk_mq_hw_ctx *hctx; + queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) @@ -1035,18 +992,25 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) } EXPORT_SYMBOL(blk_mq_delay_queue); -static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, - struct request *rq, bool at_head) +static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct request *rq, + bool at_head) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - trace_block_rq_insert(hctx->queue, rq); if (at_head) list_add(&rq->queuelist, &ctx->rq_list); else list_add_tail(&rq->queuelist, &ctx->rq_list); +} + +static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, + struct request *rq, bool at_head) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + __blk_mq_insert_req_list(hctx, ctx, rq, at_head); blk_mq_hctx_mark_pending(hctx, ctx); } @@ -1102,8 +1066,9 @@ static void blk_mq_insert_requests(struct request_queue *q, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); rq->mq_ctx = ctx; - __blk_mq_insert_request(hctx, rq, false); + __blk_mq_insert_req_list(hctx, ctx, rq, false); } + blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); blk_mq_run_hw_queue(hctx, from_schedule); @@ -1185,7 +1150,7 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct request *rq, struct bio *bio) { - if (!hctx_allow_merges(hctx)) { + if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) { blk_mq_bio_to_request(rq, bio); spin_lock(&ctx->lock); insert_rq: @@ -1222,11 +1187,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, int rw = bio_data_dir(bio); struct blk_mq_alloc_data alloc_data; - if (unlikely(blk_mq_queue_enter(q, GFP_KERNEL))) { - bio_endio(bio, -EIO); - return NULL; - } - + blk_queue_enter_live(q); ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); @@ -1245,7 +1206,7 @@ static struct request *blk_mq_map_request(struct request_queue *q, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, - __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); + __GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, rw); ctx = alloc_data.ctx; hctx = alloc_data.hctx; @@ -1257,28 +1218,79 @@ static struct request *blk_mq_map_request(struct request_queue *q, return rq; } +static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie) +{ + int ret; + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, + rq->mq_ctx->cpu); + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; + blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num); + + /* + * For OK queue, we are done. For error, kill it. Any other + * error (busy), just add it to our list as we previously + * would have done + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + if (ret == BLK_MQ_RQ_QUEUE_OK) { + *cookie = new_cookie; + return 0; + } + + __blk_mq_requeue_request(rq); + + if (ret == BLK_MQ_RQ_QUEUE_ERROR) { + *cookie = BLK_QC_T_NONE; + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + return 0; + } + + return -1; +} + /* * Multiple hardware queue variant. This will not use per-process plugs, * but will attempt to bypass the hctx queueing if we can go straight to * hardware for SYNC IO. */ -static void blk_mq_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); struct blk_map_ctx data; struct request *rq; + unsigned int request_count = 0; + struct blk_plug *plug; + struct request *same_queue_rq = NULL; + blk_qc_t cookie; blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_endio(bio, -EIO); - return; + bio_io_error(bio); + return BLK_QC_T_NONE; } + blk_queue_split(q, &bio, q->bio_split); + + if (!is_flush_fua && !blk_queue_nomerges(q)) { + if (blk_attempt_plug_merge(q, bio, &request_count, + &same_queue_rq)) + return BLK_QC_T_NONE; + } else + request_count = blk_plug_queued_count(q); + rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) - return; + return BLK_QC_T_NONE; + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1286,38 +1298,43 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } + plug = current->plug; /* * If the driver supports defer issued based on 'last', then * queue it up like normal since we can potentially save some * CPU this way. */ - if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { - struct blk_mq_queue_data bd = { - .rq = rq, - .list = NULL, - .last = 1 - }; - int ret; + if (((plug && !blk_queue_nomerges(q)) || is_sync) && + !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { + struct request *old_rq = NULL; blk_mq_bio_to_request(rq, bio); /* - * For OK queue, we are done. For error, kill it. Any other - * error (busy), just add it to our list as we previously - * would have done + * We do limited pluging. If the bio can be merged, do that. + * Otherwise the existing request in the plug list will be + * issued. So the plug list will have one request at most */ - ret = q->mq_ops->queue_rq(data.hctx, &bd); - if (ret == BLK_MQ_RQ_QUEUE_OK) - goto done; - else { - __blk_mq_requeue_request(rq); - - if (ret == BLK_MQ_RQ_QUEUE_ERROR) { - rq->errors = -EIO; - blk_mq_end_request(rq, rq->errors); - goto done; + if (plug) { + /* + * The plug list might get flushed before this. If that + * happens, same_queue_rq is invalid and plug list is + * empty + */ + if (same_queue_rq && !list_empty(&plug->mq_list)) { + old_rq = same_queue_rq; + list_del_init(&old_rq->queuelist); } - } + list_add_tail(&rq->queuelist, &plug->mq_list); + } else /* is_sync */ + old_rq = rq; + blk_mq_put_ctx(data.ctx); + if (!old_rq) + goto done; + if (!blk_mq_direct_issue_request(old_rq, &cookie)) + goto done; + blk_mq_insert_request(old_rq, false, true, true); + goto done; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1330,42 +1347,43 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) run_queue: blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); } -done: blk_mq_put_ctx(data.ctx); +done: + return cookie; } /* * Single hardware queue variant. This will attempt to use any per-process * plug for merging and IO deferral. */ -static void blk_sq_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = rw_is_sync(bio->bi_rw); const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); - unsigned int use_plug, request_count = 0; + struct blk_plug *plug; + unsigned int request_count = 0; struct blk_map_ctx data; struct request *rq; - - /* - * If we have multiple hardware queues, just go directly to - * one of those for sync IO. - */ - use_plug = !is_flush_fua && !is_sync; + blk_qc_t cookie; blk_queue_bounce(q, &bio); if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { - bio_endio(bio, -EIO); - return; + bio_io_error(bio); + return BLK_QC_T_NONE; } - if (use_plug && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count)) - return; + blk_queue_split(q, &bio, q->bio_split); + + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, &request_count, NULL)) + return BLK_QC_T_NONE; rq = blk_mq_map_request(q, bio, &data); if (unlikely(!rq)) - return; + return BLK_QC_T_NONE; + + cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num); if (unlikely(is_flush_fua)) { blk_mq_bio_to_request(rq, bio); @@ -1378,21 +1396,21 @@ static void blk_sq_make_request(struct request_queue *q, struct bio *bio) * utilize that to temporarily store requests until the task is * either done or scheduled away. */ - if (use_plug) { - struct blk_plug *plug = current->plug; + plug = current->plug; + if (plug) { + blk_mq_bio_to_request(rq, bio); + if (!request_count) + trace_block_plug(q); - if (plug) { - blk_mq_bio_to_request(rq, bio); - if (list_empty(&plug->mq_list)) - trace_block_plug(q); - else if (request_count >= BLK_MAX_REQUEST_COUNT) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); - } - list_add_tail(&rq->queuelist, &plug->mq_list); - blk_mq_put_ctx(data.ctx); - return; + blk_mq_put_ctx(data.ctx); + + if (request_count >= BLK_MAX_REQUEST_COUNT) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); } + + list_add_tail(&rq->queuelist, &plug->mq_list); + return cookie; } if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { @@ -1407,6 +1425,7 @@ run_queue: } blk_mq_put_ctx(data.ctx); + return cookie; } /* @@ -1438,6 +1457,11 @@ static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_init_rq_map(). + */ + kmemleak_free(page_address(page)); __free_pages(page, page->private); } @@ -1510,6 +1534,11 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, list_add_tail(&page->lru, &tags->page_list); p = page_address(page); + /* + * Allow kmemleak to scan these pages as they contain pointers + * to additional allocations like via ops->init_request(). + */ + kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL); entries_per_page = order_to_size(this_order) / rq_size; to_do = min(entries_per_page, set->queue_depth - i); left -= to_do * rq_size; @@ -1528,7 +1557,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, i++; } } - return tags; fail: @@ -1682,7 +1710,7 @@ static int blk_mq_init_hctx(struct request_queue *q, INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; hctx->queue_num = hctx_idx; - hctx->flags = set->flags; + hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1790,13 +1818,19 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } -static void blk_mq_map_swqueue(struct request_queue *q) +static void blk_mq_map_swqueue(struct request_queue *q, + const struct cpumask *online_mask) { unsigned int i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; + /* + * Avoid others reading imcomplete hctx->cpumask through sysfs + */ + mutex_lock(&q->sysfs_lock); + queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; @@ -1807,7 +1841,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) */ queue_for_each_ctx(q, ctx, i) { /* If the cpu isn't online, the cpu is mapped to first hctx */ - if (!cpu_online(i)) + if (!cpumask_test_cpu(i, online_mask)) continue; hctx = q->mq_ops->map_queue(q, i); @@ -1816,6 +1850,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->ctxs[hctx->nr_ctx++] = ctx; } + mutex_unlock(&q->sysfs_lock); + queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_ctxmap *map = &hctx->ctx_map; @@ -1851,29 +1887,36 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx->next_cpu = cpumask_first(hctx->cpumask); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } + + queue_for_each_ctx(q, ctx, i) { + if (!cpumask_test_cpu(i, online_mask)) + continue; + + hctx = q->mq_ops->map_queue(q, i); + cpumask_set_cpu(i, hctx->tags->cpumask); + } } -static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) +static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; - struct request_queue *q; - bool shared; int i; - if (set->tag_list.next == set->tag_list.prev) - shared = false; - else - shared = true; + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + } +} + +static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) +{ + struct request_queue *q; list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_freeze_queue(q); - - queue_for_each_hw_ctx(q, hctx, i) { - if (shared) - hctx->flags |= BLK_MQ_F_TAG_SHARED; - else - hctx->flags &= ~BLK_MQ_F_TAG_SHARED; - } + queue_set_hctx_shared(q, shared); blk_mq_unfreeze_queue(q); } } @@ -1884,7 +1927,12 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) mutex_lock(&set->tag_list_lock); list_del_init(&q->tag_set_list); - blk_mq_update_tag_set_depth(set); + if (list_is_singular(&set->tag_list)) { + /* just transitioned to unshared */ + set->flags &= ~BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, false); + } mutex_unlock(&set->tag_list_lock); } @@ -1894,8 +1942,17 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, q->tag_set = set; mutex_lock(&set->tag_list_lock); + + /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ + if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { + set->flags |= BLK_MQ_F_TAG_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_depth(set, true); + } + if (set->flags & BLK_MQ_F_TAG_SHARED) + queue_set_hctx_shared(q, true); list_add_tail(&q->tag_set_list, &set->tag_list); - blk_mq_update_tag_set_depth(set); + mutex_unlock(&set->tag_list_lock); } @@ -1918,6 +1975,9 @@ void blk_mq_release(struct request_queue *q) kfree(hctx); } + kfree(q->mq_map); + q->mq_map = NULL; + kfree(q->queue_hw_ctx); /* ctx kobj stays in queue_ctx */ @@ -1979,14 +2039,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, hctxs[i]->queue_num = i; } - /* - * Init percpu_ref in atomic mode so that it's faster to shutdown. - * See blk_register_queue() for details. - */ - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, - PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto err_hctxs; - setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); @@ -2027,13 +2079,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (blk_mq_init_hw_queues(q, set)) goto err_hctxs; + get_online_cpus(); mutex_lock(&all_q_mutex); - list_add_tail(&q->all_q_node, &all_q_list); - mutex_unlock(&all_q_mutex); + list_add_tail(&q->all_q_node, &all_q_list); blk_mq_add_queue_tag_set(set, q); + blk_mq_map_swqueue(q, cpu_online_mask); - blk_mq_map_swqueue(q); + mutex_unlock(&all_q_mutex); + put_online_cpus(); return q; @@ -2057,30 +2111,25 @@ void blk_mq_free_queue(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; + mutex_lock(&all_q_mutex); + list_del_init(&q->all_q_node); + mutex_unlock(&all_q_mutex); + blk_mq_del_queue_tag_set(q); blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); blk_mq_free_hw_queues(q, set); - - percpu_ref_exit(&q->mq_usage_counter); - - kfree(q->mq_map); - - q->mq_map = NULL; - - mutex_lock(&all_q_mutex); - list_del_init(&q->all_q_node); - mutex_unlock(&all_q_mutex); } /* Basically redo blk_mq_init_queue with queue frozen */ -static void blk_mq_queue_reinit(struct request_queue *q) +static void blk_mq_queue_reinit(struct request_queue *q, + const struct cpumask *online_mask) { - WARN_ON_ONCE(!q->mq_freeze_depth); + WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); blk_mq_sysfs_unregister(q); - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask); /* * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe @@ -2088,7 +2137,7 @@ static void blk_mq_queue_reinit(struct request_queue *q) * involves free and re-allocate memory, worthy doing?) */ - blk_mq_map_swqueue(q); + blk_mq_map_swqueue(q, online_mask); blk_mq_sysfs_register(q); } @@ -2097,16 +2146,43 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, unsigned long action, void *hcpu) { struct request_queue *q; + int cpu = (unsigned long)hcpu; + /* + * New online cpumask which is going to be set in this hotplug event. + * Declare this cpumasks as global as cpu-hotplug operation is invoked + * one-by-one and dynamically allocating this could result in a failure. + */ + static struct cpumask online_new; /* - * Before new mappings are established, hotadded cpu might already - * start handling requests. This doesn't break anything as we map - * offline CPUs to first hardware queue. We will re-init the queue - * below to get optimal settings. + * Before hotadded cpu starts handling requests, new mappings must + * be established. Otherwise, these requests in hw queue might + * never be dispatched. + * + * For example, there is a single hw queue (hctx) and two CPU queues + * (ctx0 for CPU0, and ctx1 for CPU1). + * + * Now CPU1 is just onlined and a request is inserted into + * ctx1->rq_list and set bit0 in pending bitmap as ctx1->index_hw is + * still zero. + * + * And then while running hw queue, flush_busy_ctxs() finds bit0 is + * set in pending bitmap and tries to retrieve requests in + * hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, + * so the request in ctx1->rq_list is ignored. */ - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && - action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DEAD: + case CPU_UP_CANCELED: + cpumask_copy(&online_new, cpu_online_mask); + break; + case CPU_UP_PREPARE: + cpumask_copy(&online_new, cpu_online_mask); + cpumask_set_cpu(cpu, &online_new); + break; + default: return NOTIFY_OK; + } mutex_lock(&all_q_mutex); @@ -2130,7 +2206,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, } list_for_each_entry(q, &all_q_list, all_q_node) - blk_mq_queue_reinit(q); + blk_mq_queue_reinit(q, &online_new); list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_unfreeze_queue(q); @@ -2193,6 +2269,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) return 0; } +struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags) +{ + return tags->cpumask; +} +EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask); + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the diff --git a/kernel/block/blk-mq.h b/kernel/block/blk-mq.h index 4b7cbf0e6..3cb6feb4f 100644 --- a/kernel/block/blk-mq.h +++ b/kernel/block/blk-mq.h @@ -25,12 +25,9 @@ struct blk_mq_ctx { struct kobject kobj; } ____cacheline_aligned_in_smp; -void __blk_mq_complete_request(struct request *rq); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_free_queue(struct request_queue *q); -void blk_mq_clone_flush_request(struct request *flush_rq, - struct request *orig_rq); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); @@ -51,7 +48,8 @@ void blk_mq_disable_hotplug(void); * CPU -> queue mappings */ extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set); -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues); +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues, + const struct cpumask *online_mask); extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); /* diff --git a/kernel/block/blk-settings.c b/kernel/block/blk-settings.c index e0057d035..c7bb666aa 100644 --- a/kernel/block/blk-settings.c +++ b/kernel/block/blk-settings.c @@ -53,28 +53,6 @@ void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn) } EXPORT_SYMBOL(blk_queue_unprep_rq); -/** - * blk_queue_merge_bvec - set a merge_bvec function for queue - * @q: queue - * @mbfn: merge_bvec_fn - * - * Usually queues have static limitations on the max sectors or segments that - * we can put in a request. Stacking drivers may have some settings that - * are dynamic, and thus we have to query the queue whether it is ok to - * add a new bio_vec to a bio at a given offset or not. If the block device - * has such limitations, it needs to register a merge_bvec_fn to control - * the size of bio's sent to it. Note that a block device *must* allow a - * single page to be added to an empty bio. The block device driver may want - * to use the bio_split() function to deal with these bio's. By default - * no merge_bvec_fn is defined for a queue, and only the fixed limits are - * honored. - */ -void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn) -{ - q->merge_bvec_fn = mbfn; -} -EXPORT_SYMBOL(blk_queue_merge_bvec); - void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) { q->softirq_done_fn = fn; @@ -111,11 +89,14 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_segments = BLK_MAX_SEGMENTS; lim->max_integrity_segments = 0; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; + lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->max_dev_sectors = 0; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; + lim->max_hw_discard_sectors = 0; lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; @@ -147,6 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_hw_sectors = UINT_MAX; lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; + lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -234,8 +216,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request - * @limits: the queue limits + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: @@ -244,34 +226,29 @@ EXPORT_SYMBOL(blk_queue_bounce_limit); * the device driver based upon the capabilities of the I/O * controller. * + * max_dev_sectors is a hard limit imposed by the storage device for + * READ/WRITE requests. It is set by the disk driver. + * * max_sectors is a soft limit imposed by the block layer for * filesystem type requests. This value can be overridden on a * per-device basis in /sys/block/<device>/queue/max_sectors_kb. * The soft limit can not exceed max_hw_sectors. **/ -void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) { + struct queue_limits *limits = &q->limits; + unsigned int max_sectors; + if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); printk(KERN_INFO "%s: set to minimum %d\n", __func__, max_hw_sectors); } - limits->max_sectors = limits->max_hw_sectors = max_hw_sectors; -} -EXPORT_SYMBOL(blk_limits_max_hw_sectors); - -/** - * blk_queue_max_hw_sectors - set max sectors for a request for this queue - * @q: the request queue for the device - * @max_hw_sectors: max hardware sectors in the usual 512b unit - * - * Description: - * See description for blk_limits_max_hw_sectors(). - **/ -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) -{ - blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); + limits->max_hw_sectors = max_hw_sectors; + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); + max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + limits->max_sectors = max_sectors; } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -303,6 +280,7 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors); void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors) { + q->limits.max_hw_discard_sectors = max_discard_sectors; q->limits.max_discard_sectors = max_discard_sectors; } EXPORT_SYMBOL(blk_queue_max_discard_sectors); @@ -544,12 +522,15 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); + t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_same_sectors = min(t->max_write_same_sectors, b->max_write_same_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); + t->virt_boundary_mask = min_not_zero(t->virt_boundary_mask, + b->virt_boundary_mask); t->max_segments = min_not_zero(t->max_segments, b->max_segments); t->max_integrity_segments = min_not_zero(t->max_integrity_segments, @@ -641,6 +622,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_discard_sectors = min_not_zero(t->max_discard_sectors, b->max_discard_sectors); + t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors, + b->max_hw_discard_sectors); t->discard_granularity = max(t->discard_granularity, b->discard_granularity); t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) % @@ -788,6 +771,17 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) EXPORT_SYMBOL(blk_queue_segment_boundary); /** + * blk_queue_virt_boundary - set boundary rules for bio merging + * @q: the request queue for the device + * @mask: the memory boundary mask + **/ +void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask) +{ + q->limits.virt_boundary_mask = mask; +} +EXPORT_SYMBOL(blk_queue_virt_boundary); + +/** * blk_queue_dma_alignment - set dma length and memory alignment * @q: the request queue for the device * @mask: alignment mask diff --git a/kernel/block/blk-sysfs.c b/kernel/block/blk-sysfs.c index 2b8fd302f..e140cc487 100644 --- a/kernel/block/blk-sysfs.c +++ b/kernel/block/blk-sysfs.c @@ -6,11 +6,12 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/blktrace_api.h> #include <linux/blk-mq.h> +#include <linux/blk-cgroup.h> #include "blk.h" -#include "blk-cgroup.h" #include "blk-mq.h" struct queue_sysfs_entry { @@ -144,12 +145,43 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag return queue_var_show(q->limits.discard_granularity, page); } +static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page) +{ + unsigned long long val; + + val = q->limits.max_hw_discard_sectors << 9; + return sprintf(page, "%llu\n", val); +} + static ssize_t queue_discard_max_show(struct request_queue *q, char *page) { return sprintf(page, "%llu\n", (unsigned long long)q->limits.max_discard_sectors << 9); } +static ssize_t queue_discard_max_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned long max_discard; + ssize_t ret = queue_var_store(&max_discard, page, count); + + if (ret < 0) + return ret; + + if (max_discard & (q->limits.discard_granularity - 1)) + return -EINVAL; + + max_discard >>= 9; + if (max_discard > UINT_MAX) + return -EINVAL; + + if (max_discard > q->limits.max_hw_discard_sectors) + max_discard = q->limits.max_hw_discard_sectors; + + q->limits.max_discard_sectors = max_discard; + return ret; +} + static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) { return queue_var_show(queue_discard_zeroes_data(q), page); @@ -173,6 +205,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; + max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long) + q->limits.max_dev_sectors >> 1); + if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) return -EINVAL; @@ -285,6 +320,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } +static ssize_t queue_poll_show(struct request_queue *q, char *page) +{ + return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); +} + +static ssize_t queue_poll_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long poll_on; + ssize_t ret; + + if (!q->mq_ops || !q->mq_ops->poll) + return -EINVAL; + + ret = queue_var_store(&poll_on, page, count); + if (ret < 0) + return ret; + + spin_lock_irq(q->queue_lock); + if (poll_on) + queue_flag_set(QUEUE_FLAG_POLL, q); + else + queue_flag_clear(QUEUE_FLAG_POLL, q); + spin_unlock_irq(q->queue_lock); + + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -359,9 +422,15 @@ static struct queue_sysfs_entry queue_discard_granularity_entry = { .show = queue_discard_granularity_show, }; +static struct queue_sysfs_entry queue_discard_max_hw_entry = { + .attr = {.name = "discard_max_hw_bytes", .mode = S_IRUGO }, + .show = queue_discard_max_hw_show, +}; + static struct queue_sysfs_entry queue_discard_max_entry = { - .attr = {.name = "discard_max_bytes", .mode = S_IRUGO }, + .attr = {.name = "discard_max_bytes", .mode = S_IRUGO | S_IWUSR }, .show = queue_discard_max_show, + .store = queue_discard_max_store, }; static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { @@ -404,6 +473,12 @@ static struct queue_sysfs_entry queue_random_entry = { .store = queue_store_random, }; +static struct queue_sysfs_entry queue_poll_entry = { + .attr = {.name = "io_poll", .mode = S_IRUGO | S_IWUSR }, + .show = queue_poll_show, + .store = queue_poll_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -420,6 +495,7 @@ static struct attribute *default_attrs[] = { &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, &queue_discard_max_entry.attr, + &queue_discard_max_hw_entry.attr, &queue_discard_zeroes_data_entry.attr, &queue_write_same_max_entry.attr, &queue_nonrot_entry.attr, @@ -427,6 +503,7 @@ static struct attribute *default_attrs[] = { &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, &queue_random_entry.attr, + &queue_poll_entry.attr, NULL, }; @@ -501,6 +578,7 @@ static void blk_release_queue(struct kobject *kobj) struct request_queue *q = container_of(kobj, struct request_queue, kobj); + bdi_exit(&q->backing_dev_info); blkcg_exit_queue(q); if (q->elevator) { @@ -522,6 +600,9 @@ static void blk_release_queue(struct kobject *kobj) blk_trace_shutdown(q); + if (q->bio_split) + bioset_free(q->bio_split); + ida_simple_remove(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); } @@ -557,9 +638,8 @@ int blk_register_queue(struct gendisk *disk) */ if (!blk_queue_init_done(q)) { queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); + percpu_ref_switch_to_percpu(&q->q_usage_counter); blk_queue_bypass_end(q); - if (q->mq_ops) - blk_mq_finish_init(q); } ret = blk_trace_init_sysfs(dev); diff --git a/kernel/block/blk-throttle.c b/kernel/block/blk-throttle.c index 5b9c6d5c3..2149a1ddb 100644 --- a/kernel/block/blk-throttle.c +++ b/kernel/block/blk-throttle.c @@ -9,7 +9,7 @@ #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> -#include "blk-cgroup.h" +#include <linux/blk-cgroup.h> #include "blk.h" /* Max dispatch from a group in 1 round */ @@ -83,14 +83,6 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -/* Per-cpu group stats */ -struct tg_stats_cpu { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; -}; - struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -141,12 +133,6 @@ struct throtl_grp { /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; - - /* Per cpu stats pointer */ - struct tg_stats_cpu __percpu *stats_cpu; - - /* List of tgs waiting for per cpu stats memory to be allocated */ - struct list_head stats_alloc_node; }; struct throtl_data @@ -168,13 +154,6 @@ struct throtl_data struct work_struct dispatch_work; }; -/* list and work item to allocate percpu group stats */ -static DEFINE_SPINLOCK(tg_stats_alloc_lock); -static LIST_HEAD(tg_stats_alloc_list); - -static void tg_stats_alloc_fn(struct work_struct *); -static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); - static void throtl_pending_timer_fn(unsigned long arg); static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) @@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) return pd_to_blkg(&tg->pd); } -static inline struct throtl_grp *td_root_tg(struct throtl_data *td) -{ - return blkg_to_tg(td->queue->root_blkg); -} - /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest @@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) } \ } while (0) -static void tg_stats_init(struct tg_stats_cpu *tg_stats) -{ - blkg_rwstat_init(&tg_stats->service_bytes); - blkg_rwstat_init(&tg_stats->serviced); -} - -/* - * Worker for allocating per cpu stat for tgs. This is scheduled on the - * system_wq once there are some groups on the alloc_list waiting for - * allocation. - */ -static void tg_stats_alloc_fn(struct work_struct *work) -{ - static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ - struct delayed_work *dwork = to_delayed_work(work); - bool empty = false; - -alloc_stats: - if (!stats_cpu) { - int cpu; - - stats_cpu = alloc_percpu(struct tg_stats_cpu); - if (!stats_cpu) { - /* allocation failed, try again after some time */ - schedule_delayed_work(dwork, msecs_to_jiffies(10)); - return; - } - for_each_possible_cpu(cpu) - tg_stats_init(per_cpu_ptr(stats_cpu, cpu)); - } - - spin_lock_irq(&tg_stats_alloc_lock); - - if (!list_empty(&tg_stats_alloc_list)) { - struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, - struct throtl_grp, - stats_alloc_node); - swap(tg->stats_cpu, stats_cpu); - list_del_init(&tg->stats_alloc_node); - } - - empty = list_empty(&tg_stats_alloc_list); - spin_unlock_irq(&tg_stats_alloc_lock); - if (!empty) - goto alloc_stats; -} - static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); @@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued, } /* init a service_queue, assumes the caller zeroed it */ -static void throtl_service_queue_init(struct throtl_service_queue *sq, - struct throtl_service_queue *parent_sq) +static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[0]); INIT_LIST_HEAD(&sq->queued[1]); sq->pending_tree = RB_ROOT; - sq->parent_sq = parent_sq; setup_timer(&sq->pending_timer, throtl_pending_timer_fn, (unsigned long)sq); } -static void throtl_service_queue_exit(struct throtl_service_queue *sq) +static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) { - del_timer_sync(&sq->pending_timer); + struct throtl_grp *tg; + int rw; + + tg = kzalloc_node(sizeof(*tg), gfp, node); + if (!tg) + return NULL; + + throtl_service_queue_init(&tg->service_queue); + + for (rw = READ; rw <= WRITE; rw++) { + throtl_qnode_init(&tg->qnode_on_self[rw], tg); + throtl_qnode_init(&tg->qnode_on_parent[rw], tg); + } + + RB_CLEAR_NODE(&tg->rb_node); + tg->bps[READ] = -1; + tg->bps[WRITE] = -1; + tg->iops[READ] = -1; + tg->iops[WRITE] = -1; + + return &tg->pd; } -static void throtl_pd_init(struct blkcg_gq *blkg) +static void throtl_pd_init(struct blkg_policy_data *pd) { - struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_grp *tg = pd_to_tg(pd); + struct blkcg_gq *blkg = tg_to_blkg(tg); struct throtl_data *td = blkg->q->td; - struct throtl_service_queue *parent_sq; - unsigned long flags; - int rw; + struct throtl_service_queue *sq = &tg->service_queue; /* * If on the default hierarchy, we switch to properly hierarchical @@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg) * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy. */ - parent_sq = &td->service_queue; - - if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) - parent_sq = &blkg_to_tg(blkg->parent)->service_queue; - - throtl_service_queue_init(&tg->service_queue, parent_sq); - - for (rw = READ; rw <= WRITE; rw++) { - throtl_qnode_init(&tg->qnode_on_self[rw], tg); - throtl_qnode_init(&tg->qnode_on_parent[rw], tg); - } - - RB_CLEAR_NODE(&tg->rb_node); + sq->parent_sq = &td->service_queue; + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) + sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; - - tg->bps[READ] = -1; - tg->bps[WRITE] = -1; - tg->iops[READ] = -1; - tg->iops[WRITE] = -1; - - /* - * Ugh... We need to perform per-cpu allocation for tg->stats_cpu - * but percpu allocator can't be called from IO path. Queue tg on - * tg_stats_alloc_list and allocate from work item. - */ - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); - schedule_delayed_work(&tg_stats_alloc_work, 0); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); } /* @@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg) (tg->bps[rw] != -1 || tg->iops[rw] != -1); } -static void throtl_pd_online(struct blkcg_gq *blkg) +static void throtl_pd_online(struct blkg_policy_data *pd) { /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ - tg_update_has_rules(blkg_to_tg(blkg)); -} - -static void throtl_pd_exit(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - unsigned long flags; - - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_del_init(&tg->stats_alloc_node); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); - - free_percpu(tg->stats_cpu); - - throtl_service_queue_exit(&tg->service_queue); -} - -static void throtl_pd_reset_stats(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - int cpu; - - if (tg->stats_cpu == NULL) - return; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - blkg_rwstat_reset(&sc->service_bytes); - blkg_rwstat_reset(&sc->serviced); - } -} - -static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, - struct blkcg *blkcg) -{ - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) - return td_root_tg(td); - - return blkg_to_tg(blkg_lookup(blkcg, td->queue)); + tg_update_has_rules(pd_to_tg(pd)); } -static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, - struct blkcg *blkcg) +static void throtl_pd_free(struct blkg_policy_data *pd) { - struct request_queue *q = td->queue; - struct throtl_grp *tg = NULL; - - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) { - tg = td_root_tg(td); - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - - /* if %NULL and @q is alive, fall back to root_tg */ - if (!IS_ERR(blkg)) - tg = blkg_to_tg(blkg); - else if (!blk_queue_dying(q)) - tg = td_root_tg(td); - } + struct throtl_grp *tg = pd_to_tg(pd); - return tg; + del_timer_sync(&tg->service_queue.pending_timer); + kfree(tg); } static struct throtl_grp * @@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, - int rw) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - struct tg_stats_cpu *stats_cpu; - unsigned long flags; - - /* If per cpu stats are not allocated yet, don't do any accounting. */ - if (tg->stats_cpu == NULL) - return; - - /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. - */ - local_irq_save(flags); - - stats_cpu = this_cpu_ptr(tg->stats_cpu); - - blkg_rwstat_add(&stats_cpu->serviced, rw, 1); - blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); - - local_irq_restore(flags); -} - static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) * more than once as a throttled bio will go through blk-throtl the * second time when it eventually gets issued. Set it when a bio * is being charged to a tg. - * - * Dispatch stats aren't recursive and each @bio should only be - * accounted by the @tg it was originally associated with. Let's - * update the stats when setting REQ_THROTTLED for the first time - * which is guaranteed to be for the @bio's original tg. */ - if (!(bio->bi_rw & REQ_THROTTLED)) { + if (!(bio->bi_rw & REQ_THROTTLED)) bio->bi_rw |= REQ_THROTTLED; - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - } } /** @@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) } } -static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct throtl_grp *tg = pd_to_tg(pd); - struct blkg_rwstat rwstat = { }, tmp; - int i, cpu; - - if (tg->stats_cpu == NULL) - return 0; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - tmp = blkg_rwstat_read((void *)sc + off); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - rwstat.cnt[i] += tmp.cnt[i]; - } - - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, - &blkcg_policy_throtl, seq_cft(sf)->private, true); - return 0; -} - static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; } -static ssize_t tg_set_conf(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off, bool is_u64) +static void tg_conf_updated(struct throtl_grp *tg) { - struct blkcg *blkcg = css_to_blkcg(of_css(of)); - struct blkg_conf_ctx ctx; - struct throtl_grp *tg; - struct throtl_service_queue *sq; - struct blkcg_gq *blkg; + struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; - int ret; - - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); - if (ret) - return ret; - - tg = blkg_to_tg(ctx.blkg); - sq = &tg->service_queue; - - if (!ctx.v) - ctx.v = -1; - - if (is_u64) - *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; - else - *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; + struct blkcg_gq *blkg; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", @@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) + blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) tg_update_has_rules(blkg_to_tg(blkg)); /* @@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, tg_update_disptime(tg); throtl_schedule_next_dispatch(sq->parent_sq, true); } +} + +static ssize_t tg_set_conf(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, bool is_u64) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + int ret; + u64 v; + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) + goto out_finish; + if (!v) + v = -1; + + tg = blkg_to_tg(ctx.blkg); + + if (is_u64) + *(u64 *)((void *)tg + of_cft(of)->private) = v; + else + *(unsigned int *)((void *)tg + of_cft(of)->private) = v; + + tg_conf_updated(tg); + ret = 0; +out_finish: blkg_conf_finish(&ctx); - return nbytes; + return ret ?: nbytes; } static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, @@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, return tg_set_conf(of, buf, nbytes, off, false); } -static struct cftype throtl_files[] = { +static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), @@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = { }, { .name = "throttle.io_service_bytes", - .private = offsetof(struct tg_stats_cpu, service_bytes), - .seq_show = tg_print_cpu_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_bytes, }, { .name = "throttle.io_serviced", - .private = offsetof(struct tg_stats_cpu, serviced), - .seq_show = tg_print_cpu_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_ios, + }, + { } /* terminate */ +}; + +static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = blkg_dev_name(pd->blkg); + char bufs[4][21] = { "max", "max", "max", "max" }; + + if (!dname) + return 0; + if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && + tg->iops[READ] == -1 && tg->iops[WRITE] == -1) + return 0; + + if (tg->bps[READ] != -1) + snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); + if (tg->bps[WRITE] != -1) + snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); + if (tg->iops[READ] != -1) + snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); + if (tg->iops[WRITE] != -1) + snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); + + seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3]); + return 0; +} + +static int tg_print_max(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; +} + +static ssize_t tg_set_max(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + u64 v[4]; + int ret; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + tg = blkg_to_tg(ctx.blkg); + + v[0] = tg->bps[READ]; + v[1] = tg->bps[WRITE]; + v[2] = tg->iops[READ]; + v[3] = tg->iops[WRITE]; + + while (true) { + char tok[27]; /* wiops=18446744073709551616 */ + char *p; + u64 val = -1; + int len; + + if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) + break; + if (tok[0] == '\0') + break; + ctx.body += len; + + ret = -EINVAL; + p = tok; + strsep(&p, "="); + if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) + goto out_finish; + + ret = -ERANGE; + if (!val) + goto out_finish; + + ret = -EINVAL; + if (!strcmp(tok, "rbps")) + v[0] = val; + else if (!strcmp(tok, "wbps")) + v[1] = val; + else if (!strcmp(tok, "riops")) + v[2] = min_t(u64, val, UINT_MAX); + else if (!strcmp(tok, "wiops")) + v[3] = min_t(u64, val, UINT_MAX); + else + goto out_finish; + } + + tg->bps[READ] = v[0]; + tg->bps[WRITE] = v[1]; + tg->iops[READ] = v[2]; + tg->iops[WRITE] = v[3]; + + tg_conf_updated(tg); + ret = 0; +out_finish: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static struct cftype throtl_files[] = { + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_max, + .write = tg_set_max, }, { } /* terminate */ }; @@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q) } static struct blkcg_policy blkcg_policy_throtl = { - .pd_size = sizeof(struct throtl_grp), - .cftypes = throtl_files, + .dfl_cftypes = throtl_files, + .legacy_cftypes = throtl_legacy_files, + .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, - .pd_exit_fn = throtl_pd_exit, - .pd_reset_stats_fn = throtl_pd_reset_stats, + .pd_free_fn = throtl_pd_free, }; -bool blk_throtl_bio(struct request_queue *q, struct bio *bio) +bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { - struct throtl_data *td = q->td; struct throtl_qnode *qn = NULL; - struct throtl_grp *tg; + struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); - struct blkcg *blkcg; bool throttled = false; + WARN_ON_ONCE(!rcu_read_lock_held()); + /* see throtl_charge_bio() */ - if (bio->bi_rw & REQ_THROTTLED) + if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) goto out; - /* - * A throtl_grp pointer retrieved under rcu can be used to access - * basic fields like stats and io rates. If a group has no rules, - * just update the dispatch stats in lockless manner and return. - */ - rcu_read_lock(); - blkcg = bio_blkcg(bio); - tg = throtl_lookup_tg(td, blkcg); - if (tg) { - if (!tg->has_rules[rw]) { - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - goto out_unlock_rcu; - } - } - - /* - * Either group has not been allocated yet or it is not an unlimited - * IO group - */ spin_lock_irq(q->queue_lock); - tg = throtl_lookup_create_tg(td, blkcg); - if (unlikely(!tg)) + + if (unlikely(blk_queue_bypass(q))) goto out_unlock; sq = &tg->service_queue; @@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) out_unlock: spin_unlock_irq(q->queue_lock); -out_unlock_rcu: - rcu_read_unlock(); out: /* * As multiple blk-throtls may stack in the same issue path, we @@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); - throtl_service_queue_init(&td->service_queue, NULL); + throtl_service_queue_init(&td->service_queue); q->td = td; td->queue = q; diff --git a/kernel/block/blk-timeout.c b/kernel/block/blk-timeout.c index 246dfb16c..aa40aa933 100644 --- a/kernel/block/blk-timeout.c +++ b/kernel/block/blk-timeout.c @@ -158,11 +158,13 @@ void blk_abort_request(struct request *req) { if (blk_mark_rq_complete(req)) return; - blk_delete_timer(req); - if (req->q->mq_ops) + + if (req->q->mq_ops) { blk_mq_rq_timed_out(req, false); - else + } else { + blk_delete_timer(req); blk_rq_timed_out(req); + } } EXPORT_SYMBOL_GPL(blk_abort_request); diff --git a/kernel/block/blk.h b/kernel/block/blk.h index 43b036185..c43926d3d 100644 --- a/kernel/block/blk.h +++ b/kernel/block/blk.h @@ -22,6 +22,12 @@ struct blk_flush_queue { struct list_head flush_queue[2]; struct list_head flush_data_in_flight; struct request *flush_rq; + + /* + * flush_rq shares tag with this rq, both can't be active + * at the same time + */ + struct request *orig_rq; spinlock_t mq_flush_lock; }; @@ -66,6 +72,26 @@ void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); bool __blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes); +void blk_freeze_queue(struct request_queue *q); + +static inline void blk_queue_enter_live(struct request_queue *q) +{ + /* + * Given that running in generic_make_request() context + * guarantees that a live reference against q_usage_counter has + * been established, further references under that same context + * need not check that the queue has been frozen (marked dead). + */ + percpu_ref_get(&q->q_usage_counter); +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +void blk_flush_integrity(void); +#else +static inline void blk_flush_integrity(void) +{ +} +#endif void blk_rq_timed_out_timer(unsigned long data); unsigned long blk_rq_timeout(unsigned long timeout); @@ -78,7 +104,9 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, bool bio_attempt_back_merge(struct request_queue *q, struct request *req, struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count); + unsigned int *request_count, + struct request **same_queue_rq); +unsigned int blk_plug_queued_count(struct request_queue *q); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); @@ -193,8 +221,6 @@ int blk_try_merge(struct request *rq, struct bio *bio); void blk_queue_congestion_threshold(struct request_queue *q); -void __blk_run_queue_uncond(struct request_queue *q); - int blk_dev_init(void); @@ -267,15 +293,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) * Internal throttling interface */ #ifdef CONFIG_BLK_DEV_THROTTLING -extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ -static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) -{ - return false; -} static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } diff --git a/kernel/block/bounce.c b/kernel/block/bounce.c index 39d123e0a..2f1ec8a67 100644 --- a/kernel/block/bounce.c +++ b/kernel/block/bounce.c @@ -13,6 +13,7 @@ #include <linux/pagemap.h> #include <linux/mempool.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> @@ -122,20 +123,19 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) } } -static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) +static void bounce_end_io(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; struct bio_vec *bvec, *org_vec; int i; - - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); + int start = bio_orig->bi_iter.bi_idx; /* * free up bounce indirect pages used */ bio_for_each_segment_all(bvec, bio, i) { - org_vec = bio_orig->bi_io_vec + i; + org_vec = bio_orig->bi_io_vec + i + start; + if (bvec->bv_page == org_vec->bv_page) continue; @@ -143,61 +143,44 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) mempool_free(bvec->bv_page, pool); } - bio_endio(bio_orig, err); + bio_orig->bi_error = bio->bi_error; + bio_endio(bio_orig); bio_put(bio); } -static void bounce_end_io_write(struct bio *bio, int err) +static void bounce_end_io_write(struct bio *bio) { - bounce_end_io(bio, page_pool, err); + bounce_end_io(bio, page_pool); } -static void bounce_end_io_write_isa(struct bio *bio, int err) +static void bounce_end_io_write_isa(struct bio *bio) { - bounce_end_io(bio, isa_page_pool, err); + bounce_end_io(bio, isa_page_pool); } -static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err) +static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) { struct bio *bio_orig = bio->bi_private; - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (!bio->bi_error) copy_to_high_bio_irq(bio_orig, bio); - bounce_end_io(bio, pool, err); + bounce_end_io(bio, pool); } -static void bounce_end_io_read(struct bio *bio, int err) +static void bounce_end_io_read(struct bio *bio) { - __bounce_end_io_read(bio, page_pool, err); + __bounce_end_io_read(bio, page_pool); } -static void bounce_end_io_read_isa(struct bio *bio, int err) +static void bounce_end_io_read_isa(struct bio *bio) { - __bounce_end_io_read(bio, isa_page_pool, err); + __bounce_end_io_read(bio, isa_page_pool); } -#ifdef CONFIG_NEED_BOUNCE_POOL -static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) -{ - if (bio_data_dir(bio) != WRITE) - return 0; - - if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) - return 0; - - return test_bit(BIO_SNAP_STABLE, &bio->bi_flags); -} -#else -static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) -{ - return 0; -} -#endif /* CONFIG_NEED_BOUNCE_POOL */ - static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool, int force) + mempool_t *pool) { struct bio *bio; int rw = bio_data_dir(*bio_orig); @@ -205,8 +188,6 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, struct bvec_iter iter; unsigned i; - if (force) - goto bounce; bio_for_each_segment(from, *bio_orig, iter) if (page_to_pfn(from.bv_page) > queue_bounce_pfn(q)) goto bounce; @@ -218,7 +199,7 @@ bounce: bio_for_each_segment_all(to, bio, i) { struct page *page = to->bv_page; - if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) + if (page_to_pfn(page) <= queue_bounce_pfn(q)) continue; to->bv_page = mempool_alloc(pool, q->bounce_gfp); @@ -256,7 +237,6 @@ bounce: void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) { - int must_bounce; mempool_t *pool; /* @@ -265,15 +245,13 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) if (!bio_has_data(*bio_orig)) return; - must_bounce = must_snapshot_stable_pages(q, *bio_orig); - /* * for non-isa bounce case, just check if the bounce pfn is equal * to or bigger than the highest pfn in the system -- in that case, * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { - if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) + if (queue_bounce_pfn(q) >= blk_max_pfn) return; pool = page_pool; } else { @@ -284,7 +262,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) /* * slow path */ - __blk_queue_bounce(q, bio_orig, pool, must_bounce); + __blk_queue_bounce(q, bio_orig, pool); } EXPORT_SYMBOL(blk_queue_bounce); diff --git a/kernel/block/cfq-iosched.c b/kernel/block/cfq-iosched.c index 5da8e6e9a..1f9093e90 100644 --- a/kernel/block/cfq-iosched.c +++ b/kernel/block/cfq-iosched.c @@ -14,8 +14,8 @@ #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/blktrace_api.h> +#include <linux/blk-cgroup.h> #include "blk.h" -#include "blk-cgroup.h" /* * tunables @@ -67,6 +67,11 @@ static struct kmem_cache *cfq_pool; #define sample_valid(samples) ((samples) > 80) #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) +/* blkio-related constants */ +#define CFQ_WEIGHT_LEGACY_MIN 10 +#define CFQ_WEIGHT_LEGACY_DFL 500 +#define CFQ_WEIGHT_LEGACY_MAX 1000 + struct cfq_ttime { unsigned long last_end_request; @@ -172,10 +177,6 @@ enum wl_type_t { struct cfqg_stats { #ifdef CONFIG_CFQ_GROUP_IOSCHED - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -184,8 +185,6 @@ struct cfqg_stats { struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; #ifdef CONFIG_DEBUG_BLK_CGROUP @@ -212,6 +211,15 @@ struct cfqg_stats { #endif /* CONFIG_CFQ_GROUP_IOSCHED */ }; +/* Per-cgroup data */ +struct cfq_group_data { + /* must be the first member */ + struct blkcg_policy_data cpd; + + unsigned int weight; + unsigned int leaf_weight; +}; + /* This is per cgroup per device grouping structure */ struct cfq_group { /* must be the first member */ @@ -290,7 +298,11 @@ struct cfq_group { int dispatched; struct cfq_ttime ttime; struct cfqg_stats stats; /* stats for this cfqg */ - struct cfqg_stats dead_stats; /* stats pushed from dead children */ + + /* async queue for each priority case */ + struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; + struct cfq_queue *async_idle_cfqq; + }; struct cfq_io_cq { @@ -356,12 +368,6 @@ struct cfq_data { struct cfq_queue *active_queue; struct cfq_io_cq *active_cic; - /* - * async queue for each priority case - */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - sector_t last_position; /* @@ -387,6 +393,7 @@ struct cfq_data { }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); +static void cfq_put_queue(struct cfq_queue *cfqq); static struct cfq_rb_root *st_for(struct cfq_group *cfqg, enum wl_class_t class, @@ -446,16 +453,6 @@ CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS -static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct cfq_group, pd) : NULL; -} - -static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) -{ - return pd_to_blkg(&cfqg->pd); -} - #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* cfqg stats flags */ @@ -600,6 +597,22 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } #ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct cfq_group, pd) : NULL; +} + +static struct cfq_group_data +*cpd_to_cfqgd(struct blkcg_policy_data *cpd) +{ + return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; +} + +static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) +{ + return pd_to_blkg(&cfqg->pd); +} + static struct blkcg_policy blkcg_policy_cfq; static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) @@ -607,6 +620,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); } +static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg) +{ + return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq)); +} + static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; @@ -668,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) blkg_rwstat_add(&cfqg->stats.merged, rw, 1); } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) -{ - blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); -} - static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { @@ -693,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, static void cfqg_stats_reset(struct cfqg_stats *stats) { /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); @@ -711,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats) } /* @to += @from */ -static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) +static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) { /* queued stats shouldn't be cleared */ - blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); - blkg_rwstat_merge(&to->serviced, &from->serviced); - blkg_rwstat_merge(&to->merged, &from->merged); - blkg_rwstat_merge(&to->service_time, &from->service_time); - blkg_rwstat_merge(&to->wait_time, &from->wait_time); - blkg_stat_merge(&from->time, &from->time); + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); + blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); + blkg_stat_add_aux(&from->time, &from->time); #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_merge(&to->dequeue, &from->dequeue); - blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); - blkg_stat_merge(&to->idle_time, &from->idle_time); - blkg_stat_merge(&to->empty_time, &from->empty_time); + blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_add_aux(&to->dequeue, &from->dequeue); + blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + blkg_stat_add_aux(&to->idle_time, &from->idle_time); + blkg_stat_add_aux(&to->empty_time, &from->empty_time); #endif } /* - * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' + * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' * recursive stats can still account for the amount used by this cfqg after * it's gone. */ @@ -745,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) if (unlikely(!parent)) return; - cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); - cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); + cfqg_stats_add_aux(&parent->stats, &cfqg->stats); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } #else /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -770,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, unsigned long time, unsigned long unaccounted_time) { } static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) { } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { } @@ -858,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, - struct cfq_io_cq *cic, struct bio *bio, - gfp_t gfp_mask); + struct cfq_io_cq *cic, struct bio *bio); static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) { @@ -1521,115 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void cfqg_stats_init(struct cfqg_stats *stats) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight); + +static void cfqg_stats_exit(struct cfqg_stats *stats) { - blkg_rwstat_init(&stats->service_bytes); - blkg_rwstat_init(&stats->serviced); - blkg_rwstat_init(&stats->merged); - blkg_rwstat_init(&stats->service_time); - blkg_rwstat_init(&stats->wait_time); - blkg_rwstat_init(&stats->queued); + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); + blkg_rwstat_exit(&stats->queued); + blkg_stat_exit(&stats->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_exit(&stats->unaccounted_time); + blkg_stat_exit(&stats->avg_queue_size_sum); + blkg_stat_exit(&stats->avg_queue_size_samples); + blkg_stat_exit(&stats->dequeue); + blkg_stat_exit(&stats->group_wait_time); + blkg_stat_exit(&stats->idle_time); + blkg_stat_exit(&stats->empty_time); +#endif +} - blkg_stat_init(&stats->sectors); - blkg_stat_init(&stats->time); +static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) +{ + if (blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || + blkg_rwstat_init(&stats->queued, gfp) || + blkg_stat_init(&stats->time, gfp)) + goto err; #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_init(&stats->unaccounted_time); - blkg_stat_init(&stats->avg_queue_size_sum); - blkg_stat_init(&stats->avg_queue_size_samples); - blkg_stat_init(&stats->dequeue); - blkg_stat_init(&stats->group_wait_time); - blkg_stat_init(&stats->idle_time); - blkg_stat_init(&stats->empty_time); + if (blkg_stat_init(&stats->unaccounted_time, gfp) || + blkg_stat_init(&stats->avg_queue_size_sum, gfp) || + blkg_stat_init(&stats->avg_queue_size_samples, gfp) || + blkg_stat_init(&stats->dequeue, gfp) || + blkg_stat_init(&stats->group_wait_time, gfp) || + blkg_stat_init(&stats->idle_time, gfp) || + blkg_stat_init(&stats->empty_time, gfp)) + goto err; #endif + return 0; +err: + cfqg_stats_exit(stats); + return -ENOMEM; +} + +static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) +{ + struct cfq_group_data *cgd; + + cgd = kzalloc(sizeof(*cgd), GFP_KERNEL); + if (!cgd) + return NULL; + return &cgd->cpd; +} + +static void cfq_cpd_init(struct blkcg_policy_data *cpd) +{ + struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); + unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? + CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (cpd_to_blkcg(cpd) == &blkcg_root) + weight *= 2; + + cgd->weight = weight; + cgd->leaf_weight = weight; +} + +static void cfq_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(cpd_to_cfqgd(cpd)); +} + +static void cfq_cpd_bind(struct blkcg_policy_data *cpd) +{ + struct blkcg *blkcg = cpd_to_blkcg(cpd); + bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys); + unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (blkcg == &blkcg_root) + weight *= 2; + + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); } -static void cfq_pd_init(struct blkcg_gq *blkg) +static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg; + + cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); + if (!cfqg) + return NULL; cfq_init_cfqg_base(cfqg); - cfqg->weight = blkg->blkcg->cfq_weight; - cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight; - cfqg_stats_init(&cfqg->stats); - cfqg_stats_init(&cfqg->dead_stats); + if (cfqg_stats_init(&cfqg->stats, gfp)) { + kfree(cfqg); + return NULL; + } + + return &cfqg->pd; +} + +static void cfq_pd_init(struct blkg_policy_data *pd) +{ + struct cfq_group *cfqg = pd_to_cfqg(pd); + struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); + + cfqg->weight = cgd->weight; + cfqg->leaf_weight = cgd->leaf_weight; } -static void cfq_pd_offline(struct blkcg_gq *blkg) +static void cfq_pd_offline(struct blkg_policy_data *pd) { + struct cfq_group *cfqg = pd_to_cfqg(pd); + int i; + + for (i = 0; i < IOPRIO_BE_NR; i++) { + if (cfqg->async_cfqq[0][i]) + cfq_put_queue(cfqg->async_cfqq[0][i]); + if (cfqg->async_cfqq[1][i]) + cfq_put_queue(cfqg->async_cfqq[1][i]); + } + + if (cfqg->async_idle_cfqq) + cfq_put_queue(cfqg->async_idle_cfqq); + /* * @blkg is going offline and will be ignored by * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so * that they don't get lost. If IOs complete after this point, the * stats for them will be lost. Oh well... */ - cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); -} - -/* offset delta from cfqg->stats to cfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - - offsetof(struct cfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -{ - u64 sum = 0; - - sum += blkg_stat_recursive_sum(pd, off); - sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); - return sum; + cfqg_stats_xfer_dead(cfqg); } -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, - int off) +static void cfq_pd_free(struct blkg_policy_data *pd) { - struct blkg_rwstat a, b; + struct cfq_group *cfqg = pd_to_cfqg(pd); - a = blkg_rwstat_recursive_sum(pd, off); - b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); - blkg_rwstat_merge(&a, &b); - return a; + cfqg_stats_exit(&cfqg->stats); + return kfree(cfqg); } -static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +static void cfq_pd_reset_stats(struct blkg_policy_data *pd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg = pd_to_cfqg(pd); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } -/* - * Search for the cfq group current task belongs to. request_queue lock must - * be held. - */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { - struct request_queue *q = cfqd->queue; - struct cfq_group *cfqg = NULL; - - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - cfqg = cfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - cfqg = blkg_to_cfqg(blkg); - } + struct blkcg_gq *blkg; - return cfqg; + blkg = blkg_lookup(blkcg, cfqd->queue); + if (likely(blkg)) + return blkg_to_cfqg(blkg); + return NULL; } static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { - /* Currently, all async queues are mapped to root group */ - if (!cfq_cfqq_sync(cfqq)) - cfqg = cfqq->cfqd->root_group; - cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ cfqg_get(cfqg); @@ -1673,42 +1730,74 @@ static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) static int cfq_print_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight); + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + unsigned int val = 0; + + if (cgd) + val = cgd->weight; + + seq_printf(sf, "%u\n", val); return 0; } static int cfq_print_leaf_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight); + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + unsigned int val = 0; + + if (cgd) + val = cgd->leaf_weight; + + seq_printf(sf, "%u\n", val); return 0; } static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, - bool is_leaf_weight) + bool on_dfl, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; + struct cfq_group_data *cfqgd; int ret; + u64 v; ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); if (ret) return ret; - ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) == 1) { + /* require "default" on dfl */ + ret = -ERANGE; + if (!v && on_dfl) + goto out_finish; + } else if (!strcmp(strim(ctx.body), "default")) { + v = 0; + } else { + ret = -EINVAL; + goto out_finish; + } + cfqg = blkg_to_cfqg(ctx.blkg); - if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { + cfqgd = blkcg_to_cfqgd(blkcg); + + ret = -ERANGE; + if (!v || (v >= min && v <= max)) { if (!is_leaf_weight) { - cfqg->dev_weight = ctx.v; - cfqg->new_weight = ctx.v ?: blkcg->cfq_weight; + cfqg->dev_weight = v; + cfqg->new_weight = v ?: cfqgd->weight; } else { - cfqg->dev_leaf_weight = ctx.v; - cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight; + cfqg->dev_leaf_weight = v; + cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; } ret = 0; } - +out_finish: blkg_conf_finish(&ctx); return ret ?: nbytes; } @@ -1716,30 +1805,39 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, false); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); } static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, true); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); } -static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val, bool is_leaf_weight) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; + struct cfq_group_data *cfqgd; + int ret = 0; - if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) - return -EINVAL; + if (val < min || val > max) + return -ERANGE; spin_lock_irq(&blkcg->lock); + cfqgd = blkcg_to_cfqgd(blkcg); + if (!cfqgd) { + ret = -EINVAL; + goto out; + } if (!is_leaf_weight) - blkcg->cfq_weight = val; + cfqgd->weight = val; else - blkcg->cfq_leaf_weight = val; + cfqgd->leaf_weight = val; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct cfq_group *cfqg = blkg_to_cfqg(blkg); @@ -1748,28 +1846,33 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, continue; if (!is_leaf_weight) { + if (reset_dev) + cfqg->dev_weight = 0; if (!cfqg->dev_weight) - cfqg->new_weight = blkcg->cfq_weight; + cfqg->new_weight = cfqgd->weight; } else { + if (reset_dev) + cfqg->dev_leaf_weight = 0; if (!cfqg->dev_leaf_weight) - cfqg->new_leaf_weight = blkcg->cfq_leaf_weight; + cfqg->new_leaf_weight = cfqgd->leaf_weight; } } +out: spin_unlock_irq(&blkcg->lock); - return 0; + return ret; } static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, false); + return __cfq_set_weight(css, val, false, false, false); } static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, true); + return __cfq_set_weight(css, val, false, false, true); } static int cfqg_print_stat(struct seq_file *sf, void *v) @@ -1789,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v) static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = cfqg_stat_pd_recursive_sum(pd, off); - + u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_u64(sf, pd, sum); } static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); - + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_rwstat(sf, pd, &sum); } @@ -1818,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } +static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); + return 0; +} + +static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, + false); + return 0; +} + #ifdef CONFIG_DEBUG_BLK_CGROUP static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) @@ -1844,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) } #endif /* CONFIG_DEBUG_BLK_CGROUP */ -static struct cftype cfq_blkcg_files[] = { +static struct cftype cfq_blkcg_legacy_files[] = { /* on root, weight is mapped to leaf_weight */ { .name = "weight_device", @@ -1892,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat, + .seq_show = cfqg_print_stat_sectors, }, { .name = "io_service_bytes", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes, }, { .name = "io_serviced", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios, }, { .name = "io_service_time", @@ -1934,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors_recursive", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat_recursive, + .seq_show = cfqg_print_stat_sectors_recursive, }, { .name = "io_service_bytes_recursive", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes_recursive, }, { .name = "io_serviced_recursive", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios_recursive, }, { .name = "io_service_time_recursive", @@ -2000,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = { #endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ }; + +static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + + seq_printf(sf, "default %u\n", cgd->weight); + blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, + &blkcg_policy_cfq, 0, false); + return 0; +} + +static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + char *endp; + int ret; + u64 v; + + buf = strim(buf); + + /* "WEIGHT" or "default WEIGHT" sets the default weight */ + v = simple_strtoull(buf, &endp, 0); + if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { + ret = __cfq_set_weight(of_css(of), v, true, false, false); + return ret ?: nbytes; + } + + /* "MAJ:MIN WEIGHT" */ + return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); +} + +static struct cftype cfq_blkcg_files[] = { + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cfq_print_weight_on_dfl, + .write = cfq_set_weight_on_dfl, + }, + { } /* terminate */ +}; + #else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { return cfqd->root_group; } @@ -2805,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; cfqq->nr_sectors += blk_rq_sectors(rq); - cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); } /* @@ -3438,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq) struct cfq_io_cq *cic = icq_to_cic(icq); struct cfq_data *cfqd = cic_to_cfqd(cic); - if (cic->cfqq[BLK_RW_ASYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); - cic->cfqq[BLK_RW_ASYNC] = NULL; + if (cic_to_cfqq(cic, false)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); + cic_set_cfqq(cic, NULL, false); } - if (cic->cfqq[BLK_RW_SYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); - cic->cfqq[BLK_RW_SYNC] = NULL; + if (cic_to_cfqq(cic, true)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); + cic_set_cfqq(cic, NULL, true); } } @@ -3504,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) return; - cfqq = cic->cfqq[BLK_RW_ASYNC]; + cfqq = cic_to_cfqq(cic, false); if (cfqq) { - struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, - GFP_ATOMIC); - if (new_cfqq) { - cic->cfqq[BLK_RW_ASYNC] = new_cfqq; - cfq_put_queue(cfqq); - } + cfq_put_queue(cfqq); + cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); + cic_set_cfqq(cic, cfqq, false); } - cfqq = cic->cfqq[BLK_RW_SYNC]; + cfqq = cic_to_cfqq(cic, true); if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); @@ -3546,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *sync_cfqq; + struct cfq_queue *cfqq; uint64_t serial_nr; rcu_read_lock(); @@ -3560,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) return; - sync_cfqq = cic_to_cfqq(cic, 1); - if (sync_cfqq) { - /* - * Drop reference to sync queue. A new sync queue will be - * assigned in new group upon arrival of a fresh request. - */ - cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); - cic_set_cfqq(cic, NULL, 1); - cfq_put_queue(sync_cfqq); + /* + * Drop reference to queues. New queues will be assigned in new + * group upon arrival of fresh requests. + */ + cfqq = cic_to_cfqq(cic, false); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, false); + cfq_put_queue(cfqq); + } + + cfqq = cic_to_cfqq(cic, true); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, true); + cfq_put_queue(cfqq); } cic->blkcg_serial_nr = serial_nr; @@ -3577,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ -static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) -{ - struct blkcg *blkcg; - struct cfq_queue *cfqq, *new_cfqq = NULL; - struct cfq_group *cfqg; - -retry: - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); - if (!cfqg) { - cfqq = &cfqd->oom_cfqq; - goto out; - } - - cfqq = cic_to_cfqq(cic, is_sync); - - /* - * Always try a new alloc if we fell back to the OOM cfqq - * originally, since it should just be a temporary situation. - */ - if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = NULL; - if (new_cfqq) { - cfqq = new_cfqq; - new_cfqq = NULL; - } else if (gfp_mask & __GFP_WAIT) { - rcu_read_unlock(); - spin_unlock_irq(cfqd->queue->queue_lock); - new_cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - spin_lock_irq(cfqd->queue->queue_lock); - if (new_cfqq) - goto retry; - else - return &cfqd->oom_cfqq; - } else { - cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - } - - if (cfqq) { - cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, cic); - cfq_link_cfqq_cfqg(cfqq, cfqg); - cfq_log_cfqq(cfqd, cfqq, "alloced"); - } else - cfqq = &cfqd->oom_cfqq; - } -out: - if (new_cfqq) - kmem_cache_free(cfq_pool, new_cfqq); - - rcu_read_unlock(); - return cfqq; -} - static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) +cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) { switch (ioprio_class) { case IOPRIO_CLASS_RT: - return &cfqd->async_cfqq[0][ioprio]; + return &cfqg->async_cfqq[0][ioprio]; case IOPRIO_CLASS_NONE: ioprio = IOPRIO_NORM; /* fall through */ case IOPRIO_CLASS_BE: - return &cfqd->async_cfqq[1][ioprio]; + return &cfqg->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: - return &cfqd->async_idle_cfqq; + return &cfqg->async_idle_cfqq; default: BUG(); } @@ -3659,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) static struct cfq_queue * cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio) { int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); struct cfq_queue **async_cfqq = NULL; - struct cfq_queue *cfqq = NULL; + struct cfq_queue *cfqq; + struct cfq_group *cfqg; + + rcu_read_lock(); + cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); + if (!cfqg) { + cfqq = &cfqd->oom_cfqq; + goto out; + } if (!is_sync) { if (!ioprio_valid(cic->ioprio)) { @@ -3672,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, ioprio = task_nice_ioprio(tsk); ioprio_class = task_nice_ioclass(tsk); } - async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); + async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); cfqq = *async_cfqq; + if (cfqq) + goto out; } - if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); + cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, + cfqd->queue->node); + if (!cfqq) { + cfqq = &cfqd->oom_cfqq; + goto out; + } - /* - * pin the queue now that it's allocated, scheduler exit will prune it - */ - if (!is_sync && !(*async_cfqq)) { + cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); + cfq_init_prio_data(cfqq, cic); + cfq_link_cfqq_cfqg(cfqq, cfqg); + cfq_log_cfqq(cfqd, cfqq, "alloced"); + + if (async_cfqq) { + /* a new async queue is created, pin and remember */ cfqq->ref++; *async_cfqq = cfqq; } - +out: cfqq->ref++; + rcu_read_unlock(); return cfqq; } @@ -4221,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; - might_sleep_if(gfp_mask & __GFP_WAIT); - spin_lock_irq(q->queue_lock); check_ioprio_changed(cic, bio); @@ -4230,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); + if (cfqq) + cfq_put_queue(cfqq); + cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); cic_set_cfqq(cic, cfqq, is_sync); } else { /* @@ -4336,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) cancel_work_sync(&cfqd->unplug_work); } -static void cfq_put_async_queues(struct cfq_data *cfqd) -{ - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqd->async_cfqq[0][i]) - cfq_put_queue(cfqd->async_cfqq[0][i]); - if (cfqd->async_cfqq[1][i]) - cfq_put_queue(cfqd->async_cfqq[1][i]); - } - - if (cfqd->async_idle_cfqq) - cfq_put_queue(cfqd->async_idle_cfqq); -} - static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; @@ -4363,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e) if (cfqd->active_queue) __cfq_slice_expired(cfqd, cfqd->active_queue, 0); - cfq_put_async_queues(cfqd); - spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); @@ -4418,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) goto out_free; cfq_init_cfqg_base(cfqd->root_group); + cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; + cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; #endif - cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; - cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -4431,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) cfqd->prio_trees[i] = RB_ROOT; /* - * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. + * Our fallback cfqq if cfq_get_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow * will not attempt to free it. oom_cfqq is linked to root_group * but shouldn't hold a reference as it'll never be unlinked. Lose @@ -4477,6 +4595,18 @@ out_free: return ret; } +static void cfq_registered_queue(struct request_queue *q) +{ + struct elevator_queue *e = q->elevator; + struct cfq_data *cfqd = e->elevator_data; + + /* + * Default to IOPS mode with no idling for SSDs + */ + if (blk_queue_nonrot(q)) + cfqd->cfq_slice_idle = 0; +} + /* * sysfs parts below --> */ @@ -4592,6 +4722,7 @@ static struct elevator_type iosched_cfq = { .elevator_may_queue_fn = cfq_may_queue, .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, + .elevator_registered_fn = cfq_registered_queue, }, .icq_size = sizeof(struct cfq_io_cq), .icq_align = __alignof__(struct cfq_io_cq), @@ -4602,11 +4733,18 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { - .pd_size = sizeof(struct cfq_group), - .cftypes = cfq_blkcg_files, + .dfl_cftypes = cfq_blkcg_files, + .legacy_cftypes = cfq_blkcg_legacy_files, + + .cpd_alloc_fn = cfq_cpd_alloc, + .cpd_init_fn = cfq_cpd_init, + .cpd_free_fn = cfq_cpd_free, + .cpd_bind_fn = cfq_cpd_bind, + .pd_alloc_fn = cfq_pd_alloc, .pd_init_fn = cfq_pd_init, .pd_offline_fn = cfq_pd_offline, + .pd_free_fn = cfq_pd_free, .pd_reset_stats_fn = cfq_pd_reset_stats, }; #endif diff --git a/kernel/block/elevator.c b/kernel/block/elevator.c index 8985038f3..c3555c9c6 100644 --- a/kernel/block/elevator.c +++ b/kernel/block/elevator.c @@ -35,11 +35,11 @@ #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/pm_runtime.h> +#include <linux/blk-cgroup.h> #include <trace/events/block.h> #include "blk.h" -#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -420,7 +420,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio) * noxmerges: Only simple one-hit cache try * merges: All merge tries attempted */ - if (blk_queue_nomerges(q)) + if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return ELEVATOR_NO_MERGE; /* @@ -806,6 +806,8 @@ int elv_register_queue(struct request_queue *q) } kobject_uevent(&e->kobj, KOBJ_ADD); e->registered = 1; + if (e->type->ops.elevator_registered_fn) + e->type->ops.elevator_registered_fn(q); } return error; } diff --git a/kernel/block/genhd.c b/kernel/block/genhd.c index ea982eada..e5cafa515 100644 --- a/kernel/block/genhd.c +++ b/kernel/block/genhd.c @@ -8,6 +8,7 @@ #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> @@ -629,6 +630,7 @@ void add_disk(struct gendisk *disk) WARN_ON(retval); disk_add_events(disk); + blk_integrity_add(disk); } EXPORT_SYMBOL(add_disk); @@ -637,6 +639,7 @@ void del_gendisk(struct gendisk *disk) struct disk_part_iter piter; struct hd_struct *part; + blk_integrity_del(disk); disk_del_events(disk); /* invalidate stuff */ @@ -1109,8 +1112,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); - free_part_stats(&disk->part0); - free_part_info(&disk->part0); + hd_free_part(&disk->part0); if (disk->queue) blk_put_queue(disk->queue); kfree(disk); @@ -1284,7 +1286,11 @@ struct gendisk *alloc_disk_node(int minors, int node_id) * converted to make use of bd_mutex and sequence counters. */ seqcount_init(&disk->part0.nr_sects_seq); - hd_ref_init(&disk->part0); + if (hd_ref_init(&disk->part0)) { + hd_free_part(&disk->part0); + kfree(disk); + return NULL; + } disk->minors = minors; rand_initialize_disk(disk); diff --git a/kernel/block/ioctl.c b/kernel/block/ioctl.c index 7d8befde2..0918aed2d 100644 --- a/kernel/block/ioctl.c +++ b/kernel/block/ioctl.c @@ -7,6 +7,7 @@ #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/blktrace_api.h> +#include <linux/pr.h> #include <asm/uaccess.h> static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) @@ -150,26 +151,63 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user } } -static int blkdev_reread_part(struct block_device *bdev) +/* + * This is an exported API for the block driver, and will not + * acquire bd_mutex. This API should be used in case that + * caller has held bd_mutex already. + */ +int __blkdev_reread_part(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; - int res; if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (!mutex_trylock(&bdev->bd_mutex)) - return -EBUSY; - res = rescan_partitions(disk, bdev); + + lockdep_assert_held(&bdev->bd_mutex); + + return rescan_partitions(disk, bdev); +} +EXPORT_SYMBOL(__blkdev_reread_part); + +/* + * This is an exported API for the block driver, and will + * try to acquire bd_mutex. If bd_mutex has been held already + * in current context, please call __blkdev_reread_part(). + * + * Make sure the held locks in current context aren't required + * in open()/close() handler and I/O path for avoiding ABBA deadlock: + * - bd_mutex is held before calling block driver's open/close + * handler + * - reading partition table may submit I/O to the block device + */ +int blkdev_reread_part(struct block_device *bdev) +{ + int res; + + mutex_lock(&bdev->bd_mutex); + res = __blkdev_reread_part(bdev); mutex_unlock(&bdev->bd_mutex); + return res; } +EXPORT_SYMBOL(blkdev_reread_part); -static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, - uint64_t len, int secure) +static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, + unsigned long arg, unsigned long flags) { - unsigned long flags = 0; + uint64_t range[2]; + uint64_t start, len; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(range, (void __user *)arg, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; if (start & 511) return -EINVAL; @@ -180,14 +218,24 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, if (start + len > (i_size_read(bdev->bd_inode) >> 9)) return -EINVAL; - if (secure) - flags |= BLKDEV_DISCARD_SECURE; return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags); } -static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start, - uint64_t len) +static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, + unsigned long arg) { + uint64_t range[2]; + uint64_t start, len; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(range, (void __user *)arg, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; + if (start & 511) return -EINVAL; if (len & 511) @@ -248,6 +296,96 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, */ EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); +static int blkdev_pr_register(struct block_device *bdev, + struct pr_registration __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_registration reg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_register) + return -EOPNOTSUPP; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + + if (reg.flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags); +} + +static int blkdev_pr_reserve(struct block_device *bdev, + struct pr_reservation __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_reservation rsv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_reserve) + return -EOPNOTSUPP; + if (copy_from_user(&rsv, arg, sizeof(rsv))) + return -EFAULT; + + if (rsv.flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags); +} + +static int blkdev_pr_release(struct block_device *bdev, + struct pr_reservation __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_reservation rsv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_release) + return -EOPNOTSUPP; + if (copy_from_user(&rsv, arg, sizeof(rsv))) + return -EFAULT; + + if (rsv.flags) + return -EOPNOTSUPP; + return ops->pr_release(bdev, rsv.key, rsv.type); +} + +static int blkdev_pr_preempt(struct block_device *bdev, + struct pr_preempt __user *arg, bool abort) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_preempt p; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_preempt) + return -EOPNOTSUPP; + if (copy_from_user(&p, arg, sizeof(p))) + return -EFAULT; + + if (p.flags) + return -EOPNOTSUPP; + return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort); +} + +static int blkdev_pr_clear(struct block_device *bdev, + struct pr_clear __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_clear c; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_clear) + return -EOPNOTSUPP; + if (copy_from_user(&c, arg, sizeof(c))) + return -EFAULT; + + if (c.flags) + return -EOPNOTSUPP; + return ops->pr_clear(bdev, c.key); +} + /* * Is it an unrecognized ioctl? The correct returns are either * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a @@ -268,89 +406,115 @@ static inline int is_unrecognized_ioctl(int ret) ret == -ENOIOCTLCMD; } -/* - * always keep this in sync with compat_blkdev_ioctl() - */ -int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, - unsigned long arg) +static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) { - struct gendisk *disk = bdev->bd_disk; - struct backing_dev_info *bdi; - loff_t size; - int ret, n; - unsigned int max_sectors; - - switch(cmd) { - case BLKFLSBUF: - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + int ret; - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; - fsync_bdev(bdev); - invalidate_bdev(bdev); - return 0; + ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (!is_unrecognized_ioctl(ret)) + return ret; - case BLKROSET: - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (get_user(n, (int __user *)(arg))) - return -EFAULT; - set_device_ro(bdev, n); - return 0; + fsync_bdev(bdev); + invalidate_bdev(bdev); + return 0; +} - case BLKDISCARD: - case BLKSECDISCARD: { - uint64_t range[2]; +static int blkdev_roset(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + int ret, n; - if (!(mode & FMODE_WRITE)) - return -EBADF; + ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (!is_unrecognized_ioctl(ret)) + return ret; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (get_user(n, (int __user *)arg)) + return -EFAULT; + set_device_ro(bdev, n); + return 0; +} - if (copy_from_user(range, (void __user *)arg, sizeof(range))) - return -EFAULT; +static int blkdev_getgeo(struct block_device *bdev, + struct hd_geometry __user *argp) +{ + struct gendisk *disk = bdev->bd_disk; + struct hd_geometry geo; + int ret; - return blk_ioctl_discard(bdev, range[0], range[1], - cmd == BLKSECDISCARD); - } - case BLKZEROOUT: { - uint64_t range[2]; + if (!argp) + return -EINVAL; + if (!disk->fops->getgeo) + return -ENOTTY; + + /* + * We need to set the startsect first, the driver may + * want to override it. + */ + memset(&geo, 0, sizeof(geo)); + geo.start = get_start_sect(bdev); + ret = disk->fops->getgeo(bdev, &geo); + if (ret) + return ret; + if (copy_to_user(argp, &geo, sizeof(geo))) + return -EFAULT; + return 0; +} - if (!(mode & FMODE_WRITE)) - return -EBADF; +/* set the logical block size */ +static int blkdev_bszset(struct block_device *bdev, fmode_t mode, + int __user *argp) +{ + int ret, n; - if (copy_from_user(range, (void __user *)arg, sizeof(range))) - return -EFAULT; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!argp) + return -EINVAL; + if (get_user(n, argp)) + return -EFAULT; - return blk_ioctl_zeroout(bdev, range[0], range[1]); + if (!(mode & FMODE_EXCL)) { + bdgrab(bdev); + if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) + return -EBUSY; } - case HDIO_GETGEO: { - struct hd_geometry geo; + ret = set_blocksize(bdev, n); + if (!(mode & FMODE_EXCL)) + blkdev_put(bdev, mode | FMODE_EXCL); + return ret; +} - if (!arg) - return -EINVAL; - if (!disk->fops->getgeo) - return -ENOTTY; - - /* - * We need to set the startsect first, the driver may - * want to override it. - */ - memset(&geo, 0, sizeof(geo)); - geo.start = get_start_sect(bdev); - ret = disk->fops->getgeo(bdev, &geo); - if (ret) - return ret; - if (copy_to_user((struct hd_geometry __user *)arg, &geo, - sizeof(geo))) - return -EFAULT; - return 0; - } +/* + * always keep this in sync with compat_blkdev_ioctl() + */ +int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, + unsigned long arg) +{ + struct backing_dev_info *bdi; + void __user *argp = (void __user *)arg; + loff_t size; + unsigned int max_sectors; + + switch (cmd) { + case BLKFLSBUF: + return blkdev_flushbuf(bdev, mode, cmd, arg); + case BLKROSET: + return blkdev_roset(bdev, mode, cmd, arg); + case BLKDISCARD: + return blk_ioctl_discard(bdev, mode, arg, 0); + case BLKSECDISCARD: + return blk_ioctl_discard(bdev, mode, arg, + BLKDEV_DISCARD_SECURE); + case BLKZEROOUT: + return blk_ioctl_zeroout(bdev, mode, arg); + case HDIO_GETGEO: + return blkdev_getgeo(bdev, argp); case BLKRAGET: case BLKFRAGET: if (!arg) @@ -387,28 +551,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKBSZSET: - /* set the logical block size */ - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (!arg) - return -EINVAL; - if (get_user(n, (int __user *) arg)) - return -EFAULT; - if (!(mode & FMODE_EXCL)) { - bdgrab(bdev); - if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) - return -EBUSY; - } - ret = set_blocksize(bdev, n); - if (!(mode & FMODE_EXCL)) - blkdev_put(bdev, mode | FMODE_EXCL); - return ret; + return blkdev_bszset(bdev, mode, argp); case BLKPG: - ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg); - break; + return blkpg_ioctl(bdev, argp); case BLKRRPART: - ret = blkdev_reread_part(bdev); - break; + return blkdev_reread_part(bdev); case BLKGETSIZE: size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) @@ -420,11 +567,21 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKTRACESTOP: case BLKTRACESETUP: case BLKTRACETEARDOWN: - ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg); - break; + return blk_trace_ioctl(bdev, cmd, argp); + case IOC_PR_REGISTER: + return blkdev_pr_register(bdev, argp); + case IOC_PR_RESERVE: + return blkdev_pr_reserve(bdev, argp); + case IOC_PR_RELEASE: + return blkdev_pr_release(bdev, argp); + case IOC_PR_PREEMPT: + return blkdev_pr_preempt(bdev, argp, false); + case IOC_PR_PREEMPT_ABORT: + return blkdev_pr_preempt(bdev, argp, true); + case IOC_PR_CLEAR: + return blkdev_pr_clear(bdev, argp); default: - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + return __blkdev_driver_ioctl(bdev, mode, cmd, arg); } - return ret; } EXPORT_SYMBOL_GPL(blkdev_ioctl); diff --git a/kernel/block/ioprio.c b/kernel/block/ioprio.c index 31666c92b..cc7800e9e 100644 --- a/kernel/block/ioprio.c +++ b/kernel/block/ioprio.c @@ -123,7 +123,8 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), uid)) + if (!uid_eq(task_uid(p), uid) || + !task_pid_vnr(p)) continue; ret = set_task_ioprio(p, ioprio); if (ret) @@ -220,7 +221,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) break; do_each_thread(g, p) { - if (!uid_eq(task_uid(p), user->uid)) + if (!uid_eq(task_uid(p), user->uid) || + !task_pid_vnr(p)) continue; tmpio = get_task_ioprio(p); if (tmpio < 0) diff --git a/kernel/block/noop-iosched.c b/kernel/block/noop-iosched.c index 3de89d469..a163c487c 100644 --- a/kernel/block/noop-iosched.c +++ b/kernel/block/noop-iosched.c @@ -21,10 +21,10 @@ static void noop_merged_requests(struct request_queue *q, struct request *rq, static int noop_dispatch(struct request_queue *q, int force) { struct noop_data *nd = q->elevator->elevator_data; + struct request *rq; - if (!list_empty(&nd->queue)) { - struct request *rq; - rq = list_entry(nd->queue.next, struct request, queuelist); + rq = list_first_entry_or_null(&nd->queue, struct request, queuelist); + if (rq) { list_del_init(&rq->queuelist); elv_dispatch_sort(q, rq); return 1; @@ -46,7 +46,7 @@ noop_former_request(struct request_queue *q, struct request *rq) if (rq->queuelist.prev == &nd->queue) return NULL; - return list_entry(rq->queuelist.prev, struct request, queuelist); + return list_prev_entry(rq, queuelist); } static struct request * @@ -56,7 +56,7 @@ noop_latter_request(struct request_queue *q, struct request *rq) if (rq->queuelist.next == &nd->queue) return NULL; - return list_entry(rq->queuelist.next, struct request, queuelist); + return list_next_entry(rq, queuelist); } static int noop_init_queue(struct request_queue *q, struct elevator_type *e) diff --git a/kernel/block/partition-generic.c b/kernel/block/partition-generic.c index 0d9e5f97f..746935a59 100644 --- a/kernel/block/partition-generic.c +++ b/kernel/block/partition-generic.c @@ -212,8 +212,7 @@ static void part_release(struct device *dev) { struct hd_struct *p = dev_to_part(dev); blk_free_devt(dev->devt); - free_part_stats(p); - free_part_info(p); + hd_free_part(p); kfree(p); } @@ -233,8 +232,9 @@ static void delete_partition_rcu_cb(struct rcu_head *head) put_device(part_to_dev(part)); } -void __delete_partition(struct hd_struct *part) +void __delete_partition(struct percpu_ref *ref) { + struct hd_struct *part = container_of(ref, struct hd_struct, ref); call_rcu(&part->rcu_head, delete_partition_rcu_cb); } @@ -255,7 +255,7 @@ void delete_partition(struct gendisk *disk, int partno) kobject_put(part->holder_dir); device_del(part_to_dev(part)); - hd_struct_put(part); + hd_struct_kill(part); } static ssize_t whole_disk_show(struct device *dev, @@ -356,8 +356,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - hd_ref_init(p); - return p; + if (!hd_ref_init(p)) + return p; out_free_info: free_part_info(p); @@ -397,7 +397,7 @@ static int drop_partitions(struct gendisk *disk, struct block_device *bdev) struct hd_struct *part; int res; - if (bdev->bd_part_count) + if (bdev->bd_part_count || bdev->bd_super) return -EBUSY; res = invalidate_partition(disk, 0); if (res) @@ -428,6 +428,7 @@ rescan: if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); + blk_integrity_revalidate(disk); check_disk_size_change(disk, bdev); bdev->bd_invalidated = 0; if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) diff --git a/kernel/block/partitions/mac.c b/kernel/block/partitions/mac.c index c2c48ec64..621317ac4 100644 --- a/kernel/block/partitions/mac.c +++ b/kernel/block/partitions/mac.c @@ -32,7 +32,7 @@ int mac_partition(struct parsed_partitions *state) Sector sect; unsigned char *data; int slot, blocks_in_map; - unsigned secsize; + unsigned secsize, datasize, partoffset; #ifdef CONFIG_PPC_PMAC int found_root = 0; int found_root_goodness = 0; @@ -50,10 +50,14 @@ int mac_partition(struct parsed_partitions *state) } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); - data = read_part_sector(state, secsize/512, §); + datasize = round_down(secsize, 512); + data = read_part_sector(state, datasize / 512, §); if (!data) return -1; - part = (struct mac_partition *) (data + secsize%512); + partoffset = secsize % 512; + if (partoffset + sizeof(*part) > datasize) + return -1; + part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); return 0; /* not a MacOS disk */ diff --git a/kernel/block/scsi_ioctl.c b/kernel/block/scsi_ioctl.c index 55b6f15da..077479994 100644 --- a/kernel/block/scsi_ioctl.c +++ b/kernel/block/scsi_ioctl.c @@ -326,8 +326,8 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, goto out_put_request; } - ret = -EFAULT; - if (blk_fill_sghdr_rq(q, rq, hdr, mode)) + ret = blk_fill_sghdr_rq(q, rq, hdr, mode); + if (ret < 0) goto out_free_cdb; ret = 0; @@ -444,7 +444,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, } - rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_RECLAIM); if (IS_ERR(rq)) { err = PTR_ERR(rq); goto error_free_buffer; @@ -495,7 +495,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, break; } - if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { + if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_RECLAIM)) { err = DRIVER_ERROR << 24; goto error; } @@ -536,7 +536,7 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, struct request *rq; int err; - rq = blk_get_request(q, WRITE, __GFP_WAIT); + rq = blk_get_request(q, WRITE, __GFP_RECLAIM); if (IS_ERR(rq)) return PTR_ERR(rq); blk_rq_set_block_pc(rq); diff --git a/kernel/block/t10-pi.c b/kernel/block/t10-pi.c index 24d6e9715..2c9791233 100644 --- a/kernel/block/t10-pi.c +++ b/kernel/block/t10-pi.c @@ -160,38 +160,30 @@ static int t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) return t10_pi_verify(iter, t10_pi_ip_fn, 3); } -struct blk_integrity t10_pi_type1_crc = { +struct blk_integrity_profile t10_pi_type1_crc = { .name = "T10-DIF-TYPE1-CRC", .generate_fn = t10_pi_type1_generate_crc, .verify_fn = t10_pi_type1_verify_crc, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type1_crc); -struct blk_integrity t10_pi_type1_ip = { +struct blk_integrity_profile t10_pi_type1_ip = { .name = "T10-DIF-TYPE1-IP", .generate_fn = t10_pi_type1_generate_ip, .verify_fn = t10_pi_type1_verify_ip, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type1_ip); -struct blk_integrity t10_pi_type3_crc = { +struct blk_integrity_profile t10_pi_type3_crc = { .name = "T10-DIF-TYPE3-CRC", .generate_fn = t10_pi_type3_generate_crc, .verify_fn = t10_pi_type3_verify_crc, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type3_crc); -struct blk_integrity t10_pi_type3_ip = { +struct blk_integrity_profile t10_pi_type3_ip = { .name = "T10-DIF-TYPE3-IP", .generate_fn = t10_pi_type3_generate_ip, .verify_fn = t10_pi_type3_verify_ip, - .tuple_size = sizeof(struct t10_pi_tuple), - .tag_size = 0, }; EXPORT_SYMBOL(t10_pi_type3_ip); |