From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/block/blk-mq-tag.c | 666 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 666 insertions(+) create mode 100644 kernel/block/blk-mq-tag.c (limited to 'kernel/block/blk-mq-tag.c') diff --git a/kernel/block/blk-mq-tag.c b/kernel/block/blk-mq-tag.c new file mode 100644 index 000000000..be3290cc0 --- /dev/null +++ b/kernel/block/blk-mq-tag.c @@ -0,0 +1,666 @@ +/* + * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread + * over multiple cachelines to avoid ping-pong between multiple submitters + * or submitter and completer. Uses rolling wakeups to avoid falling of + * the scaling cliff when we run out of tags and have to start putting + * submitters to sleep. + * + * Uses active queue tracking to support fairer distribution of tags + * between multiple submitters when a shared tag map is used. + * + * Copyright (C) 2013-2014 Jens Axboe + */ +#include +#include +#include + +#include +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" + +static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) +{ + int i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + int ret; + + ret = find_first_zero_bit(&bm->word, bm->depth); + if (ret < bm->depth) + return true; + } + + return false; +} + +bool blk_mq_has_free_tags(struct blk_mq_tags *tags) +{ + if (!tags) + return true; + + return bt_has_free_tags(&tags->bitmap_tags); +} + +static inline int bt_index_inc(int index) +{ + return (index + 1) & (BT_WAIT_QUEUES - 1); +} + +static inline void bt_index_atomic_inc(atomic_t *index) +{ + int old = atomic_read(index); + int new = bt_index_inc(old); + atomic_cmpxchg(index, old, new); +} + +/* + * If a previously inactive queue goes active, bump the active user count. + */ +bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + atomic_inc(&hctx->tags->active_queues); + + return true; +} + +/* + * Wakeup all potentially sleeping on tags + */ +void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) +{ + struct blk_mq_bitmap_tags *bt; + int i, wake_index; + + bt = &tags->bitmap_tags; + wake_index = atomic_read(&bt->wake_index); + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) + wake_up(&bs->wait); + + wake_index = bt_index_inc(wake_index); + } + + if (include_reserve) { + bt = &tags->breserved_tags; + if (waitqueue_active(&bt->bs[0].wait)) + wake_up(&bt->bs[0].wait); + } +} + +/* + * If a previously busy queue goes inactive, potential waiters could now + * be allowed to queue. Wake them up and check. + */ +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + + atomic_dec(&tags->active_queues); + + blk_mq_tag_wakeup_all(tags, false); +} + +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct blk_mq_bitmap_tags *bt) +{ + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return true; + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->depth == 1) + return true; + + users = atomic_read(&hctx->tags->active_queues); + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->depth + users - 1) / users, 4U); + return atomic_read(&hctx->nr_active) < depth; +} + +static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag, + bool nowrap) +{ + int tag, org_last_tag = last_tag; + + while (1) { + tag = find_next_zero_bit(&bm->word, bm->depth, last_tag); + if (unlikely(tag >= bm->depth)) { + /* + * We started with an offset, and we didn't reset the + * offset to 0 in a failure case, so start from 0 to + * exhaust the map. + */ + if (org_last_tag && last_tag && !nowrap) { + last_tag = org_last_tag = 0; + continue; + } + return -1; + } + + if (!test_and_set_bit(tag, &bm->word)) + break; + + last_tag = tag + 1; + if (last_tag >= bm->depth - 1) + last_tag = 0; + } + + return tag; +} + +#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) + +/* + * Straight forward bitmap tag implementation, where each bit is a tag + * (cleared == free, and set == busy). The small twist is using per-cpu + * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue + * contexts. This enables us to drastically limit the space searched, + * without dirtying an extra shared cacheline like we would if we stored + * the cache value inside the shared blk_mq_bitmap_tags structure. On top + * of that, each word of tags is in a separate cacheline. This means that + * multiple users will tend to stick to different cachelines, at least + * until the map is exhausted. + */ +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, + unsigned int *tag_cache, struct blk_mq_tags *tags) +{ + unsigned int last_tag, org_last_tag; + int index, i, tag; + + if (!hctx_may_queue(hctx, bt)) + return -1; + + last_tag = org_last_tag = *tag_cache; + index = TAG_TO_INDEX(bt, last_tag); + + for (i = 0; i < bt->map_nr; i++) { + tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag), + BT_ALLOC_RR(tags)); + if (tag != -1) { + tag += (index << bt->bits_per_word); + goto done; + } + + /* + * Jump to next index, and reset the last tag to be the + * first tag of that index + */ + index++; + last_tag = (index << bt->bits_per_word); + + if (index >= bt->map_nr) { + index = 0; + last_tag = 0; + } + } + + *tag_cache = 0; + return -1; + + /* + * Only update the cache from the allocation path, if we ended + * up using the specific cached tag. + */ +done: + if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) { + last_tag = tag + 1; + if (last_tag >= bt->depth - 1) + last_tag = 0; + + *tag_cache = last_tag; + } + + return tag; +} + +static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, + struct blk_mq_hw_ctx *hctx) +{ + struct bt_wait_state *bs; + int wait_index; + + if (!hctx) + return &bt->bs[0]; + + wait_index = atomic_read(&hctx->wait_index); + bs = &bt->bs[wait_index]; + bt_index_atomic_inc(&hctx->wait_index); + return bs; +} + +static int bt_get(struct blk_mq_alloc_data *data, + struct blk_mq_bitmap_tags *bt, + struct blk_mq_hw_ctx *hctx, + unsigned int *last_tag, struct blk_mq_tags *tags) +{ + struct bt_wait_state *bs; + DEFINE_WAIT(wait); + int tag; + + tag = __bt_get(hctx, bt, last_tag, tags); + if (tag != -1) + return tag; + + if (!(data->gfp & __GFP_WAIT)) + return -1; + + bs = bt_wait_ptr(bt, hctx); + do { + prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + + tag = __bt_get(hctx, bt, last_tag, tags); + if (tag != -1) + break; + + /* + * We're out of tags on this hardware queue, kick any + * pending IO submits before going to sleep waiting for + * some to complete. Note that hctx can be NULL here for + * reserved tag allocation. + */ + if (hctx) + blk_mq_run_hw_queue(hctx, false); + + /* + * Retry tag allocation after running the hardware queue, + * as running the queue may also have found completions. + */ + tag = __bt_get(hctx, bt, last_tag, tags); + if (tag != -1) + break; + + blk_mq_put_ctx(data->ctx); + + io_schedule(); + + data->ctx = blk_mq_get_ctx(data->q); + data->hctx = data->q->mq_ops->map_queue(data->q, + data->ctx->cpu); + if (data->reserved) { + bt = &data->hctx->tags->breserved_tags; + } else { + last_tag = &data->ctx->last_tag; + hctx = data->hctx; + bt = &hctx->tags->bitmap_tags; + } + finish_wait(&bs->wait, &wait); + bs = bt_wait_ptr(bt, hctx); + } while (1); + + finish_wait(&bs->wait, &wait); + return tag; +} + +static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) +{ + int tag; + + tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, + &data->ctx->last_tag, data->hctx->tags); + if (tag >= 0) + return tag + data->hctx->tags->nr_reserved_tags; + + return BLK_MQ_TAG_FAIL; +} + +static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) +{ + int tag, zero = 0; + + if (unlikely(!data->hctx->tags->nr_reserved_tags)) { + WARN_ON_ONCE(1); + return BLK_MQ_TAG_FAIL; + } + + tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero, + data->hctx->tags); + if (tag < 0) + return BLK_MQ_TAG_FAIL; + + return tag; +} + +unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) +{ + if (!data->reserved) + return __blk_mq_get_tag(data); + + return __blk_mq_get_reserved_tag(data); +} + +static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) +{ + int i, wake_index; + + wake_index = atomic_read(&bt->wake_index); + for (i = 0; i < BT_WAIT_QUEUES; i++) { + struct bt_wait_state *bs = &bt->bs[wake_index]; + + if (waitqueue_active(&bs->wait)) { + int o = atomic_read(&bt->wake_index); + if (wake_index != o) + atomic_cmpxchg(&bt->wake_index, o, wake_index); + + return bs; + } + + wake_index = bt_index_inc(wake_index); + } + + return NULL; +} + +static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) +{ + const int index = TAG_TO_INDEX(bt, tag); + struct bt_wait_state *bs; + int wait_cnt; + + clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); + + /* Ensure that the wait list checks occur after clear_bit(). */ + smp_mb(); + + bs = bt_wake_ptr(bt); + if (!bs) + return; + + wait_cnt = atomic_dec_return(&bs->wait_cnt); + if (unlikely(wait_cnt < 0)) + wait_cnt = atomic_inc_return(&bs->wait_cnt); + if (wait_cnt == 0) { + atomic_add(bt->wake_cnt, &bs->wait_cnt); + bt_index_atomic_inc(&bt->wake_index); + wake_up(&bs->wait); + } +} + +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, + unsigned int *last_tag) +{ + struct blk_mq_tags *tags = hctx->tags; + + if (tag >= tags->nr_reserved_tags) { + const int real_tag = tag - tags->nr_reserved_tags; + + BUG_ON(real_tag >= tags->nr_tags); + bt_clear_tag(&tags->bitmap_tags, real_tag); + if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO)) + *last_tag = real_tag; + } else { + BUG_ON(tag >= tags->nr_reserved_tags); + bt_clear_tag(&tags->breserved_tags, tag); + } +} + +static void bt_for_each(struct blk_mq_hw_ctx *hctx, + struct blk_mq_bitmap_tags *bt, unsigned int off, + busy_iter_fn *fn, void *data, bool reserved) +{ + struct request *rq; + int bit, i; + + for (i = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + for (bit = find_first_bit(&bm->word, bm->depth); + bit < bm->depth; + bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { + rq = blk_mq_tag_to_rq(hctx->tags, off + bit); + if (rq->q == hctx->queue) + fn(hctx, rq, data, reserved); + } + + off += (1 << bt->bits_per_word); + } +} + +void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn, + void *priv) +{ + struct blk_mq_tags *tags = hctx->tags; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, + false); +} +EXPORT_SYMBOL(blk_mq_tag_busy_iter); + +static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) +{ + unsigned int i, used; + + for (i = 0, used = 0; i < bt->map_nr; i++) { + struct blk_align_bitmap *bm = &bt->map[i]; + + used += bitmap_weight(&bm->word, bm->depth); + } + + return bt->depth - used; +} + +static void bt_update_count(struct blk_mq_bitmap_tags *bt, + unsigned int depth) +{ + unsigned int tags_per_word = 1U << bt->bits_per_word; + unsigned int map_depth = depth; + + if (depth) { + int i; + + for (i = 0; i < bt->map_nr; i++) { + bt->map[i].depth = min(map_depth, tags_per_word); + map_depth -= bt->map[i].depth; + } + } + + bt->wake_cnt = BT_WAIT_BATCH; + if (bt->wake_cnt > depth / BT_WAIT_QUEUES) + bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES); + + bt->depth = depth; +} + +static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, + int node, bool reserved) +{ + int i; + + bt->bits_per_word = ilog2(BITS_PER_LONG); + + /* + * Depth can be zero for reserved tags, that's not a failure + * condition. + */ + if (depth) { + unsigned int nr, tags_per_word; + + tags_per_word = (1 << bt->bits_per_word); + + /* + * If the tag space is small, shrink the number of tags + * per word so we spread over a few cachelines, at least. + * If less than 4 tags, just forget about it, it's not + * going to work optimally anyway. + */ + if (depth >= 4) { + while (tags_per_word * 4 > depth) { + bt->bits_per_word--; + tags_per_word = (1 << bt->bits_per_word); + } + } + + nr = ALIGN(depth, tags_per_word) / tags_per_word; + bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), + GFP_KERNEL, node); + if (!bt->map) + return -ENOMEM; + + bt->map_nr = nr; + } + + bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); + if (!bt->bs) { + kfree(bt->map); + bt->map = NULL; + return -ENOMEM; + } + + bt_update_count(bt, depth); + + for (i = 0; i < BT_WAIT_QUEUES; i++) { + init_waitqueue_head(&bt->bs[i].wait); + atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt); + } + + return 0; +} + +static void bt_free(struct blk_mq_bitmap_tags *bt) +{ + kfree(bt->map); + kfree(bt->bs); +} + +static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node, int alloc_policy) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + + tags->alloc_policy = alloc_policy; + + if (bt_alloc(&tags->bitmap_tags, depth, node, false)) + goto enomem; + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) + goto enomem; + + return tags; +enomem: + bt_free(&tags->bitmap_tags); + kfree(tags); + return NULL; +} + +struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, + unsigned int reserved_tags, + int node, int alloc_policy) +{ + struct blk_mq_tags *tags; + + if (total_tags > BLK_MQ_TAG_MAX) { + pr_err("blk-mq: tag depth too large\n"); + return NULL; + } + + tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); + if (!tags) + return NULL; + + tags->nr_tags = total_tags; + tags->nr_reserved_tags = reserved_tags; + + return blk_mq_init_bitmap_tags(tags, node, alloc_policy); +} + +void blk_mq_free_tags(struct blk_mq_tags *tags) +{ + bt_free(&tags->bitmap_tags); + bt_free(&tags->breserved_tags); + kfree(tags); +} + +void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + + *tag = prandom_u32() % depth; +} + +int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) +{ + tdepth -= tags->nr_reserved_tags; + if (tdepth > tags->nr_tags) + return -EINVAL; + + /* + * Don't need (or can't) update reserved tags here, they remain + * static and should never need resizing. + */ + bt_update_count(&tags->bitmap_tags, tdepth); + blk_mq_tag_wakeup_all(tags, false); + return 0; +} + +/** + * blk_mq_unique_tag() - return a tag that is unique queue-wide + * @rq: request for which to compute a unique tag + * + * The tag field in struct request is unique per hardware queue but not over + * all hardware queues. Hence this function that returns a tag with the + * hardware context index in the upper bits and the per hardware queue tag in + * the lower bits. + * + * Note: When called for a request that is queued on a non-multiqueue request + * queue, the hardware context index is set to zero. + */ +u32 blk_mq_unique_tag(struct request *rq) +{ + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx; + int hwq = 0; + + if (q->mq_ops) { + hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + hwq = hctx->queue_num; + } + + return (hwq << BLK_MQ_UNIQUE_TAG_BITS) | + (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); +} +EXPORT_SYMBOL(blk_mq_unique_tag); + +ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) +{ + char *orig_page = page; + unsigned int free, res; + + if (!tags) + return 0; + + page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " + "bits_per_word=%u\n", + tags->nr_tags, tags->nr_reserved_tags, + tags->bitmap_tags.bits_per_word); + + free = bt_unused_tags(&tags->bitmap_tags); + res = bt_unused_tags(&tags->breserved_tags); + + page += sprintf(page, "nr_free=%u, nr_reserved=%u\n", free, res); + page += sprintf(page, "active_queues=%u\n", atomic_read(&tags->active_queues)); + + return page - orig_page; +} -- cgit 1.2.3-korg