From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/fs/f2fs/gc.c | 746 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 746 insertions(+) create mode 100644 kernel/fs/f2fs/gc.c (limited to 'kernel/fs/f2fs/gc.c') diff --git a/kernel/fs/f2fs/gc.c b/kernel/fs/f2fs/gc.c new file mode 100644 index 000000000..ed58211fe --- /dev/null +++ b/kernel/fs/f2fs/gc.c @@ -0,0 +1,746 @@ +/* + * fs/f2fs/gc.c + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "f2fs.h" +#include "node.h" +#include "segment.h" +#include "gc.h" +#include + +static int gc_thread_func(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; + long wait_ms; + + wait_ms = gc_th->min_sleep_time; + + do { + if (try_to_freeze()) + continue; + else + wait_event_interruptible_timeout(*wq, + kthread_should_stop(), + msecs_to_jiffies(wait_ms)); + if (kthread_should_stop()) + break; + + if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) { + increase_sleep_time(gc_th, &wait_ms); + continue; + } + + /* + * [GC triggering condition] + * 0. GC is not conducted currently. + * 1. There are enough dirty segments. + * 2. IO subsystem is idle by checking the # of writeback pages. + * 3. IO subsystem is idle by checking the # of requests in + * bdev's request list. + * + * Note) We have to avoid triggering GCs frequently. + * Because it is possible that some segments can be + * invalidated soon after by user update or deletion. + * So, I'd like to wait some time to collect dirty segments. + */ + if (!mutex_trylock(&sbi->gc_mutex)) + continue; + + if (!is_idle(sbi)) { + increase_sleep_time(gc_th, &wait_ms); + mutex_unlock(&sbi->gc_mutex); + continue; + } + + if (has_enough_invalid_blocks(sbi)) + decrease_sleep_time(gc_th, &wait_ms); + else + increase_sleep_time(gc_th, &wait_ms); + + stat_inc_bggc_count(sbi); + + /* if return value is not zero, no victim was selected */ + if (f2fs_gc(sbi)) + wait_ms = gc_th->no_gc_sleep_time; + + /* balancing f2fs's metadata periodically */ + f2fs_balance_fs_bg(sbi); + + } while (!kthread_should_stop()); + return 0; +} + +int start_gc_thread(struct f2fs_sb_info *sbi) +{ + struct f2fs_gc_kthread *gc_th; + dev_t dev = sbi->sb->s_bdev->bd_dev; + int err = 0; + + gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + if (!gc_th) { + err = -ENOMEM; + goto out; + } + + gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; + gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; + gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; + + gc_th->gc_idle = 0; + + sbi->gc_thread = gc_th; + init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, + "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(gc_th->f2fs_gc_task)) { + err = PTR_ERR(gc_th->f2fs_gc_task); + kfree(gc_th); + sbi->gc_thread = NULL; + } +out: + return err; +} + +void stop_gc_thread(struct f2fs_sb_info *sbi) +{ + struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + if (!gc_th) + return; + kthread_stop(gc_th->f2fs_gc_task); + kfree(gc_th); + sbi->gc_thread = NULL; +} + +static int select_gc_type(struct f2fs_gc_kthread *gc_th, int gc_type) +{ + int gc_mode = (gc_type == BG_GC) ? GC_CB : GC_GREEDY; + + if (gc_th && gc_th->gc_idle) { + if (gc_th->gc_idle == 1) + gc_mode = GC_CB; + else if (gc_th->gc_idle == 2) + gc_mode = GC_GREEDY; + } + return gc_mode; +} + +static void select_policy(struct f2fs_sb_info *sbi, int gc_type, + int type, struct victim_sel_policy *p) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + + if (p->alloc_mode == SSR) { + p->gc_mode = GC_GREEDY; + p->dirty_segmap = dirty_i->dirty_segmap[type]; + p->max_search = dirty_i->nr_dirty[type]; + p->ofs_unit = 1; + } else { + p->gc_mode = select_gc_type(sbi->gc_thread, gc_type); + p->dirty_segmap = dirty_i->dirty_segmap[DIRTY]; + p->max_search = dirty_i->nr_dirty[DIRTY]; + p->ofs_unit = sbi->segs_per_sec; + } + + if (p->max_search > sbi->max_victim_search) + p->max_search = sbi->max_victim_search; + + p->offset = sbi->last_victim[p->gc_mode]; +} + +static unsigned int get_max_cost(struct f2fs_sb_info *sbi, + struct victim_sel_policy *p) +{ + /* SSR allocates in a segment unit */ + if (p->alloc_mode == SSR) + return 1 << sbi->log_blocks_per_seg; + if (p->gc_mode == GC_GREEDY) + return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + else if (p->gc_mode == GC_CB) + return UINT_MAX; + else /* No other gc_mode */ + return 0; +} + +static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + unsigned int secno; + + /* + * If the gc_type is FG_GC, we can select victim segments + * selected by background GC before. + * Those segments guarantee they have small valid blocks. + */ + for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { + if (sec_usage_check(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); + return secno * sbi->segs_per_sec; + } + return NULL_SEGNO; +} + +static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) +{ + struct sit_info *sit_i = SIT_I(sbi); + unsigned int secno = GET_SECNO(sbi, segno); + unsigned int start = secno * sbi->segs_per_sec; + unsigned long long mtime = 0; + unsigned int vblocks; + unsigned char age = 0; + unsigned char u; + unsigned int i; + + for (i = 0; i < sbi->segs_per_sec; i++) + mtime += get_seg_entry(sbi, start + i)->mtime; + vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + + mtime = div_u64(mtime, sbi->segs_per_sec); + vblocks = div_u64(vblocks, sbi->segs_per_sec); + + u = (vblocks * 100) >> sbi->log_blocks_per_seg; + + /* Handle if the system time has changed by the user */ + if (mtime < sit_i->min_mtime) + sit_i->min_mtime = mtime; + if (mtime > sit_i->max_mtime) + sit_i->max_mtime = mtime; + if (sit_i->max_mtime != sit_i->min_mtime) + age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime), + sit_i->max_mtime - sit_i->min_mtime); + + return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); +} + +static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, + unsigned int segno, struct victim_sel_policy *p) +{ + if (p->alloc_mode == SSR) + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; + + /* alloc_mode == LFS */ + if (p->gc_mode == GC_GREEDY) + return get_valid_blocks(sbi, segno, sbi->segs_per_sec); + else + return get_cb_cost(sbi, segno); +} + +/* + * This function is called from two paths. + * One is garbage collection and the other is SSR segment selection. + * When it is called during GC, it just gets a victim segment + * and it does not remove it from dirty seglist. + * When it is called from SSR segment selection, it finds a segment + * which has minimum valid blocks and removes it from dirty seglist. + */ +static int get_victim_by_default(struct f2fs_sb_info *sbi, + unsigned int *result, int gc_type, int type, char alloc_mode) +{ + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct victim_sel_policy p; + unsigned int secno, max_cost; + int nsearched = 0; + + mutex_lock(&dirty_i->seglist_lock); + + p.alloc_mode = alloc_mode; + select_policy(sbi, gc_type, type, &p); + + p.min_segno = NULL_SEGNO; + p.min_cost = max_cost = get_max_cost(sbi, &p); + + if (p.alloc_mode == LFS && gc_type == FG_GC) { + p.min_segno = check_bg_victims(sbi); + if (p.min_segno != NULL_SEGNO) + goto got_it; + } + + while (1) { + unsigned long cost; + unsigned int segno; + + segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset); + if (segno >= MAIN_SEGS(sbi)) { + if (sbi->last_victim[p.gc_mode]) { + sbi->last_victim[p.gc_mode] = 0; + p.offset = 0; + continue; + } + break; + } + + p.offset = segno + p.ofs_unit; + if (p.ofs_unit > 1) + p.offset -= segno % p.ofs_unit; + + secno = GET_SECNO(sbi, segno); + + if (sec_usage_check(sbi, secno)) + continue; + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) + continue; + + cost = get_gc_cost(sbi, segno, &p); + + if (p.min_cost > cost) { + p.min_segno = segno; + p.min_cost = cost; + } else if (unlikely(cost == max_cost)) { + continue; + } + + if (nsearched++ >= p.max_search) { + sbi->last_victim[p.gc_mode] = segno; + break; + } + } + if (p.min_segno != NULL_SEGNO) { +got_it: + if (p.alloc_mode == LFS) { + secno = GET_SECNO(sbi, p.min_segno); + if (gc_type == FG_GC) + sbi->cur_victim_sec = secno; + else + set_bit(secno, dirty_i->victim_secmap); + } + *result = (p.min_segno / p.ofs_unit) * p.ofs_unit; + + trace_f2fs_get_victim(sbi->sb, type, gc_type, &p, + sbi->cur_victim_sec, + prefree_segments(sbi), free_segments(sbi)); + } + mutex_unlock(&dirty_i->seglist_lock); + + return (p.min_segno == NULL_SEGNO) ? 0 : 1; +} + +static const struct victim_selection default_v_ops = { + .get_victim = get_victim_by_default, +}; + +static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino) +{ + struct inode_entry *ie; + + ie = radix_tree_lookup(&gc_list->iroot, ino); + if (ie) + return ie->inode; + return NULL; +} + +static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) +{ + struct inode_entry *new_ie; + + if (inode == find_gc_inode(gc_list, inode->i_ino)) { + iput(inode); + return; + } + new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); + new_ie->inode = inode; + + f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); + list_add_tail(&new_ie->list, &gc_list->ilist); +} + +static void put_gc_inode(struct gc_inode_list *gc_list) +{ + struct inode_entry *ie, *next_ie; + list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { + radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); + iput(ie->inode); + list_del(&ie->list); + kmem_cache_free(inode_entry_slab, ie); + } +} + +static int check_valid_map(struct f2fs_sb_info *sbi, + unsigned int segno, int offset) +{ + struct sit_info *sit_i = SIT_I(sbi); + struct seg_entry *sentry; + int ret; + + mutex_lock(&sit_i->sentry_lock); + sentry = get_seg_entry(sbi, segno); + ret = f2fs_test_bit(offset, sentry->cur_valid_map); + mutex_unlock(&sit_i->sentry_lock); + return ret; +} + +/* + * This function compares node address got in summary with that in NAT. + * On validity, copy that node with cold status, otherwise (invalid node) + * ignore that. + */ +static void gc_node_segment(struct f2fs_sb_info *sbi, + struct f2fs_summary *sum, unsigned int segno, int gc_type) +{ + bool initial = true; + struct f2fs_summary *entry; + int off; + +next_step: + entry = sum; + + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + nid_t nid = le32_to_cpu(entry->nid); + struct page *node_page; + + /* stop BG_GC if there is not enough free sections. */ + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + return; + + if (check_valid_map(sbi, segno, off) == 0) + continue; + + if (initial) { + ra_node_page(sbi, nid); + continue; + } + node_page = get_node_page(sbi, nid); + if (IS_ERR(node_page)) + continue; + + /* block may become invalid during get_node_page */ + if (check_valid_map(sbi, segno, off) == 0) { + f2fs_put_page(node_page, 1); + continue; + } + + /* set page dirty and write it */ + if (gc_type == FG_GC) { + f2fs_wait_on_page_writeback(node_page, NODE); + set_page_dirty(node_page); + } else { + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } + f2fs_put_page(node_page, 1); + stat_inc_node_blk_count(sbi, 1, gc_type); + } + + if (initial) { + initial = false; + goto next_step; + } + + if (gc_type == FG_GC) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = LONG_MAX, + .for_reclaim = 0, + }; + sync_node_pages(sbi, 0, &wbc); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) + goto next_step; + } +} + +/* + * Calculate start block index indicating the given node offset. + * Be careful, caller should give this node offset only indicating direct node + * blocks. If any node offsets, which point the other types of node blocks such + * as indirect or double indirect node blocks, are given, it must be a caller's + * bug. + */ +block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) +{ + unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; + unsigned int bidx; + + if (node_ofs == 0) + return 0; + + if (node_ofs <= 2) { + bidx = node_ofs - 1; + } else if (node_ofs <= indirect_blks) { + int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 2 - dec; + } else { + int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 5 - dec; + } + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); +} + +static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + struct node_info *dni, block_t blkaddr, unsigned int *nofs) +{ + struct page *node_page; + nid_t nid; + unsigned int ofs_in_node; + block_t source_blkaddr; + + nid = le32_to_cpu(sum->nid); + ofs_in_node = le16_to_cpu(sum->ofs_in_node); + + node_page = get_node_page(sbi, nid); + if (IS_ERR(node_page)) + return 0; + + get_node_info(sbi, nid, dni); + + if (sum->version != dni->version) { + f2fs_put_page(node_page, 1); + return 0; + } + + *nofs = ofs_of_node(node_page); + source_blkaddr = datablock_addr(node_page, ofs_in_node); + f2fs_put_page(node_page, 1); + + if (source_blkaddr != blkaddr) + return 0; + return 1; +} + +static void move_data_page(struct inode *inode, struct page *page, int gc_type) +{ + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC, + }; + + if (gc_type == BG_GC) { + if (PageWriteback(page)) + goto out; + set_page_dirty(page); + set_cold_data(page); + } else { + f2fs_wait_on_page_writeback(page, DATA); + + if (clear_page_dirty_for_io(page)) + inode_dec_dirty_pages(inode); + set_cold_data(page); + do_write_data_page(page, &fio); + clear_cold_data(page); + } +out: + f2fs_put_page(page, 1); +} + +/* + * This function tries to get parent node of victim data block, and identifies + * data block validity. If the block is valid, copy that with cold status and + * modify parent node. + * If the parent node is not valid or the data block address is different, + * the victim data block is ignored. + */ +static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + struct gc_inode_list *gc_list, unsigned int segno, int gc_type) +{ + struct super_block *sb = sbi->sb; + struct f2fs_summary *entry; + block_t start_addr; + int off; + int phase = 0; + + start_addr = START_BLOCK(sbi, segno); + +next_step: + entry = sum; + + for (off = 0; off < sbi->blocks_per_seg; off++, entry++) { + struct page *data_page; + struct inode *inode; + struct node_info dni; /* dnode info for the data */ + unsigned int ofs_in_node, nofs; + block_t start_bidx; + + /* stop BG_GC if there is not enough free sections. */ + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) + return; + + if (check_valid_map(sbi, segno, off) == 0) + continue; + + if (phase == 0) { + ra_node_page(sbi, le32_to_cpu(entry->nid)); + continue; + } + + /* Get an inode by ino with checking validity */ + if (check_dnode(sbi, entry, &dni, start_addr + off, &nofs) == 0) + continue; + + if (phase == 1) { + ra_node_page(sbi, dni.ino); + continue; + } + + ofs_in_node = le16_to_cpu(entry->ofs_in_node); + + if (phase == 2) { + inode = f2fs_iget(sb, dni.ino); + if (IS_ERR(inode) || is_bad_inode(inode)) + continue; + + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + + data_page = find_data_page(inode, + start_bidx + ofs_in_node, false); + if (IS_ERR(data_page)) { + iput(inode); + continue; + } + + f2fs_put_page(data_page, 0); + add_gc_inode(gc_list, inode); + continue; + } + + /* phase 3 */ + inode = find_gc_inode(gc_list, dni.ino); + if (inode) { + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + data_page = get_lock_data_page(inode, + start_bidx + ofs_in_node); + if (IS_ERR(data_page)) + continue; + move_data_page(inode, data_page, gc_type); + stat_inc_data_blk_count(sbi, 1, gc_type); + } + } + + if (++phase < 4) + goto next_step; + + if (gc_type == FG_GC) { + f2fs_submit_merged_bio(sbi, DATA, WRITE); + + /* + * In the case of FG_GC, it'd be better to reclaim this victim + * completely. + */ + if (get_valid_blocks(sbi, segno, 1) != 0) { + phase = 2; + goto next_step; + } + } +} + +static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, + int gc_type) +{ + struct sit_info *sit_i = SIT_I(sbi); + int ret; + + mutex_lock(&sit_i->sentry_lock); + ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, + NO_CHECK_TYPE, LFS); + mutex_unlock(&sit_i->sentry_lock); + return ret; +} + +static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, + struct gc_inode_list *gc_list, int gc_type) +{ + struct page *sum_page; + struct f2fs_summary_block *sum; + struct blk_plug plug; + + /* read segment summary of victim */ + sum_page = get_sum_page(sbi, segno); + + blk_start_plug(&plug); + + sum = page_address(sum_page); + + switch (GET_SUM_TYPE((&sum->footer))) { + case SUM_TYPE_NODE: + gc_node_segment(sbi, sum->entries, segno, gc_type); + break; + case SUM_TYPE_DATA: + gc_data_segment(sbi, sum->entries, gc_list, segno, gc_type); + break; + } + blk_finish_plug(&plug); + + stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type); + stat_inc_call_count(sbi->stat_info); + + f2fs_put_page(sum_page, 1); +} + +int f2fs_gc(struct f2fs_sb_info *sbi) +{ + unsigned int segno, i; + int gc_type = BG_GC; + int nfree = 0; + int ret = -1; + struct cp_control cpc; + struct gc_inode_list gc_list = { + .ilist = LIST_HEAD_INIT(gc_list.ilist), + .iroot = RADIX_TREE_INIT(GFP_NOFS), + }; + + cpc.reason = __get_cp_reason(sbi); +gc_more: + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) + goto stop; + if (unlikely(f2fs_cp_error(sbi))) + goto stop; + + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { + gc_type = FG_GC; + write_checkpoint(sbi, &cpc); + } + + if (!__get_victim(sbi, &segno, gc_type)) + goto stop; + ret = 0; + + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, + META_SSA); + + for (i = 0; i < sbi->segs_per_sec; i++) + do_garbage_collect(sbi, segno + i, &gc_list, gc_type); + + if (gc_type == FG_GC) { + sbi->cur_victim_sec = NULL_SEGNO; + nfree++; + WARN_ON(get_valid_blocks(sbi, segno, sbi->segs_per_sec)); + } + + if (has_not_enough_free_secs(sbi, nfree)) + goto gc_more; + + if (gc_type == FG_GC) + write_checkpoint(sbi, &cpc); +stop: + mutex_unlock(&sbi->gc_mutex); + + put_gc_inode(&gc_list); + return ret; +} + +void build_gc_manager(struct f2fs_sb_info *sbi) +{ + DIRTY_I(sbi)->v_ops = &default_v_ops; +} -- cgit 1.2.3-korg