diff options
author | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-11 10:41:07 +0300 |
---|---|---|
committer | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-13 08:17:18 +0300 |
commit | e09b41010ba33a20a87472ee821fa407a5b8da36 (patch) | |
tree | d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/mm | |
parent | f93b97fd65072de626c074dbe099a1fff05ce060 (diff) |
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page.
During the rebasing, the following patch collided:
Force tick interrupt and get rid of softirq magic(I70131fb85).
Collisions have been removed because its logic was found on the
source already.
Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769
Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/mm')
86 files changed, 8721 insertions, 4669 deletions
diff --git a/kernel/mm/Kconfig b/kernel/mm/Kconfig index 0cc453705..9614351e6 100644 --- a/kernel/mm/Kconfig +++ b/kernel/mm/Kconfig @@ -200,18 +200,6 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION -# -# If we have space for more page flags then we can enable additional -# optimizations and functionality. -# -# Regular Sparsemem takes page flag bits for the sectionid if it does not -# use a virtual memmap. Disable extended page flags for 32 bit platforms -# that require the use of a sectionid in the page flags. -# -config PAGEFLAGS_EXTENDED - def_bool y - depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM - # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. @@ -299,15 +287,9 @@ config BOUNCE # On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often # have more than 4GB of memory, but we don't currently use the IOTLB to present # a 32-bit address to OHCI. So we need to use a bounce pool instead. -# -# We also use the bounce pool to provide stable page writes for jbd. jbd -# initiates buffer writeback without locking the page or setting PG_writeback, -# and fixing that behavior (a second time; jbd2 doesn't have this problem) is -# a major rework effort. Instead, use the bounce buffer to snapshot pages -# (until jbd goes away). The only jbd user is ext3. config NEED_BOUNCE_POOL bool - default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD) + default y if TILE && USB_OHCI_HCD config NR_QUICK int @@ -368,6 +350,7 @@ config MEMORY_FAILURE depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" select MEMORY_ISOLATION + select RAS help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running @@ -635,3 +618,53 @@ config MAX_STACK_SIZE_MB changed to a smaller value in which case that is used. A sane initial value is 80 MB. + +# For architectures that support deferred memory initialisation +config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + bool + +config DEFERRED_STRUCT_PAGE_INIT + bool "Defer initialisation of struct pages to kswapd" + default n + depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT + depends on MEMORY_HOTPLUG + help + Ordinarily all struct pages are initialised during early boot in a + single thread. On very large machines this can take a considerable + amount of time. If this option is set, large machines will bring up + a subset of memmap at boot and then initialise the rest in parallel + when kswapd starts. This has a potential performance impact on + processes running early in the lifetime of the systemm until kswapd + finishes the initialisation. + +config IDLE_PAGE_TRACKING + bool "Enable idle page tracking" + depends on SYSFS && MMU + select PAGE_EXTENSION if !64BIT + help + This feature allows to estimate the amount of user pages that have + not been touched during a given period of time. This information can + be useful to tune memory cgroup limits and/or for job placement + within a compute cluster. + + See Documentation/vm/idle_page_tracking.txt for more details. + +config ZONE_DEVICE + bool "Device memory (pmem, etc...) hotplug support" if EXPERT + default !ZONE_DMA + depends on !ZONE_DMA + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on X86_64 #arch_add_memory() comprehends device memory + + help + Device memory hotplug support allows for establishing pmem, + or other device driver discovered memory regions, in the + memmap. This allows pfn_to_page() lookups of otherwise + "device-physical" addresses which is needed for using a DAX + mapping in an O_DIRECT operation, among other things. + + If FS_DAX is enabled, then say Y. + +config FRAME_VECTOR + bool diff --git a/kernel/mm/Makefile b/kernel/mm/Makefile index 98c4eaeab..2ed43191f 100644 --- a/kernel/mm/Makefile +++ b/kernel/mm/Makefile @@ -78,3 +78,6 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_USERFAULTFD) += userfaultfd.o +obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o +obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o diff --git a/kernel/mm/backing-dev.c b/kernel/mm/backing-dev.c index 000e7b3b9..6871838f0 100644 --- a/kernel/mm/backing-dev.c +++ b/kernel/mm/backing-dev.c @@ -18,6 +18,7 @@ struct backing_dev_info noop_backing_dev_info = { .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; +EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; @@ -48,25 +49,25 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; - unsigned long bdi_thresh; + unsigned long wb_thresh; unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); - list_for_each_entry(inode, &wb->b_dirty, i_wb_list) + list_for_each_entry(inode, &wb->b_dirty, i_io_list) nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_wb_list) + list_for_each_entry(inode, &wb->b_io, i_io_list) nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_wb_list) + list_for_each_entry(inode, &wb->b_more_io, i_io_list) nr_more_io++; - list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list) + list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) nr_dirty_time++; spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); - bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + wb_thresh = wb_calc_thresh(wb, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, @@ -84,19 +85,19 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", - (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), - (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), + (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), + (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), + K(wb_thresh), K(dirty_thresh), K(background_thresh), - (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), - (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), - (unsigned long) K(bdi->write_bandwidth), + (unsigned long) K(wb_stat(wb, WB_DIRTIED)), + (unsigned long) K(wb_stat(wb, WB_WRITTEN)), + (unsigned long) K(wb->write_bandwidth), nr_dirty, nr_io, nr_more_io, nr_dirty_time, - !list_empty(&bdi->bdi_list), bdi->state); + !list_empty(&bdi->bdi_list), bdi->wb.state); #undef K return 0; @@ -255,13 +256,8 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -int bdi_has_dirty_io(struct backing_dev_info *bdi) -{ - return wb_has_dirty_io(&bdi->wb); -} - /* - * This function is used when the first inode for this bdi is marked dirty. It + * This function is used when the first inode for this wb is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the * periodic background write-out of dirty inodes. Since the write-out would * starts only 'dirty_writeback_interval' centisecs from now anyway, we just @@ -274,172 +270,597 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) * We have to be careful not to postpone flush work if it is scheduled for * earlier. Thus we use queue_delayed_work(). */ -void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) +void wb_wakeup_delayed(struct bdi_writeback *wb) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - spin_lock_bh(&bdi->wb_lock); - if (test_bit(BDI_registered, &bdi->state)) - queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); - spin_unlock_bh(&bdi->wb_lock); + spin_lock_bh(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->dwork, timeout); + spin_unlock_bh(&wb->work_lock); } /* - * Remove bdi from bdi_list, and ensure that it is no longer visible + * Initial write bandwidth: 100 MB/s */ -static void bdi_remove_from_list(struct backing_dev_info *bdi) +#define INIT_BW (100 << (20 - PAGE_SHIFT)) + +static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, + int blkcg_id, gfp_t gfp) { - spin_lock_bh(&bdi_lock); - list_del_rcu(&bdi->bdi_list); - spin_unlock_bh(&bdi_lock); + int i, err; - synchronize_rcu_expedited(); -} + memset(wb, 0, sizeof(*wb)); -int bdi_register(struct backing_dev_info *bdi, struct device *parent, - const char *fmt, ...) -{ - va_list args; - struct device *dev; + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); + INIT_LIST_HEAD(&wb->b_dirty_time); + spin_lock_init(&wb->list_lock); - if (bdi->dev) /* The driver needs to use separate queues per device */ - return 0; + wb->bw_time_stamp = jiffies; + wb->balanced_dirty_ratelimit = INIT_BW; + wb->dirty_ratelimit = INIT_BW; + wb->write_bandwidth = INIT_BW; + wb->avg_write_bandwidth = INIT_BW; - va_start(args, fmt); - dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); - va_end(args); - if (IS_ERR(dev)) - return PTR_ERR(dev); + spin_lock_init(&wb->work_lock); + INIT_LIST_HEAD(&wb->work_list); + INIT_DELAYED_WORK(&wb->dwork, wb_workfn); - bdi->dev = dev; + wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); + if (!wb->congested) + return -ENOMEM; - bdi_debug_register(bdi, dev_name(dev)); - set_bit(BDI_registered, &bdi->state); + err = fprop_local_init_percpu(&wb->completions, gfp); + if (err) + goto out_put_cong; - spin_lock_bh(&bdi_lock); - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); + for (i = 0; i < NR_WB_STAT_ITEMS; i++) { + err = percpu_counter_init(&wb->stat[i], 0, gfp); + if (err) + goto out_destroy_stat; + } - trace_writeback_bdi_register(bdi); return 0; -} -EXPORT_SYMBOL(bdi_register); -int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) -{ - return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); +out_destroy_stat: + while (--i) + percpu_counter_destroy(&wb->stat[i]); + fprop_local_destroy_percpu(&wb->completions); +out_put_cong: + wb_congested_put(wb->congested); + return err; } -EXPORT_SYMBOL(bdi_register_dev); /* * Remove bdi from the global list and shutdown any threads we have running */ -static void bdi_wb_shutdown(struct backing_dev_info *bdi) +static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ - spin_lock_bh(&bdi->wb_lock); - if (!test_and_clear_bit(BDI_registered, &bdi->state)) { - spin_unlock_bh(&bdi->wb_lock); + spin_lock_bh(&wb->work_lock); + if (!test_and_clear_bit(WB_registered, &wb->state)) { + spin_unlock_bh(&wb->work_lock); return; } - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); /* - * Make sure nobody finds us on the bdi_list anymore + * Drain work list and shutdown the delayed_work. !WB_registered + * tells wb_workfn() that @wb is dying and its work_list needs to + * be drained no matter what. */ - bdi_remove_from_list(bdi); + mod_delayed_work(bdi_wq, &wb->dwork, 0); + flush_delayed_work(&wb->dwork); + WARN_ON(!list_empty(&wb->work_list)); +} + +static void wb_exit(struct bdi_writeback *wb) +{ + int i; + + WARN_ON(delayed_work_pending(&wb->dwork)); + + for (i = 0; i < NR_WB_STAT_ITEMS; i++) + percpu_counter_destroy(&wb->stat[i]); + + fprop_local_destroy_percpu(&wb->completions); + wb_congested_put(wb->congested); +} + +#ifdef CONFIG_CGROUP_WRITEBACK + +#include <linux/memcontrol.h> + +/* + * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, + * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU + * protected. cgwb_release_wait is used to wait for the completion of cgwb + * releases from bdi destruction path. + */ +static DEFINE_SPINLOCK(cgwb_lock); +static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait); + +/** + * wb_congested_get_create - get or create a wb_congested + * @bdi: associated bdi + * @blkcg_id: ID of the associated blkcg + * @gfp: allocation mask + * + * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one. + * The returned wb_congested has its reference count incremented. Returns + * NULL on failure. + */ +struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) +{ + struct bdi_writeback_congested *new_congested = NULL, *congested; + struct rb_node **node, *parent; + unsigned long flags; +retry: + spin_lock_irqsave(&cgwb_lock, flags); + + node = &bdi->cgwb_congested_tree.rb_node; + parent = NULL; + + while (*node != NULL) { + parent = *node; + congested = container_of(parent, struct bdi_writeback_congested, + rb_node); + if (congested->blkcg_id < blkcg_id) + node = &parent->rb_left; + else if (congested->blkcg_id > blkcg_id) + node = &parent->rb_right; + else + goto found; + } + + if (new_congested) { + /* !found and storage for new one already allocated, insert */ + congested = new_congested; + new_congested = NULL; + rb_link_node(&congested->rb_node, parent, node); + rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); + goto found; + } + + spin_unlock_irqrestore(&cgwb_lock, flags); + + /* allocate storage for new one and retry */ + new_congested = kzalloc(sizeof(*new_congested), gfp); + if (!new_congested) + return NULL; + + atomic_set(&new_congested->refcnt, 0); + new_congested->bdi = bdi; + new_congested->blkcg_id = blkcg_id; + goto retry; + +found: + atomic_inc(&congested->refcnt); + spin_unlock_irqrestore(&cgwb_lock, flags); + kfree(new_congested); + return congested; +} + +/** + * wb_congested_put - put a wb_congested + * @congested: wb_congested to put + * + * Put @congested and destroy it if the refcnt reaches zero. + */ +void wb_congested_put(struct bdi_writeback_congested *congested) +{ + unsigned long flags; + + local_irq_save_nort(flags); + if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { + local_irq_restore_nort(flags); + return; + } + + /* bdi might already have been destroyed leaving @congested unlinked */ + if (congested->bdi) { + rb_erase(&congested->rb_node, + &congested->bdi->cgwb_congested_tree); + congested->bdi = NULL; + } + + spin_unlock_irqrestore(&cgwb_lock, flags); + kfree(congested); +} + +static void cgwb_release_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(work, struct bdi_writeback, + release_work); + struct backing_dev_info *bdi = wb->bdi; + + spin_lock_irq(&cgwb_lock); + list_del_rcu(&wb->bdi_node); + spin_unlock_irq(&cgwb_lock); + + wb_shutdown(wb); + + css_put(wb->memcg_css); + css_put(wb->blkcg_css); + + fprop_local_destroy_percpu(&wb->memcg_completions); + percpu_ref_exit(&wb->refcnt); + wb_exit(wb); + kfree_rcu(wb, rcu); + + if (atomic_dec_and_test(&bdi->usage_cnt)) + wake_up_all(&cgwb_release_wait); +} + +static void cgwb_release(struct percpu_ref *refcnt) +{ + struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, + refcnt); + schedule_work(&wb->release_work); +} + +static void cgwb_kill(struct bdi_writeback *wb) +{ + lockdep_assert_held(&cgwb_lock); + + WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); + list_del(&wb->memcg_node); + list_del(&wb->blkcg_node); + percpu_ref_kill(&wb->refcnt); +} + +static int cgwb_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, gfp_t gfp) +{ + struct mem_cgroup *memcg; + struct cgroup_subsys_state *blkcg_css; + struct blkcg *blkcg; + struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; + struct bdi_writeback *wb; + unsigned long flags; + int ret = 0; + + memcg = mem_cgroup_from_css(memcg_css); + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); + blkcg = css_to_blkcg(blkcg_css); + memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + blkcg_cgwb_list = &blkcg->cgwb_list; + + /* look up again under lock and discard on blkcg mismatch */ + spin_lock_irqsave(&cgwb_lock, flags); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb && wb->blkcg_css != blkcg_css) { + cgwb_kill(wb); + wb = NULL; + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (wb) + goto out_put; + + /* need to create a new one */ + wb = kmalloc(sizeof(*wb), gfp); + if (!wb) + return -ENOMEM; + + ret = wb_init(wb, bdi, blkcg_css->id, gfp); + if (ret) + goto err_free; + + ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); + if (ret) + goto err_wb_exit; + + ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); + if (ret) + goto err_ref_exit; + + wb->memcg_css = memcg_css; + wb->blkcg_css = blkcg_css; + INIT_WORK(&wb->release_work, cgwb_release_workfn); + set_bit(WB_registered, &wb->state); /* - * Drain work list and shutdown the delayed_work. At this point, - * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi - * is dying and its work_list needs to be drained no matter what. + * The root wb determines the registered state of the whole bdi and + * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate + * whether they're still online. Don't link @wb if any is dead. + * See wb_memcg_offline() and wb_blkcg_offline(). */ - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); - flush_delayed_work(&bdi->wb.dwork); + ret = -ENODEV; + spin_lock_irqsave(&cgwb_lock, flags); + if (test_bit(WB_registered, &bdi->wb.state) && + blkcg_cgwb_list->next && memcg_cgwb_list->next) { + /* we might have raced another instance of this function */ + ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); + if (!ret) { + atomic_inc(&bdi->usage_cnt); + list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); + list_add(&wb->memcg_node, memcg_cgwb_list); + list_add(&wb->blkcg_node, blkcg_cgwb_list); + css_get(memcg_css); + css_get(blkcg_css); + } + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (ret) { + if (ret == -EEXIST) + ret = 0; + goto err_fprop_exit; + } + goto out_put; + +err_fprop_exit: + fprop_local_destroy_percpu(&wb->memcg_completions); +err_ref_exit: + percpu_ref_exit(&wb->refcnt); +err_wb_exit: + wb_exit(wb); +err_free: + kfree(wb); +out_put: + css_put(blkcg_css); + return ret; } -static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +/** + * wb_get_create - get wb for a given memcg, create if necessary + * @bdi: target bdi + * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) + * @gfp: allocation mask to use + * + * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to + * create one. The returned wb has its refcount incremented. + * + * This function uses css_get() on @memcg_css and thus expects its refcnt + * to be positive on invocation. IOW, rcu_read_lock() protection on + * @memcg_css isn't enough. try_get it before calling this function. + * + * A wb is keyed by its associated memcg. As blkcg implicitly enables + * memcg on the default hierarchy, memcg association is guaranteed to be + * more specific (equal or descendant to the associated blkcg) and thus can + * identify both the memcg and blkcg associations. + * + * Because the blkcg associated with a memcg may change as blkcg is enabled + * and disabled closer to root in the hierarchy, each wb keeps track of + * both the memcg and blkcg associated with it and verifies the blkcg on + * each lookup. On mismatch, the existing wb is discarded and a new one is + * created. + */ +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp) { - memset(wb, 0, sizeof(*wb)); + struct bdi_writeback *wb; + + might_sleep_if(gfpflags_allow_blocking(gfp)); + + if (!memcg_css->parent) + return &bdi->wb; + + do { + rcu_read_lock(); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb) { + struct cgroup_subsys_state *blkcg_css; + + /* see whether the blkcg association has changed */ + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, + &io_cgrp_subsys); + if (unlikely(wb->blkcg_css != blkcg_css || + !wb_tryget(wb))) + wb = NULL; + css_put(blkcg_css); + } + rcu_read_unlock(); + } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); + + return wb; +} - wb->bdi = bdi; - wb->last_old_flush = jiffies; - INIT_LIST_HEAD(&wb->b_dirty); - INIT_LIST_HEAD(&wb->b_io); - INIT_LIST_HEAD(&wb->b_more_io); - INIT_LIST_HEAD(&wb->b_dirty_time); - spin_lock_init(&wb->list_lock); - INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); +static int cgwb_bdi_init(struct backing_dev_info *bdi) +{ + int ret; + + INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); + bdi->cgwb_congested_tree = RB_ROOT; + atomic_set(&bdi->usage_cnt, 1); + + ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); + if (!ret) { + bdi->wb.memcg_css = mem_cgroup_root_css; + bdi->wb.blkcg_css = blkcg_root_css; + } + return ret; } -/* - * Initial write bandwidth: 100 MB/s +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) +{ + struct radix_tree_iter iter; + struct rb_node *rbn; + void **slot; + + WARN_ON(test_bit(WB_registered, &bdi->wb.state)); + + spin_lock_irq(&cgwb_lock); + + radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) + cgwb_kill(*slot); + + while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { + struct bdi_writeback_congested *congested = + rb_entry(rbn, struct bdi_writeback_congested, rb_node); + + rb_erase(rbn, &bdi->cgwb_congested_tree); + congested->bdi = NULL; /* mark @congested unlinked */ + } + + spin_unlock_irq(&cgwb_lock); + + /* + * All cgwb's and their congested states must be shutdown and + * released before returning. Drain the usage counter to wait for + * all cgwb's and cgwb_congested's ever created on @bdi. + */ + atomic_dec(&bdi->usage_cnt); + wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); +} + +/** + * wb_memcg_offline - kill all wb's associated with a memcg being offlined + * @memcg: memcg being offlined + * + * Also prevents creation of any new wb's associated with @memcg. */ -#define INIT_BW (100 << (20 - PAGE_SHIFT)) +void wb_memcg_offline(struct mem_cgroup *memcg) +{ + LIST_HEAD(to_destroy); + struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + struct bdi_writeback *wb, *next; + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) + cgwb_kill(wb); + memcg_cgwb_list->next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); +} + +/** + * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined + * @blkcg: blkcg being offlined + * + * Also prevents creation of any new wb's associated with @blkcg. + */ +void wb_blkcg_offline(struct blkcg *blkcg) +{ + LIST_HEAD(to_destroy); + struct bdi_writeback *wb, *next; + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node) + cgwb_kill(wb); + blkcg->cgwb_list.next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static int cgwb_bdi_init(struct backing_dev_info *bdi) +{ + int err; + + bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL); + if (!bdi->wb_congested) + return -ENOMEM; + + err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); + if (err) { + kfree(bdi->wb_congested); + return err; + } + return 0; +} + +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } + +#endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) { - int i, err; + int ret; bdi->dev = NULL; bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; - spin_lock_init(&bdi->wb_lock); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->work_list); + INIT_LIST_HEAD(&bdi->wb_list); + init_waitqueue_head(&bdi->wb_waitq); - bdi_wb_init(&bdi->wb, bdi); + ret = cgwb_bdi_init(bdi); - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); - if (err) - goto err; - } + list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); + + return ret; +} +EXPORT_SYMBOL(bdi_init); - bdi->dirty_exceeded = 0; +int bdi_register(struct backing_dev_info *bdi, struct device *parent, + const char *fmt, ...) +{ + va_list args; + struct device *dev; + + if (bdi->dev) /* The driver needs to use separate queues per device */ + return 0; - bdi->bw_time_stamp = jiffies; - bdi->written_stamp = 0; + va_start(args, fmt); + dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); + va_end(args); + if (IS_ERR(dev)) + return PTR_ERR(dev); - bdi->balanced_dirty_ratelimit = INIT_BW; - bdi->dirty_ratelimit = INIT_BW; - bdi->write_bandwidth = INIT_BW; - bdi->avg_write_bandwidth = INIT_BW; + bdi->dev = dev; - err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); + bdi_debug_register(bdi, dev_name(dev)); + set_bit(WB_registered, &bdi->wb.state); - if (err) { -err: - while (i--) - percpu_counter_destroy(&bdi->bdi_stat[i]); - } + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); - return err; + trace_writeback_bdi_register(bdi); + return 0; } -EXPORT_SYMBOL(bdi_init); +EXPORT_SYMBOL(bdi_register); -void bdi_destroy(struct backing_dev_info *bdi) +int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) { - int i; + return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); +} +EXPORT_SYMBOL(bdi_register_dev); - bdi_wb_shutdown(bdi); - bdi_set_min_ratio(bdi, 0); +/* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ +static void bdi_remove_from_list(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi_lock); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); - WARN_ON(!list_empty(&bdi->work_list)); - WARN_ON(delayed_work_pending(&bdi->wb.dwork)); + synchronize_rcu_expedited(); +} + +void bdi_unregister(struct backing_dev_info *bdi) +{ + /* make sure nobody finds us on the bdi_list anymore */ + bdi_remove_from_list(bdi); + wb_shutdown(&bdi->wb); + cgwb_bdi_destroy(bdi); if (bdi->dev) { bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; } +} - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) - percpu_counter_destroy(&bdi->bdi_stat[i]); - fprop_local_destroy_percpu(&bdi->completions); +void bdi_exit(struct backing_dev_info *bdi) +{ + WARN_ON_ONCE(bdi->dev); + wb_exit(&bdi->wb); +} + +void bdi_destroy(struct backing_dev_info *bdi) +{ + bdi_unregister(bdi); + bdi_exit(bdi); } EXPORT_SYMBOL(bdi_destroy); @@ -472,31 +893,31 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; -static atomic_t nr_bdi_congested[2]; +static atomic_t nr_wb_congested[2]; -void clear_bdi_congested(struct backing_dev_info *bdi, int sync) +void clear_wb_congested(struct bdi_writeback_congested *congested, int sync) { - enum bdi_state bit; wait_queue_head_t *wqh = &congestion_wqh[sync]; + enum wb_state bit; - bit = sync ? BDI_sync_congested : BDI_async_congested; - if (test_and_clear_bit(bit, &bdi->state)) - atomic_dec(&nr_bdi_congested[sync]); + bit = sync ? WB_sync_congested : WB_async_congested; + if (test_and_clear_bit(bit, &congested->state)) + atomic_dec(&nr_wb_congested[sync]); smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } -EXPORT_SYMBOL(clear_bdi_congested); +EXPORT_SYMBOL(clear_wb_congested); -void set_bdi_congested(struct backing_dev_info *bdi, int sync) +void set_wb_congested(struct bdi_writeback_congested *congested, int sync) { - enum bdi_state bit; + enum wb_state bit; - bit = sync ? BDI_sync_congested : BDI_async_congested; - if (!test_and_set_bit(bit, &bdi->state)) - atomic_inc(&nr_bdi_congested[sync]); + bit = sync ? WB_sync_congested : WB_async_congested; + if (!test_and_set_bit(bit, &congested->state)) + atomic_inc(&nr_wb_congested[sync]); } -EXPORT_SYMBOL(set_bdi_congested); +EXPORT_SYMBOL(set_wb_congested); /** * congestion_wait - wait for a backing_dev to become uncongested @@ -536,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait); * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absence of zone congestion, cond_resched() is called to yield - * the processor if necessary but otherwise does not sleep. + * In the absence of zone congestion, a short sleep or a cond_resched is + * performed to yield the processor and to allow other subsystems to make + * a forward progress. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function @@ -555,9 +977,21 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) * encountered in the current zone, yield if necessary instead * of sleeping on the congestion queue */ - if (atomic_read(&nr_bdi_congested[sync]) == 0 || + if (atomic_read(&nr_wb_congested[sync]) == 0 || !test_bit(ZONE_CONGESTED, &zone->flags)) { - cond_resched(); + + /* + * Memory allocation/reclaim might be called from a WQ + * context and the current implementation of the WQ + * concurrency control doesn't recognize that a particular + * WQ is congested if the worker thread is looping without + * ever sleeping. Therefore we have to do a short sleep + * here rather than calling cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout_uninterruptible(1); + else + cond_resched(); /* In case we scheduled, work out time remaining */ ret = timeout - (jiffies - start); diff --git a/kernel/mm/balloon_compaction.c b/kernel/mm/balloon_compaction.c index fcad8322e..300117f1a 100644 --- a/kernel/mm/balloon_compaction.c +++ b/kernel/mm/balloon_compaction.c @@ -61,6 +61,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) bool dequeued_page; dequeued_page = false; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { /* * Block others from accessing the 'page' while we get around @@ -75,15 +76,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) continue; } #endif - spin_lock_irqsave(&b_dev_info->pages_lock, flags); balloon_page_delete(page); __count_vm_event(BALLOON_DEFLATE); - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); unlock_page(page); dequeued_page = true; break; } } + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); if (!dequeued_page) { /* @@ -199,23 +199,17 @@ int balloon_page_migrate(struct page *newpage, struct balloon_dev_info *balloon = balloon_page_device(page); int rc = -EAGAIN; - /* - * Block others from accessing the 'newpage' when we get around to - * establishing additional references. We should be the only one - * holding a reference to the 'newpage' at this point. - */ - BUG_ON(!trylock_page(newpage)); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); if (WARN_ON(!__is_movable_balloon_page(page))) { dump_page(page, "not movable balloon page"); - unlock_page(newpage); return rc; } if (balloon && balloon->migratepage) rc = balloon->migratepage(balloon, newpage, page, mode); - unlock_page(newpage); return rc; } #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/kernel/mm/bootmem.c b/kernel/mm/bootmem.c index 477be6965..3b6380784 100644 --- a/kernel/mm/bootmem.c +++ b/kernel/mm/bootmem.c @@ -164,7 +164,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) end = PFN_DOWN(physaddr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -172,7 +172,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; - unsigned long *map, start, end, pages, count = 0; + unsigned long *map, start, end, pages, cur, count = 0; if (!bdata->node_bootmem_map) return 0; @@ -210,17 +210,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { int order = ilog2(BITS_PER_LONG); - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); count += BITS_PER_LONG; start += BITS_PER_LONG; } else { - unsigned long cur = start; + cur = start; start = ALIGN(start + 1, BITS_PER_LONG); while (vec && cur != start) { if (vec & 1) { page = pfn_to_page(cur); - __free_pages_bootmem(page, 0); + __free_pages_bootmem(page, cur, 0); count++; } vec >>= 1; @@ -229,12 +229,14 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) } } + cur = bdata->node_min_pfn; page = virt_to_page(bdata->node_bootmem_map); pages = bdata->node_low_pfn - bdata->node_min_pfn; pages = bootmem_bootmap_pages(pages); count += pages; while (pages--) - __free_pages_bootmem(page++, 0); + __free_pages_bootmem(page++, cur++, 0); + bdata->node_bootmem_map = NULL; bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); @@ -293,6 +295,9 @@ static void __init __free(bootmem_data_t *bdata, sidx + bdata->node_min_pfn, eidx + bdata->node_min_pfn); + if (WARN_ON(bdata->node_bootmem_map == NULL)) + return; + if (bdata->hint_idx > sidx) bdata->hint_idx = sidx; @@ -313,6 +318,9 @@ static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, eidx + bdata->node_min_pfn, flags); + if (WARN_ON(bdata->node_bootmem_map == NULL)) + return 0; + for (idx = sidx; idx < eidx; idx++) if (test_and_set_bit(idx, bdata->node_bootmem_map)) { if (exclusive) { diff --git a/kernel/mm/cma.c b/kernel/mm/cma.c index 3a7a67b93..ea506eb18 100644 --- a/kernel/mm/cma.c +++ b/kernel/mm/cma.c @@ -182,7 +182,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; - /* ensure minimal alignment requied by mm core */ + /* ensure minimal alignment required by mm core */ alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); /* alignment should be aligned with order_per_bit */ @@ -238,7 +238,7 @@ int __init cma_declare_contiguous(phys_addr_t base, /* * high_memory isn't direct mapped memory so retrieving its physical * address isn't appropriate. But it would be useful to check the - * physical address of the highmem boundary so it's justfiable to get + * physical address of the highmem boundary so it's justifiable to get * the physical address from it. On x86 there is a validation check for * this case, so the following workaround is needed to avoid it. */ @@ -316,13 +316,15 @@ int __init cma_declare_contiguous(phys_addr_t base, */ if (base < highmem_start && limit > highmem_start) { addr = memblock_alloc_range(size, alignment, - highmem_start, limit); + highmem_start, limit, + MEMBLOCK_NONE); limit = highmem_start; } if (!addr) { addr = memblock_alloc_range(size, alignment, base, - limit); + limit, + MEMBLOCK_NONE); if (!addr) { ret = -ENOMEM; goto err; @@ -359,9 +361,11 @@ err: * This function allocates part of contiguous memory on specific * contiguous memory area. */ -struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) +struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align) { - unsigned long mask, offset, pfn, start = 0; + unsigned long mask, offset; + unsigned long pfn = -1; + unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; struct page *page = NULL; int ret; @@ -369,7 +373,7 @@ struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) if (!cma || !cma->count) return NULL; - pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, + pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma, count, align); if (!count) @@ -416,7 +420,7 @@ struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) start = bitmap_no + mask + 1; } - trace_cma_alloc(page ? pfn : -1UL, page, count, align); + trace_cma_alloc(pfn, page, count, align); pr_debug("%s(): returned %p\n", __func__, page); return page; diff --git a/kernel/mm/cma.h b/kernel/mm/cma.h index 1132d7335..17c75a424 100644 --- a/kernel/mm/cma.h +++ b/kernel/mm/cma.h @@ -16,7 +16,7 @@ struct cma { extern struct cma cma_areas[MAX_CMA_AREAS]; extern unsigned cma_area_count; -static unsigned long cma_bitmap_maxno(struct cma *cma) +static inline unsigned long cma_bitmap_maxno(struct cma *cma) { return cma->count >> cma->order_per_bit; } diff --git a/kernel/mm/cma_debug.c b/kernel/mm/cma_debug.c index 7621ee34d..f8e4b60db 100644 --- a/kernel/mm/cma_debug.c +++ b/kernel/mm/cma_debug.c @@ -39,7 +39,7 @@ static int cma_used_get(void *data, u64 *val) mutex_lock(&cma->lock); /* pages counter is smaller than sizeof(int) */ - used = bitmap_weight(cma->bitmap, (int)cma->count); + used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma)); mutex_unlock(&cma->lock); *val = (u64)used << cma->order_per_bit; @@ -52,13 +52,14 @@ static int cma_maxchunk_get(void *data, u64 *val) struct cma *cma = data; unsigned long maxchunk = 0; unsigned long start, end = 0; + unsigned long bitmap_maxno = cma_bitmap_maxno(cma); mutex_lock(&cma->lock); for (;;) { - start = find_next_zero_bit(cma->bitmap, cma->count, end); + start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); if (start >= cma->count) break; - end = find_next_bit(cma->bitmap, cma->count, start); + end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk); } mutex_unlock(&cma->lock); @@ -170,10 +171,10 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) tmp = debugfs_create_dir(name, cma_debugfs_root); - debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma, + debugfs_create_file("alloc", S_IWUSR, tmp, cma, &cma_alloc_fops); - debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma, + debugfs_create_file("free", S_IWUSR, tmp, cma, &cma_free_fops); debugfs_create_file("base_pfn", S_IRUGO, tmp, diff --git a/kernel/mm/compaction.c b/kernel/mm/compaction.c index 0af17fef6..ba0f146d8 100644 --- a/kernel/mm/compaction.c +++ b/kernel/mm/compaction.c @@ -35,17 +35,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA -#ifdef CONFIG_TRACEPOINTS -static const char *const compaction_status_string[] = { - "deferred", - "skipped", - "continue", - "partial", - "complete", - "no_suitable_page", - "not_suitable_zone", -}; -#endif #define CREATE_TRACE_POINTS #include <trace/events/compaction.h> @@ -207,6 +196,13 @@ static inline bool isolation_suitable(struct compact_control *cc, return !get_pageblock_skip(page); } +static void reset_cached_positions(struct zone *zone) +{ + zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; + zone->compact_cached_free_pfn = zone_end_pfn(zone); +} + /* * This function is called to clear all cached information on pageblocks that * should be skipped for page isolation when the migrate and free page scanner @@ -218,9 +214,6 @@ static void __reset_isolation_suitable(struct zone *zone) unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; - zone->compact_cached_migrate_pfn[0] = start_pfn; - zone->compact_cached_migrate_pfn[1] = start_pfn; - zone->compact_cached_free_pfn = end_pfn; zone->compact_blockskip_flush = false; /* Walk the zone and mark every pageblock as suitable for isolation */ @@ -238,6 +231,8 @@ static void __reset_isolation_suitable(struct zone *zone) clear_pageblock_skip(page); } + + reset_cached_positions(zone); } void reset_isolation_suitable(pg_data_t *pgdat) @@ -431,6 +426,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (!valid_page) valid_page = page; + + /* + * For compound pages such as THP and hugetlbfs, we can save + * potentially a lot of iterations if we skip them at once. + * The check is racy, but we can consider only valid values + * and the only danger is skipping too much. + */ + if (PageCompound(page)) { + unsigned int comp_order = compound_order(page); + + if (likely(comp_order < MAX_ORDER)) { + blockpfn += (1UL << comp_order) - 1; + cursor += (1UL << comp_order) - 1; + } + + goto isolate_fail; + } + if (!PageBuddy(page)) goto isolate_fail; @@ -490,6 +503,13 @@ isolate_fail: } + /* + * There is a tiny chance that we have read bogus compound_order(), + * so be careful to not go outside of the pageblock. + */ + if (unlikely(blockpfn > end_pfn)) + blockpfn = end_pfn; + trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, nr_scanned, total_isolated); @@ -674,6 +694,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { + bool is_lru; + /* * Periodically drop the lock (if held) regardless of its * contention, to give chance to IRQs. Abort async compaction @@ -717,36 +739,35 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * It's possible to migrate LRU pages and balloon pages * Skip any other type of page */ - if (!PageLRU(page)) { + is_lru = PageLRU(page); + if (!is_lru) { if (unlikely(balloon_page_movable(page))) { if (balloon_page_isolate(page)) { /* Successfully isolated */ goto isolate_success; } } - continue; } /* - * PageLRU is set. lru_lock normally excludes isolation - * splitting and collapsing (collapsing has already happened - * if PageLRU is set) but the lock is not necessarily taken - * here and it is wasteful to take it just to check transhuge. - * Check TransHuge without lock and skip the whole pageblock if - * it's either a transhuge or hugetlbfs page, as calling - * compound_order() without preventing THP from splitting the - * page underneath us may return surprising results. + * Regardless of being on LRU, compound pages such as THP and + * hugetlbfs are not to be compacted. We can potentially save + * a lot of iterations if we skip them at once. The check is + * racy, but we can consider only valid values and the only + * danger is skipping too much. */ - if (PageTransHuge(page)) { - if (!locked) - low_pfn = ALIGN(low_pfn + 1, - pageblock_nr_pages) - 1; - else - low_pfn += (1 << compound_order(page)) - 1; + if (PageCompound(page)) { + unsigned int comp_order = compound_order(page); + + if (likely(comp_order < MAX_ORDER)) + low_pfn += (1UL << comp_order) - 1; continue; } + if (!is_lru) + continue; + /* * Migration will fail if an anonymous page is pinned in memory, * so avoid taking lru_lock and isolating it unnecessarily in an @@ -763,11 +784,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!locked) break; - /* Recheck PageLRU and PageTransHuge under lock */ + /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) continue; - if (PageTransHuge(page)) { - low_pfn += (1 << compound_order(page)) - 1; + + /* + * Page become compound since the non-locked check, + * and it's on LRU. It can only be a THP so the order + * is safe to read and it's 0 for tail pages. + */ + if (unlikely(PageCompound(page))) { + low_pfn += (1UL << compound_order(page)) - 1; continue; } } @@ -778,7 +805,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (__isolate_lru_page(page, isolate_mode) != 0) continue; - VM_BUG_ON_PAGE(PageTransCompound(page), page); + VM_BUG_ON_PAGE(PageCompound(page), page); /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); @@ -898,6 +925,16 @@ static bool suitable_migration_target(struct page *page) } /* + * Test whether the free scanner has reached the same or lower pageblock than + * the migration scanner, and compaction should thus terminate. + */ +static inline bool compact_scanners_met(struct compact_control *cc) +{ + return (cc->free_pfn >> pageblock_order) + <= (cc->migrate_pfn >> pageblock_order); +} + +/* * Based on information in the current compact_control, find blocks * suitable for isolating free pages from and then isolate them. */ @@ -933,8 +970,7 @@ static void isolate_freepages(struct compact_control *cc) * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; block_start_pfn >= low_pfn && - cc->nr_migratepages > cc->nr_freepages; + for (; block_start_pfn >= low_pfn; block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { @@ -966,6 +1002,8 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn, freelist, false); /* + * If we isolated enough freepages, or aborted due to async + * compaction being contended, terminate the loop. * Remember where the free scanner should restart next time, * which is where isolate_freepages_block() left off. * But if it scanned the whole pageblock, isolate_start_pfn @@ -974,27 +1012,31 @@ static void isolate_freepages(struct compact_control *cc) * In that case we will however want to restart at the start * of the previous pageblock. */ - cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? - isolate_start_pfn : - block_start_pfn - pageblock_nr_pages; - - /* - * isolate_freepages_block() might have aborted due to async - * compaction being contended - */ - if (cc->contended) + if ((cc->nr_freepages >= cc->nr_migratepages) + || cc->contended) { + if (isolate_start_pfn >= block_end_pfn) + isolate_start_pfn = + block_start_pfn - pageblock_nr_pages; break; + } else { + /* + * isolate_freepages_block() should not terminate + * prematurely unless contended, or isolated enough + */ + VM_BUG_ON(isolate_start_pfn < block_end_pfn); + } } /* split_free_page does not map the pages */ map_pages(freelist); /* - * If we crossed the migrate scanner, we want to keep it that way - * so that compact_finished() may detect this + * Record where the free scanner will restart next time. Either we + * broke from the loop and set isolate_start_pfn based on the last + * call to isolate_freepages_block(), or we met the migration scanner + * and the loop terminated due to isolate_start_pfn < low_pfn */ - if (block_start_pfn < low_pfn) - cc->free_pfn = cc->migrate_pfn; + cc->free_pfn = isolate_start_pfn; } /* @@ -1062,6 +1104,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | @@ -1110,6 +1153,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; /* Perform the isolation */ + isolate_start_pfn = low_pfn; low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, isolate_mode); @@ -1119,6 +1163,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } /* + * Record where we could have freed pages by migration and not + * yet flushed them to buddy allocator. + * - this is the lowest page that could have been isolated and + * then freed by migration. + */ + if (cc->nr_migratepages && !cc->last_migrated_pfn) + cc->last_migrated_pfn = isolate_start_pfn; + + /* * Either we isolated something and proceed with migration. Or * we failed and compact_zone should decide if we should * continue or not. @@ -1127,16 +1180,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, } acct_isolated(zone, cc); - /* - * Record where migration scanner will be restarted. If we end up in - * the same pageblock as the free scanner, make the scanners fully - * meet so that compact_finished() terminates compaction. - */ - cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn; + /* Record where migration scanner will be restarted. */ + cc->migrate_pfn = low_pfn; return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; } +/* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ +static inline bool is_via_compact_memory(int order) +{ + return order == -1; +} + static int __compact_finished(struct zone *zone, struct compact_control *cc, const int migratetype) { @@ -1144,14 +1202,12 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, unsigned long watermark; if (cc->contended || fatal_signal_pending(current)) - return COMPACT_PARTIAL; + return COMPACT_CONTENDED; /* Compaction run completes if the migrate and free scanner meet */ - if (cc->free_pfn <= cc->migrate_pfn) { + if (compact_scanners_met(cc)) { /* Let the next compaction start anew. */ - zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; - zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; - zone->compact_cached_free_pfn = zone_end_pfn(zone); + reset_cached_positions(zone); /* * Mark that the PG_migrate_skip information should be cleared @@ -1165,11 +1221,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_COMPLETE; } - /* - * order == -1 is expected when compacting via - * /proc/sys/vm/compact_memory - */ - if (cc->order == -1) + if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; /* Compaction run is not finished if the watermark is not met */ @@ -1232,11 +1284,7 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, int fragindex; unsigned long watermark; - /* - * order == -1 is expected when compacting via - * /proc/sys/vm/compact_memory - */ - if (order == -1) + if (is_via_compact_memory(order)) return COMPACT_CONTINUE; watermark = low_wmark_pages(zone); @@ -1295,7 +1343,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) unsigned long end_pfn = zone_end_pfn(zone); const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; - unsigned long last_migrated_pfn = 0; ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); @@ -1333,6 +1380,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } + cc->last_migrated_pfn = 0; trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync); @@ -1342,11 +1390,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) while ((ret = compact_finished(zone, cc, migratetype)) == COMPACT_CONTINUE) { int err; - unsigned long isolate_start_pfn = cc->migrate_pfn; switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: - ret = COMPACT_PARTIAL; + ret = COMPACT_CONTENDED; putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; goto out; @@ -1376,22 +1423,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it */ - if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) { - ret = COMPACT_PARTIAL; + if (err == -ENOMEM && !compact_scanners_met(cc)) { + ret = COMPACT_CONTENDED; goto out; } } - /* - * Record where we could have freed pages by migration and not - * yet flushed them to buddy allocator. We use the pfn that - * isolate_migratepages() started from in this loop iteration - * - this is the lowest page that could have been isolated and - * then freed by migration. - */ - if (!last_migrated_pfn) - last_migrated_pfn = isolate_start_pfn; - check_drain: /* * Has the migration scanner moved away from the previous @@ -1400,12 +1437,12 @@ check_drain: * compact_finished() can detect immediately if allocation * would succeed. */ - if (cc->order > 0 && last_migrated_pfn) { + if (cc->order > 0 && cc->last_migrated_pfn) { int cpu; unsigned long current_block_start = cc->migrate_pfn & ~((1UL << cc->order) - 1); - if (last_migrated_pfn < current_block_start) { + if (cc->last_migrated_pfn < current_block_start) { cpu = get_cpu_light(); local_lock_irq(swapvec_lock); lru_add_drain_cpu(cpu); @@ -1413,7 +1450,7 @@ check_drain: drain_local_pages(zone); put_cpu_light(); /* No more flushing until we migrate again */ - last_migrated_pfn = 0; + cc->last_migrated_pfn = 0; } } @@ -1442,6 +1479,9 @@ out: trace_mm_compaction_end(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn, sync, ret); + if (ret == COMPACT_CONTENDED) + ret = COMPACT_PARTIAL; + return ret; } @@ -1613,10 +1653,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) * this makes sure we compact the whole zone regardless of * cached scanner positions. */ - if (cc->order == -1) + if (is_via_compact_memory(cc->order)) __reset_isolation_suitable(zone); - if (cc->order == -1 || !compaction_deferred(zone, cc->order)) + if (is_via_compact_memory(cc->order) || + !compaction_deferred(zone, cc->order)) compact_zone(zone, cc); if (cc->order > 0) { diff --git a/kernel/mm/debug.c b/kernel/mm/debug.c index 3eb3ac2fc..668aa3519 100644 --- a/kernel/mm/debug.c +++ b/kernel/mm/debug.c @@ -7,7 +7,7 @@ #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/memcontrol.h> static const struct trace_print_flags pageflag_names[] = { @@ -25,12 +25,7 @@ static const struct trace_print_flags pageflag_names[] = { {1UL << PG_private, "private" }, {1UL << PG_private_2, "private_2" }, {1UL << PG_writeback, "writeback" }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED {1UL << PG_head, "head" }, - {1UL << PG_tail, "tail" }, -#else - {1UL << PG_compound, "compound" }, -#endif {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, @@ -48,6 +43,10 @@ static const struct trace_print_flags pageflag_names[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE {1UL << PG_compound_lock, "compound_lock" }, #endif +#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) + {1UL << PG_young, "young" }, + {1UL << PG_idle, "idle" }, +#endif }; static void dump_flags(unsigned long flags, @@ -121,6 +120,7 @@ static const struct trace_print_flags vmaflags_names[] = { {VM_GROWSDOWN, "growsdown" }, {VM_PFNMAP, "pfnmap" }, {VM_DENYWRITE, "denywrite" }, + {VM_LOCKONFAULT, "lockonfault" }, {VM_LOCKED, "locked" }, {VM_IO, "io" }, {VM_SEQ_READ, "seqread" }, diff --git a/kernel/mm/dmapool.c b/kernel/mm/dmapool.c index fd5fe4342..57312b5d6 100644 --- a/kernel/mm/dmapool.c +++ b/kernel/mm/dmapool.c @@ -242,7 +242,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) return page; } -static inline int is_page_busy(struct dma_page *page) +static inline bool is_page_busy(struct dma_page *page) { return page->in_use != 0; } @@ -271,6 +271,9 @@ void dma_pool_destroy(struct dma_pool *pool) { bool empty = false; + if (unlikely(!pool)) + return; + mutex_lock(&pools_reg_lock); mutex_lock(&pools_lock); list_del(&pool->pools); @@ -323,7 +326,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; - might_sleep_if(mem_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(mem_flags)); spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { @@ -334,7 +337,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ spin_unlock_irqrestore(&pool->lock, flags); - page = pool_alloc_page(pool, mem_flags); + page = pool_alloc_page(pool, mem_flags & (~__GFP_ZERO)); if (!page) return NULL; @@ -372,9 +375,14 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, break; } } - memset(retval, POOL_POISON_ALLOCATED, pool->size); + if (!(mem_flags & __GFP_ZERO)) + memset(retval, POOL_POISON_ALLOCATED, pool->size); #endif spin_unlock_irqrestore(&pool->lock, flags); + + if (mem_flags & __GFP_ZERO) + memset(retval, 0, pool->size); + return retval; } EXPORT_SYMBOL(dma_pool_alloc); @@ -386,7 +394,7 @@ static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) list_for_each_entry(page, &pool->page_list, page_list) { if (dma < page->dma) continue; - if (dma < (page->dma + pool->allocation)) + if ((dma - page->dma) < pool->allocation) return page; } return NULL; diff --git a/kernel/mm/early_ioremap.c b/kernel/mm/early_ioremap.c index e10ccd299..6d5717bd7 100644 --- a/kernel/mm/early_ioremap.c +++ b/kernel/mm/early_ioremap.c @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <asm/fixmap.h> +#include <asm/early_ioremap.h> #ifdef CONFIG_MMU static int early_ioremap_debug __initdata; @@ -125,7 +126,7 @@ __early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot) /* * Mappings have to be page-aligned */ - offset = phys_addr & ~PAGE_MASK; + offset = offset_in_page(phys_addr); phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr + 1) - phys_addr; @@ -188,7 +189,7 @@ void __init early_iounmap(void __iomem *addr, unsigned long size) if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))) return; - offset = virt_addr & ~PAGE_MASK; + offset = offset_in_page(virt_addr); nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT; idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; @@ -217,6 +218,35 @@ early_memremap(resource_size_t phys_addr, unsigned long size) return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_NORMAL); } +#ifdef FIXMAP_PAGE_RO +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); +} +#endif + +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) + +void __init copy_from_early_mem(void *dest, phys_addr_t src, unsigned long size) +{ + unsigned long slop, clen; + char *p; + + while (size) { + slop = offset_in_page(src); + clen = size; + if (clen > MAX_MAP_CHUNK - slop) + clen = MAX_MAP_CHUNK - slop; + p = early_memremap(src & PAGE_MASK, clen + slop); + memcpy(dest, p + slop, clen); + early_memunmap(p, clen + slop); + dest += clen; + src += clen; + size -= clen; + } +} + #else /* CONFIG_MMU */ void __init __iomem * @@ -231,6 +261,11 @@ early_memremap(resource_size_t phys_addr, unsigned long size) { return (void *)phys_addr; } +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} void __init early_iounmap(void __iomem *addr, unsigned long size) { diff --git a/kernel/mm/fadvise.c b/kernel/mm/fadvise.c index 4a3907cf7..b8a5bc66b 100644 --- a/kernel/mm/fadvise.c +++ b/kernel/mm/fadvise.c @@ -115,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) case POSIX_FADV_NOREUSE: break; case POSIX_FADV_DONTNEED: - if (!bdi_write_congested(bdi)) + if (!inode_write_congested(mapping->host)) __filemap_fdatawrite_range(mapping, offset, endbyte, WB_SYNC_NONE); diff --git a/kernel/mm/failslab.c b/kernel/mm/failslab.c index fefaabaab..79171b4a5 100644 --- a/kernel/mm/failslab.c +++ b/kernel/mm/failslab.c @@ -3,12 +3,12 @@ static struct { struct fault_attr attr; - u32 ignore_gfp_wait; - int cache_filter; + bool ignore_gfp_reclaim; + bool cache_filter; } failslab = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = 1, - .cache_filter = 0, + .ignore_gfp_reclaim = true, + .cache_filter = false, }; bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) @@ -16,7 +16,7 @@ bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) if (gfpflags & __GFP_NOFAIL) return false; - if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) + if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) return false; if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) @@ -42,7 +42,7 @@ static int __init failslab_debugfs_init(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_wait)) + &failslab.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("cache-filter", mode, dir, &failslab.cache_filter)) diff --git a/kernel/mm/filemap.c b/kernel/mm/filemap.c index 01cf28476..44301361c 100644 --- a/kernel/mm/filemap.c +++ b/kernel/mm/filemap.c @@ -100,6 +100,7 @@ * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) + * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) @@ -176,9 +177,11 @@ static void page_cache_tree_delete(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock and + * mem_cgroup_begin_page_stat(). */ -void __delete_from_page_cache(struct page *page, void *shadow) +void __delete_from_page_cache(struct page *page, void *shadow, + struct mem_cgroup *memcg) { struct address_space *mapping = page->mapping; @@ -198,7 +201,9 @@ void __delete_from_page_cache(struct page *page, void *shadow) page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - __dec_zone_page_state(page, NR_FILE_PAGES); + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(page)) + __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); @@ -212,7 +217,8 @@ void __delete_from_page_cache(struct page *page, void *shadow) * anyway will be cleared before returning page into buddy allocator. */ if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping); + account_page_cleaned(page, mapping, memcg, + inode_to_wb(mapping->host)); } /** @@ -226,14 +232,20 @@ void __delete_from_page_cache(struct page *page, void *shadow) void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + struct mem_cgroup *memcg; + unsigned long flags; + void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); freepage = mapping->a_ops->freepage; - spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(page, NULL); - spin_unlock_irq(&mapping->tree_lock); + + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); + __delete_from_page_cache(page, NULL, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (freepage) freepage(page); @@ -283,7 +295,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, if (!mapping_cap_writeback_dirty(mapping)) return 0; + wbc_attach_fdatawrite_inode(&wbc, mapping->host); ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); return ret; } @@ -319,23 +333,14 @@ int filemap_flush(struct address_space *mapping) } EXPORT_SYMBOL(filemap_flush); -/** - * filemap_fdatawait_range - wait for writeback to complete - * @mapping: address space structure to wait for - * @start_byte: offset in bytes where the range starts - * @end_byte: offset in bytes where the range ends (inclusive) - * - * Walk the list of under-writeback pages of the given address space - * in the given range and wait for all of them. - */ -int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, - loff_t end_byte) +static int __filemap_fdatawait_range(struct address_space *mapping, + loff_t start_byte, loff_t end_byte) { pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; - int ret2, ret = 0; + int ret = 0; if (end_byte < start_byte) goto out; @@ -362,6 +367,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, cond_resched(); } out: + return ret; +} + +/** + * filemap_fdatawait_range - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. Check error status of + * the address space and return it. + * + * Since the error status of the address space is cleared by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. + */ +int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, + loff_t end_byte) +{ + int ret, ret2; + + ret = __filemap_fdatawait_range(mapping, start_byte, end_byte); ret2 = filemap_check_errors(mapping); if (!ret) ret = ret2; @@ -371,11 +399,38 @@ out: EXPORT_SYMBOL(filemap_fdatawait_range); /** + * filemap_fdatawait_keep_errors - wait for writeback without clearing errors + * @mapping: address space structure to wait for + * + * Walk the list of under-writeback pages of the given address space + * and wait for all of them. Unlike filemap_fdatawait(), this function + * does not clear error status of the address space. + * + * Use this function if callers don't handle errors themselves. Expected + * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), + * fsfreeze(8) + */ +void filemap_fdatawait_keep_errors(struct address_space *mapping) +{ + loff_t i_size = i_size_read(mapping->host); + + if (i_size == 0) + return; + + __filemap_fdatawait_range(mapping, 0, i_size - 1); +} + +/** * filemap_fdatawait - wait for all under-writeback pages to complete * @mapping: address space structure to wait for * * Walk the list of under-writeback pages of the given address space - * and wait for all of them. + * and wait for all of them. Check error status of the address space + * and return it. + * + * Since the error status of the address space is cleared by this function, + * callers are responsible for checking the return value and handling and/or + * reporting the error. */ int filemap_fdatawait(struct address_space *mapping) { @@ -472,6 +527,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); + struct mem_cgroup *memcg; + unsigned long flags; pgoff_t offset = old->index; freepage = mapping->a_ops->freepage; @@ -480,16 +537,23 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(old, NULL); + memcg = mem_cgroup_begin_page_stat(old); + spin_lock_irqsave(&mapping->tree_lock, flags); + __delete_from_page_cache(old, NULL, memcg); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; - __inc_zone_page_state(new, NR_FILE_PAGES); + + /* + * hugetlb pages do not participate in page cache accounting. + */ + if (!PageHuge(new)) + __inc_zone_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_migrate(old, new, true); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); + mem_cgroup_replace_page(old, new); radix_tree_preload_end(); if (freepage) freepage(old); @@ -580,7 +644,10 @@ static int __add_to_page_cache_locked(struct page *page, radix_tree_preload_end(); if (unlikely(error)) goto err_insert; - __inc_zone_page_state(page, NR_FILE_PAGES); + + /* hugetlb pages do not participate in page cache accounting. */ + if (!huge) + __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); if (!huge) mem_cgroup_commit_charge(page, memcg, false); @@ -653,7 +720,7 @@ struct page *__page_cache_alloc(gfp_t gfp) do { cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); - page = alloc_pages_exact_node(n, gfp, 0); + page = __alloc_pages_node(n, gfp, 0); } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); return page; @@ -1659,8 +1726,8 @@ no_cached_page: error = -ENOMEM; goto out; } - error = add_to_page_cache_lru(page, mapping, - index, GFP_KERNEL); + error = add_to_page_cache_lru(page, mapping, index, + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error) { page_cache_release(page); if (error == -EEXIST) { @@ -1761,7 +1828,8 @@ static int page_cache_read(struct file *file, pgoff_t offset) if (!page) return -ENOMEM; - ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + ret = add_to_page_cache_lru(page, mapping, offset, + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) @@ -1785,7 +1853,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, struct file *file, pgoff_t offset) { - unsigned long ra_pages; struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ @@ -1814,10 +1881,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, /* * mmap read-around */ - ra_pages = max_sane_readahead(ra->ra_pages); - ra->start = max_t(long, 0, offset - ra_pages / 2); - ra->size = ra_pages; - ra->async_size = ra_pages / 4; + ra->start = max_t(long, 0, offset - ra->ra_pages / 2); + ra->size = ra->ra_pages; + ra->async_size = ra->ra_pages / 4; ra_submit(ra, mapping, file); } @@ -2466,6 +2532,11 @@ again: break; } + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); if (unlikely(status < 0)) @@ -2503,10 +2574,6 @@ again: written += copied; balance_dirty_pages_ratelimited(mapping); - if (fatal_signal_pending(current)) { - status = -EINTR; - break; - } } while (iov_iter_count(i)); return written ? written : status; @@ -2541,7 +2608,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; @@ -2651,7 +2718,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * page is known to the local caching routines. * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). + * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). * */ int try_to_release_page(struct page *page, gfp_t gfp_mask) diff --git a/kernel/mm/frame_vector.c b/kernel/mm/frame_vector.c new file mode 100644 index 000000000..7cf2b7163 --- /dev/null +++ b/kernel/mm/frame_vector.c @@ -0,0 +1,230 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/sched.h> + +/** + * get_vaddr_frames() - map virtual addresses to pfns + * @start: starting user address + * @nr_frames: number of pages / pfns from start to map + * @write: whether pages will be written to by the caller + * @force: whether to force write access even if user mapping is + * readonly. See description of the same argument of + get_user_pages(). + * @vec: structure which receives pages / pfns of the addresses mapped. + * It should have space for at least nr_frames entries. + * + * This function maps virtual addresses from @start and fills @vec structure + * with page frame numbers or page pointers to corresponding pages (choice + * depends on the type of the vma underlying the virtual address). If @start + * belongs to a normal vma, the function grabs reference to each of the pages + * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't + * touch page structures and the caller must make sure pfns aren't reused for + * anything else while he is using them. + * + * The function returns number of pages mapped which may be less than + * @nr_frames. In particular we stop mapping if there are more vmas of + * different type underlying the specified range of virtual addresses. + * When the function isn't able to map a single page, it returns error. + * + * This function takes care of grabbing mmap_sem as necessary. + */ +int get_vaddr_frames(unsigned long start, unsigned int nr_frames, + bool write, bool force, struct frame_vector *vec) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int ret = 0; + int err; + int locked; + + if (nr_frames == 0) + return 0; + + if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) + nr_frames = vec->nr_allocated; + + down_read(&mm->mmap_sem); + locked = 1; + vma = find_vma_intersection(mm, start, start + 1); + if (!vma) { + ret = -EFAULT; + goto out; + } + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { + vec->got_ref = true; + vec->is_pfns = false; + ret = get_user_pages_locked(current, mm, start, nr_frames, + write, force, (struct page **)(vec->ptrs), &locked); + goto out; + } + + vec->got_ref = false; + vec->is_pfns = true; + do { + unsigned long *nums = frame_vector_pfns(vec); + + while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) { + err = follow_pfn(vma, start, &nums[ret]); + if (err) { + if (ret == 0) + ret = err; + goto out; + } + start += PAGE_SIZE; + ret++; + } + /* + * We stop if we have enough pages or if VMA doesn't completely + * cover the tail page. + */ + if (ret >= nr_frames || start < vma->vm_end) + break; + vma = find_vma_intersection(mm, start, start + 1); + } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP)); +out: + if (locked) + up_read(&mm->mmap_sem); + if (!ret) + ret = -EFAULT; + if (ret > 0) + vec->nr_frames = ret; + return ret; +} +EXPORT_SYMBOL(get_vaddr_frames); + +/** + * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired + * them + * @vec: frame vector to put + * + * Drop references to pages if get_vaddr_frames() acquired them. We also + * invalidate the frame vector so that it is prepared for the next call into + * get_vaddr_frames(). + */ +void put_vaddr_frames(struct frame_vector *vec) +{ + int i; + struct page **pages; + + if (!vec->got_ref) + goto out; + pages = frame_vector_pages(vec); + /* + * frame_vector_pages() might needed to do a conversion when + * get_vaddr_frames() got pages but vec was later converted to pfns. + * But it shouldn't really fail to convert pfns back... + */ + if (WARN_ON(IS_ERR(pages))) + goto out; + for (i = 0; i < vec->nr_frames; i++) + put_page(pages[i]); + vec->got_ref = false; +out: + vec->nr_frames = 0; +} +EXPORT_SYMBOL(put_vaddr_frames); + +/** + * frame_vector_to_pages - convert frame vector to contain page pointers + * @vec: frame vector to convert + * + * Convert @vec to contain array of page pointers. If the conversion is + * successful, return 0. Otherwise return an error. Note that we do not grab + * page references for the page structures. + */ +int frame_vector_to_pages(struct frame_vector *vec) +{ + int i; + unsigned long *nums; + struct page **pages; + + if (!vec->is_pfns) + return 0; + nums = frame_vector_pfns(vec); + for (i = 0; i < vec->nr_frames; i++) + if (!pfn_valid(nums[i])) + return -EINVAL; + pages = (struct page **)nums; + for (i = 0; i < vec->nr_frames; i++) + pages[i] = pfn_to_page(nums[i]); + vec->is_pfns = false; + return 0; +} +EXPORT_SYMBOL(frame_vector_to_pages); + +/** + * frame_vector_to_pfns - convert frame vector to contain pfns + * @vec: frame vector to convert + * + * Convert @vec to contain array of pfns. + */ +void frame_vector_to_pfns(struct frame_vector *vec) +{ + int i; + unsigned long *nums; + struct page **pages; + + if (vec->is_pfns) + return; + pages = (struct page **)(vec->ptrs); + nums = (unsigned long *)pages; + for (i = 0; i < vec->nr_frames; i++) + nums[i] = page_to_pfn(pages[i]); + vec->is_pfns = true; +} +EXPORT_SYMBOL(frame_vector_to_pfns); + +/** + * frame_vector_create() - allocate & initialize structure for pinned pfns + * @nr_frames: number of pfns slots we should reserve + * + * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns + * pfns. + */ +struct frame_vector *frame_vector_create(unsigned int nr_frames) +{ + struct frame_vector *vec; + int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames; + + if (WARN_ON_ONCE(nr_frames == 0)) + return NULL; + /* + * This is absurdly high. It's here just to avoid strange effects when + * arithmetics overflows. + */ + if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2)) + return NULL; + /* + * Avoid higher order allocations, use vmalloc instead. It should + * be rare anyway. + */ + if (size <= PAGE_SIZE) + vec = kmalloc(size, GFP_KERNEL); + else + vec = vmalloc(size); + if (!vec) + return NULL; + vec->nr_allocated = nr_frames; + vec->nr_frames = 0; + return vec; +} +EXPORT_SYMBOL(frame_vector_create); + +/** + * frame_vector_destroy() - free memory allocated to carry frame vector + * @vec: Frame vector to free + * + * Free structure allocated by frame_vector_create() to carry frames. + */ +void frame_vector_destroy(struct frame_vector *vec) +{ + /* Make sure put_vaddr_frames() got called properly... */ + VM_BUG_ON(vec->nr_frames > 0); + kvfree(vec); +} +EXPORT_SYMBOL(frame_vector_destroy); diff --git a/kernel/mm/frontswap.c b/kernel/mm/frontswap.c index 8d82809eb..27a9924ca 100644 --- a/kernel/mm/frontswap.c +++ b/kernel/mm/frontswap.c @@ -21,11 +21,16 @@ #include <linux/swapfile.h> /* - * frontswap_ops is set by frontswap_register_ops to contain the pointers - * to the frontswap "backend" implementation functions. + * frontswap_ops are added by frontswap_register_ops, and provide the + * frontswap "backend" implementation functions. Multiple implementations + * may be registered, but implementations can never deregister. This + * is a simple singly-linked list of all registered implementations. */ static struct frontswap_ops *frontswap_ops __read_mostly; +#define for_each_frontswap_ops(ops) \ + for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next) + /* * If enabled, frontswap_store will return failure even on success. As * a result, the swap subsystem will always write the page to swap, in @@ -79,15 +84,6 @@ static inline void inc_frontswap_invalidates(void) { } * on all frontswap functions to not call the backend until the backend * has registered. * - * Specifically when no backend is registered (nobody called - * frontswap_register_ops) all calls to frontswap_init (which is done via - * swapon -> enable_swap_info -> frontswap_init) are registered and remembered - * (via the setting of need_init bitmap) but fail to create tmem_pools. When a - * backend registers with frontswap at some later point the previous - * calls to frontswap_init are executed (by iterating over the need_init - * bitmap) to create tmem_pools and set the respective poolids. All of that is - * guarded by us using atomic bit operations on the 'need_init' bitmap. - * * This would not guards us against the user deciding to call swapoff right as * we are calling the backend to initialize (so swapon is in action). * Fortunatly for us, the swapon_mutex has been taked by the callee so we are @@ -106,37 +102,64 @@ static inline void inc_frontswap_invalidates(void) { } * * Obviously the opposite (unloading the backend) must be done after all * the frontswap_[store|load|invalidate_area|invalidate_page] start - * ignorning or failing the requests - at which point frontswap_ops - * would have to be made in some fashion atomic. + * ignoring or failing the requests. However, there is currently no way + * to unload a backend once it is registered. */ -static DECLARE_BITMAP(need_init, MAX_SWAPFILES); /* - * Register operations for frontswap, returning previous thus allowing - * detection of multiple backends and possible nesting. + * Register operations for frontswap */ -struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops) +void frontswap_register_ops(struct frontswap_ops *ops) { - struct frontswap_ops *old = frontswap_ops; - int i; - - for (i = 0; i < MAX_SWAPFILES; i++) { - if (test_and_clear_bit(i, need_init)) { - struct swap_info_struct *sis = swap_info[i]; - /* __frontswap_init _should_ have set it! */ - if (!sis->frontswap_map) - return ERR_PTR(-EINVAL); - ops->init(i); - } + DECLARE_BITMAP(a, MAX_SWAPFILES); + DECLARE_BITMAP(b, MAX_SWAPFILES); + struct swap_info_struct *si; + unsigned int i; + + bitmap_zero(a, MAX_SWAPFILES); + bitmap_zero(b, MAX_SWAPFILES); + + spin_lock(&swap_lock); + plist_for_each_entry(si, &swap_active_head, list) { + if (!WARN_ON(!si->frontswap_map)) + set_bit(si->type, a); } + spin_unlock(&swap_lock); + + /* the new ops needs to know the currently active swap devices */ + for_each_set_bit(i, a, MAX_SWAPFILES) + ops->init(i); + /* - * We MUST have frontswap_ops set _after_ the frontswap_init's - * have been called. Otherwise __frontswap_store might fail. Hence - * the barrier to make sure compiler does not re-order us. + * Setting frontswap_ops must happen after the ops->init() calls + * above; cmpxchg implies smp_mb() which will ensure the init is + * complete at this point. */ - barrier(); - frontswap_ops = ops; - return old; + do { + ops->next = frontswap_ops; + } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); + + spin_lock(&swap_lock); + plist_for_each_entry(si, &swap_active_head, list) { + if (si->frontswap_map) + set_bit(si->type, b); + } + spin_unlock(&swap_lock); + + /* + * On the very unlikely chance that a swap device was added or + * removed between setting the "a" list bits and the ops init + * calls, we re-check and do init or invalidate for any changed + * bits. + */ + if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) { + for (i = 0; i < MAX_SWAPFILES; i++) { + if (!test_bit(i, a) && test_bit(i, b)) + ops->init(i); + else if (test_bit(i, a) && !test_bit(i, b)) + ops->invalidate_area(i); + } + } } EXPORT_SYMBOL(frontswap_register_ops); @@ -164,6 +187,7 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); void __frontswap_init(unsigned type, unsigned long *map) { struct swap_info_struct *sis = swap_info[type]; + struct frontswap_ops *ops; BUG_ON(sis == NULL); @@ -179,28 +203,30 @@ void __frontswap_init(unsigned type, unsigned long *map) * p->frontswap set to something valid to work properly. */ frontswap_map_set(sis, map); - if (frontswap_ops) - frontswap_ops->init(type); - else { - BUG_ON(type >= MAX_SWAPFILES); - set_bit(type, need_init); - } + + for_each_frontswap_ops(ops) + ops->init(type); } EXPORT_SYMBOL(__frontswap_init); bool __frontswap_test(struct swap_info_struct *sis, pgoff_t offset) { - bool ret = false; - - if (frontswap_ops && sis->frontswap_map) - ret = test_bit(offset, sis->frontswap_map); - return ret; + if (sis->frontswap_map) + return test_bit(offset, sis->frontswap_map); + return false; } EXPORT_SYMBOL(__frontswap_test); +static inline void __frontswap_set(struct swap_info_struct *sis, + pgoff_t offset) +{ + set_bit(offset, sis->frontswap_map); + atomic_inc(&sis->frontswap_pages); +} + static inline void __frontswap_clear(struct swap_info_struct *sis, - pgoff_t offset) + pgoff_t offset) { clear_bit(offset, sis->frontswap_map); atomic_dec(&sis->frontswap_pages); @@ -215,39 +241,46 @@ static inline void __frontswap_clear(struct swap_info_struct *sis, */ int __frontswap_store(struct page *page) { - int ret = -1, dup = 0; + int ret = -1; swp_entry_t entry = { .val = page_private(page), }; int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); + struct frontswap_ops *ops; /* * Return if no backend registed. * Don't need to inc frontswap_failed_stores here. */ if (!frontswap_ops) - return ret; + return -1; BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); - if (__frontswap_test(sis, offset)) - dup = 1; - ret = frontswap_ops->store(type, offset, page); + + /* + * If a dup, we must remove the old page first; we can't leave the + * old page no matter if the store of the new page succeeds or fails, + * and we can't rely on the new page replacing the old page as we may + * not store to the same implementation that contains the old page. + */ + if (__frontswap_test(sis, offset)) { + __frontswap_clear(sis, offset); + for_each_frontswap_ops(ops) + ops->invalidate_page(type, offset); + } + + /* Try to store in each implementation, until one succeeds. */ + for_each_frontswap_ops(ops) { + ret = ops->store(type, offset, page); + if (!ret) /* successful store */ + break; + } if (ret == 0) { - set_bit(offset, sis->frontswap_map); + __frontswap_set(sis, offset); inc_frontswap_succ_stores(); - if (!dup) - atomic_inc(&sis->frontswap_pages); } else { - /* - failed dup always results in automatic invalidate of - the (older) page from frontswap - */ inc_frontswap_failed_stores(); - if (dup) { - __frontswap_clear(sis, offset); - frontswap_ops->invalidate_page(type, offset); - } } if (frontswap_writethrough_enabled) /* report failure so swap also writes to swap device */ @@ -268,14 +301,22 @@ int __frontswap_load(struct page *page) int type = swp_type(entry); struct swap_info_struct *sis = swap_info[type]; pgoff_t offset = swp_offset(entry); + struct frontswap_ops *ops; + + if (!frontswap_ops) + return -1; BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); - /* - * __frontswap_test() will check whether there is backend registered - */ - if (__frontswap_test(sis, offset)) - ret = frontswap_ops->load(type, offset, page); + if (!__frontswap_test(sis, offset)) + return -1; + + /* Try loading from each implementation, until one succeeds. */ + for_each_frontswap_ops(ops) { + ret = ops->load(type, offset, page); + if (!ret) /* successful load */ + break; + } if (ret == 0) { inc_frontswap_loads(); if (frontswap_tmem_exclusive_gets_enabled) { @@ -294,16 +335,19 @@ EXPORT_SYMBOL(__frontswap_load); void __frontswap_invalidate_page(unsigned type, pgoff_t offset) { struct swap_info_struct *sis = swap_info[type]; + struct frontswap_ops *ops; + + if (!frontswap_ops) + return; BUG_ON(sis == NULL); - /* - * __frontswap_test() will check whether there is backend registered - */ - if (__frontswap_test(sis, offset)) { - frontswap_ops->invalidate_page(type, offset); - __frontswap_clear(sis, offset); - inc_frontswap_invalidates(); - } + if (!__frontswap_test(sis, offset)) + return; + + for_each_frontswap_ops(ops) + ops->invalidate_page(type, offset); + __frontswap_clear(sis, offset); + inc_frontswap_invalidates(); } EXPORT_SYMBOL(__frontswap_invalidate_page); @@ -314,16 +358,19 @@ EXPORT_SYMBOL(__frontswap_invalidate_page); void __frontswap_invalidate_area(unsigned type) { struct swap_info_struct *sis = swap_info[type]; + struct frontswap_ops *ops; - if (frontswap_ops) { - BUG_ON(sis == NULL); - if (sis->frontswap_map == NULL) - return; - frontswap_ops->invalidate_area(type); - atomic_set(&sis->frontswap_pages, 0); - bitmap_zero(sis->frontswap_map, sis->max); - } - clear_bit(type, need_init); + if (!frontswap_ops) + return; + + BUG_ON(sis == NULL); + if (sis->frontswap_map == NULL) + return; + + for_each_frontswap_ops(ops) + ops->invalidate_area(type); + atomic_set(&sis->frontswap_pages, 0); + bitmap_zero(sis->frontswap_map, sis->max); } EXPORT_SYMBOL(__frontswap_invalidate_area); diff --git a/kernel/mm/gup.c b/kernel/mm/gup.c index 6297f6bcc..deafa2c91 100644 --- a/kernel/mm/gup.c +++ b/kernel/mm/gup.c @@ -12,7 +12,9 @@ #include <linux/sched.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> + #include <asm/pgtable.h> +#include <asm/tlbflush.h> #include "internal.h" @@ -32,6 +34,30 @@ static struct page *no_page_table(struct vm_area_struct *vma, return NULL; } +static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, unsigned int flags) +{ + /* No page to get reference */ + if (flags & FOLL_GET) + return -EFAULT; + + if (flags & FOLL_TOUCH) { + pte_t entry = *pte; + + if (flags & FOLL_WRITE) + entry = pte_mkdirty(entry); + entry = pte_mkyoung(entry); + + if (!pte_same(*pte, entry)) { + set_pte_at(vma->vm_mm, address, pte, entry); + update_mmu_cache(vma, address, pte); + } + } + + /* Proper page table entry exists, but no corresponding struct page */ + return -EEXIST; +} + static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { @@ -73,10 +99,21 @@ retry: page = vm_normal_page(vma, address, pte); if (unlikely(!page)) { - if ((flags & FOLL_DUMP) || - !is_zero_pfn(pte_pfn(pte))) - goto bad_page; - page = pte_page(pte); + if (flags & FOLL_DUMP) { + /* Avoid special (like zero) pages in core dumps */ + page = ERR_PTR(-EFAULT); + goto out; + } + + if (is_zero_pfn(pte_pfn(pte))) { + page = pte_page(pte); + } else { + int ret; + + ret = follow_pfn_pte(vma, address, ptep, flags); + page = ERR_PTR(ret); + goto out; + } } if (flags & FOLL_GET) @@ -92,7 +129,7 @@ retry: */ mark_page_accessed(page); } - if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -114,12 +151,9 @@ retry: unlock_page(page); } } +out: pte_unmap_unlock(ptep, ptl); return page; -bad_page: - pte_unmap_unlock(ptep, ptl); - return ERR_PTR(-EFAULT); - no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) @@ -265,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret; + /* mlock all present pages, but do not fault in new pages */ + if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) + return -ENOENT; /* For mm_populate(), just skip the stack guard page. */ if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) || @@ -489,9 +526,15 @@ retry: goto next_page; } BUG(); - } - if (IS_ERR(page)) + } else if (PTR_ERR(page) == -EEXIST) { + /* + * Proper page table entry exists, but no corresponding + * struct page. + */ + goto next_page; + } else if (IS_ERR(page)) { return i ? i : PTR_ERR(page); + } if (pages) { pages[i] = page; flush_anon_page(vma, page, start); @@ -850,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); - gup_flags = FOLL_TOUCH | FOLL_POPULATE; + gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; + if (vma->vm_flags & VM_LOCKONFAULT) + gup_flags &= ~FOLL_POPULATE; + /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW diff --git a/kernel/mm/huge_memory.c b/kernel/mm/huge_memory.c index 078832cf3..62fe06bb7 100644 --- a/kernel/mm/huge_memory.c +++ b/kernel/mm/huge_memory.c @@ -16,6 +16,7 @@ #include <linux/swap.h> #include <linux/shrinker.h> #include <linux/mm_inline.h> +#include <linux/dax.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> @@ -23,6 +24,8 @@ #include <linux/pagemap.h> #include <linux/migrate.h> #include <linux/hashtable.h> +#include <linux/userfaultfd_k.h> +#include <linux/page_idle.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -104,7 +107,7 @@ static struct khugepaged_scan khugepaged_scan = { }; -static int set_recommended_min_free_kbytes(void) +static void set_recommended_min_free_kbytes(void) { struct zone *zone; int nr_zones = 0; @@ -113,7 +116,7 @@ static int set_recommended_min_free_kbytes(void) for_each_populated_zone(zone) nr_zones++; - /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ + /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; /* @@ -139,7 +142,6 @@ static int set_recommended_min_free_kbytes(void) min_free_kbytes = recommended_min; } setup_per_zone_wmarks(); - return 0; } static int start_stop_khugepaged(void) @@ -149,7 +151,7 @@ static int start_stop_khugepaged(void) if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); - if (unlikely(IS_ERR(khugepaged_thread))) { + if (IS_ERR(khugepaged_thread)) { pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; @@ -171,12 +173,7 @@ fail: static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; -static inline bool is_huge_zero_pmd(pmd_t pmd) -{ - return is_huge_zero_page(pmd_page(pmd)); -} - -static struct page *get_huge_zero_page(void) +struct page *get_huge_zero_page(void) { struct page *zero_page; retry: @@ -716,21 +713,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - struct page *page, gfp_t gfp) + unsigned long address, pmd_t *pmd, + struct page *page, gfp_t gfp, + unsigned int flags) { struct mem_cgroup *memcg; pgtable_t pgtable; spinlock_t *ptl; + unsigned long haddr = address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) - return VM_FAULT_OOM; + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { mem_cgroup_cancel_charge(page, memcg); + put_page(page); return VM_FAULT_OOM; } @@ -750,6 +753,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pte_free(mm, pgtable); } else { pmd_t entry; + + /* Deliver the page fault to userland */ + if (userfaultfd_missing(vma)) { + int ret; + + spin_unlock(ptl); + mem_cgroup_cancel_charge(page, memcg); + put_page(page); + pte_free(mm, pgtable); + ret = handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; + } + entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); @@ -760,6 +778,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&mm->nr_ptes); spin_unlock(ptl); + count_vm_event(THP_FAULT_ALLOC); } return 0; @@ -767,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; } /* Caller must hold page table lock. */ @@ -806,6 +825,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, pgtable_t pgtable; struct page *zero_page; bool set; + int ret; pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; @@ -816,14 +836,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } ptl = pmd_lock(mm, pmd); - set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, - zero_page); - spin_unlock(ptl); + ret = 0; + set = false; + if (pmd_none(*pmd)) { + if (userfaultfd_missing(vma)) { + spin_unlock(ptl); + ret = handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + } else { + set_huge_zero_page(pgtable, mm, vma, + haddr, pmd, + zero_page); + spin_unlock(ptl); + set = true; + } + } else + spin_unlock(ptl); if (!set) { pte_free(mm, pgtable); put_huge_zero_page(); } - return 0; + return ret; } gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); @@ -831,14 +865,51 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { - put_page(page); - count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; + return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, + flags); +} + +static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t entry; + spinlock_t *ptl; + + ptl = pmd_lock(mm, pmd); + if (pmd_none(*pmd)) { + entry = pmd_mkhuge(pfn_pmd(pfn, prot)); + if (write) { + entry = pmd_mkyoung(pmd_mkdirty(entry)); + entry = maybe_pmd_mkwrite(entry, vma); + } + set_pmd_at(mm, addr, pmd, entry); + update_mmu_cache_pmd(vma, addr, pmd); } + spin_unlock(ptl); +} - count_vm_event(THP_FAULT_ALLOC); - return 0; +int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned long pfn, bool write) +{ + pgprot_t pgprot = vma->vm_page_prot; + /* + * If we had pmd_special, we could avoid all these restrictions, + * but we need to be consistent with PTEs and architectures that + * can't support a 'special' bit. + */ + BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); + BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == + (VM_PFNMAP|VM_MIXEDMAP)); + BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); + BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + if (track_pfn_insert(vma, &pgprot, pfn)) + return VM_FAULT_SIGBUS; + insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); + return VM_FAULT_NOPAGE; } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -873,16 +944,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_huge_zero_pmd(pmd)) { struct page *zero_page; - bool set; /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ zero_page = get_huge_zero_page(); - set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_page); - BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ ret = 0; goto out_unlock; } @@ -1031,7 +1100,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -1174,7 +1243,7 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma); @@ -1238,7 +1307,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, pmd, _pmd, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) @@ -1387,41 +1456,41 @@ out: int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { + pmd_t orig_pmd; spinlock_t *ptl; - int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - struct page *page; - pgtable_t pgtable; - pmd_t orig_pmd; - /* - * For architectures like ppc64 we look at deposited pgtable - * when calling pmdp_get_and_clear. So do the - * pgtable_trans_huge_withdraw after finishing pmdp related - * operations. - */ - orig_pmd = pmdp_get_and_clear_full(tlb->mm, addr, pmd, - tlb->fullmm); - tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); - if (is_huge_zero_pmd(orig_pmd)) { - atomic_long_dec(&tlb->mm->nr_ptes); - spin_unlock(ptl); + if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1) + return 0; + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_huge_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + if (vma_is_dax(vma)) { + spin_unlock(ptl); + if (is_huge_zero_pmd(orig_pmd)) put_huge_zero_page(); - } else { - page = pmd_page(orig_pmd); - page_remove_rmap(page); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); - VM_BUG_ON_PAGE(!PageHead(page), page); - atomic_long_dec(&tlb->mm->nr_ptes); - spin_unlock(ptl); - tlb_remove_page(tlb, page); - } - pte_free(tlb->mm, pgtable); - ret = 1; + } else if (is_huge_zero_pmd(orig_pmd)) { + pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); + put_huge_zero_page(); + } else { + struct page *page = pmd_page(orig_pmd); + page_remove_rmap(page); + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON_PAGE(!PageHead(page), page); + pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); + atomic_long_dec(&tlb->mm->nr_ptes); + spin_unlock(ptl); + tlb_remove_page(tlb, page); } - return ret; + return 1; } int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, @@ -1459,7 +1528,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); - pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); if (pmd_move_must_withdraw(new_ptl, old_ptl)) { @@ -1505,7 +1574,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } if (!prot_numa || !pmd_protnone(*pmd)) { - entry = pmdp_get_and_clear_notify(mm, addr, pmd); + entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) entry = pmd_mkwrite(entry); @@ -1676,12 +1745,7 @@ static void __split_huge_page_refcount(struct page *page, /* after clearing PageTail the gup refcount can be released */ smp_mb__after_atomic(); - /* - * retain hwpoison flag of the poisoned tail page: - * fix for the unsuitable process killed on Guest Machine(KVM) - * by the memory-failure. - */ - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (page->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | @@ -1691,8 +1755,12 @@ static void __split_huge_page_refcount(struct page *page, (1L << PG_unevictable))); page_tail->flags |= (1L << PG_dirty); - /* clear PageTail before overwriting first_page */ - smp_wmb(); + clear_compound_head(page_tail); + + if (page_is_young(page)) + set_page_young(page_tail); + if (page_is_idle(page)) + set_page_idle(page_tail); /* * __split_huge_page_splitting() already set the @@ -1811,7 +1879,7 @@ static int __split_huge_page_map(struct page *page, * here). But it is generally safer to never allow * small and huge TLB entries for the same virtual * address to be loaded simultaneously. So instead of - * doing "pmd_populate(); flush_tlb_range();" we first + * doing "pmd_populate(); flush_pmd_tlb_range();" we first * mark the current pmd notpresent (atomically because * here the pmd_trans_huge and pmd_trans_splitting * must remain set at all times on the pmd until the @@ -1941,7 +2009,7 @@ int hugepage_madvise(struct vm_area_struct *vma, /* * Be somewhat over-protective like KSM for now! */ - if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) + if (*vm_flags & VM_NO_THP) return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; @@ -1957,7 +2025,7 @@ int hugepage_madvise(struct vm_area_struct *vma, /* * Be somewhat over-protective like KSM for now! */ - if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) + if (*vm_flags & VM_NO_THP) return -EINVAL; *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; @@ -2137,8 +2205,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (pte_none(pteval) || (pte_present(pteval) && + is_zero_pfn(pte_pfn(pteval)))) { + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out; @@ -2198,7 +2268,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(PageLRU(page), page); /* If there is no mapped pte young don't collapse the page */ - if (pte_young(pteval) || PageReferenced(page) || + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } @@ -2262,8 +2333,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void khugepaged_alloc_sleep(void) { - wait_event_freezable_timeout(khugepaged_wait, false, - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + DEFINE_WAIT(wait); + + add_wait_queue(&khugepaged_wait, &wait); + freezable_schedule_timeout_interruptible( + msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); } static int khugepaged_node_load[MAX_NUMNODES]; @@ -2337,8 +2412,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { VM_BUG_ON_PAGE(*hpage, *hpage); @@ -2350,7 +2424,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, */ up_read(&mm->mmap_sem); - *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); + *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); @@ -2405,8 +2479,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); @@ -2454,7 +2527,7 @@ static void collapse_huge_page(struct mm_struct *mm, __GFP_THISNODE; /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); + new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); if (!new_page) return; @@ -2499,7 +2572,7 @@ static void collapse_huge_page(struct mm_struct *mm, * huge and small TLB entries for the same virtual address * to avoid the risk of CPU bugs in that area. */ - _pmd = pmdp_clear_flush(vma, address, pmd); + _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -2591,7 +2664,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out_unmap; @@ -2624,7 +2698,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, */ if (page_count(page) != 1 + !!PageSwapCache(page)) goto out_unmap; - if (pte_young(pteval) || PageReferenced(page) || + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } @@ -2799,7 +2874,7 @@ static void khugepaged_do_scan(void) cond_resched(); - if (unlikely(kthread_should_stop() || freezing(current))) + if (unlikely(kthread_should_stop() || try_to_freeze())) break; spin_lock(&khugepaged_mm_lock); @@ -2820,8 +2895,6 @@ static void khugepaged_do_scan(void) static void khugepaged_wait_work(void) { - try_to_freeze(); - if (khugepaged_has_work()) { if (!khugepaged_scan_sleep_millisecs) return; @@ -2865,7 +2938,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - pmdp_clear_flush_notify(vma, haddr, pmd); + pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); @@ -2889,7 +2962,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) { spinlock_t *ptl; - struct page *page; + struct page *page = NULL; struct mm_struct *mm = vma->vm_mm; unsigned long haddr = address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ @@ -2902,25 +2975,27 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, again: mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - return; - } - if (is_huge_zero_pmd(*pmd)) { + if (unlikely(!pmd_trans_huge(*pmd))) + goto unlock; + if (vma_is_dax(vma)) { + pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + if (is_huge_zero_pmd(_pmd)) + put_huge_zero_page(); + } else if (is_huge_zero_pmd(*pmd)) { __split_huge_zero_page_pmd(vma, haddr, pmd); - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - return; + } else { + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!page_count(page), page); + get_page(page); } - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!page_count(page), page); - get_page(page); + unlock: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - split_huge_page(page); + if (!page) + return; + split_huge_page(page); put_page(page); /* @@ -2969,7 +3044,7 @@ static void split_huge_page_address(struct mm_struct *mm, split_huge_page_pmd_mm(mm, address, pmd); } -void __vma_adjust_trans_huge(struct vm_area_struct *vma, +void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) diff --git a/kernel/mm/hugetlb.c b/kernel/mm/hugetlb.c index 8c4c1f9f9..ef6963b57 100644 --- a/kernel/mm/hugetlb.c +++ b/kernel/mm/hugetlb.c @@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock); * prevent spurious OOMs when the hugepage pool is fully utilized. */ static int num_fault_mutexes; -static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; +struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); @@ -217,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * - * The region data structures are embedded into a resv_map and - * protected by a resv_map's lock + * The region data structures are embedded into a resv_map and protected + * by a resv_map's lock. The set of regions within the resv_map represent + * reservations for huge pages, or huge pages that have already been + * instantiated within the map. The from and to elements are huge page + * indicies into the associated mapping. from indicates the starting index + * of the region. to represents the first index past the end of the region. + * + * For example, a file region structure with from == 0 and to == 4 represents + * four huge pages in a mapping. It is important to note that the to element + * represents the first element past the end of the region. This is used in + * arithmetic as 4(to) - 0(from) = 4 huge pages in the region. + * + * Interval notation of the form [from, to) will be used to indicate that + * the endpoint from is inclusive and to is exclusive. */ struct file_region { struct list_head link; @@ -226,10 +238,25 @@ struct file_region { long to; }; +/* + * Add the huge page range represented by [f, t) to the reserve + * map. In the normal case, existing regions will be expanded + * to accommodate the specified range. Sufficient regions should + * exist for expansion due to the previous call to region_chg + * with the same range. However, it is possible that region_del + * could have been called after region_chg and modifed the map + * in such a way that no region exists to be expanded. In this + * case, pull a region descriptor from the cache associated with + * the map and use that for the new range. + * + * Return the number of new huge pages added to the map. This + * number is greater than or equal to zero. + */ static long region_add(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg, *nrg, *trg; + long add = 0; spin_lock(&resv->lock); /* Locate the region we are either in or before. */ @@ -237,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t) if (f <= rg->to) break; + /* + * If no region exists which can be expanded to include the + * specified range, the list must have been modified by an + * interleving call to region_del(). Pull a region descriptor + * from the cache and use it for this range. + */ + if (&rg->link == head || t < rg->from) { + VM_BUG_ON(resv->region_cache_count <= 0); + + resv->region_cache_count--; + nrg = list_first_entry(&resv->region_cache, struct file_region, + link); + list_del(&nrg->link); + + nrg->from = f; + nrg->to = t; + list_add(&nrg->link, rg->link.prev); + + add += t - f; + goto out_locked; + } + /* Round our left edge to the current segment if it encloses us. */ if (f > rg->from) f = rg->from; @@ -255,16 +304,50 @@ static long region_add(struct resv_map *resv, long f, long t) if (rg->to > t) t = rg->to; if (rg != nrg) { + /* Decrement return value by the deleted range. + * Another range will span this area so that by + * end of routine add will be >= zero + */ + add -= (rg->to - rg->from); list_del(&rg->link); kfree(rg); } } + + add += (nrg->from - f); /* Added to beginning of region */ nrg->from = f; + add += t - nrg->to; /* Added to end of region */ nrg->to = t; + +out_locked: + resv->adds_in_progress--; spin_unlock(&resv->lock); - return 0; + VM_BUG_ON(add < 0); + return add; } +/* + * Examine the existing reserve map and determine how many + * huge pages in the specified range [f, t) are NOT currently + * represented. This routine is called before a subsequent + * call to region_add that will actually modify the reserve + * map to add the specified range [f, t). region_chg does + * not change the number of huge pages represented by the + * map. However, if the existing regions in the map can not + * be expanded to represent the new range, a new file_region + * structure is added to the map as a placeholder. This is + * so that the subsequent region_add call will have all the + * regions it needs and will not fail. + * + * Upon entry, region_chg will also examine the cache of region descriptors + * associated with the map. If there are not enough descriptors cached, one + * will be allocated for the in progress add operation. + * + * Returns the number of huge pages that need to be added to the existing + * reservation map for the range [f, t). This number is greater or equal to + * zero. -ENOMEM is returned if a new file_region structure or cache entry + * is needed and can not be allocated. + */ static long region_chg(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; @@ -273,6 +356,33 @@ static long region_chg(struct resv_map *resv, long f, long t) retry: spin_lock(&resv->lock); +retry_locked: + resv->adds_in_progress++; + + /* + * Check for sufficient descriptors in the cache to accommodate + * the number of in progress add operations. + */ + if (resv->adds_in_progress > resv->region_cache_count) { + struct file_region *trg; + + VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1); + /* Must drop lock to allocate a new descriptor. */ + resv->adds_in_progress--; + spin_unlock(&resv->lock); + + trg = kmalloc(sizeof(*trg), GFP_KERNEL); + if (!trg) { + kfree(nrg); + return -ENOMEM; + } + + spin_lock(&resv->lock); + list_add(&trg->link, &resv->region_cache); + resv->region_cache_count++; + goto retry_locked; + } + /* Locate the region we are before or in. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -283,6 +393,7 @@ retry: * size such that we can guarantee to record the reservation. */ if (&rg->link == head || t < rg->from) { if (!nrg) { + resv->adds_in_progress--; spin_unlock(&resv->lock); nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); if (!nrg) @@ -331,41 +442,146 @@ out_nrg: return chg; } -static long region_truncate(struct resv_map *resv, long end) +/* + * Abort the in progress add operation. The adds_in_progress field + * of the resv_map keeps track of the operations in progress between + * calls to region_chg and region_add. Operations are sometimes + * aborted after the call to region_chg. In such cases, region_abort + * is called to decrement the adds_in_progress counter. + * + * NOTE: The range arguments [f, t) are not needed or used in this + * routine. They are kept to make reading the calling code easier as + * arguments will match the associated region_chg call. + */ +static void region_abort(struct resv_map *resv, long f, long t) +{ + spin_lock(&resv->lock); + VM_BUG_ON(!resv->region_cache_count); + resv->adds_in_progress--; + spin_unlock(&resv->lock); +} + +/* + * Delete the specified range [f, t) from the reserve map. If the + * t parameter is LONG_MAX, this indicates that ALL regions after f + * should be deleted. Locate the regions which intersect [f, t) + * and either trim, delete or split the existing regions. + * + * Returns the number of huge pages deleted from the reserve map. + * In the normal case, the return value is zero or more. In the + * case where a region must be split, a new region descriptor must + * be allocated. If the allocation fails, -ENOMEM will be returned. + * NOTE: If the parameter t == LONG_MAX, then we will never split + * a region and possibly return -ENOMEM. Callers specifying + * t == LONG_MAX do not need to check for -ENOMEM error. + */ +static long region_del(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; struct file_region *rg, *trg; - long chg = 0; + struct file_region *nrg = NULL; + long del = 0; +retry: spin_lock(&resv->lock); - /* Locate the region we are either in or before. */ - list_for_each_entry(rg, head, link) - if (end <= rg->to) + list_for_each_entry_safe(rg, trg, head, link) { + /* + * Skip regions before the range to be deleted. file_region + * ranges are normally of the form [from, to). However, there + * may be a "placeholder" entry in the map which is of the form + * (from, to) with from == to. Check for placeholder entries + * at the beginning of the range to be deleted. + */ + if (rg->to <= f && (rg->to != rg->from || rg->to != f)) + continue; + + if (rg->from >= t) break; - if (&rg->link == head) - goto out; - /* If we are in the middle of a region then adjust it. */ - if (end > rg->from) { - chg = rg->to - end; - rg->to = end; - rg = list_entry(rg->link.next, typeof(*rg), link); - } + if (f > rg->from && t < rg->to) { /* Must split region */ + /* + * Check for an entry in the cache before dropping + * lock and attempting allocation. + */ + if (!nrg && + resv->region_cache_count > resv->adds_in_progress) { + nrg = list_first_entry(&resv->region_cache, + struct file_region, + link); + list_del(&nrg->link); + resv->region_cache_count--; + } - /* Drop any remaining regions. */ - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) + if (!nrg) { + spin_unlock(&resv->lock); + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + goto retry; + } + + del += t - f; + + /* New entry for end of split region */ + nrg->from = t; + nrg->to = rg->to; + INIT_LIST_HEAD(&nrg->link); + + /* Original entry is trimmed */ + rg->to = f; + + list_add(&nrg->link, &rg->link); + nrg = NULL; break; - chg += rg->to - rg->from; - list_del(&rg->link); - kfree(rg); + } + + if (f <= rg->from && t >= rg->to) { /* Remove entire region */ + del += rg->to - rg->from; + list_del(&rg->link); + kfree(rg); + continue; + } + + if (f <= rg->from) { /* Trim beginning of region */ + del += t - rg->from; + rg->from = t; + } else { /* Trim end of region */ + del += rg->to - f; + rg->to = f; + } } -out: spin_unlock(&resv->lock); - return chg; + kfree(nrg); + return del; } +/* + * A rare out of memory error was encountered which prevented removal of + * the reserve map region for a page. The huge page itself was free'ed + * and removed from the page cache. This routine will adjust the subpool + * usage count, and the global reserve count if needed. By incrementing + * these counts, the reserve map entry which could not be deleted will + * appear as a "reserved" entry instead of simply dangling with incorrect + * counts. + */ +void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve) +{ + struct hugepage_subpool *spool = subpool_inode(inode); + long rsv_adjust; + + rsv_adjust = hugepage_subpool_get_pages(spool, 1); + if (restore_reserve && rsv_adjust) { + struct hstate *h = hstate_inode(inode); + + hugetlb_acct_memory(h, 1); + } +} + +/* + * Count and return the number of huge pages in the reserve map + * that intersect with the range [f, t). + */ static long region_count(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; @@ -482,22 +698,44 @@ static void set_vma_private_data(struct vm_area_struct *vma, struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); - if (!resv_map) + struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL); + + if (!resv_map || !rg) { + kfree(resv_map); + kfree(rg); return NULL; + } kref_init(&resv_map->refs); spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); + resv_map->adds_in_progress = 0; + + INIT_LIST_HEAD(&resv_map->region_cache); + list_add(&rg->link, &resv_map->region_cache); + resv_map->region_cache_count = 1; + return resv_map; } void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); + struct list_head *head = &resv_map->region_cache; + struct file_region *rg, *trg; /* Clear out any active regions before we release the map. */ - region_truncate(resv_map, 0); + region_del(resv_map, 0, LONG_MAX); + + /* ... and any entries left in the cache */ + list_for_each_entry_safe(rg, trg, head, link) { + list_del(&rg->link); + kfree(rg); + } + + VM_BUG_ON(resv_map->adds_in_progress); + kfree(resv_map); } @@ -554,7 +792,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static int vma_has_reserves(struct vm_area_struct *vma, long chg) +static bool vma_has_reserves(struct vm_area_struct *vma, long chg) { if (vma->vm_flags & VM_NORESERVE) { /* @@ -567,23 +805,34 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg) * properly, so add work-around here. */ if (vma->vm_flags & VM_MAYSHARE && chg == 0) - return 1; + return true; else - return 0; + return false; } /* Shared mappings always use reserves */ - if (vma->vm_flags & VM_MAYSHARE) - return 1; + if (vma->vm_flags & VM_MAYSHARE) { + /* + * We know VM_NORESERVE is not set. Therefore, there SHOULD + * be a region map for all pages. The only situation where + * there is no region map is if a hole was punched via + * fallocate. In this case, there really are no reverves to + * use. This situation is indicated if chg != 0. + */ + if (chg) + return false; + else + return true; + } /* * Only the process that called mmap() has reserves for * private mappings. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return 1; + return true; - return 0; + return false; } static void enqueue_huge_page(struct hstate *h, struct page *page) @@ -755,23 +1004,22 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) #if defined(CONFIG_CMA) && defined(CONFIG_X86_64) static void destroy_compound_gigantic_page(struct page *page, - unsigned long order) + unsigned int order) { int i; int nr_pages = 1 << order; struct page *p = page + 1; for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __ClearPageTail(p); + clear_compound_head(p); set_page_refcounted(p); - p->first_page = NULL; } set_compound_order(page, 0); __ClearPageHead(page); } -static void free_gigantic_page(struct page *page, unsigned order) +static void free_gigantic_page(struct page *page, unsigned int order) { free_contig_range(page_to_pfn(page), 1 << order); } @@ -815,7 +1063,7 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, unsigned order) +static struct page *alloc_gigantic_page(int nid, unsigned int order) { unsigned long nr_pages = 1 << order; unsigned long ret, pfn, flags; @@ -851,7 +1099,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned order) } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); -static void prep_compound_gigantic_page(struct page *page, unsigned long order); +static void prep_compound_gigantic_page(struct page *page, unsigned int order); static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) { @@ -884,9 +1132,9 @@ static int alloc_fresh_gigantic_page(struct hstate *h, static inline bool gigantic_page_supported(void) { return true; } #else static inline bool gigantic_page_supported(void) { return false; } -static inline void free_gigantic_page(struct page *page, unsigned order) { } +static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, - unsigned long order) { } + unsigned int order) { } static inline int alloc_fresh_gigantic_page(struct hstate *h, nodemask_t *nodes_allowed) { return 0; } #endif @@ -907,13 +1155,12 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_writeback); } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - set_compound_page_dtor(page, NULL); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); set_page_refcounted(page); if (hstate_is_gigantic(h)) { destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); } else { - arch_release_hugepage(page); __free_pages(page, huge_page_order(h)); } } @@ -1004,7 +1251,7 @@ void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); - set_compound_page_dtor(page, free_huge_page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); spin_lock(&hugetlb_lock); set_hugetlb_cgroup(page, NULL); h->nr_huge_pages++; @@ -1013,7 +1260,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } -static void prep_compound_gigantic_page(struct page *page, unsigned long order) +static void prep_compound_gigantic_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; @@ -1038,10 +1285,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) */ __ClearPageReserved(p); set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } } @@ -1056,7 +1300,7 @@ int PageHuge(struct page *page) return 0; page = compound_head(page); - return get_compound_page_dtor(page) == free_huge_page; + return page[1].compound_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHuge); @@ -1093,15 +1337,11 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; - page = alloc_pages_exact_node(nid, + page = __alloc_pages_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { - if (arch_prepare_hugepage(page)) { - __free_pages(page, huge_page_order(h)); - return NULL; - } prep_new_huge_page(h, page, nid); } @@ -1203,7 +1443,82 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) dissolve_free_huge_page(pfn_to_page(pfn)); } -static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) +/* + * There are 3 ways this can get called: + * 1. With vma+addr: we use the VMA's memory policy + * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge + * page from any node, and let the buddy allocator itself figure + * it out. + * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page + * strictly from 'nid' + */ +static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, int nid) +{ + int order = huge_page_order(h); + gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; + unsigned int cpuset_mems_cookie; + + /* + * We need a VMA to get a memory policy. If we do not + * have one, we use the 'nid' argument. + * + * The mempolicy stuff below has some non-inlined bits + * and calls ->vm_ops. That makes it hard to optimize at + * compile-time, even when NUMA is off and it does + * nothing. This helps the compiler optimize it out. + */ + if (!IS_ENABLED(CONFIG_NUMA) || !vma) { + /* + * If a specific node is requested, make sure to + * get memory from there, but only when a node + * is explicitly specified. + */ + if (nid != NUMA_NO_NODE) + gfp |= __GFP_THISNODE; + /* + * Make sure to call something that can handle + * nid=NUMA_NO_NODE + */ + return alloc_pages_node(nid, gfp, order); + } + + /* + * OK, so we have a VMA. Fetch the mempolicy and try to + * allocate a huge page with it. We will only reach this + * when CONFIG_NUMA=y. + */ + do { + struct page *page; + struct mempolicy *mpol; + struct zonelist *zl; + nodemask_t *nodemask; + + cpuset_mems_cookie = read_mems_allowed_begin(); + zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); + mpol_cond_put(mpol); + page = __alloc_pages_nodemask(gfp, order, zl, nodemask); + if (page) + return page; + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + return NULL; +} + +/* + * There are two ways to allocate a huge page: + * 1. When you have a VMA and an address (like a fault) + * 2. When you have no VMA (like when setting /proc/.../nr_hugepages) + * + * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in + * this case which signifies that the allocation should be done with + * respect for the VMA's memory policy. + * + * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This + * implies that memory policies will not be taken in to account. + */ +static struct page *__alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, int nid) { struct page *page; unsigned int r_nid; @@ -1212,6 +1527,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) return NULL; /* + * Make sure that anyone specifying 'nid' is not also specifying a VMA. + * This makes sure the caller is picking _one_ of the modes with which + * we can call this function, not both. + */ + if (vma || (addr != -1)) { + VM_WARN_ON_ONCE(addr == -1); + VM_WARN_ON_ONCE(nid != NUMA_NO_NODE); + } + /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed * overcommit @@ -1244,25 +1568,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) } spin_unlock(&hugetlb_lock); - if (nid == NUMA_NO_NODE) - page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| - __GFP_REPEAT|__GFP_NOWARN, - huge_page_order(h)); - else - page = alloc_pages_exact_node(nid, - htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); - - if (page && arch_prepare_hugepage(page)) { - __free_pages(page, huge_page_order(h)); - page = NULL; - } + page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); spin_lock(&hugetlb_lock); if (page) { INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); - set_compound_page_dtor(page, free_huge_page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); set_hugetlb_cgroup(page, NULL); /* * We incremented the global counters already @@ -1281,6 +1593,29 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) } /* + * Allocate a huge page from 'nid'. Note, 'nid' may be + * NUMA_NO_NODE, which means that it may be allocated + * anywhere. + */ +static +struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) +{ + unsigned long addr = -1; + + return __alloc_buddy_huge_page(h, NULL, addr, nid); +} + +/* + * Use the VMA's mpolicy to allocate a huge page from the buddy. + */ +static +struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE); +} + +/* * This allocation function is useful in the context where vma is irrelevant. * E.g. soft-offlining uses this function because it only cares physical * address of error page. @@ -1295,7 +1630,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (!page) - page = alloc_buddy_huge_page(h, nid); + page = __alloc_buddy_huge_page_no_mpol(h, nid); return page; } @@ -1325,7 +1660,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE); if (!page) { alloc_ok = false; break; @@ -1421,87 +1756,150 @@ static void return_unused_surplus_pages(struct hstate *h, } } + /* - * Determine if the huge page at addr within the vma has an associated - * reservation. Where it does not we will need to logically increase - * reservation and actually increase subpool usage before an allocation - * can occur. Where any new reservation would be required the - * reservation change is prepared, but not committed. Once the page - * has been allocated from the subpool and instantiated the change should - * be committed via vma_commit_reservation. No action is required on - * failure. + * vma_needs_reservation, vma_commit_reservation and vma_end_reservation + * are used by the huge page allocation routines to manage reservations. + * + * vma_needs_reservation is called to determine if the huge page at addr + * within the vma has an associated reservation. If a reservation is + * needed, the value 1 is returned. The caller is then responsible for + * managing the global reservation and subpool usage counts. After + * the huge page has been allocated, vma_commit_reservation is called + * to add the page to the reservation map. If the page allocation fails, + * the reservation must be ended instead of committed. vma_end_reservation + * is called in such cases. + * + * In the normal case, vma_commit_reservation returns the same value + * as the preceding vma_needs_reservation call. The only time this + * is not the case is if a reserve map was changed between calls. It + * is the responsibility of the caller to notice the difference and + * take appropriate action. */ -static long vma_needs_reservation(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr) +enum vma_resv_mode { + VMA_NEEDS_RESV, + VMA_COMMIT_RESV, + VMA_END_RESV, +}; +static long __vma_reservation_common(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr, + enum vma_resv_mode mode) { struct resv_map *resv; pgoff_t idx; - long chg; + long ret; resv = vma_resv_map(vma); if (!resv) return 1; idx = vma_hugecache_offset(h, vma, addr); - chg = region_chg(resv, idx, idx + 1); + switch (mode) { + case VMA_NEEDS_RESV: + ret = region_chg(resv, idx, idx + 1); + break; + case VMA_COMMIT_RESV: + ret = region_add(resv, idx, idx + 1); + break; + case VMA_END_RESV: + region_abort(resv, idx, idx + 1); + ret = 0; + break; + default: + BUG(); + } if (vma->vm_flags & VM_MAYSHARE) - return chg; + return ret; else - return chg < 0 ? chg : 0; + return ret < 0 ? ret : 0; } -static void vma_commit_reservation(struct hstate *h, + +static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct resv_map *resv; - pgoff_t idx; + return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV); +} - resv = vma_resv_map(vma); - if (!resv) - return; +static long vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV); +} - idx = vma_hugecache_offset(h, vma, addr); - region_add(resv, idx, idx + 1); +static void vma_end_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV); } -static struct page *alloc_huge_page(struct vm_area_struct *vma, +struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; - long chg; + long map_chg, map_commit; + long gbl_chg; int ret, idx; struct hugetlb_cgroup *h_cg; idx = hstate_index(h); /* - * Processes that did not create the mapping will have no - * reserves and will not have accounted against subpool - * limit. Check that the subpool limit can be made before - * satisfying the allocation MAP_NORESERVE mappings may also - * need pages and subpool limit allocated allocated if no reserve - * mapping overlaps. + * Examine the region/reserve map to determine if the process + * has a reservation for the page to be allocated. A return + * code of zero indicates a reservation exists (no change). */ - chg = vma_needs_reservation(h, vma, addr); - if (chg < 0) + map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); + if (map_chg < 0) return ERR_PTR(-ENOMEM); - if (chg || avoid_reserve) - if (hugepage_subpool_get_pages(spool, 1) < 0) + + /* + * Processes that did not create the mapping will have no + * reserves as indicated by the region/reserve map. Check + * that the allocation will not exceed the subpool limit. + * Allocations for MAP_NORESERVE mappings also need to be + * checked against any subpool limit. + */ + if (map_chg || avoid_reserve) { + gbl_chg = hugepage_subpool_get_pages(spool, 1); + if (gbl_chg < 0) { + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); + } + + /* + * Even though there was no reservation in the region/reserve + * map, there could be reservations associated with the + * subpool that can be used. This would be indicated if the + * return value of hugepage_subpool_get_pages() is zero. + * However, if avoid_reserve is specified we still avoid even + * the subpool reservations. + */ + if (avoid_reserve) + gbl_chg = 1; + } ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) goto out_subpool_put; spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); + /* + * glb_chg is passed to indicate whether or not a page must be taken + * from the global free pool (global change). gbl_chg == 0 indicates + * a reservation exists for the allocation. + */ + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { spin_unlock(&hugetlb_lock); - page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; - + if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { + SetPagePrivate(page); + h->resv_huge_pages--; + } spin_lock(&hugetlb_lock); list_move(&page->lru, &h->hugepage_activelist); /* Fall through */ @@ -1511,14 +1909,30 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_private(page, (unsigned long)spool); - vma_commit_reservation(h, vma, addr); + map_commit = vma_commit_reservation(h, vma, addr); + if (unlikely(map_chg > map_commit)) { + /* + * The page was added to the reservation map between + * vma_needs_reservation and vma_commit_reservation. + * This indicates a race with hugetlb_reserve_pages. + * Adjust for the subpool count incremented above AND + * in hugetlb_reserve_pages for the same page. Also, + * the reservation count added in hugetlb_reserve_pages + * no longer applies. + */ + long rsv_adjust; + + rsv_adjust = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -rsv_adjust); + } return page; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (chg || avoid_reserve) + if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -1567,7 +1981,8 @@ found: return 1; } -static void __init prep_compound_huge_page(struct page *page, int order) +static void __init prep_compound_huge_page(struct page *page, + unsigned int order) { if (unlikely(order > (MAX_ORDER - 1))) prep_compound_gigantic_page(page, order); @@ -1736,7 +2151,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with alloc_buddy_huge_page() here and be unable + * We might race with __alloc_buddy_huge_page() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -1778,7 +2193,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * alloc_buddy_huge_page() is checking the global counter, + * __alloc_buddy_huge_page() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. @@ -2071,7 +2486,7 @@ struct node_hstate { struct kobject *hugepages_kobj; struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; }; -struct node_hstate node_hstates[MAX_NUMNODES]; +static struct node_hstate node_hstates[MAX_NUMNODES]; /* * A subset of global hstate attributes for node devices @@ -2234,7 +2649,7 @@ static void __exit hugetlb_exit(void) } kobject_put(hugepages_kobj); - kfree(htlb_fault_mutex_table); + kfree(hugetlb_fault_mutex_table); } module_exit(hugetlb_exit); @@ -2267,18 +2682,18 @@ static int __init hugetlb_init(void) #else num_fault_mutexes = 1; #endif - htlb_fault_mutex_table = + hugetlb_fault_mutex_table = kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); - BUG_ON(!htlb_fault_mutex_table); + BUG_ON(!hugetlb_fault_mutex_table); for (i = 0; i < num_fault_mutexes; i++) - mutex_init(&htlb_fault_mutex_table[i]); + mutex_init(&hugetlb_fault_mutex_table[i]); return 0; } module_init(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_add_hstate(unsigned order) +void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; unsigned long i; @@ -2485,6 +2900,12 @@ void hugetlb_show_meminfo(void) 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); } +void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "HugetlbPages:\t%8lu kB\n", + atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); +} + /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { @@ -2720,6 +3141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, get_page(ptepage); page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); + hugetlb_count_add(pages_per_huge_page(h), dst); } spin_unlock(src_ptl); spin_unlock(dst_ptl); @@ -2800,6 +3222,7 @@ again: if (huge_pte_dirty(pte)) set_page_dirty(page); + hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) { @@ -2897,6 +3320,14 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, continue; /* + * Shared VMAs have their own reserves and do not affect + * MAP_PRIVATE accounting but it is possible that a shared + * VMA is using the same page so check and skip such VMAs. + */ + if (iter_vma->vm_flags & VM_MAYSHARE) + continue; + + /* * Unmap the page from other VMAs without their own reserves. * They get marked to be SIGKILLed if they fault in these * areas. This is because a future no-page fault on this VMA @@ -3070,6 +3501,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return page != NULL; } +int huge_add_to_page_cache(struct page *page, struct address_space *mapping, + pgoff_t idx) +{ + struct inode *inode = mapping->host; + struct hstate *h = hstate_inode(inode); + int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + + if (err) + return err; + ClearPagePrivate(page); + + spin_lock(&inode->i_lock); + inode->i_blocks += blocks_per_huge_page(h); + spin_unlock(&inode->i_lock); + return 0; +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address, pte_t *ptep, unsigned int flags) @@ -3117,21 +3565,13 @@ retry: set_page_huge_active(page); if (vma->vm_flags & VM_MAYSHARE) { - int err; - struct inode *inode = mapping->host; - - err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + int err = huge_add_to_page_cache(page, mapping, idx); if (err) { put_page(page); if (err == -EEXIST) goto retry; goto out; } - ClearPagePrivate(page); - - spin_lock(&inode->i_lock); - inode->i_blocks += blocks_per_huge_page(h); - spin_unlock(&inode->i_lock); } else { lock_page(page); if (unlikely(anon_vma_prepare(vma))) { @@ -3159,11 +3599,14 @@ retry: * any allocations necessary to record that reservation occur outside * the spinlock. */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } + /* Just decrements count, does not deallocate */ + vma_end_reservation(h, vma, address); + } ptl = huge_pte_lockptr(h, mm, ptep); spin_lock(ptl); @@ -3184,6 +3627,7 @@ retry: && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); + hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); @@ -3203,7 +3647,7 @@ backout_unlocked: } #ifdef CONFIG_SMP -static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address) @@ -3228,7 +3672,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, unsigned long address) @@ -3262,12 +3706,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } else { + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; - mapping = vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, vma, address); @@ -3276,8 +3720,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); - mutex_lock(&htlb_fault_mutex_table[hash]); + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); + mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { @@ -3310,6 +3754,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_OOM; goto out_mutex; } + /* Just decrements count, does not deallocate */ + vma_end_reservation(h, vma, address); if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, @@ -3360,7 +3806,7 @@ out_ptl: put_page(pagecache_page); } out_mutex: - mutex_unlock(&htlb_fault_mutex_table[hash]); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -3629,16 +4075,35 @@ int hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_MAYSHARE) - region_add(resv_map, from, to); + if (!vma || vma->vm_flags & VM_MAYSHARE) { + long add = region_add(resv_map, from, to); + + if (unlikely(chg > add)) { + /* + * pages in this range were added to the reserve + * map between region_chg and region_add. This + * indicates a race with alloc_huge_page. Adjust + * the subpool and reserve counts modified above + * based on the difference. + */ + long rsv_adjust; + + rsv_adjust = hugepage_subpool_put_pages(spool, + chg - add); + hugetlb_acct_memory(h, -rsv_adjust); + } + } return 0; out_err: + if (!vma || vma->vm_flags & VM_MAYSHARE) + region_abort(resv_map, from, to); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_put(&resv_map->refs, resv_map_release); return ret; } -void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) +long hugetlb_unreserve_pages(struct inode *inode, long start, long end, + long freed) { struct hstate *h = hstate_inode(inode); struct resv_map *resv_map = inode_resv_map(inode); @@ -3646,8 +4111,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; - if (resv_map) - chg = region_truncate(resv_map, offset); + if (resv_map) { + chg = region_del(resv_map, start, end); + /* + * region_del() can fail in the rare case where a region + * must be split and another region descriptor can not be + * allocated. If end == LONG_MAX, it will not fail. + */ + if (chg < 0) + return chg; + } + spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); @@ -3658,6 +4132,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); + + return 0; } #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE @@ -3671,8 +4147,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; - unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; + unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK; /* * match the virtual addresses, permission and the alignment of the @@ -3686,7 +4162,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, return saddr; } -static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) +static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) { unsigned long base = addr & PUD_MASK; unsigned long end = base + PUD_SIZE; @@ -3696,8 +4172,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) */ if (vma->vm_flags & VM_MAYSHARE && vma->vm_start <= base && end <= vma->vm_end) - return 1; - return 0; + return true; + return false; } /* @@ -3792,6 +4268,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) { return NULL; } + +int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) +{ + return 0; +} #define want_pmd_share() (0) #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ diff --git a/kernel/mm/hugetlb_cgroup.c b/kernel/mm/hugetlb_cgroup.c index 6e0057439..d8fb10de0 100644 --- a/kernel/mm/hugetlb_cgroup.c +++ b/kernel/mm/hugetlb_cgroup.c @@ -186,7 +186,8 @@ again: } rcu_read_unlock(); - ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); + if (!page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter)) + ret = -ENOMEM; css_put(&h_cg->css); done: *ptr = h_cg; @@ -384,7 +385,7 @@ void __init hugetlb_cgroup_file_init(void) /* * Add cgroup control files only if the huge page consists * of more than two normal pages. This is because we use - * page[2].lru.next for storing cgroup details. + * page[2].private for storing cgroup details. */ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) __hugetlb_cgroup_file_init(hstate_index(h)); diff --git a/kernel/mm/hwpoison-inject.c b/kernel/mm/hwpoison-inject.c index 4ca5fe004..9d26fd9fe 100644 --- a/kernel/mm/hwpoison-inject.c +++ b/kernel/mm/hwpoison-inject.c @@ -28,7 +28,7 @@ static int hwpoison_inject(void *data, u64 val) /* * This implies unable to support free buddy pages. */ - if (!get_page_unless_zero(hpage)) + if (!get_hwpoison_page(p)) return 0; if (!hwpoison_filter_enable) @@ -45,12 +45,9 @@ static int hwpoison_inject(void *data, u64 val) /* * do a racy check with elevated page count, to make sure PG_hwpoison * will only be set for the targeted owner (or on a free page). - * We temporarily take page lock for try_get_mem_cgroup_from_page(). * memory_failure() will redo the check reliably inside page lock. */ - lock_page(hpage); err = hwpoison_filter(hpage); - unlock_page(hpage); if (err) goto put_out; @@ -58,7 +55,7 @@ inject: pr_info("Injecting memory failure at pfn %#lx\n", pfn); return memory_failure(pfn, 18, MF_COUNT_INCREASED); put_out: - put_page(hpage); + put_hwpoison_page(p); return 0; } @@ -126,7 +123,7 @@ static int pfn_inject_init(void) if (!dentry) goto fail; -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_MEMCG dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, hwpoison_dir, &hwpoison_filter_memcg); if (!dentry) diff --git a/kernel/mm/internal.h b/kernel/mm/internal.h index a25e359a4..38e24b89e 100644 --- a/kernel/mm/internal.h +++ b/kernel/mm/internal.h @@ -14,6 +14,25 @@ #include <linux/fs.h> #include <linux/mm.h> +/* + * The set of flags that only affect watermark checking and reclaim + * behaviour. This is used by the MM to obey the caller constraints + * about IO, FS and watermark checking while ignoring placement + * hints such as HIGHMEM usage. + */ +#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ + __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) + +/* The GFP flags allowed during early boot */ +#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) + +/* Control allocation cpuset and node placement constraints */ +#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) + +/* Do not use these with a slab allocator */ +#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); @@ -61,9 +80,9 @@ static inline void __get_page_tail_foll(struct page *page, * speculative page access (like in * page_cache_get_speculative()) on tail pages. */ - VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page); + VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page); if (get_page_head) - atomic_inc(&page->first_page->_count); + atomic_inc(&compound_head(page)->_count); get_huge_page_tail(page); } @@ -129,6 +148,7 @@ struct alloc_context { int classzone_idx; int migratetype; enum zone_type high_zoneidx; + bool spread_dirty_pages; }; /* @@ -155,8 +175,9 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) } extern int __isolate_free_page(struct page *page, unsigned int order); -extern void __free_pages_bootmem(struct page *page, unsigned int order); -extern void prep_compound_page(struct page *page, unsigned long order); +extern void __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order); +extern void prep_compound_page(struct page *page, unsigned int order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); #endif @@ -181,6 +202,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ + unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ int order; /* order a direct compactor needs */ @@ -213,7 +235,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use page_order_unsafe() below. */ -static inline unsigned long page_order(struct page *page) +static inline unsigned int page_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); @@ -269,20 +291,19 @@ extern unsigned int munlock_vma_page(struct page *page); extern void clear_page_mlock(struct page *page); /* - * mlock_migrate_page - called only from migrate_page_copy() to - * migrate the Mlocked page flag; update statistics. + * mlock_migrate_page - called only from migrate_misplaced_transhuge_page() + * (because that does not go through the full procedure of migration ptes): + * to migrate the Mlocked page flag; update statistics. */ static inline void mlock_migrate_page(struct page *newpage, struct page *page) { if (TestClearPageMlocked(page)) { - unsigned long flags; int nr_pages = hpage_nr_pages(page); - local_irq_save(flags); + /* Holding pmd lock, no change in irq context: __mod is safe */ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); SetPageMlocked(newpage); __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); - local_irq_restore(flags); } } @@ -361,10 +382,7 @@ do { \ } while (0) extern void mminit_verify_pageflags_layout(void); -extern void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn); extern void mminit_verify_zonelist(void); - #else static inline void mminit_dprintk(enum mminit_level level, @@ -376,11 +394,6 @@ static inline void mminit_verify_pageflags_layout(void) { } -static inline void mminit_verify_page_links(struct page *page, - enum zone_type zone, unsigned long nid, unsigned long pfn) -{ -} - static inline void mminit_verify_zonelist(void) { } @@ -433,4 +446,19 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #define ALLOC_FAIR 0x100 /* fair zone allocation */ +enum ttu_flags; +struct tlbflush_unmap_batch; + +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +void try_to_unmap_flush(void); +void try_to_unmap_flush_dirty(void); +#else +static inline void try_to_unmap_flush(void) +{ +} +static inline void try_to_unmap_flush_dirty(void) +{ +} + +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ #endif /* __MM_INTERNAL_H */ diff --git a/kernel/mm/kasan/Makefile b/kernel/mm/kasan/Makefile index bd837b8c2..647101489 100644 --- a/kernel/mm/kasan/Makefile +++ b/kernel/mm/kasan/Makefile @@ -5,4 +5,4 @@ CFLAGS_REMOVE_kasan.o = -pg # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -obj-y := kasan.o report.o +obj-y := kasan.o report.o kasan_init.o diff --git a/kernel/mm/kasan/kasan.c b/kernel/mm/kasan/kasan.c index 6c513a63e..bc0a8d8b8 100644 --- a/kernel/mm/kasan/kasan.c +++ b/kernel/mm/kasan/kasan.c @@ -2,9 +2,9 @@ * This file contains shadow memory manipulation code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. - * Author: Andrey Ryabinin <a.ryabinin@samsung.com> + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * - * Some of code borrowed from https://github.com/xairy/linux by + * Some code borrowed from https://github.com/xairy/kasan-prototype by * Andrey Konovalov <adech.fo@gmail.com> * * This program is free software; you can redistribute it and/or modify @@ -19,6 +19,7 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/kmemleak.h> #include <linux/memblock.h> #include <linux/memory.h> #include <linux/mm.h> @@ -86,6 +87,11 @@ static __always_inline bool memory_is_poisoned_2(unsigned long addr) if (memory_is_poisoned_1(addr + 1)) return true; + /* + * If single shadow byte covers 2-byte access, we don't + * need to do anything more. Otherwise, test the first + * shadow byte. + */ if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) return false; @@ -103,6 +109,11 @@ static __always_inline bool memory_is_poisoned_4(unsigned long addr) if (memory_is_poisoned_1(addr + 3)) return true; + /* + * If single shadow byte covers 4-byte access, we don't + * need to do anything more. Otherwise, test the first + * shadow byte. + */ if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) return false; @@ -120,7 +131,12 @@ static __always_inline bool memory_is_poisoned_8(unsigned long addr) if (memory_is_poisoned_1(addr + 7)) return true; - if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7)) + /* + * If single shadow byte covers 8-byte access, we don't + * need to do anything more. Otherwise, test the first + * shadow byte. + */ + if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) return false; return unlikely(*(u8 *)shadow_addr); @@ -135,12 +151,16 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr) if (unlikely(*shadow_addr)) { u16 shadow_first_bytes = *(u16 *)shadow_addr; - s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK; if (unlikely(shadow_first_bytes)) return true; - if (likely(!last_byte)) + /* + * If two shadow bytes covers 16-byte access, we don't + * need to do anything more. Otherwise, test the last + * shadow byte. + */ + if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) return false; return memory_is_poisoned_1(addr + 15); @@ -204,7 +224,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr, s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); if (unlikely(ret != (unsigned long)last_shadow || - ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) + ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) return true; } return false; @@ -236,18 +256,12 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) static __always_inline void check_memory_region(unsigned long addr, size_t size, bool write) { - struct kasan_access_info info; - if (unlikely(size == 0)) return; if (unlikely((void *)addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { - info.access_addr = (void *)addr; - info.access_size = size; - info.is_write = write; - info.ip = _RET_IP_; - kasan_report_user_access(&info); + kasan_report(addr, size, write, _RET_IP_); return; } @@ -431,6 +445,7 @@ int kasan_module_alloc(void *addr, size_t size) if (ret) { find_vm_area(addr)->flags |= VM_KASAN; + kmemleak_ignore(ret); return 0; } @@ -525,7 +540,7 @@ static int kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - pr_err("WARNING: KASan doesn't support memory hot-add\n"); + pr_err("WARNING: KASAN doesn't support memory hot-add\n"); pr_err("Memory hot-add will be disabled\n"); hotplug_memory_notifier(kasan_mem_notifier, 0); diff --git a/kernel/mm/kasan/kasan.h b/kernel/mm/kasan/kasan.h index 4986b0aca..4f6c62e5c 100644 --- a/kernel/mm/kasan/kasan.h +++ b/kernel/mm/kasan/kasan.h @@ -7,7 +7,6 @@ #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) #define KASAN_FREE_PAGE 0xFF /* page was freed */ -#define KASAN_FREE_PAGE 0xFF /* page was freed */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ @@ -55,16 +54,13 @@ struct kasan_global { #endif }; -void kasan_report_error(struct kasan_access_info *info); -void kasan_report_user_access(struct kasan_access_info *info); - static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT); } -static inline bool kasan_enabled(void) +static inline bool kasan_report_enabled(void) { return !current->kasan_depth; } diff --git a/kernel/mm/kasan/kasan_init.c b/kernel/mm/kasan/kasan_init.c new file mode 100644 index 000000000..3f9a41cf0 --- /dev/null +++ b/kernel/mm/kasan/kasan_init.c @@ -0,0 +1,152 @@ +/* + * This file contains some kasan initialization code. + * + * Copyright (c) 2015 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/pfn.h> + +#include <asm/page.h> +#include <asm/pgalloc.h> + +/* + * This page serves two purposes: + * - It used as early shadow memory. The entire shadow region populated + * with this page, before we will be able to setup normal shadow memory. + * - Latter it reused it as zero shadow to cover large ranges of memory + * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...). + */ +unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; + +#if CONFIG_PGTABLE_LEVELS > 3 +pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +#endif +#if CONFIG_PGTABLE_LEVELS > 2 +pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +#endif +pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; + +static __init void *early_alloc(size_t size, int node) +{ + return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, node); +} + +static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + pte_t zero_pte; + + zero_pte = pfn_pte(PFN_DOWN(__pa(kasan_zero_page)), PAGE_KERNEL); + zero_pte = pte_wrprotect(zero_pte); + + while (addr + PAGE_SIZE <= end) { + set_pte_at(&init_mm, addr, pte, zero_pte); + addr += PAGE_SIZE; + pte = pte_offset_kernel(pmd, addr); + } +} + +static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, + unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, addr); + unsigned long next; + + do { + next = pmd_addr_end(addr, end); + + if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { + pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + continue; + } + + if (pmd_none(*pmd)) { + pmd_populate_kernel(&init_mm, pmd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + zero_pte_populate(pmd, addr, next); + } while (pmd++, addr = next, addr != end); +} + +static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + pud_t *pud = pud_offset(pgd, addr); + unsigned long next; + + do { + next = pud_addr_end(addr, end); + if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { + pmd_t *pmd; + + pud_populate(&init_mm, pud, kasan_zero_pmd); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + continue; + } + + if (pud_none(*pud)) { + pud_populate(&init_mm, pud, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + zero_pmd_populate(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +/** + * kasan_populate_zero_shadow - populate shadow memory region with + * kasan_zero_page + * @shadow_start - start of the memory range to populate + * @shadow_end - end of the memory range to populate + */ +void __init kasan_populate_zero_shadow(const void *shadow_start, + const void *shadow_end) +{ + unsigned long addr = (unsigned long)shadow_start; + unsigned long end = (unsigned long)shadow_end; + pgd_t *pgd = pgd_offset_k(addr); + unsigned long next; + + do { + next = pgd_addr_end(addr, end); + + if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) { + pud_t *pud; + pmd_t *pmd; + + /* + * kasan_zero_pud should be populated with pmds + * at this moment. + * [pud,pmd]_populate*() below needed only for + * 3,2 - level page tables where we don't have + * puds,pmds, so pgd_populate(), pud_populate() + * is noops. + */ + pgd_populate(&init_mm, pgd, kasan_zero_pud); + pud = pud_offset(pgd, addr); + pud_populate(&init_mm, pud, kasan_zero_pmd); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + continue; + } + + if (pgd_none(*pgd)) { + pgd_populate(&init_mm, pgd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + zero_pud_populate(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} diff --git a/kernel/mm/kasan/report.c b/kernel/mm/kasan/report.c index 680ceedf8..12f222d02 100644 --- a/kernel/mm/kasan/report.c +++ b/kernel/mm/kasan/report.c @@ -2,9 +2,9 @@ * This file contains error reporting code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. - * Author: Andrey Ryabinin <a.ryabinin@samsung.com> + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * - * Some of code borrowed from https://github.com/xairy/linux by + * Some code borrowed from https://github.com/xairy/kasan-prototype by * Andrey Konovalov <adech.fo@gmail.com> * * This program is free software; you can redistribute it and/or modify @@ -22,6 +22,7 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/kasan.h> +#include <linux/module.h> #include <asm/sections.h> @@ -48,34 +49,49 @@ static const void *find_first_bad_addr(const void *addr, size_t size) static void print_error_description(struct kasan_access_info *info) { - const char *bug_type = "unknown crash"; - u8 shadow_val; + const char *bug_type = "unknown-crash"; + u8 *shadow_addr; info->first_bad_addr = find_first_bad_addr(info->access_addr, info->access_size); - shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr); + shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); - switch (shadow_val) { - case KASAN_FREE_PAGE: - case KASAN_KMALLOC_FREE: - bug_type = "use after free"; + /* + * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look + * at the next shadow byte to determine the type of the bad access. + */ + if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1) + shadow_addr++; + + switch (*shadow_addr) { + case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: + /* + * In theory it's still possible to see these shadow values + * due to a data race in the kernel code. + */ + bug_type = "out-of-bounds"; break; case KASAN_PAGE_REDZONE: case KASAN_KMALLOC_REDZONE: + bug_type = "slab-out-of-bounds"; + break; case KASAN_GLOBAL_REDZONE: - case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: - bug_type = "out of bounds access"; + bug_type = "global-out-of-bounds"; break; case KASAN_STACK_LEFT: case KASAN_STACK_MID: case KASAN_STACK_RIGHT: case KASAN_STACK_PARTIAL: - bug_type = "out of bounds on stack"; + bug_type = "stack-out-of-bounds"; + break; + case KASAN_FREE_PAGE: + case KASAN_KMALLOC_FREE: + bug_type = "use-after-free"; break; } - pr_err("BUG: KASan: %s in %pS at addr %p\n", + pr_err("BUG: KASAN: %s in %pS at addr %p\n", bug_type, (void *)info->ip, info->access_addr); pr_err("%s of size %zu by task %s/%d\n", @@ -85,9 +101,11 @@ static void print_error_description(struct kasan_access_info *info) static inline bool kernel_or_module_addr(const void *addr) { - return (addr >= (void *)_stext && addr < (void *)_end) - || (addr >= (void *)MODULES_VADDR - && addr < (void *)MODULES_END); + if (addr >= (void *)_stext && addr < (void *)_end) + return true; + if (is_module_address((unsigned long)addr)) + return true; + return false; } static inline bool init_task_stack_addr(const void *addr) @@ -161,15 +179,19 @@ static void print_shadow_for_address(const void *addr) for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) { const void *kaddr = kasan_shadow_to_mem(shadow_row); char buffer[4 + (BITS_PER_LONG/8)*2]; + char shadow_buf[SHADOW_BYTES_PER_ROW]; snprintf(buffer, sizeof(buffer), (i == 0) ? ">%p: " : " %p: ", kaddr); - - kasan_disable_current(); + /* + * We should not pass a shadow pointer to generic + * function, because generic functions may try to + * access kasan mapping for the passed address. + */ + memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW); print_hex_dump(KERN_ERR, buffer, DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, - shadow_row, SHADOW_BYTES_PER_ROW, 0); - kasan_enable_current(); + shadow_buf, SHADOW_BYTES_PER_ROW, 0); if (row_is_guilty(shadow_row, shadow)) pr_err("%*c\n", @@ -182,37 +204,43 @@ static void print_shadow_for_address(const void *addr) static DEFINE_SPINLOCK(report_lock); -void kasan_report_error(struct kasan_access_info *info) -{ - unsigned long flags; - - spin_lock_irqsave(&report_lock, flags); - pr_err("=================================" - "=================================\n"); - print_error_description(info); - print_address_description(info); - print_shadow_for_address(info->first_bad_addr); - pr_err("=================================" - "=================================\n"); - spin_unlock_irqrestore(&report_lock, flags); -} - -void kasan_report_user_access(struct kasan_access_info *info) +static void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; + const char *bug_type; + /* + * Make sure we don't end up in loop. + */ + kasan_disable_current(); spin_lock_irqsave(&report_lock, flags); pr_err("=================================" "=================================\n"); - pr_err("BUG: KASan: user-memory-access on address %p\n", - info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, task_pid_nr(current)); - dump_stack(); + if (info->access_addr < + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + pr_err("BUG: KASAN: %s on address %p\n", + bug_type, info->access_addr); + pr_err("%s of size %zu by task %s/%d\n", + info->is_write ? "Write" : "Read", + info->access_size, current->comm, + task_pid_nr(current)); + dump_stack(); + } else { + print_error_description(info); + print_address_description(info); + print_shadow_for_address(info->first_bad_addr); + } pr_err("=================================" "=================================\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, flags); + kasan_enable_current(); } void kasan_report(unsigned long addr, size_t size, @@ -220,13 +248,14 @@ void kasan_report(unsigned long addr, size_t size, { struct kasan_access_info info; - if (likely(!kasan_enabled())) + if (likely(!kasan_report_enabled())) return; info.access_addr = (void *)addr; info.access_size = size; info.is_write = is_write; info.ip = ip; + kasan_report_error(&info); } diff --git a/kernel/mm/kmemleak.c b/kernel/mm/kmemleak.c index 3716cdb8b..19423a45d 100644 --- a/kernel/mm/kmemleak.c +++ b/kernel/mm/kmemleak.c @@ -53,6 +53,13 @@ * modifications to the memory scanning parameters including the scan_thread * pointer * + * Locks and mutexes are acquired/nested in the following order: + * + * scan_mutex [-> object->lock] -> kmemleak_lock -> other_object->lock (SINGLE_DEPTH_NESTING) + * + * No kmemleak_lock and object->lock nesting is allowed outside scan_mutex + * regions. + * * The kmemleak_object structures have a use_count incremented or decremented * using the get_object()/put_object() functions. When the use_count becomes * 0, this count can no longer be incremented and put_object() schedules the @@ -295,23 +302,14 @@ static void hex_dump_object(struct seq_file *seq, struct kmemleak_object *object) { const u8 *ptr = (const u8 *)object->pointer; - int i, len, remaining; - unsigned char linebuf[HEX_ROW_SIZE * 5]; + size_t len; /* limit the number of lines to HEX_MAX_LINES */ - remaining = len = - min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); - - seq_printf(seq, " hex dump (first %d bytes):\n", len); - for (i = 0; i < len; i += HEX_ROW_SIZE) { - int linelen = min(remaining, HEX_ROW_SIZE); - - remaining -= HEX_ROW_SIZE; - hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE, - HEX_GROUP_SIZE, linebuf, sizeof(linebuf), - HEX_ASCII); - seq_printf(seq, " %s\n", linebuf); - } + len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); + + seq_printf(seq, " hex dump (first %zu bytes):\n", len); + seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE, + HEX_GROUP_SIZE, ptr, len, HEX_ASCII); } /* @@ -481,12 +479,11 @@ static void put_object(struct kmemleak_object *object) static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) { unsigned long flags; - struct kmemleak_object *object = NULL; + struct kmemleak_object *object; rcu_read_lock(); read_lock_irqsave(&kmemleak_lock, flags); - if (ptr >= min_addr && ptr < max_addr) - object = lookup_object(ptr, alias); + object = lookup_object(ptr, alias); read_unlock_irqrestore(&kmemleak_lock, flags); /* check whether the object is still available */ @@ -498,6 +495,27 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) } /* + * Look up an object in the object search tree and remove it from both + * object_tree_root and object_list. The returned object's use_count should be + * at least 1, as initially set by create_object(). + */ +static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias) +{ + unsigned long flags; + struct kmemleak_object *object; + + write_lock_irqsave(&kmemleak_lock, flags); + object = lookup_object(ptr, alias); + if (object) { + rb_erase(&object->rb_node, &object_tree_root); + list_del_rcu(&object->object_list); + } + write_unlock_irqrestore(&kmemleak_lock, flags); + + return object; +} + +/* * Save stack trace to the given array of MAX_TRACE size. */ static int __save_stack_trace(unsigned long *trace) @@ -582,11 +600,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, kmemleak_stop("Cannot insert 0x%lx into the object " "search tree (overlaps existing)\n", ptr); + /* + * No need for parent->lock here since "parent" cannot + * be freed while the kmemleak_lock is held. + */ + dump_object_info(parent); kmem_cache_free(object_cache, object); - object = parent; - spin_lock(&object->lock); - dump_object_info(object); - spin_unlock(&object->lock); + object = NULL; goto out; } } @@ -600,20 +620,14 @@ out: } /* - * Remove the metadata (struct kmemleak_object) for a memory block from the - * object_list and object_tree_root and decrement its use_count. + * Mark the object as not allocated and schedule RCU freeing via put_object(). */ static void __delete_object(struct kmemleak_object *object) { unsigned long flags; - write_lock_irqsave(&kmemleak_lock, flags); - rb_erase(&object->rb_node, &object_tree_root); - list_del_rcu(&object->object_list); - write_unlock_irqrestore(&kmemleak_lock, flags); - WARN_ON(!(object->flags & OBJECT_ALLOCATED)); - WARN_ON(atomic_read(&object->use_count) < 2); + WARN_ON(atomic_read(&object->use_count) < 1); /* * Locking here also ensures that the corresponding memory block @@ -633,7 +647,7 @@ static void delete_object_full(unsigned long ptr) { struct kmemleak_object *object; - object = find_and_get_object(ptr, 0); + object = find_and_remove_object(ptr, 0); if (!object) { #ifdef DEBUG kmemleak_warn("Freeing unknown object at 0x%08lx\n", @@ -642,7 +656,6 @@ static void delete_object_full(unsigned long ptr) return; } __delete_object(object); - put_object(object); } /* @@ -655,7 +668,7 @@ static void delete_object_part(unsigned long ptr, size_t size) struct kmemleak_object *object; unsigned long start, end; - object = find_and_get_object(ptr, 1); + object = find_and_remove_object(ptr, 1); if (!object) { #ifdef DEBUG kmemleak_warn("Partially freeing unknown object at 0x%08lx " @@ -663,7 +676,6 @@ static void delete_object_part(unsigned long ptr, size_t size) #endif return; } - __delete_object(object); /* * Create one or two objects that may result from the memory block @@ -681,7 +693,7 @@ static void delete_object_part(unsigned long ptr, size_t size) create_object(ptr + size, end - ptr - size, object->min_count, GFP_KERNEL); - put_object(object); + __delete_object(object); } static void __paint_it(struct kmemleak_object *object, int color) @@ -817,6 +829,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size, } if (crt_early_log >= ARRAY_SIZE(early_log)) { + crt_early_log++; kmemleak_disable(); return; } @@ -1151,19 +1164,18 @@ static int scan_should_stop(void) * found to the gray list. */ static void scan_block(void *_start, void *_end, - struct kmemleak_object *scanned, int allow_resched) + struct kmemleak_object *scanned) { unsigned long *ptr; unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); unsigned long *end = _end - (BYTES_PER_POINTER - 1); + unsigned long flags; + read_lock_irqsave(&kmemleak_lock, flags); for (ptr = start; ptr < end; ptr++) { struct kmemleak_object *object; - unsigned long flags; unsigned long pointer; - if (allow_resched) - cond_resched(); if (scan_should_stop()) break; @@ -1176,26 +1188,31 @@ static void scan_block(void *_start, void *_end, pointer = *ptr; kasan_enable_current(); - object = find_and_get_object(pointer, 1); + if (pointer < min_addr || pointer >= max_addr) + continue; + + /* + * No need for get_object() here since we hold kmemleak_lock. + * object->use_count cannot be dropped to 0 while the object + * is still present in object_tree_root and object_list + * (with updates protected by kmemleak_lock). + */ + object = lookup_object(pointer, 1); if (!object) continue; - if (object == scanned) { + if (object == scanned) /* self referenced, ignore */ - put_object(object); continue; - } /* * Avoid the lockdep recursive warning on object->lock being * previously acquired in scan_object(). These locks are * enclosed by scan_mutex. */ - spin_lock_irqsave_nested(&object->lock, flags, - SINGLE_DEPTH_NESTING); + spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); if (!color_white(object)) { /* non-orphan, ignored or new */ - spin_unlock_irqrestore(&object->lock, flags); - put_object(object); + spin_unlock(&object->lock); continue; } @@ -1207,13 +1224,27 @@ static void scan_block(void *_start, void *_end, */ object->count++; if (color_gray(object)) { + /* put_object() called when removing from gray_list */ + WARN_ON(!get_object(object)); list_add_tail(&object->gray_list, &gray_list); - spin_unlock_irqrestore(&object->lock, flags); - continue; } + spin_unlock(&object->lock); + } + read_unlock_irqrestore(&kmemleak_lock, flags); +} - spin_unlock_irqrestore(&object->lock, flags); - put_object(object); +/* + * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency. + */ +static void scan_large_block(void *start, void *end) +{ + void *next; + + while (start < end) { + next = min(start + MAX_SCAN_SIZE, end); + scan_block(start, next, NULL); + start = next; + cond_resched(); } } @@ -1239,22 +1270,25 @@ static void scan_object(struct kmemleak_object *object) if (hlist_empty(&object->area_list)) { void *start = (void *)object->pointer; void *end = (void *)(object->pointer + object->size); + void *next; + + do { + next = min(start + MAX_SCAN_SIZE, end); + scan_block(start, next, object); - while (start < end && (object->flags & OBJECT_ALLOCATED) && - !(object->flags & OBJECT_NO_SCAN)) { - scan_block(start, min(start + MAX_SCAN_SIZE, end), - object, 0); - start += MAX_SCAN_SIZE; + start = next; + if (start >= end) + break; spin_unlock_irqrestore(&object->lock, flags); cond_resched(); spin_lock_irqsave(&object->lock, flags); - } + } while (object->flags & OBJECT_ALLOCATED); } else hlist_for_each_entry(area, &object->area_list, node) scan_block((void *)area->start, (void *)(area->start + area->size), - object, 0); + object); out: spin_unlock_irqrestore(&object->lock, flags); } @@ -1331,14 +1365,14 @@ static void kmemleak_scan(void) rcu_read_unlock(); /* data/bss scanning */ - scan_block(_sdata, _edata, NULL, 1); - scan_block(__bss_start, __bss_stop, NULL, 1); + scan_large_block(_sdata, _edata); + scan_large_block(__bss_start, __bss_stop); #ifdef CONFIG_SMP /* per-cpu sections scanning */ for_each_possible_cpu(i) - scan_block(__per_cpu_start + per_cpu_offset(i), - __per_cpu_end + per_cpu_offset(i), NULL, 1); + scan_large_block(__per_cpu_start + per_cpu_offset(i), + __per_cpu_end + per_cpu_offset(i)); #endif /* @@ -1359,7 +1393,7 @@ static void kmemleak_scan(void) /* only scan if page is in use */ if (page_count(page) == 0) continue; - scan_block(page, page + 1, NULL, 1); + scan_block(page, page + 1, NULL); } } put_online_mems(); @@ -1373,7 +1407,7 @@ static void kmemleak_scan(void) read_lock(&tasklist_lock); do_each_thread(g, p) { scan_block(task_stack_page(p), task_stack_page(p) + - THREAD_SIZE, NULL, 0); + THREAD_SIZE, NULL); } while_each_thread(g, p); read_unlock(&tasklist_lock); } @@ -1750,7 +1784,6 @@ static void __kmemleak_do_cleanup(void) */ static void kmemleak_do_cleanup(struct work_struct *work) { - mutex_lock(&scan_mutex); stop_scan_thread(); /* @@ -1765,7 +1798,6 @@ static void kmemleak_do_cleanup(struct work_struct *work) else pr_info("Kmemleak disabled without freeing internal data. " "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); - mutex_unlock(&scan_mutex); } static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); @@ -1842,7 +1874,7 @@ void __init kmemleak_init(void) object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); - if (crt_early_log >= ARRAY_SIZE(early_log)) + if (crt_early_log > ARRAY_SIZE(early_log)) pr_warning("Early log buffer exceeded (%d), please increase " "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log); diff --git a/kernel/mm/ksm.c b/kernel/mm/ksm.c index 7ee101eaa..b5cd647da 100644 --- a/kernel/mm/ksm.c +++ b/kernel/mm/ksm.c @@ -475,7 +475,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) flush_dcache_page(page); } else { put_page(page); -out: page = NULL; +out: + page = NULL; } up_read(&mm->mmap_sem); return page; @@ -625,7 +626,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) unlock_page(page); put_page(page); - if (stable_node->hlist.first) + if (!hlist_empty(&stable_node->hlist)) ksm_pages_sharing--; else ksm_pages_shared--; @@ -1021,8 +1022,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (page == kpage) /* ksm page forked */ return 0; - if (!(vma->vm_flags & VM_MERGEABLE)) - goto out; if (PageTransCompound(page) && page_trans_compound_anon_split(page)) goto out; BUG_ON(PageTransCompound(page)); @@ -1087,10 +1086,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, int err = -EFAULT; down_read(&mm->mmap_sem); - if (ksm_test_exit(mm)) - goto out; - vma = find_vma(mm, rmap_item->address); - if (!vma || vma->vm_start > rmap_item->address) + vma = find_mergeable_vma(mm, rmap_item->address); + if (!vma) goto out; err = try_to_merge_one_page(vma, page, kpage); @@ -1177,8 +1174,18 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); tree_page = get_ksm_page(stable_node, false); - if (!tree_page) - return NULL; + if (!tree_page) { + /* + * If we walked over a stale stable_node, + * get_ksm_page() will call rb_erase() and it + * may rebalance the tree from under us. So + * restart the search from scratch. Returning + * NULL would be safe too, but we'd generate + * false negative insertions just because some + * stable_node was stale. + */ + goto again; + } ret = memcmp_pages(page, tree_page); put_page(tree_page); @@ -1254,12 +1261,14 @@ static struct stable_node *stable_tree_insert(struct page *kpage) unsigned long kpfn; struct rb_root *root; struct rb_node **new; - struct rb_node *parent = NULL; + struct rb_node *parent; struct stable_node *stable_node; kpfn = page_to_pfn(kpage); nid = get_kpfn_nid(kpfn); root = root_stable_tree + nid; +again: + parent = NULL; new = &root->rb_node; while (*new) { @@ -1269,8 +1278,18 @@ static struct stable_node *stable_tree_insert(struct page *kpage) cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); tree_page = get_ksm_page(stable_node, false); - if (!tree_page) - return NULL; + if (!tree_page) { + /* + * If we walked over a stale stable_node, + * get_ksm_page() will call rb_erase() and it + * may rebalance the tree from under us. So + * restart the search from scratch. Returning + * NULL would be safe too, but we'd generate + * false negative insertions just because some + * stable_node was stale. + */ + goto again; + } ret = memcmp_pages(kpage, tree_page); put_page(tree_page); @@ -1340,7 +1359,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, cond_resched(); tree_rmap_item = rb_entry(*new, struct rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); - if (IS_ERR_OR_NULL(tree_page)) + if (!tree_page) return NULL; /* @@ -1914,9 +1933,11 @@ again: struct anon_vma_chain *vmac; struct vm_area_struct *vma; + cond_resched(); anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { + cond_resched(); vma = vmac->vma; if (rmap_item->address < vma->vm_start || rmap_item->address >= vma->vm_end) diff --git a/kernel/mm/list_lru.c b/kernel/mm/list_lru.c index 909eca2c8..afc71ea9a 100644 --- a/kernel/mm/list_lru.c +++ b/kernel/mm/list_lru.c @@ -42,6 +42,10 @@ static void list_lru_unregister(struct list_lru *lru) #ifdef CONFIG_MEMCG_KMEM static inline bool list_lru_memcg_aware(struct list_lru *lru) { + /* + * This needs node 0 to be always present, even + * in the systems supporting sparse numa ids. + */ return !!lru->node[0].memcg_lrus; } @@ -59,6 +63,16 @@ list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) return &nlru->lru; } +static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) +{ + struct page *page; + + if (!memcg_kmem_enabled()) + return NULL; + page = virt_to_head_page(ptr); + return page->mem_cgroup; +} + static inline struct list_lru_one * list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) { @@ -99,8 +113,8 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) struct list_lru_one *l; spin_lock(&nlru->lock); - l = list_lru_from_kmem(nlru, item); if (list_empty(item)) { + l = list_lru_from_kmem(nlru, item); list_add_tail(item, &l->list); l->nr_items++; spin_unlock(&nlru->lock); @@ -118,8 +132,8 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) struct list_lru_one *l; spin_lock(&nlru->lock); - l = list_lru_from_kmem(nlru, item); if (!list_empty(item)) { + l = list_lru_from_kmem(nlru, item); list_del_init(item); l->nr_items--; spin_unlock(&nlru->lock); @@ -377,16 +391,20 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { int i; - for (i = 0; i < nr_node_ids; i++) { - if (!memcg_aware) - lru->node[i].memcg_lrus = NULL; - else if (memcg_init_list_lru_node(&lru->node[i])) + if (!memcg_aware) + return 0; + + for_each_node(i) { + if (memcg_init_list_lru_node(&lru->node[i])) goto fail; } return 0; fail: - for (i = i - 1; i >= 0; i--) + for (i = i - 1; i >= 0; i--) { + if (!lru->node[i].memcg_lrus) + continue; memcg_destroy_list_lru_node(&lru->node[i]); + } return -ENOMEM; } @@ -397,7 +415,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru) if (!list_lru_memcg_aware(lru)) return; - for (i = 0; i < nr_node_ids; i++) + for_each_node(i) memcg_destroy_list_lru_node(&lru->node[i]); } @@ -409,16 +427,20 @@ static int memcg_update_list_lru(struct list_lru *lru, if (!list_lru_memcg_aware(lru)) return 0; - for (i = 0; i < nr_node_ids; i++) { + for_each_node(i) { if (memcg_update_list_lru_node(&lru->node[i], old_size, new_size)) goto fail; } return 0; fail: - for (i = i - 1; i >= 0; i--) + for (i = i - 1; i >= 0; i--) { + if (!lru->node[i].memcg_lrus) + continue; + memcg_cancel_update_list_lru_node(&lru->node[i], old_size, new_size); + } return -ENOMEM; } @@ -430,7 +452,7 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru, if (!list_lru_memcg_aware(lru)) return; - for (i = 0; i < nr_node_ids; i++) + for_each_node(i) memcg_cancel_update_list_lru_node(&lru->node[i], old_size, new_size); } @@ -485,7 +507,7 @@ static void memcg_drain_list_lru(struct list_lru *lru, if (!list_lru_memcg_aware(lru)) return; - for (i = 0; i < nr_node_ids; i++) + for_each_node(i) memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); } @@ -522,7 +544,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, if (!lru->node) goto out; - for (i = 0; i < nr_node_ids; i++) { + for_each_node(i) { spin_lock_init(&lru->node[i].lock); if (key) lockdep_set_class(&lru->node[i].lock, key); diff --git a/kernel/mm/maccess.c b/kernel/mm/maccess.c index d53adf9ba..d159b1c96 100644 --- a/kernel/mm/maccess.c +++ b/kernel/mm/maccess.c @@ -13,6 +13,11 @@ * * Safely read from address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. + * + * We ensure that the copy_from_user is executed in atomic context so that + * do_page_fault() doesn't attempt to take mmap_sem. This makes + * probe_kernel_read() suitable for use within regions where the caller + * already holds mmap_sem, or other locks which nest inside mmap_sem. */ long __weak probe_kernel_read(void *dst, const void *src, size_t size) @@ -60,3 +65,44 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) return ret ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(probe_kernel_write); + +/** + * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @src: Unsafe address. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from unsafe address to kernel buffer. + * + * On success, returns the length of the string INCLUDING the trailing NUL. + * + * If access fails, returns -EFAULT (some data may have been copied + * and the trailing NUL added). + * + * If @count is smaller than the length of the string, copies @count-1 bytes, + * sets the last byte of @dst buffer to NUL and returns @count. + */ +long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) +{ + mm_segment_t old_fs = get_fs(); + const void *src = unsafe_addr; + long ret; + + if (unlikely(count <= 0)) + return 0; + + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(dst++, + (const void __user __force *)src++, 1); + } while (dst[-1] && ret == 0 && src - unsafe_addr < count); + + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + return ret ? -EFAULT : src - unsafe_addr; +} diff --git a/kernel/mm/madvise.c b/kernel/mm/madvise.c index d55147551..c889fcbb5 100644 --- a/kernel/mm/madvise.c +++ b/kernel/mm/madvise.c @@ -17,6 +17,7 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -102,7 +103,8 @@ static long madvise_behavior(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma)); + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*prev) { vma = *prev; goto success; @@ -299,7 +301,7 @@ static long madvise_remove(struct vm_area_struct *vma, *prev = NULL; /* tell sys_madvise we drop mmap_sem */ - if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) + if (vma->vm_flags & VM_LOCKED) return -EINVAL; f = vma->vm_file; @@ -384,7 +386,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, } } -static int +static bool madvise_behavior_valid(int behavior) { switch (behavior) { @@ -406,10 +408,10 @@ madvise_behavior_valid(int behavior) #endif case MADV_DONTDUMP: case MADV_DODUMP: - return 1; + return true; default: - return 0; + return false; } } diff --git a/kernel/mm/memblock.c b/kernel/mm/memblock.c index 9318b567e..d300f1329 100644 --- a/kernel/mm/memblock.c +++ b/kernel/mm/memblock.c @@ -54,10 +54,16 @@ int memblock_debug __initdata_memblock; #ifdef CONFIG_MOVABLE_NODE bool movable_node_enabled __initdata_memblock = false; #endif +static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; static int memblock_reserved_in_slab __initdata_memblock = 0; +ulong __init_memblock choose_memblock_flags(void) +{ + return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; +} + /* inline so we don't get a warning when pr_debug is compiled out */ static __init_memblock const char * memblock_type_name(struct memblock_type *type) @@ -85,7 +91,7 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } -static long __init_memblock memblock_overlaps_region(struct memblock_type *type, +bool __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { unsigned long i; @@ -97,7 +103,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, break; } - return (i < type->cnt) ? i : -1; + return i < type->cnt; } /* @@ -107,6 +113,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Utility called from memblock_find_in_range_node(), find free area bottom-up. * @@ -115,12 +122,13 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, */ static phys_addr_t __init_memblock __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid) + phys_addr_t size, phys_addr_t align, int nid, + ulong flags) { phys_addr_t this_start, this_end, cand; u64 i; - for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { + for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); @@ -139,6 +147,7 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Utility called from memblock_find_in_range_node(), find free area top-down. * @@ -147,12 +156,14 @@ __memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, */ static phys_addr_t __init_memblock __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid) + phys_addr_t size, phys_addr_t align, int nid, + ulong flags) { phys_addr_t this_start, this_end, cand; u64 i; - for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { + for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end, + NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); @@ -174,6 +185,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @flags: pick from blocks based on memory attributes * * Find @size free area aligned to @align in the specified range and node. * @@ -190,7 +202,7 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, */ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid) + phys_addr_t end, int nid, ulong flags) { phys_addr_t kernel_end, ret; @@ -215,7 +227,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, /* ok, try bottom-up allocation first */ ret = __memblock_find_range_bottom_up(bottom_up_start, end, - size, align, nid); + size, align, nid, flags); if (ret) return ret; @@ -233,7 +245,8 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, "memory hotunplug may be affected\n"); } - return __memblock_find_range_top_down(start, end, size, align, nid); + return __memblock_find_range_top_down(start, end, size, align, nid, + flags); } /** @@ -252,8 +265,21 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align) { - return memblock_find_in_range_node(size, align, start, end, - NUMA_NO_NODE); + phys_addr_t ret; + ulong flags = choose_memblock_flags(); + +again: + ret = memblock_find_in_range_node(size, align, start, end, + NUMA_NO_NODE, flags); + + if (!ret && (flags & MEMBLOCK_MIRROR)) { + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + flags &= ~MEMBLOCK_MIRROR; + goto again; + } + + return ret; } static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) @@ -540,6 +566,10 @@ repeat: * area, insert that portion. */ if (rbase > base) { +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + WARN_ON(nid != memblock_get_region_node(rgn)); +#endif + WARN_ON(flags != rgn->flags); nr_new++; if (insert) memblock_insert_region(type, i++, base, @@ -585,14 +615,14 @@ static int __init_memblock memblock_add_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *_rgn = &memblock.memory; + struct memblock_type *type = &memblock.memory; memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(_rgn, base, size, nid, flags); + return memblock_add_range(type, base, size, nid, flags); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) @@ -676,7 +706,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, return 0; } -int __init_memblock memblock_remove_range(struct memblock_type *type, +static int __init_memblock memblock_remove_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size) { int start_rgn, end_rgn; @@ -732,7 +762,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) * * This function isolates region [@base, @base + @size), and sets/clears flag * - * Return 0 on succees, -errno on failure. + * Return 0 on success, -errno on failure. */ static int __init_memblock memblock_setclr_flag(phys_addr_t base, phys_addr_t size, int set, int flag) @@ -759,7 +789,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base, * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on succees, -errno on failure. + * Return 0 on success, -errno on failure. */ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) { @@ -771,7 +801,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) * @base: the base phys addr of the region * @size: the size of the region * - * Return 0 on succees, -errno on failure. + * Return 0 on success, -errno on failure. */ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) { @@ -779,9 +809,57 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) } /** + * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on success, -errno on failure. + */ +int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) +{ + system_has_some_mirror = true; + + return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); +} + + +/** + * __next_reserved_mem_region - next function for for_each_reserved_region() + * @idx: pointer to u64 loop variable + * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL + * @out_end: ptr to phys_addr_t for end address of the region, can be %NULL + * + * Iterate over all reserved memory regions. + */ +void __init_memblock __next_reserved_mem_region(u64 *idx, + phys_addr_t *out_start, + phys_addr_t *out_end) +{ + struct memblock_type *type = &memblock.reserved; + + if (*idx >= 0 && *idx < type->cnt) { + struct memblock_region *r = &type->regions[*idx]; + phys_addr_t base = r->base; + phys_addr_t size = r->size; + + if (out_start) + *out_start = base; + if (out_end) + *out_end = base + size - 1; + + *idx += 1; + return; + } + + /* signal end of iteration */ + *idx = ULLONG_MAX; +} + +/** * __next__mem_range - next function for for_each_free_mem_range() etc. * @idx: pointer to u64 loop variable * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @type_a: pointer to memblock_type from where the range is taken * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL @@ -803,7 +881,7 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) * As both region arrays are sorted, the function advances the two indices * in lockstep and returns each intersection. */ -void __init_memblock __next_mem_range(u64 *idx, int nid, +void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -831,6 +909,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; + /* if we want mirror memory skip non-mirror memory regions */ + if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -894,7 +976,8 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, * in type_b. * * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %NUMA_NO_NODE for all nodes + * @nid: node selector, %NUMA_NO_NODE for all nodes + * @flags: pick from blocks based on memory attributes * @type_a: pointer to memblock_type from where the range is taken * @type_b: pointer to memblock_type which excludes memory from being taken * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL @@ -903,7 +986,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, * * Reverse of __next_mem_range(). */ -void __init_memblock __next_mem_range_rev(u64 *idx, int nid, +void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, @@ -935,6 +1018,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) continue; + /* if we want mirror memory skip non-mirror memory regions */ + if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1050,14 +1137,15 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid) + phys_addr_t end, int nid, ulong flags) { phys_addr_t found; if (!align) align = SMP_CACHE_BYTES; - found = memblock_find_in_range_node(size, align, start, end, nid); + found = memblock_find_in_range_node(size, align, start, end, nid, + flags); if (found && !memblock_reserve(found, size)) { /* * The min_count is set to 0 so that memblock allocations are @@ -1070,26 +1158,40 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, } phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, - phys_addr_t start, phys_addr_t end) + phys_addr_t start, phys_addr_t end, + ulong flags) { - return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, + flags); } static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr, - int nid) + int nid, ulong flags) { - return memblock_alloc_range_nid(size, align, 0, max_addr, nid); + return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags); } phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + ulong flags = choose_memblock_flags(); + phys_addr_t ret; + +again: + ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, + nid, flags); + + if (!ret && (flags & MEMBLOCK_MIRROR)) { + flags &= ~MEMBLOCK_MIRROR; + goto again; + } + return ret; } phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) { - return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE); + return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE, + MEMBLOCK_NONE); } phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) @@ -1153,6 +1255,7 @@ static void * __init memblock_virt_alloc_internal( { phys_addr_t alloc; void *ptr; + ulong flags = choose_memblock_flags(); if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n")) nid = NUMA_NO_NODE; @@ -1173,13 +1276,14 @@ static void * __init memblock_virt_alloc_internal( again: alloc = memblock_find_in_range_node(size, align, min_addr, max_addr, - nid); + nid, flags); if (alloc) goto done; if (nid != NUMA_NO_NODE) { alloc = memblock_find_in_range_node(size, align, min_addr, - max_addr, NUMA_NO_NODE); + max_addr, NUMA_NO_NODE, + flags); if (alloc) goto done; } @@ -1187,10 +1291,16 @@ again: if (min_addr) { min_addr = 0; goto again; - } else { - goto error; } + if (flags & MEMBLOCK_MIRROR) { + flags &= ~MEMBLOCK_MIRROR; + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + goto again; + } + + return NULL; done: memblock_reserve(alloc, size); ptr = phys_to_virt(alloc); @@ -1205,9 +1315,6 @@ done: kmemleak_alloc(ptr, size, 0, 0); return ptr; - -error: - return NULL; } /** @@ -1316,7 +1423,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) end = PFN_DOWN(base + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -1459,12 +1566,12 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size * Check if the region [@base, @base+@size) intersects a reserved memory block. * * RETURNS: - * 0 if false, non-zero if true + * True if they intersect, false if not. */ -int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) +bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { memblock_cap_size(base, &size); - return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; + return memblock_overlaps_region(&memblock.reserved, base, size); } void __init_memblock memblock_trim_memory(phys_addr_t align) diff --git a/kernel/mm/memcontrol.c b/kernel/mm/memcontrol.c index 8bd68b5ec..095d20f60 100644 --- a/kernel/mm/memcontrol.c +++ b/kernel/mm/memcontrol.c @@ -62,6 +62,7 @@ #include <linux/oom.h> #include <linux/lockdep.h> #include <linux/file.h> +#include <linux/tracehook.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -79,6 +80,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys); #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; +struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@ -93,6 +95,7 @@ static const char * const mem_cgroup_stat_names[] = { "rss", "rss_huge", "mapped_file", + "dirty", "writeback", "swap", }; @@ -112,56 +115,10 @@ static const char * const mem_cgroup_lru_names[] = { "unevictable", }; -/* - * Per memcg event counter is incremented at every pagein/pageout. With THP, - * it will be incremated by the number of pages. This counter is used for - * for trigger some periodic events. This is straightforward and better - * than using jiffies etc. to handle periodic memcg event. - */ -enum mem_cgroup_events_target { - MEM_CGROUP_TARGET_THRESH, - MEM_CGROUP_TARGET_SOFTLIMIT, - MEM_CGROUP_TARGET_NUMAINFO, - MEM_CGROUP_NTARGETS, -}; #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 #define NUMAINFO_EVENTS_TARGET 1024 -struct mem_cgroup_stat_cpu { - long count[MEM_CGROUP_STAT_NSTATS]; - unsigned long events[MEMCG_NR_EVENTS]; - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; -}; - -struct reclaim_iter { - struct mem_cgroup *position; - /* scan generation, increased every round-trip */ - unsigned int generation; -}; - -/* - * per-zone information in memory controller. - */ -struct mem_cgroup_per_zone { - struct lruvec lruvec; - unsigned long lru_size[NR_LRU_LISTS]; - - struct reclaim_iter iter[DEF_PRIORITY + 1]; - - struct rb_node tree_node; /* RB tree node */ - unsigned long usage_in_excess;/* Set to the value by which */ - /* the soft limit is exceeded*/ - bool on_tree; - struct mem_cgroup *memcg; /* Back pointer, we cannot */ - /* use container_of */ -}; - -struct mem_cgroup_per_node { - struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; -}; - /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -182,32 +139,6 @@ struct mem_cgroup_tree { static struct mem_cgroup_tree soft_limit_tree __read_mostly; -struct mem_cgroup_threshold { - struct eventfd_ctx *eventfd; - unsigned long threshold; -}; - -/* For threshold */ -struct mem_cgroup_threshold_ary { - /* An array index points to threshold just below or equal to usage. */ - int current_threshold; - /* Size of entries[] */ - unsigned int size; - /* Array of thresholds */ - struct mem_cgroup_threshold entries[0]; -}; - -struct mem_cgroup_thresholds { - /* Primary thresholds array */ - struct mem_cgroup_threshold_ary *primary; - /* - * Spare threshold array. - * This is needed to make mem_cgroup_unregister_event() "never fail". - * It must be able to store at least primary->size - 1 entries. - */ - struct mem_cgroup_threshold_ary *spare; -}; - /* for OOM */ struct mem_cgroup_eventfd_list { struct list_head list; @@ -257,113 +188,6 @@ struct mem_cgroup_event { static void mem_cgroup_threshold(struct mem_cgroup *memcg); static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); -/* - * The memory controller data structure. The memory controller controls both - * page cache and RSS per cgroup. We would eventually like to provide - * statistics based on the statistics developed by Rik Van Riel for clock-pro, - * to help the administrator determine what knobs to tune. - */ -struct mem_cgroup { - struct cgroup_subsys_state css; - - /* Accounted resources */ - struct page_counter memory; - struct page_counter memsw; - struct page_counter kmem; - - /* Normal memory consumption range */ - unsigned long low; - unsigned long high; - - unsigned long soft_limit; - - /* vmpressure notifications */ - struct vmpressure vmpressure; - - /* css_online() has been completed */ - int initialized; - - /* - * Should the accounting and control be hierarchical, per subtree? - */ - bool use_hierarchy; - - bool oom_lock; - atomic_t under_oom; - atomic_t oom_wakeups; - - int swappiness; - /* OOM-Killer disable */ - int oom_kill_disable; - - /* protect arrays of thresholds */ - struct mutex thresholds_lock; - - /* thresholds for memory usage. RCU-protected */ - struct mem_cgroup_thresholds thresholds; - - /* thresholds for mem+swap usage. RCU-protected */ - struct mem_cgroup_thresholds memsw_thresholds; - - /* For oom notifier event fd */ - struct list_head oom_notify; - - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; - /* - * set > 0 if pages under this cgroup are moving to other cgroup. - */ - atomic_t moving_account; - /* taken only while moving_account > 0 */ - spinlock_t move_lock; - struct task_struct *move_lock_task; - unsigned long move_lock_flags; - /* - * percpu counter. - */ - struct mem_cgroup_stat_cpu __percpu *stat; - /* - * used when a cpu is offlined or other synchronizations - * See mem_cgroup_read_stat(). - */ - struct mem_cgroup_stat_cpu nocpu_base; - spinlock_t pcp_counter_lock; - -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) - struct cg_proto tcp_mem; -#endif -#if defined(CONFIG_MEMCG_KMEM) - /* Index in the kmem_cache->memcg_params.memcg_caches array */ - int kmemcg_id; - bool kmem_acct_activated; - bool kmem_acct_active; -#endif - - int last_scanned_node; -#if MAX_NUMNODES > 1 - nodemask_t scan_nodes; - atomic_t numainfo_events; - atomic_t numainfo_updating; -#endif - - /* List of events which userspace want to receive */ - struct list_head event_list; - spinlock_t event_list_lock; - - struct mem_cgroup_per_node *nodeinfo[0]; - /* WARNING: nodeinfo must be the last member here */ -}; - -#ifdef CONFIG_MEMCG_KMEM -bool memcg_kmem_is_active(struct mem_cgroup *memcg) -{ - return memcg->kmem_acct_active; -} -#endif - /* Stuffs for move charges at task migration. */ /* * Types of charges to be moved. @@ -424,11 +248,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) -{ - return s ? container_of(s, struct mem_cgroup, css) : NULL; -} - /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -500,8 +319,7 @@ void sock_update_memcg(struct sock *sk) rcu_read_lock(); memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (!mem_cgroup_is_root(memcg) && - memcg_proto_active(cg_proto) && + if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) && css_tryget_online(&memcg->css)) { sk->sk_cgrp = cg_proto; } @@ -594,11 +412,67 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) return &memcg->nodeinfo[nid]->zoneinfo[zid]; } -struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) +/** + * mem_cgroup_css_from_page - css of the memcg associated with a page + * @page: page of interest + * + * If memcg is bound to the default hierarchy, css of the memcg associated + * with @page is returned. The returned css remains associated with @page + * until it is released. + * + * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup + * is returned. + * + * XXX: The above description of behavior on the default hierarchy isn't + * strictly true yet as replace_page_cache_page() can modify the + * association before @page is released even on the default hierarchy; + * however, the current and planned usages don't mix the the two functions + * and replace_page_cache_page() will soon be updated to make the invariant + * actually true. + */ +struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { + struct mem_cgroup *memcg; + + rcu_read_lock(); + + memcg = page->mem_cgroup; + + if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + memcg = root_mem_cgroup; + + rcu_read_unlock(); return &memcg->css; } +/** + * page_cgroup_ino - return inode number of the memcg a page is charged to + * @page: the page + * + * Look up the closest online ancestor of the memory cgroup @page is charged to + * and return its inode number or 0 if @page is not charged to any cgroup. It + * is safe to call this function without holding a reference to @page. + * + * Note, this function is inherently racy, because there is nothing to prevent + * the cgroup inode from getting torn down and potentially reallocated a moment + * after page_cgroup_ino() returns, so it only should be used by callers that + * do not care (such as procfs interfaces). + */ +ino_t page_cgroup_ino(struct page *page) +{ + struct mem_cgroup *memcg; + unsigned long ino = 0; + + rcu_read_lock(); + memcg = READ_ONCE(page->mem_cgroup); + while (memcg && !(memcg->css.flags & CSS_ONLINE)) + memcg = parent_mem_cgroup(memcg); + if (memcg) + ino = cgroup_ino(memcg->css.cgroup); + rcu_read_unlock(); + return ino; +} + static struct mem_cgroup_per_zone * mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) { @@ -774,12 +648,14 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) } /* + * Return page count for single (non recursive) @memcg. + * * Implementation Note: reading percpu statistics for memcg. * * Both of vmstat[] and percpu_counter has threshold and do periodic * synchronization to implement "quick" read. There are trade-off between * reading cost and precision of value. Then, we may have a chance to implement - * a periodic synchronizion of counter in memcg's counter. + * a periodic synchronization of counter in memcg's counter. * * But this _read() function is used for user interface now. The user accounts * memory usage by memory cgroup and he _always_ requires exact value because @@ -789,24 +665,24 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) * * If there are kernel internal actions which can make use of some not-exact * value, and reading all cpu value can be performance bottleneck in some - * common workload, threashold and synchonization as vmstat[] should be + * common workload, threshold and synchronization as vmstat[] should be * implemented. */ -static long mem_cgroup_read_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) +static unsigned long +mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { long val = 0; int cpu; - get_online_cpus(); - for_each_online_cpu(cpu) + /* Per-cpu values can be negative, use a signed accumulator */ + for_each_possible_cpu(cpu) val += per_cpu(memcg->stat->count[idx], cpu); -#ifdef CONFIG_HOTPLUG_CPU - spin_lock(&memcg->pcp_counter_lock); - val += memcg->nocpu_base.count[idx]; - spin_unlock(&memcg->pcp_counter_lock); -#endif - put_online_cpus(); + /* + * Summing races with updates, so val may be negative. Avoid exposing + * transient negative values. + */ + if (val < 0) + val = 0; return val; } @@ -816,15 +692,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, unsigned long val = 0; int cpu; - get_online_cpus(); - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) val += per_cpu(memcg->stat->events[idx], cpu); -#ifdef CONFIG_HOTPLUG_CPU - spin_lock(&memcg->pcp_counter_lock); - val += memcg->nocpu_base.events[idx]; - spin_unlock(&memcg->pcp_counter_lock); -#endif - put_online_cpus(); return val; } @@ -858,14 +727,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } -unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) -{ - struct mem_cgroup_per_zone *mz; - - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); - return mz->lru_size[lru]; -} - static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) @@ -968,6 +829,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); } +EXPORT_SYMBOL(mem_cgroup_from_task); static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { @@ -1013,7 +875,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { - struct reclaim_iter *uninitialized_var(iter); + struct mem_cgroup_reclaim_iter *uninitialized_var(iter); struct cgroup_subsys_state *css = NULL; struct mem_cgroup *memcg = NULL; struct mem_cgroup *pos = NULL; @@ -1044,14 +906,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && reclaim->generation != iter->generation) goto out_unlock; - do { + while (1) { pos = READ_ONCE(iter->position); + if (!pos || css_tryget(&pos->css)) + break; /* - * A racing update may change the position and - * put the last reference, hence css_tryget(), - * or retry to see the updated position. + * css reference reached zero, so iter->position will + * be cleared by ->css_released. However, we should not + * rely on this happening soon, because ->css_released + * is called from a work queue, and by busy-waiting we + * might block it. So we clear iter->position right + * away. */ - } while (pos && !css_tryget(&pos->css)); + (void)cmpxchg(&iter->position, pos, NULL); + } } if (pos) @@ -1097,17 +965,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, } if (reclaim) { - if (cmpxchg(&iter->position, pos, memcg) == pos) { - if (memcg) - css_get(&memcg->css); - if (pos) - css_put(&pos->css); - } - /* - * pairs with css_tryget when dereferencing iter->position - * above. + * The position could have already been updated by a competing + * thread, so check that the value hasn't changed since we read + * it to avoid reclaiming from the same cgroup twice. */ + (void)cmpxchg(&iter->position, pos, memcg); + if (pos) css_put(&pos->css); @@ -1140,6 +1004,28 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, css_put(&prev->css); } +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +{ + struct mem_cgroup *memcg = dead_memcg; + struct mem_cgroup_reclaim_iter *iter; + struct mem_cgroup_per_zone *mz; + int nid, zid; + int i; + + while ((memcg = parent_mem_cgroup(memcg))) { + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); + } + } + } + } +} + /* * Iteration constructs for visiting all cgroups (under a tree). If * loops are exited prematurely (break), mem_cgroup_iter_break() must @@ -1155,30 +1041,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, iter != NULL; \ iter = mem_cgroup_iter(NULL, iter, NULL)) -void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) -{ - struct mem_cgroup *memcg; - - rcu_read_lock(); - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!memcg)) - goto out; - - switch (idx) { - case PGFAULT: - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); - break; - case PGMAJFAULT: - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); - break; - default: - BUG(); - } -out: - rcu_read_unlock(); -} -EXPORT_SYMBOL(__mem_cgroup_count_vm_event); - /** * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg * @zone: zone of the wanted lruvec @@ -1277,15 +1139,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, VM_BUG_ON((long)(*lru_size) < 0); } -bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) -{ - if (root == memcg) - return true; - if (!root->use_hierarchy) - return false; - return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); -} - bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) { struct mem_cgroup *task_memcg; @@ -1312,39 +1165,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) return ret; } -int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) -{ - unsigned long inactive_ratio; - unsigned long inactive; - unsigned long active; - unsigned long gb; - - inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); - active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); - - gb = (inactive + active) >> (30 - PAGE_SHIFT); - if (gb) - inactive_ratio = int_sqrt(10 * gb); - else - inactive_ratio = 1; - - return inactive * inactive_ratio < active; -} - -bool mem_cgroup_lruvec_online(struct lruvec *lruvec) -{ - struct mem_cgroup_per_zone *mz; - struct mem_cgroup *memcg; - - if (mem_cgroup_disabled()) - return true; - - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); - memcg = mz->memcg; - - return !!(memcg->css.flags & CSS_ONLINE); -} - #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) @@ -1376,15 +1196,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) return margin; } -int mem_cgroup_swappiness(struct mem_cgroup *memcg) -{ - /* root ? */ - if (mem_cgroup_disabled() || !memcg->css.parent) - return vm_swappiness; - - return memcg->swappiness; -} - /* * A routine for checking "mem" is under move_account() or not. * @@ -1480,7 +1291,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; - pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], + pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], K(mem_cgroup_read_stat(iter, i))); } @@ -1527,23 +1338,31 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { + struct oom_control oc = { + .zonelist = NULL, + .nodemask = NULL, + .gfp_mask = gfp_mask, + .order = order, + }; struct mem_cgroup *iter; unsigned long chosen_points = 0; unsigned long totalpages; unsigned int points = 0; struct task_struct *chosen = NULL; + mutex_lock(&oom_lock); + /* * If current has a pending SIGKILL or is exiting, then automatically * select it. The goal is to allow it to allocate so that it may * quickly exit and free its memory. */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { - mark_tsk_oom_victim(current); - return; + mark_oom_victim(current); + goto unlock; } - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); + check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); totalpages = mem_cgroup_get_limit(memcg) ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1551,8 +1370,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, css_task_iter_start(&iter->css, &it); while ((task = css_task_iter_next(&it))) { - switch (oom_scan_process_thread(task, totalpages, NULL, - false)) { + switch (oom_scan_process_thread(&oc, task, totalpages)) { case OOM_SCAN_SELECT: if (chosen) put_task_struct(chosen); @@ -1567,7 +1385,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); - return; + goto unlock; case OOM_SCAN_OK: break; }; @@ -1588,11 +1406,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, css_task_iter_end(&it); } - if (!chosen) - return; - points = chosen_points * 1000 / totalpages; - oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, - NULL, "Memory cgroup out of memory"); + if (chosen) { + points = chosen_points * 1000 / totalpages; + oom_kill_process(&oc, chosen, points, totalpages, memcg, + "Memory cgroup out of memory"); + } +unlock: + mutex_unlock(&oom_lock); } #if MAX_NUMNODES > 1 @@ -1809,8 +1629,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) { struct mem_cgroup *iter; + spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) - atomic_inc(&iter->under_oom); + iter->under_oom++; + spin_unlock(&memcg_oom_lock); } static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) @@ -1819,11 +1641,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) /* * When a new child is created while the hierarchy is under oom, - * mem_cgroup_oom_lock() may not be called. We have to use - * atomic_add_unless() here. + * mem_cgroup_oom_lock() may not be called. Watch for underflow. */ + spin_lock(&memcg_oom_lock); for_each_mem_cgroup_tree(iter, memcg) - atomic_add_unless(&iter->under_oom, -1, 0); + if (iter->under_oom > 0) + iter->under_oom--; + spin_unlock(&memcg_oom_lock); } static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); @@ -1849,22 +1673,23 @@ static int memcg_oom_wake_function(wait_queue_t *wait, return autoremove_wake_function(wait, mode, sync, arg); } -static void memcg_wakeup_oom(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->oom_wakeups); - /* for filtering, pass "memcg" as argument. */ - __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); -} - static void memcg_oom_recover(struct mem_cgroup *memcg) { - if (memcg && atomic_read(&memcg->under_oom)) - memcg_wakeup_oom(memcg); + /* + * For the following lockless ->under_oom test, the only required + * guarantee is that it must see the state asserted by an OOM when + * this function is called as a result of userland actions + * triggered by the notification of the OOM. This is trivially + * achieved by invoking mem_cgroup_mark_under_oom() before + * triggering notification. + */ + if (memcg && memcg->under_oom) + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_oom.may_oom) + if (!current->memcg_may_oom) return; /* * We are in the middle of the charge context here, so we @@ -1881,9 +1706,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) * and when we know whether the fault was overall successful. */ css_get(&memcg->css); - current->memcg_oom.memcg = memcg; - current->memcg_oom.gfp_mask = mask; - current->memcg_oom.order = order; + current->memcg_in_oom = memcg; + current->memcg_oom_gfp_mask = mask; + current->memcg_oom_order = order; } /** @@ -1905,7 +1730,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) */ bool mem_cgroup_oom_synchronize(bool handle) { - struct mem_cgroup *memcg = current->memcg_oom.memcg; + struct mem_cgroup *memcg = current->memcg_in_oom; struct oom_wait_info owait; bool locked; @@ -1933,8 +1758,8 @@ bool mem_cgroup_oom_synchronize(bool handle) if (locked && !memcg->oom_kill_disable) { mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, - current->memcg_oom.order); + mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, + current->memcg_oom_order); } else { schedule(); mem_cgroup_unmark_under_oom(memcg); @@ -1951,7 +1776,7 @@ bool mem_cgroup_oom_synchronize(bool handle) memcg_oom_recover(memcg); } cleanup: - current->memcg_oom.memcg = NULL; + current->memcg_in_oom = NULL; css_put(&memcg->css); return true; } @@ -2014,6 +1839,7 @@ again: return memcg; } +EXPORT_SYMBOL(mem_cgroup_begin_page_stat); /** * mem_cgroup_end_page_stat - finish a page state statistics transaction @@ -2032,23 +1858,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) rcu_read_unlock(); } - -/** - * mem_cgroup_update_page_stat - update page state statistics - * @memcg: memcg to account against - * @idx: page state item to account - * @val: number of pages (positive or negative) - * - * See mem_cgroup_begin_page_stat() for locking requirements. - */ -void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx, int val) -{ - VM_BUG_ON(!rcu_read_lock_held()); - - if (memcg) - this_cpu_add(memcg->stat->count[idx], val); -} +EXPORT_SYMBOL(mem_cgroup_end_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -2175,37 +1985,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -/* - * This function drains percpu counter value from DEAD cpu and - * move it to local cpu. Note that this function can be preempted. - */ -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) -{ - int i; - - spin_lock(&memcg->pcp_counter_lock); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - long x = per_cpu(memcg->stat->count[i], cpu); - - per_cpu(memcg->stat->count[i], cpu) = 0; - memcg->nocpu_base.count[i] += x; - } - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { - unsigned long x = per_cpu(memcg->stat->events[i], cpu); - - per_cpu(memcg->stat->events[i], cpu) = 0; - memcg->nocpu_base.events[i] += x; - } - spin_unlock(&memcg->pcp_counter_lock); -} - static int memcg_cpu_hotplug_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; struct memcg_stock_pcp *stock; - struct mem_cgroup *iter; if (action == CPU_ONLINE) return NOTIFY_OK; @@ -2213,14 +1998,36 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) return NOTIFY_OK; - for_each_mem_cgroup(iter) - mem_cgroup_drain_pcp_counter(iter, cpu); - stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); return NOTIFY_OK; } +/* + * Scheduled by try_charge() to be executed from the userland return path + * and reclaims memory over the high limit. + */ +void mem_cgroup_handle_over_high(void) +{ + unsigned int nr_pages = current->memcg_nr_pages_over_high; + struct mem_cgroup *memcg, *pos; + + if (likely(!nr_pages)) + return; + + pos = memcg = get_mem_cgroup_from_mm(current->mm); + + do { + if (page_counter_read(&pos->memory) <= pos->high) + continue; + mem_cgroup_events(pos, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); + } while ((pos = parent_mem_cgroup(pos))); + + css_put(&memcg->css); + current->memcg_nr_pages_over_high = 0; +} + static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { @@ -2231,17 +2038,16 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long nr_reclaimed; bool may_swap = true; bool drained = false; - int ret = 0; if (mem_cgroup_is_root(memcg)) - goto done; + return 0; retry: if (consume_stock(memcg, nr_pages)) - goto done; + return 0; if (!do_swap_account || - !page_counter_try_charge(&memcg->memsw, batch, &counter)) { - if (!page_counter_try_charge(&memcg->memory, batch, &counter)) + page_counter_try_charge(&memcg->memsw, batch, &counter)) { + if (page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; if (do_swap_account) page_counter_uncharge(&memcg->memsw, batch); @@ -2265,12 +2071,12 @@ retry: if (unlikely(test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current) || current->flags & PF_EXITING)) - goto bypass; + goto force; if (unlikely(task_in_memcg_oom(current))) goto nomem; - if (!(gfp_mask & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); @@ -2311,38 +2117,54 @@ retry: goto retry; if (gfp_mask & __GFP_NOFAIL) - goto bypass; + goto force; if (fatal_signal_pending(current)) - goto bypass; + goto force; mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); + mem_cgroup_oom(mem_over_limit, gfp_mask, + get_order(nr_pages * PAGE_SIZE)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; -bypass: - return -EINTR; +force: + /* + * The allocation either can't fail or will lead to more memory + * being freed very soon. Allow memory usage go over the limit + * temporarily by force charging it. + */ + page_counter_charge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); + + return 0; done_restock: css_get_many(&memcg->css, batch); if (batch > nr_pages) refill_stock(memcg, batch - nr_pages); - if (!(gfp_mask & __GFP_WAIT)) - goto done; + /* - * If the hierarchy is above the normal consumption range, - * make the charging task trim their excess contribution. + * If the hierarchy is above the normal consumption range, schedule + * reclaim on returning to userland. We can perform reclaim here + * if __GFP_RECLAIM but let's always punt for simplicity and so that + * GFP_KERNEL can consistently be used during reclaim. @memcg is + * not recorded as it most likely matches current's and won't + * change in the meantime. As high limit is checked again before + * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) <= memcg->high) - continue; - mem_cgroup_events(memcg, MEMCG_HIGH, 1); - try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + if (page_counter_read(&memcg->memory) > memcg->high) { + current->memcg_nr_pages_over_high += batch; + set_notify_resume(current); + break; + } } while ((memcg = parent_mem_cgroup(memcg))); -done: - return ret; + + return 0; } static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -2357,40 +2179,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) css_put_many(&memcg->css, nr_pages); } -/* - * try_get_mem_cgroup_from_page - look up page's memcg association - * @page: the page - * - * Look up, get a css reference, and return the memcg that owns @page. - * - * The page must be locked to prevent racing with swap-in and page - * cache charges. If coming from an unlocked page table, the caller - * must ensure the page is on the LRU or this can race with charging. - */ -struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) -{ - struct mem_cgroup *memcg; - unsigned short id; - swp_entry_t ent; - - VM_BUG_ON_PAGE(!PageLocked(page), page); - - memcg = page->mem_cgroup; - if (memcg) { - if (!css_tryget_online(&memcg->css)) - memcg = NULL; - } else if (PageSwapCache(page)) { - ent.val = page_private(page); - id = lookup_swap_cgroup_id(ent); - rcu_read_lock(); - memcg = mem_cgroup_from_id(id); - if (memcg && !css_tryget_online(&memcg->css)) - memcg = NULL; - rcu_read_unlock(); - } - return memcg; -} - static void lock_page_lru(struct page *page, int *isolated) { struct zone *zone = page_zone(page); @@ -2457,65 +2245,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, } #ifdef CONFIG_MEMCG_KMEM -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, - unsigned long nr_pages) -{ - struct page_counter *counter; - int ret = 0; - - ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); - if (ret < 0) - return ret; - - ret = try_charge(memcg, gfp, nr_pages); - if (ret == -EINTR) { - /* - * try_charge() chose to bypass to root due to OOM kill or - * fatal signal. Since our only options are to either fail - * the allocation or charge it to this cgroup, do it as a - * temporary condition. But we can't fail. From a kmem/slab - * perspective, the cache has already been selected, by - * mem_cgroup_kmem_get_cache(), so it is too late to change - * our minds. - * - * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed - * during try_charge() above. Tasks that were already dying - * when the allocation triggers should have been already - * directed to the root cgroup in memcontrol.h - */ - page_counter_charge(&memcg->memory, nr_pages); - if (do_swap_account) - page_counter_charge(&memcg->memsw, nr_pages); - css_get_many(&memcg->css, nr_pages); - ret = 0; - } else if (ret) - page_counter_uncharge(&memcg->kmem, nr_pages); - - return ret; -} - -void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) -{ - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) - page_counter_uncharge(&memcg->memsw, nr_pages); - - page_counter_uncharge(&memcg->kmem, nr_pages); - - css_put_many(&memcg->css, nr_pages); -} - -/* - * helper for acessing a memcg's index. It will be used as an index in the - * child cache array in kmem_cache, and also to derive its name. This function - * will return -1 when this is not a kmem-limited memcg. - */ -int memcg_cache_id(struct mem_cgroup *memcg) -{ - return memcg ? memcg->kmemcg_id : -1; -} - static int memcg_alloc_cache_id(void) { int id, size; @@ -2677,85 +2406,58 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep) css_put(&cachep->memcg_params.memcg->css); } -/* - * We need to verify if the allocation against current->mm->owner's memcg is - * possible for the given order. But the page is not allocated yet, so we'll - * need a further commit step to do the final arrangements. - * - * It is possible for the task to switch cgroups in this mean time, so at - * commit time, we can't rely on task conversion any longer. We'll then use - * the handle argument to return to the caller which cgroup we should commit - * against. We could also return the memcg directly and avoid the pointer - * passing, but a boolean return value gives better semantics considering - * the compiled-out case as well. - * - * Returning true means the allocation is possible. - */ -bool -__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) +int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, + struct mem_cgroup *memcg) { - struct mem_cgroup *memcg; + unsigned int nr_pages = 1 << order; + struct page_counter *counter; int ret; - *_memcg = NULL; + if (!memcg_kmem_is_active(memcg)) + return 0; - memcg = get_mem_cgroup_from_mm(current->mm); + if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) + return -ENOMEM; - if (!memcg_kmem_is_active(memcg)) { - css_put(&memcg->css); - return true; + ret = try_charge(memcg, gfp, nr_pages); + if (ret) { + page_counter_uncharge(&memcg->kmem, nr_pages); + return ret; } - ret = memcg_charge_kmem(memcg, gfp, 1 << order); - if (!ret) - *_memcg = memcg; + page->mem_cgroup = memcg; - css_put(&memcg->css); - return (ret == 0); + return 0; } -void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, - int order) +int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { - VM_BUG_ON(mem_cgroup_is_root(memcg)); + struct mem_cgroup *memcg; + int ret; - /* The page allocation failed. Revert */ - if (!page) { - memcg_uncharge_kmem(memcg, 1 << order); - return; - } - page->mem_cgroup = memcg; + memcg = get_mem_cgroup_from_mm(current->mm); + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + css_put(&memcg->css); + return ret; } -void __memcg_kmem_uncharge_pages(struct page *page, int order) +void __memcg_kmem_uncharge(struct page *page, int order) { struct mem_cgroup *memcg = page->mem_cgroup; + unsigned int nr_pages = 1 << order; if (!memcg) return; VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - memcg_uncharge_kmem(memcg, 1 << order); - page->mem_cgroup = NULL; -} - -struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) -{ - struct mem_cgroup *memcg = NULL; - struct kmem_cache *cachep; - struct page *page; - - page = virt_to_head_page(ptr); - if (PageSlab(page)) { - cachep = page->slab_cache; - if (!is_root_cache(cachep)) - memcg = cachep->memcg_params.memcg; - } else - /* page allocated by alloc_kmem_pages */ - memcg = page->mem_cgroup; + page_counter_uncharge(&memcg->kmem, nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_swap_account) + page_counter_uncharge(&memcg->memsw, nr_pages); - return memcg; + page->mem_cgroup = NULL; + css_put_many(&memcg->css, nr_pages); } #endif /* CONFIG_MEMCG_KMEM */ @@ -3121,20 +2823,17 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { struct mem_cgroup *iter; - long val = 0; + unsigned long val = 0; - /* Per-cpu values can be negative, use a signed accumulator */ for_each_mem_cgroup_tree(iter, memcg) val += mem_cgroup_read_stat(iter, idx); - if (val < 0) /* race ? */ - val = 0; return val; } -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - u64 val; + unsigned long val; if (mem_cgroup_is_root(memcg)) { val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); @@ -3147,7 +2846,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) else val = page_counter_read(&memcg->memsw); } - return val << PAGE_SHIFT; + return val; } enum { @@ -3181,9 +2880,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: if (counter == &memcg->memory) - return mem_cgroup_usage(memcg, false); + return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; if (counter == &memcg->memsw) - return mem_cgroup_usage(memcg, true); + return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->limit * PAGE_SIZE; @@ -3222,7 +2921,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, * of course permitted. */ mutex_lock(&memcg_create_mutex); - if (cgroup_has_tasks(memcg->css.cgroup) || + if (cgroup_is_populated(memcg->css.cgroup) || (memcg->use_hierarchy && memcg_has_children(memcg))) err = -EBUSY; mutex_unlock(&memcg_create_mutex); @@ -3471,7 +3170,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; - seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], + seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); } @@ -3496,13 +3195,13 @@ static int memcg_stat_show(struct seq_file *m, void *v) (u64)memsw * PAGE_SIZE); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - long long val = 0; + unsigned long long val = 0; if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; for_each_mem_cgroup_tree(mi, memcg) val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; - seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); + seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val); } for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { @@ -3829,16 +3528,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, swap_buffers: /* Swap primary and spare array */ thresholds->spare = thresholds->primary; - /* If all events are unregistered, free the spare array */ - if (!new) { - kfree(thresholds->spare); - thresholds->spare = NULL; - } rcu_assign_pointer(thresholds->primary, new); /* To be sure that nobody uses thresholds */ synchronize_rcu(); + + /* If all events are unregistered, free the spare array */ + if (!new) { + kfree(thresholds->spare); + thresholds->spare = NULL; + } unlock: mutex_unlock(&memcg->thresholds_lock); } @@ -3870,7 +3570,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, list_add(&event->list, &memcg->oom_notify); /* already in OOM ? */ - if (atomic_read(&memcg->under_oom)) + if (memcg->under_oom) eventfd_signal(eventfd, 1); spin_unlock(&memcg_oom_lock); @@ -3899,7 +3599,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); - seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); + seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); return 0; } @@ -4001,6 +3701,97 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_CGROUP_WRITEBACK + +struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) +{ + return &memcg->cgwb_list; +} + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return wb_domain_init(&memcg->cgwb_domain, gfp); +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ + wb_domain_exit(&memcg->cgwb_domain); +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ + wb_domain_size_changed(&memcg->cgwb_domain); +} + +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + + if (!memcg->css.parent) + return NULL; + + return &memcg->cgwb_domain; +} + +/** + * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg + * @wb: bdi_writeback in question + * @pfilepages: out parameter for number of file pages + * @pheadroom: out parameter for number of allocatable pages according to memcg + * @pdirty: out parameter for number of dirty pages + * @pwriteback: out parameter for number of pages under writeback + * + * Determine the numbers of file, headroom, dirty, and writeback pages in + * @wb's memcg. File, dirty and writeback are self-explanatory. Headroom + * is a bit more involved. + * + * A memcg's headroom is "min(max, high) - used". In the hierarchy, the + * headroom is calculated as the lowest headroom of itself and the + * ancestors. Note that this doesn't consider the actual amount of + * available memory in the system. The caller should further cap + * *@pheadroom accordingly. + */ +void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, + unsigned long *pheadroom, unsigned long *pdirty, + unsigned long *pwriteback) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + struct mem_cgroup *parent; + + *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); + + /* this should eventually include NR_UNSTABLE_NFS */ + *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | + (1 << LRU_ACTIVE_FILE)); + *pheadroom = PAGE_COUNTER_MAX; + + while ((parent = parent_mem_cgroup(memcg))) { + unsigned long ceiling = min(memcg->memory.limit, memcg->high); + unsigned long used = page_counter_read(&memcg->memory); + + *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); + memcg = parent; + } +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return 0; +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ +} + +static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * DO NOT USE IN NEW FILES. * @@ -4270,8 +4061,7 @@ static struct cftype mem_cgroup_legacy_files[] = { { .name = "cgroup.event_control", /* XXX: for compat */ .write = memcg_write_event_control, - .flags = CFTYPE_NO_PREFIX, - .mode = S_IWUGO, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, }, { .name = "swappiness", @@ -4385,9 +4175,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!memcg->stat) goto out_free; - spin_lock_init(&memcg->pcp_counter_lock); + + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto out_free_stat; + return memcg; +out_free_stat: + free_percpu(memcg->stat); out_free: kfree(memcg); return NULL; @@ -4414,6 +4209,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); + memcg_wb_domain_exit(memcg); kfree(memcg); } @@ -4446,6 +4242,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) /* root ? */ if (parent_css == NULL) { root_mem_cgroup = memcg; + mem_cgroup_root_css = &memcg->css; page_counter_init(&memcg->memory, NULL); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -4464,7 +4261,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; #endif - +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&memcg->cgwb_list); +#endif return &memcg->css; free_out: @@ -4552,6 +4351,15 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); memcg_deactivate_kmem(memcg); + + wb_memcg_offline(memcg); +} + +static void mem_cgroup_css_released(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + invalidate_reclaim_iterators(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) @@ -4585,6 +4393,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg->low = 0; memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; + memcg_wb_domain_size_changed(memcg); } #ifdef CONFIG_MMU @@ -4593,28 +4402,16 @@ static int mem_cgroup_do_precharge(unsigned long count) { int ret; - /* Try a single bulk charge without reclaim first */ - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + /* Try a single bulk charge without reclaim first, kswapd may wake */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); if (!ret) { mc.precharge += count; return ret; } - if (ret == -EINTR) { - cancel_charge(root_mem_cgroup, count); - return ret; - } /* Try charges one by one with reclaim */ while (count--) { ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); - /* - * In case of failure, any residual charges against - * mc.to will be dropped by mem_cgroup_clear_mc() - * later on. However, cancel any charges that are - * bypassed to root right away or they'll be lost. - */ - if (ret == -EINTR) - cancel_charge(root_mem_cgroup, 1); if (ret) return ret; mc.precharge++; @@ -4754,6 +4551,7 @@ static int mem_cgroup_move_account(struct page *page, { unsigned long flags; int ret; + bool anon; VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -4768,9 +4566,8 @@ static int mem_cgroup_move_account(struct page *page, goto out; /* - * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup - * of its source page while we change it: page migration takes - * both pages off the LRU, but page cache replacement doesn't. + * Prevent mem_cgroup_replace_page() from looking at + * page->mem_cgroup of its source page while we change it. */ if (!trylock_page(page)) goto out; @@ -4779,15 +4576,33 @@ static int mem_cgroup_move_account(struct page *page, if (page->mem_cgroup != from) goto out_unlock; + anon = PageAnon(page); + spin_lock_irqsave(&from->move_lock, flags); - if (!PageAnon(page) && page_mapped(page)) { + if (!anon && page_mapped(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], nr_pages); __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], nr_pages); } + /* + * move_lock grabbed above and caller set from->moving_account, so + * mem_cgroup_update_page_stat() will serialize updates to PageDirty. + * So mapping should be stable for dirty pages. + */ + if (!anon && PageDirty(page)) { + struct address_space *mapping = page_mapping(page); + + if (mapping_cap_account_dirty(mapping)) { + __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], + nr_pages); + __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], + nr_pages); + } + } + if (PageWriteback(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], nr_pages); @@ -5002,13 +4817,34 @@ static void mem_cgroup_clear_mc(void) spin_unlock(&mc.lock); } -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { - struct task_struct *p = cgroup_taskset_first(tset); - int ret = 0; - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; + struct mem_cgroup *from; + struct task_struct *leader, *p; + struct mm_struct *mm; unsigned long move_flags; + int ret = 0; + + /* charge immigration isn't supported on the default hierarchy */ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return 0; + + /* + * Multi-process migrations only happen on the default hierarchy + * where charge immigration is not used. Perform charge + * immigration if @tset contains a leader and whine if there are + * multiple. + */ + p = NULL; + cgroup_taskset_for_each_leader(leader, css, tset) { + WARN_ON_ONCE(p); + p = leader; + memcg = mem_cgroup_from_css(css); + } + if (!p) + return 0; /* * We are now commited to this value whatever it is. Changes in this @@ -5016,41 +4852,40 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, * So we need to save it, and keep it going. */ move_flags = READ_ONCE(memcg->move_charge_at_immigrate); - if (move_flags) { - struct mm_struct *mm; - struct mem_cgroup *from = mem_cgroup_from_task(p); + if (!move_flags) + return 0; - VM_BUG_ON(from == memcg); + from = mem_cgroup_from_task(p); - mm = get_task_mm(p); - if (!mm) - return 0; - /* We move charges only when we move a owner of the mm */ - if (mm->owner == p) { - VM_BUG_ON(mc.from); - VM_BUG_ON(mc.to); - VM_BUG_ON(mc.precharge); - VM_BUG_ON(mc.moved_charge); - VM_BUG_ON(mc.moved_swap); - - spin_lock(&mc.lock); - mc.from = from; - mc.to = memcg; - mc.flags = move_flags; - spin_unlock(&mc.lock); - /* We set mc.moving_task later */ - - ret = mem_cgroup_precharge_mc(mm); - if (ret) - mem_cgroup_clear_mc(); - } - mmput(mm); + VM_BUG_ON(from == memcg); + + mm = get_task_mm(p); + if (!mm) + return 0; + /* We move charges only when we move a owner of the mm */ + if (mm->owner == p) { + VM_BUG_ON(mc.from); + VM_BUG_ON(mc.to); + VM_BUG_ON(mc.precharge); + VM_BUG_ON(mc.moved_charge); + VM_BUG_ON(mc.moved_swap); + + spin_lock(&mc.lock); + mc.from = from; + mc.to = memcg; + mc.flags = move_flags; + spin_unlock(&mc.lock); + /* We set mc.moving_task later */ + + ret = mem_cgroup_precharge_mc(mm); + if (ret) + mem_cgroup_clear_mc(); } + mmput(mm); return ret; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { if (mc.to) mem_cgroup_clear_mc(); @@ -5192,10 +5027,10 @@ retry: atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { - struct task_struct *p = cgroup_taskset_first(tset); + struct cgroup_subsys_state *css; + struct task_struct *p = cgroup_taskset_first(tset, &css); struct mm_struct *mm = get_task_mm(p); if (mm) { @@ -5207,17 +5042,14 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { } #endif @@ -5234,7 +5066,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) * guarantees that @root doesn't have any children, so turning it * on for the root memcg is enough. */ - if (cgroup_on_dfl(root_css->cgroup)) + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) root_mem_cgroup->use_hierarchy = true; else root_mem_cgroup->use_hierarchy = false; @@ -5243,7 +5075,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) static u64 memory_current_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_usage(mem_cgroup_from_css(css), false); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } static int memory_low_show(struct seq_file *m, void *v) @@ -5303,6 +5137,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -5335,6 +5170,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; + memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -5353,6 +5189,7 @@ static int memory_events_show(struct seq_file *m, void *v) static struct cftype memory_files[] = { { .name = "current", + .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = memory_current_read, }, { @@ -5376,6 +5213,7 @@ static struct cftype memory_files[] = { { .name = "events", .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, events_file), .seq_show = memory_events_show, }, { } /* terminate */ @@ -5385,6 +5223,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, + .css_released = mem_cgroup_css_released, .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, @@ -5397,19 +5236,6 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_events - count memory events against a cgroup - * @memcg: the memory cgroup - * @idx: the event index - * @nr: the number of events to account for - */ -void mem_cgroup_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx, - unsigned int nr) -{ - this_cpu_add(memcg->stat->events[idx], nr); -} - -/** * mem_cgroup_low - check if memory consumption is below the normal range * @root: the highest ancestor to consider * @memcg: the memory cgroup to check @@ -5481,8 +5307,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, * the page lock, which serializes swap cache removal, which * in turn serializes uncharging. */ + VM_BUG_ON_PAGE(!PageLocked(page), page); if (page->mem_cgroup) goto out; + + if (do_swap_account) { + swp_entry_t ent = { .val = page_private(page), }; + unsigned short id = lookup_swap_cgroup_id(ent); + + rcu_read_lock(); + memcg = mem_cgroup_from_id(id); + if (memcg && !css_tryget_online(&memcg->css)) + memcg = NULL; + rcu_read_unlock(); + } } if (PageTransHuge(page)) { @@ -5490,19 +5328,12 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, VM_BUG_ON_PAGE(!PageTransHuge(page), page); } - if (do_swap_account && PageSwapCache(page)) - memcg = try_get_mem_cgroup_from_page(page); if (!memcg) memcg = get_mem_cgroup_from_mm(mm); ret = try_charge(memcg, gfp_mask, nr_pages); css_put(&memcg->css); - - if (ret == -EINTR) { - memcg = root_mem_cgroup; - ret = 0; - } out: *memcgp = memcg; return ret; @@ -5717,25 +5548,22 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_migrate - migrate a charge to another page + * mem_cgroup_replace_page - migrate a charge to another page * @oldpage: currently charged page * @newpage: page to transfer the charge to - * @lrucare: either or both pages might be on the LRU already * * Migrate the charge from @oldpage to @newpage. * * Both pages must be locked, @newpage->mapping must be set up. + * Either or both pages might be on the LRU already. */ -void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, - bool lrucare) +void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; int isolated; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); - VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), newpage); @@ -5747,25 +5575,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, if (newpage->mem_cgroup) return; - /* - * Swapcache readahead pages can get migrated before being - * charged, and migration from compaction can happen to an - * uncharged page when the PFN walker finds a page that - * reclaim just put back on the LRU but has not released yet. - */ + /* Swapcache readahead pages can get replaced before being charged */ memcg = oldpage->mem_cgroup; if (!memcg) return; - if (lrucare) - lock_page_lru(oldpage, &isolated); - + lock_page_lru(oldpage, &isolated); oldpage->mem_cgroup = NULL; + unlock_page_lru(oldpage, isolated); - if (lrucare) - unlock_page_lru(oldpage, isolated); - - commit_charge(newpage, memcg, lrucare); + commit_charge(newpage, memcg, true); } /* @@ -5842,8 +5661,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); + /* + * Interrupts should be disabled here because the caller holds the + * mapping->tree_lock lock which is taken with interrupts-off. It is + * important here to have the interrupts disabled because it is the + * only synchronisation we have for udpating the per-CPU variables. + */ local_lock_irqsave(event_lock, flags); - /* Caller disabled preemption with mapping->tree_lock */ +#ifndef CONFIG_PREEMPT_RT_BASE + VM_BUG_ON(!irqs_disabled()); +#endif mem_cgroup_charge_statistics(memcg, page, -1); memcg_check_events(memcg, page); local_unlock_irqrestore(event_lock, flags); diff --git a/kernel/mm/memory-failure.c b/kernel/mm/memory-failure.c index 9f48145c8..750b7893e 100644 --- a/kernel/mm/memory-failure.c +++ b/kernel/mm/memory-failure.c @@ -20,6 +20,14 @@ * this code has to be extremely careful. Generally it tries to use * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. + * + * It can be very tempting to add handling for obscure cases here. + * In general any code for handling new cases should only be added iff: + * - You know how to test it. + * - You have a test that can be added to mce-test + * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ + * - The case actually shows up as a frequent (top 10) page state in + * tools/vm/page-types when running a real workload. * * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back @@ -28,13 +36,6 @@ * are rare we hope to get away with this. This avoids impacting the core * VM. */ - -/* - * Notebook: - * - hugetlb needs more code - * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages - * - pass bad pages to kdump next kernel - */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/page-flags.h> @@ -55,7 +56,9 @@ #include <linux/memory_hotplug.h> #include <linux/mm_inline.h> #include <linux/kfifo.h> +#include <linux/ratelimit.h> #include "internal.h" +#include "ras/ras_event.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -128,27 +131,15 @@ static int hwpoison_filter_flags(struct page *p) * can only guarantee that the page either belongs to the memcg tasks, or is * a freed page. */ -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_MEMCG u64 hwpoison_filter_memcg; EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); static int hwpoison_filter_task(struct page *p) { - struct mem_cgroup *mem; - struct cgroup_subsys_state *css; - unsigned long ino; - if (!hwpoison_filter_memcg) return 0; - mem = try_get_mem_cgroup_from_page(p); - if (!mem) - return -EINVAL; - - css = mem_cgroup_css(mem); - ino = cgroup_ino(css->cgroup); - css_put(css); - - if (ino != hwpoison_filter_memcg) + if (page_cgroup_ino(p) != hwpoison_filter_memcg) return -EINVAL; return 0; @@ -503,68 +494,34 @@ static void collect_procs(struct page *page, struct list_head *tokill, kfree(tk); } -/* - * Error handlers for various types of pages. - */ - -enum outcome { - IGNORED, /* Error: cannot be handled */ - FAILED, /* Error: handling failed */ - DELAYED, /* Will be handled later */ - RECOVERED, /* Successfully recovered */ -}; - static const char *action_name[] = { - [IGNORED] = "Ignored", - [FAILED] = "Failed", - [DELAYED] = "Delayed", - [RECOVERED] = "Recovered", -}; - -enum action_page_type { - MSG_KERNEL, - MSG_KERNEL_HIGH_ORDER, - MSG_SLAB, - MSG_DIFFERENT_COMPOUND, - MSG_POISONED_HUGE, - MSG_HUGE, - MSG_FREE_HUGE, - MSG_UNMAP_FAILED, - MSG_DIRTY_SWAPCACHE, - MSG_CLEAN_SWAPCACHE, - MSG_DIRTY_MLOCKED_LRU, - MSG_CLEAN_MLOCKED_LRU, - MSG_DIRTY_UNEVICTABLE_LRU, - MSG_CLEAN_UNEVICTABLE_LRU, - MSG_DIRTY_LRU, - MSG_CLEAN_LRU, - MSG_TRUNCATED_LRU, - MSG_BUDDY, - MSG_BUDDY_2ND, - MSG_UNKNOWN, + [MF_IGNORED] = "Ignored", + [MF_FAILED] = "Failed", + [MF_DELAYED] = "Delayed", + [MF_RECOVERED] = "Recovered", }; static const char * const action_page_types[] = { - [MSG_KERNEL] = "reserved kernel page", - [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", - [MSG_SLAB] = "kernel slab page", - [MSG_DIFFERENT_COMPOUND] = "different compound page after locking", - [MSG_POISONED_HUGE] = "huge page already hardware poisoned", - [MSG_HUGE] = "huge page", - [MSG_FREE_HUGE] = "free huge page", - [MSG_UNMAP_FAILED] = "unmapping failed page", - [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", - [MSG_CLEAN_SWAPCACHE] = "clean swapcache page", - [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", - [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", - [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", - [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", - [MSG_DIRTY_LRU] = "dirty LRU page", - [MSG_CLEAN_LRU] = "clean LRU page", - [MSG_TRUNCATED_LRU] = "already truncated LRU page", - [MSG_BUDDY] = "free buddy page", - [MSG_BUDDY_2ND] = "free buddy page (2nd try)", - [MSG_UNKNOWN] = "unknown page", + [MF_MSG_KERNEL] = "reserved kernel page", + [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", + [MF_MSG_SLAB] = "kernel slab page", + [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", + [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", + [MF_MSG_HUGE] = "huge page", + [MF_MSG_FREE_HUGE] = "free huge page", + [MF_MSG_UNMAP_FAILED] = "unmapping failed page", + [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", + [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", + [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", + [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", + [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", + [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", + [MF_MSG_DIRTY_LRU] = "dirty LRU page", + [MF_MSG_CLEAN_LRU] = "clean LRU page", + [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", + [MF_MSG_BUDDY] = "free buddy page", + [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", + [MF_MSG_UNKNOWN] = "unknown page", }; /* @@ -598,7 +555,7 @@ static int delete_from_lru_cache(struct page *p) */ static int me_kernel(struct page *p, unsigned long pfn) { - return IGNORED; + return MF_IGNORED; } /* @@ -607,7 +564,7 @@ static int me_kernel(struct page *p, unsigned long pfn) static int me_unknown(struct page *p, unsigned long pfn) { printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); - return FAILED; + return MF_FAILED; } /* @@ -616,7 +573,7 @@ static int me_unknown(struct page *p, unsigned long pfn) static int me_pagecache_clean(struct page *p, unsigned long pfn) { int err; - int ret = FAILED; + int ret = MF_FAILED; struct address_space *mapping; delete_from_lru_cache(p); @@ -626,7 +583,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * should be the one m_f() holds. */ if (PageAnon(p)) - return RECOVERED; + return MF_RECOVERED; /* * Now truncate the page in the page cache. This is really @@ -640,7 +597,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) /* * Page has been teared down in the meanwhile */ - return FAILED; + return MF_FAILED; } /* @@ -657,7 +614,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) !try_to_release_page(p, GFP_NOIO)) { pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { - ret = RECOVERED; + ret = MF_RECOVERED; } } else { /* @@ -665,7 +622,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * This fails on dirty or anything with private pages */ if (invalidate_inode_page(p)) - ret = RECOVERED; + ret = MF_RECOVERED; else printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", pfn); @@ -751,9 +708,9 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) ClearPageUptodate(p); if (!delete_from_lru_cache(p)) - return DELAYED; + return MF_DELAYED; else - return FAILED; + return MF_FAILED; } static int me_swapcache_clean(struct page *p, unsigned long pfn) @@ -761,9 +718,9 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) delete_from_swap_cache(p); if (!delete_from_lru_cache(p)) - return RECOVERED; + return MF_RECOVERED; else - return FAILED; + return MF_FAILED; } /* @@ -776,6 +733,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) { int res = 0; struct page *hpage = compound_head(p); + + if (!PageHuge(hpage)) + return MF_DELAYED; + /* * We can safely recover from error on free or reserved (i.e. * not in-use) hugepage by dequeuing it from freelist. @@ -789,9 +750,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) if (!(page_mapping(hpage) || PageAnon(hpage))) { res = dequeue_hwpoisoned_huge_page(hpage); if (!res) - return RECOVERED; + return MF_RECOVERED; } - return DELAYED; + return MF_DELAYED; } /* @@ -815,18 +776,16 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define lru (1UL << PG_lru) #define swapbacked (1UL << PG_swapbacked) #define head (1UL << PG_head) -#define tail (1UL << PG_tail) -#define compound (1UL << PG_compound) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) static struct page_state { unsigned long mask; unsigned long res; - enum action_page_type type; + enum mf_action_page_type type; int (*action)(struct page *p, unsigned long pfn); } error_states[] = { - { reserved, reserved, MSG_KERNEL, me_kernel }, + { reserved, reserved, MF_MSG_KERNEL, me_kernel }, /* * free pages are specially detected outside this table: * PG_buddy pages only make a small fraction of all free pages. @@ -837,31 +796,26 @@ static struct page_state { * currently unused objects without touching them. But just * treat it as standard kernel for now. */ - { slab, slab, MSG_SLAB, me_kernel }, + { slab, slab, MF_MSG_SLAB, me_kernel }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED - { head, head, MSG_HUGE, me_huge_page }, - { tail, tail, MSG_HUGE, me_huge_page }, -#else - { compound, compound, MSG_HUGE, me_huge_page }, -#endif + { head, head, MF_MSG_HUGE, me_huge_page }, - { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, - { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, + { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, + { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, - { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, - { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, + { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, + { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, - { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, - { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, + { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, + { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, - { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty }, - { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean }, + { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty }, + { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean }, /* * Catchall entry: must be at end. */ - { 0, 0, MSG_UNKNOWN, me_unknown }, + { 0, 0, MF_MSG_UNKNOWN, me_unknown }, }; #undef dirty @@ -881,8 +835,11 @@ static struct page_state { * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). */ -static void action_result(unsigned long pfn, enum action_page_type type, int result) +static void action_result(unsigned long pfn, enum mf_action_page_type type, + enum mf_result result) { + trace_memory_failure_event(pfn, type, result); + pr_err("MCE %#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); } @@ -896,13 +853,13 @@ static int page_action(struct page_state *ps, struct page *p, result = ps->action(p, pfn); count = page_count(p) - 1; - if (ps->action == me_swapcache_dirty && result == DELAYED) + if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; if (count != 0) { printk(KERN_ERR "MCE %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); - result = FAILED; + result = MF_FAILED; } action_result(pfn, ps->type, result); @@ -911,8 +868,74 @@ static int page_action(struct page_state *ps, struct page *p, * Could adjust zone counters here to correct for the missing page. */ - return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; + return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; +} + +/** + * get_hwpoison_page() - Get refcount for memory error handling: + * @page: raw error page (hit by memory error) + * + * Return: return 0 if failed to grab the refcount, otherwise true (some + * non-zero value.) + */ +int get_hwpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + + if (PageHuge(head)) + return get_page_unless_zero(head); + + /* + * Thp tail page has special refcounting rule (refcount of tail pages + * is stored in ->_mapcount,) so we can't call get_page_unless_zero() + * directly for tail pages. + */ + if (PageTransHuge(head)) { + /* + * Non anonymous thp exists only in allocation/free time. We + * can't handle such a case correctly, so let's give it up. + * This should be better than triggering BUG_ON when kernel + * tries to touch the "partially handled" page. + */ + if (!PageAnon(head)) { + pr_err("MCE: %#lx: non anonymous thp\n", + page_to_pfn(page)); + return 0; + } + + if (get_page_unless_zero(head)) { + if (PageTail(page)) + get_page(page); + return 1; + } else { + return 0; + } + } + + return get_page_unless_zero(page); +} +EXPORT_SYMBOL_GPL(get_hwpoison_page); + +/** + * put_hwpoison_page() - Put refcount for memory error handling: + * @page: raw error page (hit by memory error) + */ +void put_hwpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + + if (PageHuge(head)) { + put_page(head); + return; + } + + if (PageTransHuge(head)) + if (page != head) + put_page(head); + + put_page(page); } +EXPORT_SYMBOL_GPL(put_hwpoison_page); /* * Do all that is necessary to remove user space mappings. Unmap @@ -927,7 +950,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int ret; int kill = 1, forcekill; struct page *hpage = *hpagep; - struct page *ppage; /* * Here we are interested only in user-mapped pages, so skip any @@ -977,59 +999,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } /* - * ppage: poisoned page - * if p is regular page(4k page) - * ppage == real poisoned page; - * else p is hugetlb or THP, ppage == head page. - */ - ppage = hpage; - - if (PageTransHuge(hpage)) { - /* - * Verify that this isn't a hugetlbfs head page, the check for - * PageAnon is just for avoid tripping a split_huge_page - * internal debug check, as split_huge_page refuses to deal with - * anything that isn't an anon page. PageAnon can't go away fro - * under us because we hold a refcount on the hpage, without a - * refcount on the hpage. split_huge_page can't be safely called - * in the first place, having a refcount on the tail isn't - * enough * to be safe. - */ - if (!PageHuge(hpage) && PageAnon(hpage)) { - if (unlikely(split_huge_page(hpage))) { - /* - * FIXME: if splitting THP is failed, it is - * better to stop the following operation rather - * than causing panic by unmapping. System might - * survive if the page is freed later. - */ - printk(KERN_INFO - "MCE %#lx: failed to split THP\n", pfn); - - BUG_ON(!PageHWPoison(p)); - return SWAP_FAIL; - } - /* - * We pinned the head page for hwpoison handling, - * now we split the thp and we are interested in - * the hwpoisoned raw page, so move the refcount - * to it. Similarly, page lock is shifted. - */ - if (hpage != p) { - if (!(flags & MF_COUNT_INCREASED)) { - put_page(hpage); - get_page(p); - } - lock_page(p); - unlock_page(hpage); - *hpagep = p; - } - /* THP is split, so ppage should be the real poisoned page. */ - ppage = p; - } - } - - /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. @@ -1038,12 +1007,12 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(ppage, ttu); + ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(ppage)); + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -1055,7 +1024,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); kill_procs(&tokill, forcekill, trapno, ret != SWAP_SUCCESS, p, pfn, flags); @@ -1101,6 +1070,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) struct page_state *ps; struct page *p; struct page *hpage; + struct page *orig_head; int res; unsigned int nr_pages; unsigned long page_flags; @@ -1116,7 +1086,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } p = pfn_to_page(pfn); - hpage = compound_head(p); + orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; @@ -1133,7 +1103,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) nr_pages = 1 << compound_order(hpage); else /* normal page or thp */ nr_pages = 1; - atomic_long_add(nr_pages, &num_poisoned_pages); + num_poisoned_pages_add(nr_pages); /* * We need/can do nothing about count=0 pages. @@ -1149,10 +1119,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * In fact it's dangerous to directly bump up page count from 0, * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ - if (!(flags & MF_COUNT_INCREASED) && - !get_page_unless_zero(hpage)) { + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { if (is_free_buddy_page(p)) { - action_result(pfn, MSG_BUDDY, DELAYED); + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); return 0; } else if (PageHuge(hpage)) { /* @@ -1162,23 +1131,38 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (PageHWPoison(hpage)) { if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) || (p != hpage && TestSetPageHWPoison(hpage))) { - atomic_long_sub(nr_pages, &num_poisoned_pages); + num_poisoned_pages_sub(nr_pages); unlock_page(hpage); return 0; } } set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); - action_result(pfn, MSG_FREE_HUGE, - res ? IGNORED : DELAYED); + action_result(pfn, MF_MSG_FREE_HUGE, + res ? MF_IGNORED : MF_DELAYED); unlock_page(hpage); return res; } else { - action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED); + action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); return -EBUSY; } } + if (!PageHuge(p) && PageTransHuge(hpage)) { + if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { + if (!PageAnon(hpage)) + pr_err("MCE: %#lx: non anonymous thp\n", pfn); + else + pr_err("MCE: %#lx: thp split failed\n", pfn); + if (TestClearPageHWPoison(p)) + num_poisoned_pages_sub(nr_pages); + put_hwpoison_page(p); + return -EBUSY; + } + VM_BUG_ON_PAGE(!page_count(p), p); + hpage = compound_head(p); + } + /* * We ignore non-LRU pages for good reasons. * - PG_locked is only well defined for LRU pages and a few others @@ -1188,18 +1172,18 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * walked by the page reclaim code, however that's not a big loss. */ if (!PageHuge(p)) { - if (!PageLRU(hpage)) - shake_page(hpage, 0); - if (!PageLRU(hpage)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { /* * shake_page could have turned it free. */ if (is_free_buddy_page(p)) { if (flags & MF_COUNT_INCREASED) - action_result(pfn, MSG_BUDDY, DELAYED); + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); else - action_result(pfn, MSG_BUDDY_2ND, - DELAYED); + action_result(pfn, MF_MSG_BUDDY_2ND, + MF_DELAYED); return 0; } } @@ -1211,8 +1195,8 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * The page could have changed compound pages during the locking. * If this happens just bail out. */ - if (compound_head(p) != hpage) { - action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED); + if (PageCompound(p) && compound_head(p) != orig_head) { + action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); res = -EBUSY; goto out; } @@ -1231,16 +1215,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (!PageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); - atomic_long_sub(nr_pages, &num_poisoned_pages); - put_page(hpage); - res = 0; - goto out; + num_poisoned_pages_sub(nr_pages); + unlock_page(hpage); + put_hwpoison_page(hpage); + return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &num_poisoned_pages); + num_poisoned_pages_sub(nr_pages); unlock_page(hpage); - put_page(hpage); + put_hwpoison_page(hpage); return 0; } @@ -1252,9 +1236,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * on the head page to show that the hugepage is hwpoisoned */ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { - action_result(pfn, MSG_POISONED_HUGE, IGNORED); + action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); unlock_page(hpage); - put_page(hpage); + put_hwpoison_page(hpage); return 0; } /* @@ -1281,7 +1265,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) != SWAP_SUCCESS) { - action_result(pfn, MSG_UNMAP_FAILED, IGNORED); + action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; } @@ -1290,7 +1274,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * Torn down by someone else? */ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { - action_result(pfn, MSG_TRUNCATED_LRU, IGNORED); + action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); res = -EBUSY; goto out; } @@ -1413,6 +1397,12 @@ static int __init memory_failure_init(void) } core_initcall(memory_failure_init); +#define unpoison_pr_info(fmt, pfn, rs) \ +({ \ + if (__ratelimit(rs)) \ + pr_info(fmt, pfn); \ +}) + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -1431,6 +1421,8 @@ int unpoison_memory(unsigned long pfn) struct page *p; int freeit = 0; unsigned int nr_pages; + static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (!pfn_valid(pfn)) return -ENXIO; @@ -1439,7 +1431,26 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { - pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); + unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n", + pfn, &unpoison_rs); + return 0; + } + + if (page_count(page) > 1) { + unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n", + pfn, &unpoison_rs); + return 0; + } + + if (page_mapped(page)) { + unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n", + pfn, &unpoison_rs); + return 0; + } + + if (page_mapping(page)) { + unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", + pfn, &unpoison_rs); return 0; } @@ -1449,13 +1460,14 @@ int unpoison_memory(unsigned long pfn) * In such case, we yield to memory_failure() and make unpoison fail. */ if (!PageHuge(page) && PageTransHuge(page)) { - pr_info("MCE: Memory failure is now running on %#lx\n", pfn); - return 0; + unpoison_pr_info("MCE: Memory failure is now running on %#lx\n", + pfn, &unpoison_rs); + return 0; } nr_pages = 1 << compound_order(page); - if (!get_page_unless_zero(page)) { + if (!get_hwpoison_page(p)) { /* * Since HWPoisoned hugepage should have non-zero refcount, * race between memory failure and unpoison seems to happen. @@ -1463,12 +1475,14 @@ int unpoison_memory(unsigned long pfn) * to the end. */ if (PageHuge(page)) { - pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", + pfn, &unpoison_rs); return 0; } if (TestClearPageHWPoison(p)) - atomic_long_dec(&num_poisoned_pages); - pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); + num_poisoned_pages_dec(); + unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n", + pfn, &unpoison_rs); return 0; } @@ -1480,17 +1494,18 @@ int unpoison_memory(unsigned long pfn) * the free buddy page pool. */ if (TestClearPageHWPoison(page)) { - pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); - atomic_long_sub(nr_pages, &num_poisoned_pages); + unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n", + pfn, &unpoison_rs); + num_poisoned_pages_sub(nr_pages); freeit = 1; if (PageHuge(page)) clear_page_hwpoison_huge_page(page); } unlock_page(page); - put_page(page); + put_hwpoison_page(page); if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) - put_page(page); + put_hwpoison_page(page); return 0; } @@ -1503,7 +1518,7 @@ static struct page *new_page(struct page *p, unsigned long private, int **x) return alloc_huge_page_node(page_hstate(compound_head(p)), nid); else - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); + return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1523,7 +1538,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) * When the target page is a free hugepage, just remove it * from free hugepage list. */ - if (!get_page_unless_zero(compound_head(p))) { + if (!get_hwpoison_page(p)) { if (PageHuge(p)) { pr_info("%s: %#lx free huge page\n", __func__, pfn); ret = 0; @@ -1550,16 +1565,16 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) /* * Try to free it. */ - put_page(page); + put_hwpoison_page(page); shake_page(page, 1); /* * Did it turn free? */ ret = __get_any_page(page, pfn, 0); - if (!PageLRU(page)) { + if (ret == 1 && !PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ - put_page(page); + put_hwpoison_page(page); pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; @@ -1582,7 +1597,7 @@ static int soft_offline_huge_page(struct page *page, int flags) lock_page(hpage); if (PageHWPoison(hpage)) { unlock_page(hpage); - put_page(hpage); + put_hwpoison_page(hpage); pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); return -EBUSY; } @@ -1593,7 +1608,7 @@ static int soft_offline_huge_page(struct page *page, int flags) * get_any_page() and isolate_huge_page() takes a refcount each, * so need to drop one here. */ - put_page(hpage); + put_hwpoison_page(hpage); if (!ret) { pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); return -EBUSY; @@ -1617,11 +1632,10 @@ static int soft_offline_huge_page(struct page *page, int flags) if (PageHuge(page)) { set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_order(hpage), - &num_poisoned_pages); + num_poisoned_pages_add(1 << compound_order(hpage)); } else { SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); + num_poisoned_pages_inc(); } } return ret; @@ -1642,7 +1656,7 @@ static int __soft_offline_page(struct page *page, int flags) wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); - put_page(page); + put_hwpoison_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } @@ -1657,10 +1671,10 @@ static int __soft_offline_page(struct page *page, int flags) * would need to fix isolation locking first. */ if (ret == 1) { - put_page(page); + put_hwpoison_page(page); pr_info("soft_offline: %#lx: invalidated\n", pfn); SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); + num_poisoned_pages_inc(); return 0; } @@ -1674,7 +1688,7 @@ static int __soft_offline_page(struct page *page, int flags) * Drop page reference which is came from get_any_page() * successful isolate_lru_page() already took another one. */ - put_page(page); + put_hwpoison_page(page); if (!ret) { LIST_HEAD(pagelist); inc_zone_page_state(page, NR_ISOLATED_ANON + @@ -1694,22 +1708,6 @@ static int __soft_offline_page(struct page *page, int flags) pfn, ret, page->flags); if (ret > 0) ret = -EIO; - } else { - /* - * After page migration succeeds, the source page can - * be trapped in pagevec and actual freeing is delayed. - * Freeing code works differently based on PG_hwpoison, - * so there's a race. We need to make sure that the - * source page should be freed back to buddy before - * setting PG_hwpoison. - */ - if (!is_free_buddy_page(page)) - drain_all_pages(page_zone(page)); - SetPageHWPoison(page); - if (!is_free_buddy_page(page)) - pr_info("soft offline: %#lx: page leaked\n", - pfn); - atomic_long_inc(&num_poisoned_pages); } } else { pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", @@ -1748,26 +1746,22 @@ int soft_offline_page(struct page *page, int flags) if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); + if (flags & MF_COUNT_INCREASED) + put_hwpoison_page(page); return -EBUSY; } if (!PageHuge(page) && PageTransHuge(hpage)) { if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { pr_info("soft offline: %#lx: failed to split THP\n", pfn); + if (flags & MF_COUNT_INCREASED) + put_hwpoison_page(page); return -EBUSY; } } get_online_mems(); - /* - * Isolate the page, so that it doesn't get reallocated if it - * was free. This flag should be kept set until the source page - * is freed and PG_hwpoison on it is set. - */ - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) - set_migratetype_isolate(page, true); - ret = get_any_page(page, pfn, flags); put_online_mems(); if (ret > 0) { /* for in-use pages */ @@ -1779,13 +1773,11 @@ int soft_offline_page(struct page *page, int flags) if (PageHuge(page)) { set_page_hwpoison_huge_page(hpage); if (!dequeue_hwpoisoned_huge_page(hpage)) - atomic_long_add(1 << compound_order(hpage), - &num_poisoned_pages); + num_poisoned_pages_add(1 << compound_order(hpage)); } else { if (!TestSetPageHWPoison(page)) - atomic_long_inc(&num_poisoned_pages); + num_poisoned_pages_inc(); } } - unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } diff --git a/kernel/mm/memory.c b/kernel/mm/memory.c index 3fc6efd10..b80bf4746 100644 --- a/kernel/mm/memory.c +++ b/kernel/mm/memory.c @@ -61,6 +61,7 @@ #include <linux/string.h> #include <linux/dma-debug.h> #include <linux/debugfs.h> +#include <linux/userfaultfd_k.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task) #ifdef HAVE_GENERIC_MMU_GATHER -static int tlb_next_batch(struct mmu_gather *tlb) +static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; batch = tlb->active; if (batch->next) { tlb->active = batch->next; - return 1; + return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) - return 0; + return false; batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) - return 0; + return false; tlb->batch_count++; batch->next = NULL; @@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb) tlb->active->next = batch; tlb->active = batch; - return 1; + return true; } /* tlb_gather_mmu @@ -2081,11 +2082,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, goto oom; cow_user_page(new_page, old_page, address, vma); } - __SetPageUptodate(new_page); if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) goto oom_free_new; + __SetPageUptodate(new_page); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* @@ -2684,6 +2686,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto unlock; + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(page_table, ptl); + return handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + } goto setpte; } @@ -2693,6 +2701,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; + + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) + goto oom_free_page; + /* * The memory barrier inside __SetPageUptodate makes sure that * preceeding stores to the page contents become visible before @@ -2700,9 +2712,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) - goto oom_free_page; - entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -2711,6 +2720,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(page_table, ptl); + mem_cgroup_cancel_charge(page, memcg); + page_cache_release(page); + return handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + } + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); mem_cgroup_commit_charge(page, memcg, false); @@ -3214,6 +3232,27 @@ out: return 0; } +static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, unsigned int flags) +{ + if (vma_is_anonymous(vma)) + return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); + if (vma->vm_ops->pmd_fault) + return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + return VM_FAULT_FALLBACK; +} + +static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd, + unsigned int flags) +{ + if (vma_is_anonymous(vma)) + return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); + if (vma->vm_ops->pmd_fault) + return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + return VM_FAULT_FALLBACK; +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3249,12 +3288,12 @@ static int handle_pte_fault(struct mm_struct *mm, barrier(); if (!pte_present(entry)) { if (pte_none(entry)) { - if (vma->vm_ops) + if (vma_is_anonymous(vma)) + return do_anonymous_page(mm, vma, address, + pte, pmd, flags); + else return do_fault(mm, vma, address, pte, pmd, flags, entry); - - return do_anonymous_page(mm, vma, address, pte, pmd, - flags); } return do_swap_page(mm, vma, address, pte, pmd, flags, entry); @@ -3316,10 +3355,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pmd) return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { - int ret = VM_FAULT_FALLBACK; - if (!vma->vm_ops) - ret = do_huge_pmd_anonymous_page(mm, vma, address, - pmd, flags); + int ret = create_huge_pmd(mm, vma, address, pmd, flags); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { @@ -3343,8 +3379,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, orig_pmd, pmd); if (dirty && !pmd_write(orig_pmd)) { - ret = do_huge_pmd_wp_page(mm, vma, address, pmd, - orig_pmd); + ret = wp_huge_pmd(mm, vma, address, pmd, + orig_pmd, flags); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { @@ -3363,8 +3399,18 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(pmd_none(*pmd)) && unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; - /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd + * didn't become pmd_trans_huge under us and then back to pmd_none, as + * a result of MADV_DONTNEED running immediately after a huge pmd fault + * in a different thread of this mm, in turn leading to a misleading + * pmd_trans_huge() retval. All we have to ensure is that it is a + * regular pmd that we can walk with pte_offset_map() and we can do that + * through an atomic read in C, which is what pmd_trans_unstable() + * provides. + */ + if (unlikely(pmd_trans_unstable(pmd))) return 0; /* * A regular pmd is established and it can't morph into a huge pmd @@ -3730,7 +3776,7 @@ void print_vma_addr(char *prefix, unsigned long ip) if (buf) { char *p; - p = d_path(&f->f_path, buf, PAGE_SIZE); + p = file_path(f, buf, PAGE_SIZE); if (IS_ERR(p)) p = "?"; printk("%s%s[%lx+%lx]", prefix, kbasename(p), diff --git a/kernel/mm/memory_hotplug.c b/kernel/mm/memory_hotplug.c index 9e88f749a..a042a9d53 100644 --- a/kernel/mm/memory_hotplug.c +++ b/kernel/mm/memory_hotplug.c @@ -339,8 +339,8 @@ static int __ref ensure_zone_is_initialized(struct zone *zone, unsigned long start_pfn, unsigned long num_pages) { if (!zone_is_initialized(zone)) - return init_currently_empty_zone(zone, start_pfn, num_pages, - MEMMAP_HOTPLUG); + return init_currently_empty_zone(zone, start_pfn, num_pages); + return 0; } @@ -446,7 +446,7 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) int nr_pages = PAGES_PER_SECTION; int nid = pgdat->node_id; int zone_type; - unsigned long flags; + unsigned long flags, pfn; int ret; zone_type = zone - pgdat->node_zones; @@ -461,6 +461,14 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn, MEMMAP_HOTPLUG); + + /* online_page_range is called later and expects pages reserved */ + for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { + if (!pfn_valid(pfn)) + continue; + + SetPageReserved(pfn_to_page(pfn)); + } return 0; } @@ -513,6 +521,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, break; err = 0; } + vmemmap_populate_print_last(); return err; } @@ -769,7 +778,10 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, start = phys_start_pfn << PAGE_SHIFT; size = nr_pages * PAGE_SIZE; - ret = release_mem_region_adjustable(&iomem_resource, start, size); + + /* in the ZONE_DEVICE case device driver owns the memory region */ + if (!is_dev_zone(zone)) + ret = release_mem_region_adjustable(&iomem_resource, start, size); if (ret) { resource_size_t endres = start + size - 1; @@ -1206,8 +1218,13 @@ static int should_add_memory_movable(int nid, u64 start, u64 size) return 0; } -int zone_for_memory(int nid, u64 start, u64 size, int zone_default) +int zone_for_memory(int nid, u64 start, u64 size, int zone_default, + bool for_device) { +#ifdef CONFIG_ZONE_DEVICE + if (for_device) + return ZONE_DEVICE; +#endif if (should_add_memory_movable(nid, start, size)) return ZONE_MOVABLE; @@ -1215,23 +1232,21 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default) } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory(int nid, u64 start, u64 size) +int __ref add_memory_resource(int nid, struct resource *res) { + u64 start, size; pg_data_t *pgdat = NULL; bool new_pgdat; bool new_node; - struct resource *res; int ret; + start = res->start; + size = resource_size(res); + ret = check_hotplug_memory_range(start, size); if (ret) return ret; - res = register_memory_resource(start, size); - ret = -EEXIST; - if (!res) - return ret; - { /* Stupid hack to suppress address-never-null warning */ void *p = NODE_DATA(nid); new_pgdat = !p; @@ -1239,6 +1254,14 @@ int __ref add_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); + /* + * Add new range to memblock so that when hotadd_new_pgdat() is called + * to allocate new pgdat, get_pfn_range_for_nid() will be able to find + * this new range and calculate total pages correctly. The range will + * be removed at hot-remove time. + */ + memblock_add_node(start, size, nid); + new_node = !node_online(nid); if (new_node) { pgdat = hotadd_new_pgdat(nid, start); @@ -1248,7 +1271,7 @@ int __ref add_memory(int nid, u64 start, u64 size) } /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size); + ret = arch_add_memory(nid, start, size, false); if (ret < 0) goto error; @@ -1275,12 +1298,28 @@ error: /* rollback pgdat allocation and others */ if (new_pgdat) rollback_node_hotadd(nid, pgdat); - release_memory_resource(res); + memblock_remove(start, size); out: mem_hotplug_done(); return ret; } +EXPORT_SYMBOL_GPL(add_memory_resource); + +int __ref add_memory(int nid, u64 start, u64 size) +{ + struct resource *res; + int ret; + + res = register_memory_resource(start, size); + if (!res) + return -EEXIST; + + ret = add_memory_resource(nid, res); + if (ret < 0) + release_memory_resource(res); + return ret; +} EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE @@ -1336,23 +1375,30 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) */ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn; + unsigned long pfn, sec_end_pfn; struct zone *zone = NULL; struct page *page; int i; - for (pfn = start_pfn; + for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn); pfn < end_pfn; - pfn += MAX_ORDER_NR_PAGES) { - i = 0; - /* This is just a CONFIG_HOLES_IN_ZONE check.*/ - while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) - i++; - if (i == MAX_ORDER_NR_PAGES) + pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) { + /* Make sure the memory section is present first */ + if (!present_section_nr(pfn_to_section_nr(pfn))) continue; - page = pfn_to_page(pfn + i); - if (zone && page_zone(page) != zone) - return 0; - zone = page_zone(page); + for (; pfn < sec_end_pfn && pfn < end_pfn; + pfn += MAX_ORDER_NR_PAGES) { + i = 0; + /* This is just a CONFIG_HOLES_IN_ZONE check.*/ + while ((i < MAX_ORDER_NR_PAGES) && + !pfn_valid_within(pfn + i)) + i++; + if (i == MAX_ORDER_NR_PAGES) + continue; + page = pfn_to_page(pfn + i); + if (zone && page_zone(page) != zone) + return 0; + zone = page_zone(page); + } } return 1; } @@ -2004,6 +2050,8 @@ void __ref remove_memory(int nid, u64 start, u64 size) /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); + memblock_free(start, size); + memblock_remove(start, size); arch_remove_memory(start, size); diff --git a/kernel/mm/mempolicy.c b/kernel/mm/mempolicy.c index 99d4c1d0b..87a177917 100644 --- a/kernel/mm/mempolicy.c +++ b/kernel/mm/mempolicy.c @@ -608,9 +608,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, qp->prev = vma; - if (vma->vm_flags & VM_PFNMAP) - return 1; - if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) @@ -722,8 +719,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol); + vma->anon_vma, vma->vm_file, pgoff, + new_pol, vma->vm_userfaultfd_ctx); if (prev) { vma = prev; next = vma->vm_next; @@ -945,7 +942,7 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x return alloc_huge_page_node(page_hstate(compound_head(page)), node); else - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE | + return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } @@ -2001,7 +1998,7 @@ retry_cpuset: nmask = policy_nodemask(gfp, pol); if (!nmask || node_isset(hpage_node, *nmask)) { mpol_cond_put(pol); - page = alloc_pages_exact_node(hpage_node, + page = __alloc_pages_node(hpage_node, gfp | __GFP_THISNODE, order); goto out; } diff --git a/kernel/mm/mempool.c b/kernel/mm/mempool.c index 2cc08de8b..004d42b1d 100644 --- a/kernel/mm/mempool.c +++ b/kernel/mm/mempool.c @@ -150,6 +150,9 @@ static void *remove_element(mempool_t *pool) */ void mempool_destroy(mempool_t *pool) { + if (unlikely(!pool)) + return; + while (pool->curr_nr) { void *element = remove_element(pool); pool->free(element, pool->pool_data); @@ -317,13 +320,13 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ - gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); + gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: @@ -346,7 +349,7 @@ repeat_alloc: } /* - * We use gfp mask w/o __GFP_WAIT or IO for the first round. If + * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ if (gfp_temp != gfp_mask) { @@ -355,8 +358,8 @@ repeat_alloc: goto repeat_alloc; } - /* We must not sleep if !__GFP_WAIT */ - if (!(gfp_mask & __GFP_WAIT)) { + /* We must not sleep if !__GFP_DIRECT_RECLAIM */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { spin_unlock_irqrestore(&pool->lock, flags); return NULL; } diff --git a/kernel/mm/memtest.c b/kernel/mm/memtest.c index 1997d934b..8eaa4c3a5 100644 --- a/kernel/mm/memtest.c +++ b/kernel/mm/memtest.c @@ -1,11 +1,6 @@ #include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> #include <linux/types.h> -#include <linux/mm.h> -#include <linux/smp.h> #include <linux/init.h> -#include <linux/pfn.h> #include <linux/memblock.h> static u64 patterns[] __initdata = { @@ -31,10 +26,8 @@ static u64 patterns[] __initdata = { static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) { - printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", - (unsigned long long) pattern, - (unsigned long long) start_bad, - (unsigned long long) end_bad); + pr_info(" %016llx bad mem addr %pa - %pa reserved\n", + cpu_to_be64(pattern), &start_bad, &end_bad); memblock_reserve(start_bad, end_bad - start_bad); } @@ -74,30 +67,31 @@ static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) u64 i; phys_addr_t this_start, this_end; - for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &this_start, + &this_end, NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); if (this_start < this_end) { - printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", - (unsigned long long)this_start, - (unsigned long long)this_end, - (unsigned long long)cpu_to_be64(pattern)); + pr_info(" %pa - %pa pattern %016llx\n", + &this_start, &this_end, cpu_to_be64(pattern)); memtest(pattern, this_start, this_end - this_start); } } } /* default is disabled */ -static int memtest_pattern __initdata; +static unsigned int memtest_pattern __initdata; static int __init parse_memtest(char *arg) { + int ret = 0; + if (arg) - memtest_pattern = simple_strtoul(arg, NULL, 0); + ret = kstrtouint(arg, 0, &memtest_pattern); else memtest_pattern = ARRAY_SIZE(patterns); - return 0; + return ret; } early_param("memtest", parse_memtest); @@ -110,7 +104,7 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end) if (!memtest_pattern) return; - printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); + pr_info("early_memtest: # of tests: %u\n", memtest_pattern); for (i = memtest_pattern-1; i < UINT_MAX; --i) { idx = i % ARRAY_SIZE(patterns); do_one_pass(patterns[idx], start, end); diff --git a/kernel/mm/migrate.c b/kernel/mm/migrate.c index f53838fe3..6d17e0ab4 100644 --- a/kernel/mm/migrate.c +++ b/kernel/mm/migrate.c @@ -1,5 +1,5 @@ /* - * Memory Migration functionality - linux/mm/migration.c + * Memory Migration functionality - linux/mm/migrate.c * * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter * @@ -30,13 +30,14 @@ #include <linux/mempolicy.h> #include <linux/vmalloc.h> #include <linux/security.h> -#include <linux/memcontrol.h> +#include <linux/backing-dev.h> #include <linux/syscalls.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> #include <linux/balloon_compaction.h> #include <linux/mmu_notifier.h> +#include <linux/page_idle.h> #include <asm/tlbflush.h> @@ -170,6 +171,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, else page_add_file_rmap(new); + if (vma->vm_flags & VM_LOCKED) + mlock_vma_page(new); + /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, ptep); unlock: @@ -310,6 +314,8 @@ int migrate_page_move_mapping(struct address_space *mapping, struct buffer_head *head, enum migrate_mode mode, int extra_count) { + struct zone *oldzone, *newzone; + int dirty; int expected_count = 1 + extra_count; void **pslot; @@ -317,9 +323,20 @@ int migrate_page_move_mapping(struct address_space *mapping, /* Anonymous page without mapping */ if (page_count(page) != expected_count) return -EAGAIN; + + /* No turning back from here */ + set_page_memcg(newpage, page_memcg(page)); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); + return MIGRATEPAGE_SUCCESS; } + oldzone = page_zone(page); + newzone = page_zone(newpage); + spin_lock_irq(&mapping->tree_lock); pslot = radix_tree_lookup_slot(&mapping->page_tree, @@ -352,14 +369,28 @@ int migrate_page_move_mapping(struct address_space *mapping, } /* - * Now we know that no one else is looking at the page. + * Now we know that no one else is looking at the page: + * no turning back from here. */ + set_page_memcg(newpage, page_memcg(page)); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); + get_page(newpage); /* add cache reference */ if (PageSwapCache(page)) { SetPageSwapCache(newpage); set_page_private(newpage, page_private(page)); } + /* Move dirty while page refs frozen and newpage not yet exposed */ + dirty = PageDirty(page); + if (dirty) { + ClearPageDirty(page); + SetPageDirty(newpage); + } + radix_tree_replace_slot(pslot, newpage); /* @@ -369,6 +400,9 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_unfreeze_refs(page, expected_count - 1); + spin_unlock(&mapping->tree_lock); + /* Leave irq disabled to prevent preemption while updating stats */ + /* * If moved to a different zone then also account * the page for that zone. Other VM counters will be @@ -379,13 +413,19 @@ int migrate_page_move_mapping(struct address_space *mapping, * via NR_FILE_PAGES and NR_ANON_PAGES if they * are mapped to swap space. */ - __dec_zone_page_state(page, NR_FILE_PAGES); - __inc_zone_page_state(newpage, NR_FILE_PAGES); - if (!PageSwapCache(page) && PageSwapBacked(page)) { - __dec_zone_page_state(page, NR_SHMEM); - __inc_zone_page_state(newpage, NR_SHMEM); + if (newzone != oldzone) { + __dec_zone_state(oldzone, NR_FILE_PAGES); + __inc_zone_state(newzone, NR_FILE_PAGES); + if (PageSwapBacked(page) && !PageSwapCache(page)) { + __dec_zone_state(oldzone, NR_SHMEM); + __inc_zone_state(newzone, NR_SHMEM); + } + if (dirty && mapping_cap_account_dirty(mapping)) { + __dec_zone_state(oldzone, NR_FILE_DIRTY); + __inc_zone_state(newzone, NR_FILE_DIRTY); + } } - spin_unlock_irq(&mapping->tree_lock); + local_irq_enable(); return MIGRATEPAGE_SUCCESS; } @@ -400,12 +440,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; void **pslot; - if (!mapping) { - if (page_count(page) != 1) - return -EAGAIN; - return MIGRATEPAGE_SUCCESS; - } - spin_lock_irq(&mapping->tree_lock); pslot = radix_tree_lookup_slot(&mapping->page_tree, @@ -423,6 +457,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } + set_page_memcg(newpage, page_memcg(page)); + newpage->index = page->index; + newpage->mapping = page->mapping; get_page(newpage); radix_tree_replace_slot(pslot, newpage); @@ -509,20 +546,14 @@ void migrate_page_copy(struct page *newpage, struct page *page) if (PageMappedToDisk(page)) SetPageMappedToDisk(newpage); - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - /* - * Want to mark the page and the radix tree as dirty, and - * redo the accounting that clear_page_dirty_for_io undid, - * but we can't use set_page_dirty because that function - * is actually a signal that all of the page has become dirty. - * Whereas only part of our page may be dirty. - */ - if (PageSwapBacked(page)) - SetPageDirty(newpage); - else - __set_page_dirty_nobuffers(newpage); - } + /* Move dirty on pages not done by migrate_page_move_mapping() */ + if (PageDirty(page)) + SetPageDirty(newpage); + + if (page_is_young(page)) + set_page_young(newpage); + if (page_is_idle(page)) + set_page_idle(newpage); /* * Copy NUMA information to the new page, to prevent over-eager @@ -531,7 +562,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) cpupid = page_cpupid_xchg_last(page, -1); page_cpupid_xchg_last(newpage, cpupid); - mlock_migrate_page(newpage, page); ksm_migrate_page(newpage, page); /* * Please do not reorder this without considering how mm/ksm.c's @@ -715,24 +745,13 @@ static int fallback_migrate_page(struct address_space *mapping, * MIGRATEPAGE_SUCCESS - success */ static int move_to_new_page(struct page *newpage, struct page *page, - int page_was_mapped, enum migrate_mode mode) + enum migrate_mode mode) { struct address_space *mapping; int rc; - /* - * Block others from accessing the page when we get around to - * establishing additional references. We are the only one - * holding a reference to the new page at this point. - */ - if (!trylock_page(newpage)) - BUG(); - - /* Prepare mapping for the new page.*/ - newpage->index = page->index; - newpage->mapping = page->mapping; - if (PageSwapBacked(page)) - SetPageSwapBacked(newpage); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); mapping = page_mapping(page); if (!mapping) @@ -744,22 +763,19 @@ static int move_to_new_page(struct page *newpage, struct page *page, * space which also has its own migratepage callback. This * is the most common path for page migration. */ - rc = mapping->a_ops->migratepage(mapping, - newpage, page, mode); + rc = mapping->a_ops->migratepage(mapping, newpage, page, mode); else rc = fallback_migrate_page(mapping, newpage, page, mode); - if (rc != MIGRATEPAGE_SUCCESS) { - newpage->mapping = NULL; - } else { - mem_cgroup_migrate(page, newpage, false); - if (page_was_mapped) - remove_migration_ptes(page, newpage); - page->mapping = NULL; + /* + * When successful, old pagecache page->mapping must be cleared before + * page is freed; but stats require that PageAnon be left as PageAnon. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + set_page_memcg(page, NULL); + if (!PageAnon(page)) + page->mapping = NULL; } - - unlock_page(newpage); - return rc; } @@ -808,6 +824,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, goto out_unlock; wait_on_page_writeback(page); } + /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. @@ -815,34 +832,26 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. + * + * Only page_get_anon_vma() understands the subtleties of + * getting a hold on an anon_vma from outside one of its mms. + * But if we cannot get anon_vma, then we won't need it anyway, + * because that implies that the anon page is no longer mapped + * (and cannot be remapped so long as we hold the page lock). */ - if (PageAnon(page) && !PageKsm(page)) { - /* - * Only page_lock_anon_vma_read() understands the subtleties of - * getting a hold on an anon_vma from outside one of its mms. - */ + if (PageAnon(page) && !PageKsm(page)) anon_vma = page_get_anon_vma(page); - if (anon_vma) { - /* - * Anon page - */ - } else if (PageSwapCache(page)) { - /* - * We cannot be sure that the anon_vma of an unmapped - * swapcache page is safe to use because we don't - * know in advance if the VMA that this page belonged - * to still exists. If the VMA and others sharing the - * data have been freed, then the anon_vma could - * already be invalid. - * - * To avoid this possibility, swapcache pages get - * migrated but are not remapped when migration - * completes - */ - } else { - goto out_unlock; - } - } + + /* + * Block others from accessing the new page when we get around to + * establishing additional references. We are usually the only one + * holding a reference to newpage at this point. We used to have a BUG + * here if trylock_page(newpage) fails, but would like to allow for + * cases where there might be a race with the previous use of newpage. + * This is much like races on refcount of oldpage: just don't BUG(). + */ + if (unlikely(!trylock_page(newpage))) + goto out_unlock; if (unlikely(isolated_balloon_page(page))) { /* @@ -853,7 +862,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * the page migration right away (proteced by page lock). */ rc = balloon_page_migrate(newpage, page, mode); - goto out_unlock; + goto out_unlock_both; } /* @@ -872,30 +881,30 @@ static int __unmap_and_move(struct page *page, struct page *newpage, VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { try_to_free_buffers(page); - goto out_unlock; + goto out_unlock_both; } - goto skip_unmap; - } - - /* Establish migration ptes or remove ptes */ - if (page_mapped(page)) { + } else if (page_mapped(page)) { + /* Establish migration ptes */ + VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, + page); try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); page_was_mapped = 1; } -skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page, page_was_mapped, mode); + rc = move_to_new_page(newpage, page, mode); - if (rc && page_was_mapped) - remove_migration_ptes(page, page); + if (page_was_mapped) + remove_migration_ptes(page, + rc == MIGRATEPAGE_SUCCESS ? newpage : page); +out_unlock_both: + unlock_page(newpage); +out_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); - -out_unlock: unlock_page(page); out: return rc; @@ -918,12 +927,14 @@ out: static ICE_noinline int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *page, - int force, enum migrate_mode mode) + int force, enum migrate_mode mode, + enum migrate_reason reason) { - int rc = 0; + int rc = MIGRATEPAGE_SUCCESS; int *result = NULL; - struct page *newpage = get_new_page(page, private, &result); + struct page *newpage; + newpage = get_new_page(page, private, &result); if (!newpage) return -ENOMEM; @@ -937,6 +948,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; rc = __unmap_and_move(page, newpage, force, mode); + if (rc == MIGRATEPAGE_SUCCESS) + put_new_page = NULL; out: if (rc != -EAGAIN) { @@ -949,7 +962,13 @@ out: list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - putback_lru_page(page); + /* Soft-offlined page shouldn't go through lru cache list */ + if (reason == MR_MEMORY_FAILURE) { + put_page(page); + if (!test_set_page_hwpoison(page)) + num_poisoned_pages_inc(); + } else + putback_lru_page(page); } /* @@ -957,10 +976,9 @@ out: * it. Otherwise, putback_lru_page() will drop the reference grabbed * during isolation. */ - if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { - ClearPageSwapBacked(newpage); + if (put_new_page) put_new_page(newpage, private); - } else if (unlikely(__is_movable_balloon_page(newpage))) { + else if (unlikely(__is_movable_balloon_page(newpage))) { /* drop our reference, page already in the balloon */ put_page(newpage); } else @@ -998,7 +1016,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, struct page *hpage, int force, enum migrate_mode mode) { - int rc = 0; + int rc = -EAGAIN; int *result = NULL; int page_was_mapped = 0; struct page *new_hpage; @@ -1020,8 +1038,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!new_hpage) return -ENOMEM; - rc = -EAGAIN; - if (!trylock_page(hpage)) { if (!force || mode != MIGRATE_SYNC) goto out; @@ -1031,6 +1047,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (PageAnon(hpage)) anon_vma = page_get_anon_vma(hpage); + if (unlikely(!trylock_page(new_hpage))) + goto put_anon; + if (page_mapped(hpage)) { try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); @@ -1038,16 +1057,22 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, } if (!page_mapped(hpage)) - rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode); + rc = move_to_new_page(new_hpage, hpage, mode); - if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped) - remove_migration_ptes(hpage, hpage); + if (page_was_mapped) + remove_migration_ptes(hpage, + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage); + unlock_page(new_hpage); + +put_anon: if (anon_vma) put_anon_vma(anon_vma); - if (rc == MIGRATEPAGE_SUCCESS) + if (rc == MIGRATEPAGE_SUCCESS) { hugetlb_cgroup_migrate(hpage, new_hpage); + put_new_page = NULL; + } unlock_page(hpage); out: @@ -1059,10 +1084,10 @@ out: * it. Otherwise, put_page() will drop the reference grabbed during * isolation. */ - if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + if (put_new_page) put_new_page(new_hpage, private); else - put_page(new_hpage); + putback_active_hugepage(new_hpage); if (result) { if (rc) @@ -1089,7 +1114,7 @@ out: * * The function returns after 10 attempts or if no pages are movable any more * because the list has become empty or no retryable pages exist any more. - * The caller should call putback_lru_pages() to return pages to the LRU + * The caller should call putback_movable_pages() to return pages to the LRU * or free list only if ret != 0. * * Returns the number of pages that were not migrated, or an error code. @@ -1122,7 +1147,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, pass > 2, mode); else rc = unmap_and_move(get_new_page, put_new_page, - private, page, pass > 2, mode); + private, page, pass > 2, mode, + reason); switch(rc) { case -ENOMEM: @@ -1145,7 +1171,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, } } } - rc = nr_failed + retry; + nr_failed += retry; + rc = nr_failed; out: if (nr_succeeded) count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); @@ -1187,7 +1214,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, return alloc_huge_page_node(page_hstate(compound_head(p)), pm->node); else - return alloc_pages_exact_node(pm->node, + return __alloc_pages_node(pm->node, GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } @@ -1219,7 +1246,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; - page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); + /* FOLL_DUMP to ignore special (like zero) pages */ + page = follow_page(vma, pp->addr, + FOLL_GET | FOLL_SPLIT | FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1229,10 +1258,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!page) goto set_status; - /* Use PageReserved to check for zero page */ - if (PageReserved(page)) - goto put_and_set; - pp->page = page; err = page_to_nid(page); @@ -1389,18 +1414,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (!vma || addr < vma->vm_start) goto set_status; - page = follow_page(vma, addr, 0); + /* FOLL_DUMP to ignore special (like zero) pages */ + page = follow_page(vma, addr, FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) goto set_status; - err = -ENOENT; - /* Use PageReserved to check for zero page */ - if (!page || PageReserved(page)) - goto set_status; - - err = page_to_nid(page); + err = page ? page_to_nid(page) : -ENOENT; set_status: *status = err; @@ -1553,11 +1574,11 @@ static struct page *alloc_misplaced_dst_page(struct page *page, int nid = (int) data; struct page *newpage; - newpage = alloc_pages_exact_node(nid, + newpage = __alloc_pages_node(nid, (GFP_HIGHUSER_MOVABLE | __GFP_THISNODE | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & - ~GFP_IOFS, 0); + ~__GFP_RECLAIM, 0); return newpage; } @@ -1731,7 +1752,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, HPAGE_PMD_ORDER); if (!new_page) goto out_fail; @@ -1768,7 +1789,6 @@ fail_putback: SetPageActive(page); if (TestClearPageUnevictable(new_page)) SetPageUnevictable(page); - mlock_migrate_page(page, new_page); unlock_page(new_page); put_page(new_page); /* Free it */ @@ -1796,7 +1816,7 @@ fail_putback: */ flush_cache_range(vma, mmun_start, mmun_end); page_add_anon_rmap(new_page, vma, mmun_start); - pmdp_clear_flush_notify(vma, mmun_start, pmd); + pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); @@ -1810,8 +1830,9 @@ fail_putback: goto fail_putback; } - mem_cgroup_migrate(page, new_page, false); - + mlock_migrate_page(new_page, page); + set_page_memcg(new_page, page_memcg(page)); + set_page_memcg(page, NULL); page_remove_rmap(page); spin_unlock(ptl); diff --git a/kernel/mm/mincore.c b/kernel/mm/mincore.c index be25efde6..14bb9fb37 100644 --- a/kernel/mm/mincore.c +++ b/kernel/mm/mincore.c @@ -234,7 +234,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, /* This also avoids any overflows on PAGE_CACHE_ALIGN */ pages = len >> PAGE_SHIFT; - pages += (len & ~PAGE_MASK) != 0; + pages += (offset_in_page(len)) != 0; if (!access_ok(VERIFY_WRITE, vec, pages)) return -EFAULT; diff --git a/kernel/mm/mlock.c b/kernel/mm/mlock.c index 6fd2cf15e..d6006b146 100644 --- a/kernel/mm/mlock.c +++ b/kernel/mm/mlock.c @@ -172,7 +172,7 @@ static void __munlock_isolation_failed(struct page *page) */ unsigned int munlock_vma_page(struct page *page) { - unsigned int nr_pages; + int nr_pages; struct zone *zone = page_zone(page); /* For try_to_munlock() and to serialize with page migration */ @@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - vma->vm_flags &= ~VM_LOCKED; + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; while (start < end) { struct page *page = NULL; @@ -506,11 +506,13 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) - goto out; /* don't set VM_LOCKED, don't count */ + /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ + goto out; pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma)); + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*prev) { vma = *prev; goto success; @@ -553,13 +555,14 @@ out: return ret; } -static int do_mlock(unsigned long start, size_t len, int on) +static int apply_vma_lock_flags(unsigned long start, size_t len, + vm_flags_t flags) { unsigned long nstart, end, tmp; struct vm_area_struct * vma, * prev; int error; - VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) @@ -575,14 +578,11 @@ static int do_mlock(unsigned long start, size_t len, int on) prev = vma; for (nstart = start ; ; ) { - vm_flags_t newflags; - - /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; - newflags = vma->vm_flags & ~VM_LOCKED; - if (on) - newflags |= VM_LOCKED; + newflags |= flags; + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ tmp = vma->vm_end; if (tmp > end) tmp = end; @@ -604,7 +604,7 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) +static int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; @@ -615,7 +615,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) lru_add_drain_all(); /* flush pagevec */ - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; lock_limit = rlimit(RLIMIT_MEMLOCK); @@ -628,7 +628,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) /* check against resource limits */ if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) - error = do_mlock(start, len, 1); + error = apply_vma_lock_flags(start, len, flags); up_write(¤t->mm->mmap_sem); if (error) @@ -640,37 +640,75 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) return 0; } +SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) +{ + return do_mlock(start, len, VM_LOCKED); +} + +SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags) +{ + vm_flags_t vm_flags = VM_LOCKED; + + if (flags & ~MLOCK_ONFAULT) + return -EINVAL; + + if (flags & MLOCK_ONFAULT) + vm_flags |= VM_LOCKONFAULT; + + return do_mlock(start, len, vm_flags); +} + SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; - len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); + len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; down_write(¤t->mm->mmap_sem); - ret = do_mlock(start, len, 0); + ret = apply_vma_lock_flags(start, len, 0); up_write(¤t->mm->mmap_sem); return ret; } -static int do_mlockall(int flags) +/* + * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall) + * and translate into the appropriate modifications to mm->def_flags and/or the + * flags for all current VMAs. + * + * There are a couple of subtleties with this. If mlockall() is called multiple + * times with different flags, the values do not necessarily stack. If mlockall + * is called once including the MCL_FUTURE flag and then a second time without + * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags. + */ +static int apply_mlockall_flags(int flags) { struct vm_area_struct * vma, * prev = NULL; + vm_flags_t to_add = 0; - if (flags & MCL_FUTURE) + current->mm->def_flags &= VM_LOCKED_CLEAR_MASK; + if (flags & MCL_FUTURE) { current->mm->def_flags |= VM_LOCKED; - else - current->mm->def_flags &= ~VM_LOCKED; - if (flags == MCL_FUTURE) - goto out; + + if (flags & MCL_ONFAULT) + current->mm->def_flags |= VM_LOCKONFAULT; + + if (!(flags & MCL_CURRENT)) + goto out; + } + + if (flags & MCL_CURRENT) { + to_add |= VM_LOCKED; + if (flags & MCL_ONFAULT) + to_add |= VM_LOCKONFAULT; + } for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { vm_flags_t newflags; - newflags = vma->vm_flags & ~VM_LOCKED; - if (flags & MCL_CURRENT) - newflags |= VM_LOCKED; + newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; + newflags |= to_add; /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); @@ -683,14 +721,13 @@ out: SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; - int ret = -EINVAL; + int ret; - if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE))) - goto out; + if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT))) + return -EINVAL; - ret = -EPERM; if (!can_do_mlock()) - goto out; + return -EPERM; if (flags & MCL_CURRENT) lru_add_drain_all(); /* flush pagevec */ @@ -703,11 +740,11 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) - ret = do_mlockall(flags); + ret = apply_mlockall_flags(flags); up_write(¤t->mm->mmap_sem); if (!ret && (flags & MCL_CURRENT)) mm_populate(0, TASK_SIZE); -out: + return ret; } @@ -716,7 +753,7 @@ SYSCALL_DEFINE0(munlockall) int ret; down_write(¤t->mm->mmap_sem); - ret = do_mlockall(0); + ret = apply_mlockall_flags(0); up_write(¤t->mm->mmap_sem); return ret; } diff --git a/kernel/mm/mm_init.c b/kernel/mm/mm_init.c index 5f420f7fa..fdadf918d 100644 --- a/kernel/mm/mm_init.c +++ b/kernel/mm/mm_init.c @@ -11,6 +11,7 @@ #include <linux/export.h> #include <linux/memory.h> #include <linux/notifier.h> +#include <linux/sched.h> #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -130,14 +131,6 @@ void __init mminit_verify_pageflags_layout(void) BUG_ON(or_mask != add_mask); } -void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, - unsigned long nid, unsigned long pfn) -{ - BUG_ON(page_to_nid(page) != nid); - BUG_ON(page_zonenum(page) != zone); - BUG_ON(page_to_pfn(page) != pfn); -} - static __init int set_mminit_loglevel(char *str) { get_option(&str, &mminit_loglevel); diff --git a/kernel/mm/mmap.c b/kernel/mm/mmap.c index bb50cacc3..455772a05 100644 --- a/kernel/mm/mmap.c +++ b/kernel/mm/mmap.c @@ -41,6 +41,7 @@ #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> +#include <linux/userfaultfd_k.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -440,12 +441,16 @@ static void validate_mm(struct mm_struct *mm) struct vm_area_struct *vma = mm->mmap; while (vma) { + struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; - vma_lock_anon_vma(vma); - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_verify(avc); - vma_unlock_anon_vma(vma); + if (anon_vma) { + anon_vma_lock_read(anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + anon_vma_unlock_read(anon_vma); + } + highest_address = vma->vm_end; vma = vma->vm_next; i++; @@ -919,7 +924,8 @@ again: remove_next = 1 + (end > next->vm_end); * per-vma resources, so we don't attempt to merge those. */ static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags) + struct file *file, unsigned long vm_flags, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -935,6 +941,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (vma->vm_ops && vma->vm_ops->close) return 0; + if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) + return 0; return 1; } @@ -965,9 +973,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, */ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { - if (is_mergeable_vma(vma, file, vm_flags) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -984,9 +994,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, */ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { - if (is_mergeable_vma(vma, file, vm_flags) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1029,7 +1041,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy) + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1056,14 +1069,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (prev && prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, - anon_vma, file, pgoff)) { + anon_vma, file, pgoff, + vm_userfaultfd_ctx)) { /* * OK, it can. Can we now merge in the successor as well? */ if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen) && + anon_vma, file, + pgoff+pglen, + vm_userfaultfd_ctx) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1084,7 +1100,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen)) { + anon_vma, file, pgoff+pglen, + vm_userfaultfd_ctx)) { if (prev && addr < prev->vm_end) /* case 4 */ err = vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL); @@ -1247,17 +1264,18 @@ static inline int mlock_future_check(struct mm_struct *mm, /* * The caller must hold down_write(¤t->mm->mmap_sem). */ - -unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff, - unsigned long *populate) + unsigned long flags, vm_flags_t vm_flags, + unsigned long pgoff, unsigned long *populate) { struct mm_struct *mm = current->mm; - vm_flags_t vm_flags; *populate = 0; + if (!len) + return -EINVAL; + /* * Does the application expect PROT_READ to imply PROT_EXEC? * @@ -1265,12 +1283,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, * mounted, in which case we dont add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) - if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) + if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; - if (!len) - return -EINVAL; - if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -1291,14 +1306,14 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return addr; /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) @@ -1337,7 +1352,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (path_noexec(&file->f_path)) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; @@ -1401,13 +1416,13 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, unsigned long, fd, unsigned long, pgoff) { struct file *file = NULL; - unsigned long retval = -EBADF; + unsigned long retval; if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd); if (!file) - goto out; + return -EBADF; if (is_file_hugepages(file)) len = ALIGN(len, huge_page_size(hstate_file(file))); retval = -EINVAL; @@ -1442,7 +1457,6 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, out_fput: if (file) fput(file); -out: return retval; } @@ -1462,7 +1476,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - if (a.offset & ~PAGE_MASK) + if (offset_in_page(a.offset)) return -EINVAL; return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, @@ -1479,13 +1493,14 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) int vma_wants_writenotify(struct vm_area_struct *vma) { vm_flags_t vm_flags = vma->vm_flags; + const struct vm_operations_struct *vm_ops = vma->vm_ops; /* If it was private or non-writable, the write bit is already clear */ if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) return 0; /* The backer wishes to know when pages are first written to? */ - if (vma->vm_ops && vma->vm_ops->page_mkwrite) + if (vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite)) return 1; /* The open routine did something to the protections that pgprot_modify @@ -1550,7 +1565,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Clear old maps */ - error = -ENOMEM; while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { if (do_munmap(mm, addr, len)) @@ -1570,8 +1584,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* * Can we just expand an old mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, - NULL); + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out; @@ -1651,7 +1665,7 @@ out: vma == get_gate_vma(current->mm))) mm->locked_vm += (len >> PAGE_SHIFT); else - vma->vm_flags &= ~VM_LOCKED; + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; } if (file) @@ -1977,7 +1991,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, * can happen with large stack limits and large mmap() * allocations. */ - if (addr & ~PAGE_MASK) { + if (offset_in_page(addr)) { VM_BUG_ON(addr != -ENOMEM); info.flags = 0; info.low_limit = TASK_UNMAPPED_BASE; @@ -2013,7 +2027,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (addr > TASK_SIZE - len) return -ENOMEM; - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return -EINVAL; addr = arch_rebalance_pgtables(addr, len); @@ -2035,7 +2049,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) return vma; rb_node = mm->mm_rb.rb_node; - vma = NULL; while (rb_node) { struct vm_area_struct *tmp; @@ -2127,10 +2140,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; - /* Ok, everything looks good - let it rip */ - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); return 0; } @@ -2141,32 +2150,28 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns */ int expand_upwards(struct vm_area_struct *vma, unsigned long address) { - int error; + struct mm_struct *mm = vma->vm_mm; + int error = 0; if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ + /* Guard against wrapping around to address 0. */ + if (address < PAGE_ALIGN(address+4)) + address = PAGE_ALIGN(address+4); + else + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; - vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. - * Also guard against wrapping around to address 0. */ - if (address < PAGE_ALIGN(address+4)) - address = PAGE_ALIGN(address+4); - else { - vma_unlock_anon_vma(vma); - return -ENOMEM; - } - error = 0; + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { @@ -2184,29 +2189,33 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard * against concurrent vma expansions. */ - spin_lock(&vma->vm_mm->page_table_lock); + spin_lock(&mm->page_table_lock); + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, + vma->vm_file, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; anon_vma_interval_tree_post_update_vma(vma); if (vma->vm_next) vma_gap_update(vma->vm_next); else - vma->vm_mm->highest_vm_end = address; - spin_unlock(&vma->vm_mm->page_table_lock); + mm->highest_vm_end = address; + spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); - validate_mm(vma->vm_mm); + validate_mm(mm); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -2217,27 +2226,24 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int expand_downwards(struct vm_area_struct *vma, unsigned long address) { + struct mm_struct *mm = vma->vm_mm; int error; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; - address &= PAGE_MASK; error = security_mmap_addr(address); if (error) return error; - vma_lock_anon_vma(vma); + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { @@ -2255,27 +2261,31 @@ int expand_downwards(struct vm_area_struct *vma, * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard * against concurrent vma expansions. */ - spin_lock(&vma->vm_mm->page_table_lock); + spin_lock(&mm->page_table_lock); + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm += grow; + vm_stat_account(mm, vma->vm_flags, + vma->vm_file, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; anon_vma_interval_tree_post_update_vma(vma); vma_gap_update(vma); - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); - validate_mm(vma->vm_mm); + validate_mm(mm); return error; } @@ -2442,7 +2452,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vm_area_struct *new; - int err = -ENOMEM; + int err; if (is_vm_hugetlb_page(vma) && (addr & ~(huge_page_mask(hstate_vma(vma))))) @@ -2450,7 +2460,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) - goto out_err; + return -ENOMEM; /* most fields are the same, copy all, and then fixup */ *new = *vma; @@ -2498,7 +2508,6 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mpol_put(vma_policy(new)); out_free_vma: kmem_cache_free(vm_area_cachep, new); - out_err: return err; } @@ -2525,7 +2534,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) unsigned long end; struct vm_area_struct *vma, *prev, *last; - if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; len = PAGE_ALIGN(len); @@ -2659,12 +2668,29 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (!vma || !(vma->vm_flags & VM_SHARED)) goto out; - if (start < vma->vm_start || start + size > vma->vm_end) + if (start < vma->vm_start) goto out; - if (pgoff == linear_page_index(vma, start)) { - ret = 0; - goto out; + if (start + size > vma->vm_end) { + struct vm_area_struct *next; + + for (next = vma->vm_next; next; next = next->vm_next) { + /* hole between vmas ? */ + if (next->vm_start != next->vm_prev->vm_end) + goto out; + + if (next->vm_file != vma->vm_file) + goto out; + + if (next->vm_flags != vma->vm_flags) + goto out; + + if (start + size <= next->vm_end) + break; + } + + if (!next) + goto out; } prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; @@ -2674,9 +2700,16 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; if (vma->vm_flags & VM_LOCKED) { + struct vm_area_struct *tmp; flags |= MAP_LOCKED; + /* drop PG_Mlocked flag for over-mapped range */ - munlock_vma_pages_range(vma, start, start + size); + for (tmp = vma; tmp->vm_start >= start + size; + tmp = tmp->vm_next) { + munlock_vma_pages_range(tmp, + max(tmp->vm_start, start), + min(tmp->vm_end, start + size)); + } } file = get_file(vma->vm_file); @@ -2723,7 +2756,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (error & ~PAGE_MASK) + if (offset_in_page(error)) return error; error = mlock_future_check(mm, mm->def_flags, len); @@ -2757,7 +2790,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out; @@ -2859,6 +2892,13 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) struct vm_area_struct *prev; struct rb_node **rb_link, *rb_parent; + if (find_vma_links(mm, vma->vm_start, vma->vm_end, + &prev, &rb_link, &rb_parent)) + return -ENOMEM; + if ((vma->vm_flags & VM_ACCOUNT) && + security_vm_enough_memory_mm(mm, vma_pages(vma))) + return -ENOMEM; + /* * The vm_pgoff of a purely anonymous vma should be irrelevant * until its first write fault, when page's anon_vma and index @@ -2871,16 +2911,10 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) * using the existing file pgoff checks and manipulations. * Similarly in do_mmap_pgoff and in do_brk. */ - if (!vma->vm_file) { + if (vma_is_anonymous(vma)) { BUG_ON(vma->anon_vma); vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) - return -ENOMEM; - if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory_mm(mm, vma_pages(vma))) - return -ENOMEM; vma_link(mm, vma, prev, rb_link, rb_parent); return 0; @@ -2905,7 +2939,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. */ - if (unlikely(!vma->vm_file && !vma->anon_vma)) { + if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { pgoff = addr >> PAGE_SHIFT; faulted_in_anon_vma = false; } @@ -2913,7 +2947,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (new_vma) { /* * Source vma may have been merged into new_vma @@ -2938,30 +2973,31 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (new_vma) { - *new_vma = *vma; - new_vma->vm_start = addr; - new_vma->vm_end = addr + len; - new_vma->vm_pgoff = pgoff; - if (vma_dup_policy(vma, new_vma)) - goto out_free_vma; - INIT_LIST_HEAD(&new_vma->anon_vma_chain); - if (anon_vma_clone(new_vma, vma)) - goto out_free_mempol; - if (new_vma->vm_file) - get_file(new_vma->vm_file); - if (new_vma->vm_ops && new_vma->vm_ops->open) - new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); - *need_rmap_locks = false; - } + if (!new_vma) + goto out; + *new_vma = *vma; + new_vma->vm_start = addr; + new_vma->vm_end = addr + len; + new_vma->vm_pgoff = pgoff; + if (vma_dup_policy(vma, new_vma)) + goto out_free_vma; + INIT_LIST_HEAD(&new_vma->anon_vma_chain); + if (anon_vma_clone(new_vma, vma)) + goto out_free_mempol; + if (new_vma->vm_file) + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; } return new_vma; - out_free_mempol: +out_free_mempol: mpol_put(vma_policy(new_vma)); - out_free_vma: +out_free_vma: kmem_cache_free(vm_area_cachep, new_vma); +out: return NULL; } @@ -3013,21 +3049,13 @@ static int special_mapping_fault(struct vm_area_struct *vma, pgoff_t pgoff; struct page **pages; - /* - * special mappings have no vm_file, and in that case, the mm - * uses vm_pgoff internally. So we have to subtract it from here. - * We are allowed to do this because we are the mm; do not copy - * this code into drivers! - */ - pgoff = vmf->pgoff - vma->vm_pgoff; - if (vma->vm_ops == &legacy_special_mapping_vmops) pages = vma->vm_private_data; else pages = ((struct vm_special_mapping *)vma->vm_private_data)-> pages; - for (; pgoff && *pages; ++pages) + for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; if (*pages) { @@ -3043,8 +3071,8 @@ static int special_mapping_fault(struct vm_area_struct *vma, static struct vm_area_struct *__install_special_mapping( struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long vm_flags, const struct vm_operations_struct *ops, - void *priv) + unsigned long vm_flags, void *priv, + const struct vm_operations_struct *ops) { int ret; struct vm_area_struct *vma; @@ -3093,8 +3121,8 @@ struct vm_area_struct *_install_special_mapping( unsigned long addr, unsigned long len, unsigned long vm_flags, const struct vm_special_mapping *spec) { - return __install_special_mapping(mm, addr, len, vm_flags, - &special_mapping_vmops, (void *)spec); + return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec, + &special_mapping_vmops); } int install_special_mapping(struct mm_struct *mm, @@ -3102,8 +3130,8 @@ int install_special_mapping(struct mm_struct *mm, unsigned long vm_flags, struct page **pages) { struct vm_area_struct *vma = __install_special_mapping( - mm, addr, len, vm_flags, &legacy_special_mapping_vmops, - (void *)pages); + mm, addr, len, vm_flags, (void *)pages, + &legacy_special_mapping_vmops); return PTR_ERR_OR_ZERO(vma); } diff --git a/kernel/mm/mmu_notifier.c b/kernel/mm/mmu_notifier.c index 3b9b3d074..5fbdd367b 100644 --- a/kernel/mm/mmu_notifier.c +++ b/kernel/mm/mmu_notifier.c @@ -123,6 +123,23 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +int __mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct mmu_notifier *mn; + int young = 0, id; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->clear_young) + young |= mn->ops->clear_young(mn, mm, start, end); + } + srcu_read_unlock(&srcu, id); + + return young; +} + int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { diff --git a/kernel/mm/mprotect.c b/kernel/mm/mprotect.c index 88584838e..ef5be8eaa 100644 --- a/kernel/mm/mprotect.c +++ b/kernel/mm/mprotect.c @@ -29,6 +29,8 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include "internal.h" + /* * For a prot_numa update we only hold mmap_sem for read so there is a * potential race with faulting where a pmd was temporarily none. This @@ -290,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*pprev) { vma = *pprev; goto success; @@ -322,6 +325,15 @@ success: change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); + /* + * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major + * fault on access. + */ + if ((oldflags & (VM_WRITE | VM_SHARED | VM_LOCKED)) == VM_LOCKED && + (newflags & VM_WRITE)) { + populate_vma_page_range(vma, start, end, NULL); + } + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); perf_event_mmap(vma); diff --git a/kernel/mm/mremap.c b/kernel/mm/mremap.c index 034e2d360..c25bc6268 100644 --- a/kernel/mm/mremap.c +++ b/kernel/mm/mremap.c @@ -22,6 +22,7 @@ #include <linux/mmu_notifier.h> #include <linux/sched/sysctl.h> #include <linux/uaccess.h> +#include <linux/mm-arch-hooks.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -275,6 +276,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, need_rmap_locks); if (moved_len < old_len) { + err = -ENOMEM; + } else if (vma->vm_ops && vma->vm_ops->mremap) { + err = vma->vm_ops->mremap(new_vma); + } + + if (unlikely(err)) { /* * On error, move entries back from new area to old, * which will succeed since page tables still there, @@ -285,14 +292,10 @@ static unsigned long move_vma(struct vm_area_struct *vma, vma = new_vma; old_len = new_len; old_addr = new_addr; - new_addr = -ENOMEM; - } else if (vma->vm_file && vma->vm_file->f_op->mremap) { - err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); - if (err < 0) { - move_page_tables(new_vma, new_addr, vma, old_addr, - moved_len, true); - return err; - } + new_addr = err; + } else { + arch_remap(mm, old_addr, old_addr + old_len, + new_addr, new_addr + new_len); } /* Conceal VM_ACCOUNT so old reservation is not undone */ @@ -343,6 +346,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = find_vma(mm, addr); + unsigned long pgoff; if (!vma || vma->vm_start > addr) return ERR_PTR(-EFAULT); @@ -354,17 +358,17 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (old_len > vma->vm_end - addr) return ERR_PTR(-EFAULT); + if (new_len == old_len) + return vma; + /* Need to be careful about a growing mapping */ - if (new_len > old_len) { - unsigned long pgoff; - - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) - return ERR_PTR(-EFAULT); - pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; - pgoff += vma->vm_pgoff; - if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) - return ERR_PTR(-EINVAL); - } + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) + return ERR_PTR(-EINVAL); + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) + return ERR_PTR(-EFAULT); if (vma->vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; @@ -397,19 +401,14 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, unsigned long charged = 0; unsigned long map_flags; - if (new_addr & ~PAGE_MASK) + if (offset_in_page(new_addr)) goto out; if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) goto out; - /* Check if the location we're moving into overlaps the - * old location at all, and fail if it does. - */ - if ((new_addr <= addr) && (new_addr+new_len) > addr) - goto out; - - if ((addr <= new_addr) && (addr+old_len) > new_addr) + /* Ensure the old/new locations do not overlap */ + if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; ret = do_munmap(mm, new_addr, new_len); @@ -436,11 +435,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (ret & ~PAGE_MASK) + if (offset_in_page(ret)) goto out1; ret = move_vma(vma, addr, old_len, new_len, new_addr, locked); - if (!(ret & ~PAGE_MASK)) + if (!(offset_in_page(ret))) goto out; out1: vm_unacct_memory(charged); @@ -485,7 +484,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) return ret; - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return ret; old_len = PAGE_ALIGN(old_len); @@ -567,7 +566,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (new_addr & ~PAGE_MASK) { + if (offset_in_page(new_addr)) { ret = new_addr; goto out; } @@ -575,8 +574,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); } out: - if (ret & ~PAGE_MASK) + if (offset_in_page(ret)) { vm_unacct_memory(charged); + locked = 0; + } up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); diff --git a/kernel/mm/msync.c b/kernel/mm/msync.c index bb04d53ae..24e612fef 100644 --- a/kernel/mm/msync.c +++ b/kernel/mm/msync.c @@ -38,7 +38,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) goto out; - if (start & ~PAGE_MASK) + if (offset_in_page(start)) goto out; if ((flags & MS_ASYNC) && (flags & MS_SYNC)) goto out; diff --git a/kernel/mm/nobootmem.c b/kernel/mm/nobootmem.c index 90b504683..e57cf24ba 100644 --- a/kernel/mm/nobootmem.c +++ b/kernel/mm/nobootmem.c @@ -37,11 +37,20 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, { void *ptr; u64 addr; + ulong flags = choose_memblock_flags(); if (limit > memblock.current_limit) limit = memblock.current_limit; - addr = memblock_find_in_range_node(size, align, goal, limit, nid); +again: + addr = memblock_find_in_range_node(size, align, goal, limit, nid, + flags); + if (!addr && (flags & MEMBLOCK_MIRROR)) { + flags &= ~MEMBLOCK_MIRROR; + pr_warn("Could not allocate %pap bytes of mirrored memory\n", + &size); + goto again; + } if (!addr) return NULL; @@ -77,7 +86,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); for (; cursor < end; cursor++) { - __free_pages_bootmem(pfn_to_page(cursor), 0); + __free_pages_bootmem(pfn_to_page(cursor), cursor, 0); totalram_pages++; } } @@ -92,7 +101,7 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) while (start + (1UL << order) > end) order--; - __free_pages_bootmem(pfn_to_page(start), order); + __free_pages_bootmem(pfn_to_page(start), start, order); start += (1UL << order); } @@ -121,7 +130,11 @@ static unsigned long __init free_low_memory_core_early(void) memblock_clear_hotplug(0, -1); - for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) + for_each_reserved_mem_region(i, &start, &end) + reserve_bootmem_region(start, end); + + for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, + NULL) count += __free_memory_core(start, end); #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK diff --git a/kernel/mm/nommu.c b/kernel/mm/nommu.c index e544508e2..92be862c8 100644 --- a/kernel/mm/nommu.c +++ b/kernel/mm/nommu.c @@ -42,22 +42,6 @@ #include <asm/mmu_context.h> #include "internal.h" -#if 0 -#define kenter(FMT, ...) \ - printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) -#define kleave(FMT, ...) \ - printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) -#define kdebug(FMT, ...) \ - printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__) -#else -#define kenter(FMT, ...) \ - no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) -#define kleave(FMT, ...) \ - no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) -#define kdebug(FMT, ...) \ - no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) -#endif - void *high_memory; EXPORT_SYMBOL(high_memory); struct page *mem_map; @@ -340,12 +324,12 @@ long vwrite(char *buf, char *addr, unsigned long count) } /* - * vmalloc - allocate virtually continguos memory + * vmalloc - allocate virtually contiguous memory * * @size: allocation size * * Allocate enough pages to cover @size from the page level - * allocator and map them into continguos kernel virtual space. + * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. @@ -357,12 +341,12 @@ void *vmalloc(unsigned long size) EXPORT_SYMBOL(vmalloc); /* - * vzalloc - allocate virtually continguos memory with zero fill + * vzalloc - allocate virtually contiguous memory with zero fill * * @size: allocation size * * Allocate enough pages to cover @size from the page level - * allocator and map them into continguos kernel virtual space. + * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * For tight control over page level allocator and protection flags @@ -436,7 +420,7 @@ void *vmalloc_exec(unsigned long size) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the - * page level allocator and map them into continguos kernel virtual space. + * page level allocator and map them into contiguous kernel virtual space. */ void *vmalloc_32(unsigned long size) { @@ -594,16 +578,16 @@ static noinline void validate_nommu_regions(void) return; last = rb_entry(lastp, struct vm_region, vm_rb); - BUG_ON(unlikely(last->vm_end <= last->vm_start)); - BUG_ON(unlikely(last->vm_top < last->vm_end)); + BUG_ON(last->vm_end <= last->vm_start); + BUG_ON(last->vm_top < last->vm_end); while ((p = rb_next(lastp))) { region = rb_entry(p, struct vm_region, vm_rb); last = rb_entry(lastp, struct vm_region, vm_rb); - BUG_ON(unlikely(region->vm_end <= region->vm_start)); - BUG_ON(unlikely(region->vm_top < region->vm_end)); - BUG_ON(unlikely(region->vm_start < last->vm_top)); + BUG_ON(region->vm_end <= region->vm_start); + BUG_ON(region->vm_top < region->vm_end); + BUG_ON(region->vm_start < last->vm_top); lastp = p; } @@ -665,11 +649,7 @@ static void free_page_series(unsigned long from, unsigned long to) for (; from < to; from += PAGE_SIZE) { struct page *page = virt_to_page(from); - kdebug("- free %lx", from); atomic_long_dec(&mmap_pages_allocated); - if (page_count(page) != 1) - kdebug("free page %p: refcount not one: %d", - page, page_count(page)); put_page(page); } } @@ -683,8 +663,6 @@ static void free_page_series(unsigned long from, unsigned long to) static void __put_nommu_region(struct vm_region *region) __releases(nommu_region_sem) { - kenter("%p{%d}", region, region->vm_usage); - BUG_ON(!nommu_region_tree.rb_node); if (--region->vm_usage == 0) { @@ -697,10 +675,8 @@ static void __put_nommu_region(struct vm_region *region) /* IO memory and memory shared directly out of the pagecache * from ramfs/tmpfs mustn't be released here */ - if (region->vm_flags & VM_MAPPED_COPY) { - kdebug("free series"); + if (region->vm_flags & VM_MAPPED_COPY) free_page_series(region->vm_start, region->vm_top); - } kmem_cache_free(vm_region_jar, region); } else { up_write(&nommu_region_sem); @@ -744,8 +720,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) struct address_space *mapping; struct rb_node **p, *parent, *rb_prev; - kenter(",%p", vma); - BUG_ON(!vma->vm_region); mm->map_count++; @@ -813,8 +787,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) struct mm_struct *mm = vma->vm_mm; struct task_struct *curr = current; - kenter("%p", vma); - protect_vma(vma, 0); mm->map_count--; @@ -854,7 +826,6 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) */ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) { - kenter("%p", vma); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); if (vma->vm_file) @@ -957,12 +928,8 @@ static int validate_mmap_request(struct file *file, int ret; /* do the simple checks first */ - if (flags & MAP_FIXED) { - printk(KERN_DEBUG - "%d: Can't do fixed-address/overlay mmap of RAM\n", - current->pid); + if (flags & MAP_FIXED) return -EINVAL; - } if ((flags & MAP_TYPE) != MAP_PRIVATE && (flags & MAP_TYPE) != MAP_SHARED) @@ -1060,8 +1027,7 @@ static int validate_mmap_request(struct file *file, ) { capabilities &= ~NOMMU_MAP_DIRECT; if (flags & MAP_SHARED) { - printk(KERN_WARNING - "MAP_SHARED not completely supported on !MMU\n"); + pr_warn("MAP_SHARED not completely supported on !MMU\n"); return -EINVAL; } } @@ -1069,7 +1035,7 @@ static int validate_mmap_request(struct file *file, /* handle executable mappings and implied executable * mappings */ - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (path_noexec(&file->f_path)) { if (prot & PROT_EXEC) return -EPERM; } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { @@ -1205,16 +1171,12 @@ static int do_mmap_private(struct vm_area_struct *vma, * we're allocating is smaller than a page */ order = get_order(len); - kdebug("alloc order %d for %lx", order, len); - total = 1 << order; point = len >> PAGE_SHIFT; /* we don't want to allocate a power-of-2 sized page set */ - if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { + if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) total = point; - kdebug("try to alloc exact %lu pages", total); - } base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); if (!base) @@ -1271,32 +1233,29 @@ enomem: /* * handle mapping creation for uClinux */ -unsigned long do_mmap_pgoff(struct file *file, - unsigned long addr, - unsigned long len, - unsigned long prot, - unsigned long flags, - unsigned long pgoff, - unsigned long *populate) +unsigned long do_mmap(struct file *file, + unsigned long addr, + unsigned long len, + unsigned long prot, + unsigned long flags, + vm_flags_t vm_flags, + unsigned long pgoff, + unsigned long *populate) { struct vm_area_struct *vma; struct vm_region *region; struct rb_node *rb; - unsigned long capabilities, vm_flags, result; + unsigned long capabilities, result; int ret; - kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); - *populate = 0; /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, &capabilities); - if (ret < 0) { - kleave(" = %d [val]", ret); + if (ret < 0) return ret; - } /* we ignore the address hint */ addr = 0; @@ -1304,7 +1263,7 @@ unsigned long do_mmap_pgoff(struct file *file, /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ - vm_flags = determine_vm_flags(file, prot, flags, capabilities); + vm_flags |= determine_vm_flags(file, prot, flags, capabilities); /* we're going to need to record the mapping */ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); @@ -1383,11 +1342,9 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_start = start; vma->vm_end = start + len; - if (pregion->vm_flags & VM_MAPPED_COPY) { - kdebug("share copy"); + if (pregion->vm_flags & VM_MAPPED_COPY) vma->vm_flags |= VM_MAPPED_COPY; - } else { - kdebug("share mmap"); + else { ret = do_mmap_shared_file(vma); if (ret < 0) { vma->vm_region = NULL; @@ -1467,7 +1424,6 @@ share: up_write(&nommu_region_sem); - kleave(" = %lx", result); return result; error_just_free: @@ -1479,27 +1435,24 @@ error: if (vma->vm_file) fput(vma->vm_file); kmem_cache_free(vm_area_cachep, vma); - kleave(" = %d", ret); return ret; sharing_violation: up_write(&nommu_region_sem); - printk(KERN_WARNING "Attempt to share mismatched mappings\n"); + pr_warn("Attempt to share mismatched mappings\n"); ret = -EINVAL; goto error; error_getting_vma: kmem_cache_free(vm_region_jar, region); - printk(KERN_WARNING "Allocation of vma for %lu byte allocation" - " from process %d failed\n", - len, current->pid); + pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", + len, current->pid); show_free_areas(0); return -ENOMEM; error_getting_region: - printk(KERN_WARNING "Allocation of vm region for %lu byte allocation" - " from process %d failed\n", - len, current->pid); + pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", + len, current->pid); show_free_areas(0); return -ENOMEM; } @@ -1544,7 +1497,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - if (a.offset & ~PAGE_MASK) + if (offset_in_page(a.offset)) return -EINVAL; return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, @@ -1563,8 +1516,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_region *region; unsigned long npages; - kenter(""); - /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ if (vma->vm_file) @@ -1628,8 +1579,6 @@ static int shrink_vma(struct mm_struct *mm, { struct vm_region *region; - kenter(""); - /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ delete_vma_from_mm(vma); @@ -1669,8 +1618,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) unsigned long end; int ret; - kenter(",%lx,%zx", start, len); - len = PAGE_ALIGN(len); if (len == 0) return -EINVAL; @@ -1682,11 +1629,9 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) if (!vma) { static int limit; if (limit < 5) { - printk(KERN_WARNING - "munmap of memory not mmapped by process %d" - " (%s): 0x%lx-0x%lx\n", - current->pid, current->comm, - start, start + len - 1); + pr_warn("munmap of memory not mmapped by process %d (%s): 0x%lx-0x%lx\n", + current->pid, current->comm, + start, start + len - 1); limit++; } return -EINVAL; @@ -1695,38 +1640,27 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* we're allowed to split an anonymous VMA but not a file-backed one */ if (vma->vm_file) { do { - if (start > vma->vm_start) { - kleave(" = -EINVAL [miss]"); + if (start > vma->vm_start) return -EINVAL; - } if (end == vma->vm_end) goto erase_whole_vma; vma = vma->vm_next; } while (vma); - kleave(" = -EINVAL [split file]"); return -EINVAL; } else { /* the chunk must be a subset of the VMA found */ if (start == vma->vm_start && end == vma->vm_end) goto erase_whole_vma; - if (start < vma->vm_start || end > vma->vm_end) { - kleave(" = -EINVAL [superset]"); + if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - } - if (start & ~PAGE_MASK) { - kleave(" = -EINVAL [unaligned start]"); + if (offset_in_page(start)) return -EINVAL; - } - if (end != vma->vm_end && end & ~PAGE_MASK) { - kleave(" = -EINVAL [unaligned split]"); + if (end != vma->vm_end && offset_in_page(end)) return -EINVAL; - } if (start != vma->vm_start && end != vma->vm_end) { ret = split_vma(mm, vma, start, 1); - if (ret < 0) { - kleave(" = %d [split]", ret); + if (ret < 0) return ret; - } } return shrink_vma(mm, vma, start, end); } @@ -1734,7 +1668,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) erase_whole_vma: delete_vma_from_mm(vma); delete_vma(mm, vma); - kleave(" = 0"); return 0; } EXPORT_SYMBOL(do_munmap); @@ -1766,8 +1699,6 @@ void exit_mmap(struct mm_struct *mm) if (!mm) return; - kenter(""); - mm->total_vm = 0; while ((vma = mm->mmap)) { @@ -1776,8 +1707,6 @@ void exit_mmap(struct mm_struct *mm) delete_vma(mm, vma); cond_resched(); } - - kleave(""); } unsigned long vm_brk(unsigned long addr, unsigned long len) @@ -1807,7 +1736,7 @@ static unsigned long do_mremap(unsigned long addr, if (old_len == 0 || new_len == 0) return (unsigned long) -EINVAL; - if (addr & ~PAGE_MASK) + if (offset_in_page(addr)) return -EINVAL; if (flags & MREMAP_FIXED && new_addr != addr) @@ -2157,7 +2086,7 @@ static int __meminit init_user_reserve(void) sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; } -module_init(init_user_reserve) +subsys_initcall(init_user_reserve); /* * Initialise sysctl_admin_reserve_kbytes. @@ -2178,4 +2107,4 @@ static int __meminit init_admin_reserve(void) sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; } -module_init(init_admin_reserve) +subsys_initcall(init_admin_reserve); diff --git a/kernel/mm/oom_kill.c b/kernel/mm/oom_kill.c index 2b665da1b..c12680993 100644 --- a/kernel/mm/oom_kill.c +++ b/kernel/mm/oom_kill.c @@ -42,7 +42,8 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; -static DEFINE_SPINLOCK(zone_scan_lock); + +DEFINE_MUTEX(oom_lock); #ifdef CONFIG_NUMA /** @@ -117,6 +118,15 @@ found: return t; } +/* + * order == -1 means the oom kill is required by sysrq, otherwise only + * for display purposes. + */ +static inline bool is_sysrq_oom(struct oom_control *oc) +{ + return oc->order == -1; +} + /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask) @@ -195,27 +205,26 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * Determine the type of allocation constraint. */ #ifdef CONFIG_NUMA -static enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask, nodemask_t *nodemask, - unsigned long *totalpages) +static enum oom_constraint constrained_alloc(struct oom_control *oc, + unsigned long *totalpages) { struct zone *zone; struct zoneref *z; - enum zone_type high_zoneidx = gfp_zone(gfp_mask); + enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask); bool cpuset_limited = false; int nid; /* Default to all available memory */ *totalpages = totalram_pages + total_swap_pages; - if (!zonelist) + if (!oc->zonelist) return CONSTRAINT_NONE; /* * Reach here only when __GFP_NOFAIL is used. So, we should avoid * to kill current.We have to random task kill in this case. * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. */ - if (gfp_mask & __GFP_THISNODE) + if (oc->gfp_mask & __GFP_THISNODE) return CONSTRAINT_NONE; /* @@ -223,17 +232,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, * the page allocator means a mempolicy is in effect. Cpuset policy * is enforced in get_page_from_freelist(). */ - if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { + if (oc->nodemask && + !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { *totalpages = total_swap_pages; - for_each_node_mask(nid, *nodemask) + for_each_node_mask(nid, *oc->nodemask) *totalpages += node_spanned_pages(nid); return CONSTRAINT_MEMORY_POLICY; } /* Check this allocation failure is caused by cpuset's wall function */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, - high_zoneidx, nodemask) - if (!cpuset_zone_allowed(zone, gfp_mask)) + for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, + high_zoneidx, oc->nodemask) + if (!cpuset_zone_allowed(zone, oc->gfp_mask)) cpuset_limited = true; if (cpuset_limited) { @@ -245,20 +255,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, return CONSTRAINT_NONE; } #else -static enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask, nodemask_t *nodemask, - unsigned long *totalpages) +static enum oom_constraint constrained_alloc(struct oom_control *oc, + unsigned long *totalpages) { *totalpages = totalram_pages + total_swap_pages; return CONSTRAINT_NONE; } #endif -enum oom_scan_t oom_scan_process_thread(struct task_struct *task, - unsigned long totalpages, const nodemask_t *nodemask, - bool force_kill) +enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, + struct task_struct *task, unsigned long totalpages) { - if (oom_unkillable_task(task, NULL, nodemask)) + if (oom_unkillable_task(task, NULL, oc->nodemask)) return OOM_SCAN_CONTINUE; /* @@ -266,7 +274,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, * Don't allow any other task to have access to the reserves. */ if (test_tsk_thread_flag(task, TIF_MEMDIE)) { - if (!force_kill) + if (!is_sysrq_oom(oc)) return OOM_SCAN_ABORT; } if (!task->mm) @@ -279,7 +287,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && !force_kill) + if (task_will_free_mem(task) && !is_sysrq_oom(oc)) return OOM_SCAN_ABORT; return OOM_SCAN_OK; @@ -288,12 +296,9 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, /* * Simple selection loop. We chose the process with the highest * number of 'points'. Returns -1 on scan abort. - * - * (not docbooked, we don't want this one cluttering up the manual) */ -static struct task_struct *select_bad_process(unsigned int *ppoints, - unsigned long totalpages, const nodemask_t *nodemask, - bool force_kill) +static struct task_struct *select_bad_process(struct oom_control *oc, + unsigned int *ppoints, unsigned long totalpages) { struct task_struct *g, *p; struct task_struct *chosen = NULL; @@ -303,8 +308,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, for_each_process_thread(g, p) { unsigned int points; - switch (oom_scan_process_thread(p, totalpages, nodemask, - force_kill)) { + switch (oom_scan_process_thread(oc, p, totalpages)) { case OOM_SCAN_SELECT: chosen = p; chosen_points = ULONG_MAX; @@ -317,7 +321,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, case OOM_SCAN_OK: break; }; - points = oom_badness(p, NULL, nodemask, totalpages); + points = oom_badness(p, NULL, oc->nodemask, totalpages); if (!points || points < chosen_points) continue; /* Prefer thread group leaders for display purposes */ @@ -379,23 +383,21 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) rcu_read_unlock(); } -static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, - struct mem_cgroup *memcg, const nodemask_t *nodemask) +static void dump_header(struct oom_control *oc, struct task_struct *p, + struct mem_cgroup *memcg) { - task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " "oom_score_adj=%hd\n", - current->comm, gfp_mask, order, + current->comm, oc->gfp_mask, oc->order, current->signal->oom_score_adj); - cpuset_print_task_mems_allowed(current); - task_unlock(current); + cpuset_print_current_mems_allowed(); dump_stack(); if (memcg) mem_cgroup_print_oom_info(memcg, p); else show_mem(SHOW_MEM_FILTER_NODES); if (sysctl_oom_dump_tasks) - dump_tasks(memcg, nodemask); + dump_tasks(memcg, oc->nodemask); } /* @@ -405,16 +407,15 @@ static atomic_t oom_victims = ATOMIC_INIT(0); static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); bool oom_killer_disabled __read_mostly; -static DECLARE_RWSEM(oom_sem); /** - * mark_tsk_oom_victim - marks the given task as OOM victim. + * mark_oom_victim - mark the given task as OOM victim * @tsk: task to mark * - * Has to be called with oom_sem taken for read and never after + * Has to be called with oom_lock held and never after * oom has been disabled already. */ -void mark_tsk_oom_victim(struct task_struct *tsk) +void mark_oom_victim(struct task_struct *tsk) { WARN_ON(oom_killer_disabled); /* OOM killer might race with memcg OOM */ @@ -431,23 +432,14 @@ void mark_tsk_oom_victim(struct task_struct *tsk) } /** - * unmark_oom_victim - unmarks the current task as OOM victim. - * - * Wakes up all waiters in oom_killer_disable() + * exit_oom_victim - note the exit of an OOM victim */ -void unmark_oom_victim(void) +void exit_oom_victim(void) { - if (!test_and_clear_thread_flag(TIF_MEMDIE)) - return; + clear_thread_flag(TIF_MEMDIE); - down_read(&oom_sem); - /* - * There is no need to signal the lasst oom_victim if there - * is nobody who cares. - */ - if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) + if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); - up_read(&oom_sem); } /** @@ -469,14 +461,14 @@ bool oom_killer_disable(void) * Make sure to not race with an ongoing OOM killer * and that the current is not the victim. */ - down_write(&oom_sem); + mutex_lock(&oom_lock); if (test_thread_flag(TIF_MEMDIE)) { - up_write(&oom_sem); + mutex_unlock(&oom_lock); return false; } oom_killer_disabled = true; - up_write(&oom_sem); + mutex_unlock(&oom_lock); wait_event(oom_victims_wait, !atomic_read(&oom_victims)); @@ -488,9 +480,25 @@ bool oom_killer_disable(void) */ void oom_killer_enable(void) { - down_write(&oom_sem); oom_killer_disabled = false; - up_write(&oom_sem); +} + +/* + * task->mm can be NULL if the task is the exited group leader. So to + * determine whether the task is using a particular mm, we examine all the + * task's threads: if one of those is using this mm then this task was also + * using it. + */ +static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(p, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm) + return t_mm == mm; + } + return false; } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -498,10 +506,9 @@ void oom_killer_enable(void) * Must be called while holding a reference to p, which will be released upon * returning. */ -void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, +void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int points, unsigned long totalpages, - struct mem_cgroup *memcg, nodemask_t *nodemask, - const char *message) + struct mem_cgroup *memcg, const char *message) { struct task_struct *victim = p; struct task_struct *child; @@ -517,7 +524,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, */ task_lock(p); if (p->mm && task_will_free_mem(p)) { - mark_tsk_oom_victim(p); + mark_oom_victim(p); task_unlock(p); put_task_struct(p); return; @@ -525,12 +532,10 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, task_unlock(p); if (__ratelimit(&oom_rs)) - dump_header(p, gfp_mask, order, memcg, nodemask); + dump_header(oc, p, memcg); - task_lock(p); - pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", + pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", message, task_pid_nr(p), p->comm, points); - task_unlock(p); /* * If any of p's children has a different mm and is eligible for kill, @@ -543,12 +548,12 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; - if (child->mm == p->mm) + if (process_shares_mm(child, p->mm)) continue; /* * oom_badness() returns 0 if the thread is unkillable */ - child_points = oom_badness(child, memcg, nodemask, + child_points = oom_badness(child, memcg, oc->nodemask, totalpages); if (child_points > victim_points) { put_task_struct(victim); @@ -570,9 +575,16 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, victim = p; } - /* mm cannot safely be dereferenced after task_unlock(victim) */ + /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; - mark_tsk_oom_victim(victim); + atomic_inc(&mm->mm_count); + /* + * We should send SIGKILL before setting TIF_MEMDIE in order to prevent + * the OOM victim from depleting the memory reserves from the user + * space under its control. + */ + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); + mark_oom_victim(victim); pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), @@ -589,21 +601,23 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * pending fatal signal. */ rcu_read_lock(); - for_each_process(p) - if (p->mm == mm && !same_thread_group(p, victim) && - !(p->flags & PF_KTHREAD)) { - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) - continue; + for_each_process(p) { + if (!process_shares_mm(p, mm)) + continue; + if (same_thread_group(p, victim)) + continue; + if (unlikely(p->flags & PF_KTHREAD)) + continue; + if (is_global_init(p)) + continue; + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + continue; - task_lock(p); /* Protect ->comm from prctl() */ - pr_err("Kill process %d (%s) sharing same memory\n", - task_pid_nr(p), p->comm); - task_unlock(p); - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); - } + do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); + } rcu_read_unlock(); - do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); + mmdrop(mm); put_task_struct(victim); } #undef K @@ -611,8 +625,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order, const nodemask_t *nodemask, +void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, struct mem_cgroup *memcg) { if (likely(!sysctl_panic_on_oom)) @@ -626,7 +639,10 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, if (constraint != CONSTRAINT_NONE) return; } - dump_header(NULL, gfp_mask, order, memcg, nodemask); + /* Do not panic for oom kills triggered by sysrq */ + if (is_sysrq_oom(oc)) + return; + dump_header(oc, NULL, memcg); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -645,80 +661,30 @@ int unregister_oom_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_oom_notifier); -/* - * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero - * if a parallel OOM killing is already taking place that includes a zone in - * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. - */ -bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) -{ - struct zoneref *z; - struct zone *zone; - bool ret = true; - - spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) { - ret = false; - goto out; - } - - /* - * Lock each zone in the zonelist under zone_scan_lock so a parallel - * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. - */ - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - set_bit(ZONE_OOM_LOCKED, &zone->flags); - -out: - spin_unlock(&zone_scan_lock); - return ret; -} - -/* - * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed - * allocation attempts with zonelists containing them may now recall the OOM - * killer, if necessary. - */ -void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) -{ - struct zoneref *z; - struct zone *zone; - - spin_lock(&zone_scan_lock); - for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) - clear_bit(ZONE_OOM_LOCKED, &zone->flags); - spin_unlock(&zone_scan_lock); -} - /** - * __out_of_memory - kill the "best" process when we run out of memory - * @zonelist: zonelist pointer - * @gfp_mask: memory allocation flags - * @order: amount of memory being requested as a power of 2 - * @nodemask: nodemask passed to page allocator - * @force_kill: true if a task must be killed, even if others are exiting + * out_of_memory - kill the "best" process when we run out of memory + * @oc: pointer to struct oom_control * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *nodemask, bool force_kill) +bool out_of_memory(struct oom_control *oc) { - const nodemask_t *mpol_mask; struct task_struct *p; unsigned long totalpages; unsigned long freed = 0; unsigned int uninitialized_var(points); enum oom_constraint constraint = CONSTRAINT_NONE; - int killed = 0; + + if (oom_killer_disabled) + return false; blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) /* Got some memory back in the last second. */ - return; + return true; /* * If current has a pending SIGKILL or is exiting, then automatically @@ -730,73 +696,44 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { - mark_tsk_oom_victim(current); - return; + mark_oom_victim(current); + return true; } /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - constraint = constrained_alloc(zonelist, gfp_mask, nodemask, - &totalpages); - mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; - check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL); + constraint = constrained_alloc(oc, &totalpages); + if (constraint != CONSTRAINT_MEMORY_POLICY) + oc->nodemask = NULL; + check_panic_on_oom(oc, constraint, NULL); if (sysctl_oom_kill_allocating_task && current->mm && - !oom_unkillable_task(current, NULL, nodemask) && + !oom_unkillable_task(current, NULL, oc->nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { get_task_struct(current); - oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, - nodemask, + oom_kill_process(oc, current, 0, totalpages, NULL, "Out of memory (oom_kill_allocating_task)"); - goto out; + return true; } - p = select_bad_process(&points, totalpages, mpol_mask, force_kill); + p = select_bad_process(oc, &points, totalpages); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - dump_header(NULL, gfp_mask, order, NULL, mpol_mask); + if (!p && !is_sysrq_oom(oc)) { + dump_header(oc, NULL, NULL); panic("Out of memory and no killable processes...\n"); } - if (p != (void *)-1UL) { - oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, - nodemask, "Out of memory"); - killed = 1; - } -out: - /* - * Give the killed threads a good chance of exiting before trying to - * allocate memory again. - */ - if (killed) + if (p && p != (void *)-1UL) { + oom_kill_process(oc, p, points, totalpages, NULL, + "Out of memory"); + /* + * Give the killed process a good chance to exit before trying + * to allocate memory again. + */ schedule_timeout_killable(1); -} - -/** - * out_of_memory - tries to invoke OOM killer. - * @zonelist: zonelist pointer - * @gfp_mask: memory allocation flags - * @order: amount of memory being requested as a power of 2 - * @nodemask: nodemask passed to page allocator - * @force_kill: true if a task must be killed, even if others are exiting - * - * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() - * when it returns false. Otherwise returns true. - */ -bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, - int order, nodemask_t *nodemask, bool force_kill) -{ - bool ret = false; - - down_read(&oom_sem); - if (!oom_killer_disabled) { - __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); - ret = true; } - up_read(&oom_sem); - - return ret; + return true; } /* @@ -806,27 +743,28 @@ bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, */ void pagefault_out_of_memory(void) { - struct zonelist *zonelist; + struct oom_control oc = { + .zonelist = NULL, + .nodemask = NULL, + .gfp_mask = 0, + .order = 0, + }; - down_read(&oom_sem); if (mem_cgroup_oom_synchronize(true)) - goto unlock; + return; - zonelist = node_zonelist(first_memory_node, GFP_KERNEL); - if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { - if (!oom_killer_disabled) - __out_of_memory(NULL, 0, 0, NULL, false); - else - /* - * There shouldn't be any user tasks runable while the - * OOM killer is disabled so the current task has to - * be a racing OOM victim for which oom_killer_disable() - * is waiting for. - */ - WARN_ON(test_thread_flag(TIF_MEMDIE)); + if (!mutex_trylock(&oom_lock)) + return; - oom_zonelist_unlock(zonelist, GFP_KERNEL); + if (!out_of_memory(&oc)) { + /* + * There shouldn't be any user tasks runnable while the + * OOM killer is disabled, so the current task has to + * be a racing OOM victim for which oom_killer_disable() + * is waiting for. + */ + WARN_ON(test_thread_flag(TIF_MEMDIE)); } -unlock: - up_read(&oom_sem); + + mutex_unlock(&oom_lock); } diff --git a/kernel/mm/page-writeback.c b/kernel/mm/page-writeback.c index eb59f7eea..d15d88c8e 100644 --- a/kernel/mm/page-writeback.c +++ b/kernel/mm/page-writeback.c @@ -2,7 +2,7 @@ * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. @@ -122,31 +122,28 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -unsigned long global_dirty_limit; +struct wb_domain global_wb_domain; -/* - * Scale the writeback cache size proportional to the relative writeout speeds. - * - * We do this by keeping a floating proportion between BDIs, based on page - * writeback completions [end_page_writeback()]. Those devices that write out - * pages fastest will get the larger share, while the slower will get a smaller - * share. - * - * We use page writeout completions because we are interested in getting rid of - * dirty pages. Having them written out is the primary goal. - * - * We introduce a concept of time, a period over which we measure these events, - * because demand can/will vary over time. The length of this period itself is - * measured in page writeback completions. - * - */ -static struct fprop_global writeout_completions; +/* consolidated parameters for balance_dirty_pages() and its subroutines */ +struct dirty_throttle_control { +#ifdef CONFIG_CGROUP_WRITEBACK + struct wb_domain *dom; + struct dirty_throttle_control *gdtc; /* only set in memcg dtc's */ +#endif + struct bdi_writeback *wb; + struct fprop_local_percpu *wb_completions; -static void writeout_period(unsigned long t); -/* Timer for aging of writeout_completions */ -static struct timer_list writeout_period_timer = - TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0); -static unsigned long writeout_period_time = 0; + unsigned long avail; /* dirtyable */ + unsigned long dirty; /* file_dirty + write + nfs */ + unsigned long thresh; /* dirty threshold */ + unsigned long bg_thresh; /* dirty background threshold */ + + unsigned long wb_dirty; /* per-wb counterparts */ + unsigned long wb_thresh; + unsigned long wb_bg_thresh; + + unsigned long pos_ratio; +}; /* * Length of period for aging writeout fractions of bdis. This is an @@ -155,6 +152,102 @@ static unsigned long writeout_period_time = 0; */ #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) +#ifdef CONFIG_CGROUP_WRITEBACK + +#define GDTC_INIT(__wb) .wb = (__wb), \ + .dom = &global_wb_domain, \ + .wb_completions = &(__wb)->completions + +#define GDTC_INIT_NO_WB .dom = &global_wb_domain + +#define MDTC_INIT(__wb, __gdtc) .wb = (__wb), \ + .dom = mem_cgroup_wb_domain(__wb), \ + .wb_completions = &(__wb)->memcg_completions, \ + .gdtc = __gdtc + +static bool mdtc_valid(struct dirty_throttle_control *dtc) +{ + return dtc->dom; +} + +static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) +{ + return dtc->dom; +} + +static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) +{ + return mdtc->gdtc; +} + +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return &wb->memcg_completions; +} + +static void wb_min_max_ratio(struct bdi_writeback *wb, + unsigned long *minp, unsigned long *maxp) +{ + unsigned long this_bw = wb->avg_write_bandwidth; + unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); + unsigned long long min = wb->bdi->min_ratio; + unsigned long long max = wb->bdi->max_ratio; + + /* + * @wb may already be clean by the time control reaches here and + * the total may not include its bw. + */ + if (this_bw < tot_bw) { + if (min) { + min *= this_bw; + do_div(min, tot_bw); + } + if (max < 100) { + max *= this_bw; + do_div(max, tot_bw); + } + } + + *minp = min; + *maxp = max; +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +#define GDTC_INIT(__wb) .wb = (__wb), \ + .wb_completions = &(__wb)->completions +#define GDTC_INIT_NO_WB +#define MDTC_INIT(__wb, __gdtc) + +static bool mdtc_valid(struct dirty_throttle_control *dtc) +{ + return false; +} + +static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc) +{ + return &global_wb_domain; +} + +static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc) +{ + return NULL; +} + +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return NULL; +} + +static void wb_min_max_ratio(struct bdi_writeback *wb, + unsigned long *minp, unsigned long *maxp) +{ + *minp = wb->bdi->min_ratio; + *maxp = wb->bdi->max_ratio; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of @@ -250,42 +343,88 @@ static unsigned long global_dirtyable_memory(void) return x + 1; /* Ensure that we never return 0 */ } -/* - * global_dirty_limits - background-writeback and dirty-throttling thresholds +/** + * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain + * @dtc: dirty_throttle_control of interest * - * Calculate the dirty thresholds based on sysctl parameters - * - vm.dirty_background_ratio or vm.dirty_background_bytes - * - vm.dirty_ratio or vm.dirty_bytes - * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * Calculate @dtc->thresh and ->bg_thresh considering + * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller + * must ensure that @dtc->avail is set before calling this function. The + * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and * real-time tasks. */ -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +static void domain_dirty_limits(struct dirty_throttle_control *dtc) { - const unsigned long available_memory = global_dirtyable_memory(); - unsigned long background; - unsigned long dirty; + const unsigned long available_memory = dtc->avail; + struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); + unsigned long bytes = vm_dirty_bytes; + unsigned long bg_bytes = dirty_background_bytes; + unsigned long ratio = vm_dirty_ratio; + unsigned long bg_ratio = dirty_background_ratio; + unsigned long thresh; + unsigned long bg_thresh; struct task_struct *tsk; - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + /* gdtc is !NULL iff @dtc is for memcg domain */ + if (gdtc) { + unsigned long global_avail = gdtc->avail; + + /* + * The byte settings can't be applied directly to memcg + * domains. Convert them to ratios by scaling against + * globally available memory. + */ + if (bytes) + ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 / + global_avail, 100UL); + if (bg_bytes) + bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 / + global_avail, 100UL); + bytes = bg_bytes = 0; + } + + if (bytes) + thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else - dirty = (vm_dirty_ratio * available_memory) / 100; + thresh = (ratio * available_memory) / 100; - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + if (bg_bytes) + bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else - background = (dirty_background_ratio * available_memory) / 100; + bg_thresh = (bg_ratio * available_memory) / 100; - if (background >= dirty) - background = dirty / 2; + if (bg_thresh >= thresh) + bg_thresh = thresh / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; + bg_thresh += bg_thresh / 4; + thresh += thresh / 4; } - *pbackground = background; - *pdirty = dirty; - trace_global_dirty_state(background, dirty); + dtc->thresh = thresh; + dtc->bg_thresh = bg_thresh; + + /* we should eventually report the domain in the TP */ + if (!gdtc) + trace_global_dirty_state(bg_thresh, thresh); +} + +/** + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * @pbackground: out parameter for bg_thresh + * @pdirty: out parameter for thresh + * + * Calculate bg_thresh and thresh for global_wb_domain. See + * domain_dirty_limits() for details. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; + + gdtc.avail = global_dirtyable_memory(); + domain_dirty_limits(&gdtc); + + *pbackground = gdtc.bg_thresh; + *pdirty = gdtc.thresh; } /** @@ -392,47 +531,52 @@ static unsigned long wp_next_time(unsigned long cur_time) return cur_time; } -/* - * Increment the BDI's writeout completion count and the global writeout - * completion count. Called from test_clear_page_writeback(). - */ -static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) +static void wb_domain_writeout_inc(struct wb_domain *dom, + struct fprop_local_percpu *completions, + unsigned int max_prop_frac) { - __inc_bdi_stat(bdi, BDI_WRITTEN); - __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, - bdi->max_prop_frac); + __fprop_inc_percpu_max(&dom->completions, completions, + max_prop_frac); /* First event after period switching was turned off? */ - if (!unlikely(writeout_period_time)) { + if (!unlikely(dom->period_time)) { /* * We can race with other __bdi_writeout_inc calls here but * it does not cause any harm since the resulting time when * timer will fire and what is in writeout_period_time will be * roughly the same. */ - writeout_period_time = wp_next_time(jiffies); - mod_timer(&writeout_period_timer, writeout_period_time); + dom->period_time = wp_next_time(jiffies); + mod_timer(&dom->period_timer, dom->period_time); } } -void bdi_writeout_inc(struct backing_dev_info *bdi) +/* + * Increment @wb's writeout completion count and the global writeout + * completion count. Called from test_clear_page_writeback(). + */ +static inline void __wb_writeout_inc(struct bdi_writeback *wb) { - unsigned long flags; + struct wb_domain *cgdom; - local_irq_save(flags); - __bdi_writeout_inc(bdi); - local_irq_restore(flags); + __inc_wb_stat(wb, WB_WRITTEN); + wb_domain_writeout_inc(&global_wb_domain, &wb->completions, + wb->bdi->max_prop_frac); + + cgdom = mem_cgroup_wb_domain(wb); + if (cgdom) + wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), + wb->bdi->max_prop_frac); } -EXPORT_SYMBOL_GPL(bdi_writeout_inc); -/* - * Obtain an accurate fraction of the BDI's portion. - */ -static void bdi_writeout_fraction(struct backing_dev_info *bdi, - long *numerator, long *denominator) +void wb_writeout_inc(struct bdi_writeback *wb) { - fprop_fraction_percpu(&writeout_completions, &bdi->completions, - numerator, denominator); + unsigned long flags; + + local_irq_save(flags); + __wb_writeout_inc(wb); + local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * On idle system, we can be called long after we scheduled because we use @@ -440,22 +584,46 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, */ static void writeout_period(unsigned long t) { - int miss_periods = (jiffies - writeout_period_time) / + struct wb_domain *dom = (void *)t; + int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; - if (fprop_new_period(&writeout_completions, miss_periods + 1)) { - writeout_period_time = wp_next_time(writeout_period_time + + if (fprop_new_period(&dom->completions, miss_periods + 1)) { + dom->period_time = wp_next_time(dom->period_time + miss_periods * VM_COMPLETIONS_PERIOD_LEN); - mod_timer(&writeout_period_timer, writeout_period_time); + mod_timer(&dom->period_timer, dom->period_time); } else { /* * Aging has zeroed all fractions. Stop wasting CPU on period * updates. */ - writeout_period_time = 0; + dom->period_time = 0; } } +int wb_domain_init(struct wb_domain *dom, gfp_t gfp) +{ + memset(dom, 0, sizeof(*dom)); + + spin_lock_init(&dom->lock); + + init_timer_deferrable(&dom->period_timer); + dom->period_timer.function = writeout_period; + dom->period_timer.data = (unsigned long)dom; + + dom->dirty_limit_tstamp = jiffies; + + return fprop_global_init(&dom->completions, gfp); +} + +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom) +{ + del_timer_sync(&dom->period_timer); + fprop_global_destroy(&dom->completions); +} +#endif + /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not @@ -510,17 +678,32 @@ static unsigned long dirty_freerun_ceiling(unsigned long thresh, return (thresh + bg_thresh) / 2; } -static unsigned long hard_dirty_limit(unsigned long thresh) +static unsigned long hard_dirty_limit(struct wb_domain *dom, + unsigned long thresh) +{ + return max(thresh, dom->dirty_limit); +} + +/* + * Memory which can be further allocated to a memcg domain is capped by + * system-wide clean memory excluding the amount being used in the domain. + */ +static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, + unsigned long filepages, unsigned long headroom) { - return max(thresh, global_dirty_limit); + struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc); + unsigned long clean = filepages - min(filepages, mdtc->dirty); + unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty); + unsigned long other_clean = global_clean - min(global_clean, clean); + + mdtc->avail = filepages + min(headroom, other_clean); } /** - * bdi_dirty_limit - @bdi's share of dirty throttling threshold - * @bdi: the backing_dev_info to query - * @dirty: global dirty limit in pages + * __wb_calc_thresh - @wb's share of dirty throttling threshold + * @dtc: dirty_throttle_context of interest * - * Returns @bdi's dirty limit in pages. The term "dirty" in the context of + * Returns @wb's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. * * Note that balance_dirty_pages() will only seriously take it as a hard limit @@ -528,34 +711,47 @@ static unsigned long hard_dirty_limit(unsigned long thresh) * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks - * more (rather than completely block them) when the bdi dirty pages go high. + * more (rather than completely block them) when the wb dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * - * The bdi's share of dirty limit will be adapting to its throughput and + * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. */ -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) +static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { - u64 bdi_dirty; + struct wb_domain *dom = dtc_dom(dtc); + unsigned long thresh = dtc->thresh; + u64 wb_thresh; long numerator, denominator; + unsigned long wb_min_ratio, wb_max_ratio; /* - * Calculate this BDI's share of the dirty ratio. + * Calculate this BDI's share of the thresh ratio. */ - bdi_writeout_fraction(bdi, &numerator, &denominator); + fprop_fraction_percpu(&dom->completions, dtc->wb_completions, + &numerator, &denominator); + + wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh *= numerator; + do_div(wb_thresh, denominator); - bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; - bdi_dirty *= numerator; - do_div(bdi_dirty, denominator); + wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); - bdi_dirty += (dirty * bdi->min_ratio) / 100; - if (bdi_dirty > (dirty * bdi->max_ratio) / 100) - bdi_dirty = dirty * bdi->max_ratio / 100; + wb_thresh += (thresh * wb_min_ratio) / 100; + if (wb_thresh > (thresh * wb_max_ratio) / 100) + wb_thresh = thresh * wb_max_ratio / 100; - return bdi_dirty; + return wb_thresh; +} + +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) +{ + struct dirty_throttle_control gdtc = { GDTC_INIT(wb), + .thresh = thresh }; + return __wb_calc_thresh(&gdtc); } /* @@ -594,7 +790,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, * * (o) global/bdi setpoints * - * We want the dirty pages be balanced around the global/bdi setpoints. + * We want the dirty pages be balanced around the global/wb setpoints. * When the number of dirty pages is higher/lower than the setpoint, the * dirty position control ratio (and hence task dirty ratelimit) will be * decreased/increased to bring the dirty pages back to the setpoint. @@ -604,8 +800,8 @@ static long long pos_ratio_polynom(unsigned long setpoint, * if (dirty < setpoint) scale up pos_ratio * if (dirty > setpoint) scale down pos_ratio * - * if (bdi_dirty < bdi_setpoint) scale up pos_ratio - * if (bdi_dirty > bdi_setpoint) scale down pos_ratio + * if (wb_dirty < wb_setpoint) scale up pos_ratio + * if (wb_dirty > wb_setpoint) scale down pos_ratio * * task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT * @@ -630,7 +826,7 @@ static long long pos_ratio_polynom(unsigned long setpoint, * 0 +------------.------------------.----------------------*-------------> * freerun^ setpoint^ limit^ dirty pages * - * (o) bdi control line + * (o) wb control line * * ^ pos_ratio * | @@ -656,33 +852,32 @@ static long long pos_ratio_polynom(unsigned long setpoint, * | . . * | . . * 0 +----------------------.-------------------------------.-------------> - * bdi_setpoint^ x_intercept^ + * wb_setpoint^ x_intercept^ * - * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can + * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can * be smoothly throttled down to normal if it starts high in situations like * - start writing to a slow SD card and a fast disk at the same time. The SD - * card's bdi_dirty may rush to many times higher than bdi_setpoint. - * - the bdi dirty thresh drops quickly due to change of JBOD workload + * card's wb_dirty may rush to many times higher than wb_setpoint. + * - the wb dirty thresh drops quickly due to change of JBOD workload */ -static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty) -{ - unsigned long write_bw = bdi->avg_write_bandwidth; - unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); - unsigned long limit = hard_dirty_limit(thresh); +static void wb_position_ratio(struct dirty_throttle_control *dtc) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); + unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); + unsigned long wb_thresh = dtc->wb_thresh; unsigned long x_intercept; unsigned long setpoint; /* dirty pages' target balance point */ - unsigned long bdi_setpoint; + unsigned long wb_setpoint; unsigned long span; long long pos_ratio; /* for scaling up/down the rate limit */ long x; - if (unlikely(dirty >= limit)) - return 0; + dtc->pos_ratio = 0; + + if (unlikely(dtc->dirty >= limit)) + return; /* * global setpoint @@ -690,165 +885,167 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; - pos_ratio = pos_ratio_polynom(setpoint, dirty, limit); + pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit); /* * The strictlimit feature is a tool preventing mistrusted filesystems * from growing a large number of dirty pages before throttling. For - * such filesystems balance_dirty_pages always checks bdi counters - * against bdi limits. Even if global "nr_dirty" is under "freerun". + * such filesystems balance_dirty_pages always checks wb counters + * against wb limits. Even if global "nr_dirty" is under "freerun". * This is especially important for fuse which sets bdi->max_ratio to * 1% by default. Without strictlimit feature, fuse writeback may * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * - * Here, in bdi_position_ratio(), we calculate pos_ratio based on - * two values: bdi_dirty and bdi_thresh. Let's consider an example: + * Here, in wb_position_ratio(), we calculate pos_ratio based on + * two values: wb_dirty and wb_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). - * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. - * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is - * about ~6K pages (as the average of background and throttle bdi + * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. + * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is + * about ~6K pages (as the average of background and throttle wb * limits). The 3rd order polynomial will provide positive feedback if - * bdi_dirty is under bdi_setpoint and vice versa. + * wb_dirty is under wb_setpoint and vice versa. * * Note, that we cannot use global counters in these calculations - * because we want to throttle process writing to a strictlimit BDI + * because we want to throttle process writing to a strictlimit wb * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ - if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { - long long bdi_pos_ratio; - unsigned long bdi_bg_thresh; + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + long long wb_pos_ratio; - if (bdi_dirty < 8) - return min_t(long long, pos_ratio * 2, - 2 << RATELIMIT_CALC_SHIFT); + if (dtc->wb_dirty < 8) { + dtc->pos_ratio = min_t(long long, pos_ratio * 2, + 2 << RATELIMIT_CALC_SHIFT); + return; + } - if (bdi_dirty >= bdi_thresh) - return 0; + if (dtc->wb_dirty >= wb_thresh) + return; - bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh); - bdi_setpoint = dirty_freerun_ceiling(bdi_thresh, - bdi_bg_thresh); + wb_setpoint = dirty_freerun_ceiling(wb_thresh, + dtc->wb_bg_thresh); - if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh) - return 0; + if (wb_setpoint == 0 || wb_setpoint == wb_thresh) + return; - bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty, - bdi_thresh); + wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty, + wb_thresh); /* - * Typically, for strictlimit case, bdi_setpoint << setpoint - * and pos_ratio >> bdi_pos_ratio. In the other words global + * Typically, for strictlimit case, wb_setpoint << setpoint + * and pos_ratio >> wb_pos_ratio. In the other words global * state ("dirty") is not limiting factor and we have to - * make decision based on bdi counters. But there is an + * make decision based on wb counters. But there is an * important case when global pos_ratio should get precedence: * global limits are exceeded (e.g. due to activities on other - * BDIs) while given strictlimit BDI is below limit. + * wb's) while given strictlimit wb is below limit. * - * "pos_ratio * bdi_pos_ratio" would work for the case above, + * "pos_ratio * wb_pos_ratio" would work for the case above, * but it would look too non-natural for the case of all - * activity in the system coming from a single strictlimit BDI + * activity in the system coming from a single strictlimit wb * with bdi->max_ratio == 100%. * * Note that min() below somewhat changes the dynamics of the * control system. Normally, pos_ratio value can be well over 3 - * (when globally we are at freerun and bdi is well below bdi + * (when globally we are at freerun and wb is well below wb * setpoint). Now the maximum pos_ratio in the same situation * is 2. We might want to tweak this if we observe the control * system is too slow to adapt. */ - return min(pos_ratio, bdi_pos_ratio); + dtc->pos_ratio = min(pos_ratio, wb_pos_ratio); + return; } /* * We have computed basic pos_ratio above based on global situation. If - * the bdi is over/under its share of dirty pages, we want to scale + * the wb is over/under its share of dirty pages, we want to scale * pos_ratio further down/up. That is done by the following mechanism. */ /* - * bdi setpoint + * wb setpoint * - * f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint) + * f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint) * - * x_intercept - bdi_dirty + * x_intercept - wb_dirty * := -------------------------- - * x_intercept - bdi_setpoint + * x_intercept - wb_setpoint * - * The main bdi control line is a linear function that subjects to + * The main wb control line is a linear function that subjects to * - * (1) f(bdi_setpoint) = 1.0 - * (2) k = - 1 / (8 * write_bw) (in single bdi case) - * or equally: x_intercept = bdi_setpoint + 8 * write_bw + * (1) f(wb_setpoint) = 1.0 + * (2) k = - 1 / (8 * write_bw) (in single wb case) + * or equally: x_intercept = wb_setpoint + 8 * write_bw * - * For single bdi case, the dirty pages are observed to fluctuate + * For single wb case, the dirty pages are observed to fluctuate * regularly within range - * [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2] + * [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2] * for various filesystems, where (2) can yield in a reasonable 12.5% * fluctuation range for pos_ratio. * - * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its + * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its * own size, so move the slope over accordingly and choose a slope that - * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh. + * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh. */ - if (unlikely(bdi_thresh > thresh)) - bdi_thresh = thresh; + if (unlikely(wb_thresh > dtc->thresh)) + wb_thresh = dtc->thresh; /* - * It's very possible that bdi_thresh is close to 0 not because the + * It's very possible that wb_thresh is close to 0 not because the * device is slow, but that it has remained inactive for long time. * Honour such devices a reasonable good (hopefully IO efficient) * threshold, so that the occasional writes won't be blocked and active * writes can rampup the threshold quickly. */ - bdi_thresh = max(bdi_thresh, (limit - dirty) / 8); + wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8); /* - * scale global setpoint to bdi's: - * bdi_setpoint = setpoint * bdi_thresh / thresh + * scale global setpoint to wb's: + * wb_setpoint = setpoint * wb_thresh / thresh */ - x = div_u64((u64)bdi_thresh << 16, thresh | 1); - bdi_setpoint = setpoint * (u64)x >> 16; + x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1); + wb_setpoint = setpoint * (u64)x >> 16; /* - * Use span=(8*write_bw) in single bdi case as indicated by - * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case. + * Use span=(8*write_bw) in single wb case as indicated by + * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case. * - * bdi_thresh thresh - bdi_thresh - * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh - * thresh thresh + * wb_thresh thresh - wb_thresh + * span = --------- * (8 * write_bw) + ------------------ * wb_thresh + * thresh thresh */ - span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16; - x_intercept = bdi_setpoint + span; + span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16; + x_intercept = wb_setpoint + span; - if (bdi_dirty < x_intercept - span / 4) { - pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty), - (x_intercept - bdi_setpoint) | 1); + if (dtc->wb_dirty < x_intercept - span / 4) { + pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty), + (x_intercept - wb_setpoint) | 1); } else pos_ratio /= 4; /* - * bdi reserve area, safeguard against dirty pool underrun and disk idle + * wb reserve area, safeguard against dirty pool underrun and disk idle * It may push the desired control point of global dirty pages higher * than setpoint. */ - x_intercept = bdi_thresh / 2; - if (bdi_dirty < x_intercept) { - if (bdi_dirty > x_intercept / 8) - pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty); + x_intercept = wb_thresh / 2; + if (dtc->wb_dirty < x_intercept) { + if (dtc->wb_dirty > x_intercept / 8) + pos_ratio = div_u64(pos_ratio * x_intercept, + dtc->wb_dirty); else pos_ratio *= 8; } - return pos_ratio; + dtc->pos_ratio = pos_ratio; } -static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, - unsigned long elapsed, - unsigned long written) +static void wb_update_write_bandwidth(struct bdi_writeback *wb, + unsigned long elapsed, + unsigned long written) { const unsigned long period = roundup_pow_of_two(3 * HZ); - unsigned long avg = bdi->avg_write_bandwidth; - unsigned long old = bdi->write_bandwidth; + unsigned long avg = wb->avg_write_bandwidth; + unsigned long old = wb->write_bandwidth; u64 bw; /* @@ -861,14 +1058,14 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, * @written may have decreased due to account_page_redirty(). * Avoid underflowing @bw calculation. */ - bw = written - min(written, bdi->written_stamp); + bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); avg = bw; goto out; } - bw += (u64)bdi->write_bandwidth * (period - elapsed); + bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); /* @@ -881,21 +1078,22 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, avg += (old - avg) >> 3; out: - bdi->write_bandwidth = bw; - bdi->avg_write_bandwidth = avg; + /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ + avg = max(avg, 1LU); + if (wb_has_dirty_io(wb)) { + long delta = avg - wb->avg_write_bandwidth; + WARN_ON_ONCE(atomic_long_add_return(delta, + &wb->bdi->tot_write_bandwidth) <= 0); + } + wb->write_bandwidth = bw; + wb->avg_write_bandwidth = avg; } -/* - * The global dirtyable memory and dirty threshold could be suddenly knocked - * down by a large amount (eg. on the startup of KVM in a swapless system). - * This may throw the system into deep dirty exceeded state and throttle - * heavy/light dirtiers alike. To retain good responsiveness, maintain - * global_dirty_limit for tracking slowly down to the knocked down dirty - * threshold. - */ -static void update_dirty_limit(unsigned long thresh, unsigned long dirty) +static void update_dirty_limit(struct dirty_throttle_control *dtc) { - unsigned long limit = global_dirty_limit; + struct wb_domain *dom = dtc_dom(dtc); + unsigned long thresh = dtc->thresh; + unsigned long limit = dom->dirty_limit; /* * Follow up in one step. @@ -908,63 +1106,57 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty) /* * Follow down slowly. Use the higher one as the target, because thresh * may drop below dirty. This is exactly the reason to introduce - * global_dirty_limit which is guaranteed to lie above the dirty pages. + * dom->dirty_limit which is guaranteed to lie above the dirty pages. */ - thresh = max(thresh, dirty); + thresh = max(thresh, dtc->dirty); if (limit > thresh) { limit -= (limit - thresh) >> 5; goto update; } return; update: - global_dirty_limit = limit; + dom->dirty_limit = limit; } -static void global_update_bandwidth(unsigned long thresh, - unsigned long dirty, +static void domain_update_bandwidth(struct dirty_throttle_control *dtc, unsigned long now) { - static DEFINE_SPINLOCK(dirty_lock); - static unsigned long update_time = INITIAL_JIFFIES; + struct wb_domain *dom = dtc_dom(dtc); /* * check locklessly first to optimize away locking for the most time */ - if (time_before(now, update_time + BANDWIDTH_INTERVAL)) + if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) return; - spin_lock(&dirty_lock); - if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { - update_dirty_limit(thresh, dirty); - update_time = now; + spin_lock(&dom->lock); + if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) { + update_dirty_limit(dtc); + dom->dirty_limit_tstamp = now; } - spin_unlock(&dirty_lock); + spin_unlock(&dom->lock); } /* - * Maintain bdi->dirty_ratelimit, the base dirty throttle rate. + * Maintain wb->dirty_ratelimit, the base dirty throttle rate. * - * Normal bdi tasks will be curbed at or below it in long term. + * Normal wb tasks will be curbed at or below it in long term. * Obviously it should be around (write_bw / N) when there are N dd tasks. */ -static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long dirtied, - unsigned long elapsed) -{ - unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); - unsigned long limit = hard_dirty_limit(thresh); +static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, + unsigned long dirtied, + unsigned long elapsed) +{ + struct bdi_writeback *wb = dtc->wb; + unsigned long dirty = dtc->dirty; + unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh); + unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh); unsigned long setpoint = (freerun + limit) / 2; - unsigned long write_bw = bdi->avg_write_bandwidth; - unsigned long dirty_ratelimit = bdi->dirty_ratelimit; + unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long dirty_ratelimit = wb->dirty_ratelimit; unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; - unsigned long pos_ratio; unsigned long step; unsigned long x; @@ -972,20 +1164,18 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ - dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; + dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; - pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty); /* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ task_ratelimit = (u64)dirty_ratelimit * - pos_ratio >> RATELIMIT_CALC_SHIFT; + dtc->pos_ratio >> RATELIMIT_CALC_SHIFT; task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */ /* * A linear estimation of the "balanced" throttle rate. The theory is, - * if there are N dd tasks, each throttled at task_ratelimit, the bdi's + * if there are N dd tasks, each throttled at task_ratelimit, the wb's * dirty_rate will be measured to be (N * task_ratelimit). So the below * formula will yield the balanced rate limit (write_bw / N). * @@ -1024,7 +1214,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, /* * We could safely do this and return immediately: * - * bdi->dirty_ratelimit = balanced_dirty_ratelimit; + * wb->dirty_ratelimit = balanced_dirty_ratelimit; * * However to get a more stable dirty_ratelimit, the below elaborated * code makes use of task_ratelimit to filter out singular points and @@ -1058,32 +1248,31 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, step = 0; /* - * For strictlimit case, calculations above were based on bdi counters - * and limits (starting from pos_ratio = bdi_position_ratio() and up to + * For strictlimit case, calculations above were based on wb counters + * and limits (starting from pos_ratio = wb_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). - * Hence, to calculate "step" properly, we have to use bdi_dirty as - * "dirty" and bdi_setpoint as "setpoint". + * Hence, to calculate "step" properly, we have to use wb_dirty as + * "dirty" and wb_setpoint as "setpoint". * - * We rampup dirty_ratelimit forcibly if bdi_dirty is low because - * it's possible that bdi_thresh is close to zero due to inactivity - * of backing device (see the implementation of bdi_dirty_limit()). + * We rampup dirty_ratelimit forcibly if wb_dirty is low because + * it's possible that wb_thresh is close to zero due to inactivity + * of backing device. */ - if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { - dirty = bdi_dirty; - if (bdi_dirty < 8) - setpoint = bdi_dirty + 1; + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + dirty = dtc->wb_dirty; + if (dtc->wb_dirty < 8) + setpoint = dtc->wb_dirty + 1; else - setpoint = (bdi_thresh + - bdi_dirty_limit(bdi, bg_thresh)) / 2; + setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2; } if (dirty < setpoint) { - x = min3(bdi->balanced_dirty_ratelimit, + x = min3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { - x = max3(bdi->balanced_dirty_ratelimit, + x = max3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; @@ -1105,69 +1294,67 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, else dirty_ratelimit -= step; - bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL); - bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit; + wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); + wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; - trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit); + trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit); } -void __bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time) +static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc, + struct dirty_throttle_control *mdtc, + unsigned long start_time, + bool update_ratelimit) { + struct bdi_writeback *wb = gdtc->wb; unsigned long now = jiffies; - unsigned long elapsed = now - bdi->bw_time_stamp; + unsigned long elapsed = now - wb->bw_time_stamp; unsigned long dirtied; unsigned long written; + lockdep_assert_held(&wb->list_lock); + /* * rate-limit, only update once every 200ms. */ if (elapsed < BANDWIDTH_INTERVAL) return; - dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); - written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); + written = percpu_counter_read(&wb->stat[WB_WRITTEN]); /* * Skip quiet periods when disk bandwidth is under-utilized. * (at least 1s idle time between two flusher runs) */ - if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) + if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) goto snapshot; - if (thresh) { - global_update_bandwidth(thresh, dirty, now); - bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty, - dirtied, elapsed); + if (update_ratelimit) { + domain_update_bandwidth(gdtc, now); + wb_update_dirty_ratelimit(gdtc, dirtied, elapsed); + + /* + * @mdtc is always NULL if !CGROUP_WRITEBACK but the + * compiler has no way to figure that out. Help it. + */ + if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) { + domain_update_bandwidth(mdtc, now); + wb_update_dirty_ratelimit(mdtc, dirtied, elapsed); + } } - bdi_update_write_bandwidth(bdi, elapsed, written); + wb_update_write_bandwidth(wb, elapsed, written); snapshot: - bdi->dirtied_stamp = dirtied; - bdi->written_stamp = written; - bdi->bw_time_stamp = now; + wb->dirtied_stamp = dirtied; + wb->written_stamp = written; + wb->bw_time_stamp = now; } -static void bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time) +void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) - return; - spin_lock(&bdi->wb.list_lock); - __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty, start_time); - spin_unlock(&bdi->wb.list_lock); + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + + __wb_update_bandwidth(&gdtc, NULL, start_time, false); } /* @@ -1187,10 +1374,10 @@ static unsigned long dirty_poll_interval(unsigned long dirty, return 1; } -static unsigned long bdi_max_pause(struct backing_dev_info *bdi, - unsigned long bdi_dirty) +static unsigned long wb_max_pause(struct bdi_writeback *wb, + unsigned long wb_dirty) { - unsigned long bw = bdi->avg_write_bandwidth; + unsigned long bw = wb->avg_write_bandwidth; unsigned long t; /* @@ -1200,20 +1387,20 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi, * * 8 serves as the safety ratio. */ - t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); + t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); t++; return min_t(unsigned long, t, MAX_PAUSE); } -static long bdi_min_pause(struct backing_dev_info *bdi, - long max_pause, - unsigned long task_ratelimit, - unsigned long dirty_ratelimit, - int *nr_dirtied_pause) +static long wb_min_pause(struct bdi_writeback *wb, + long max_pause, + unsigned long task_ratelimit, + unsigned long dirty_ratelimit, + int *nr_dirtied_pause) { - long hi = ilog2(bdi->avg_write_bandwidth); - long lo = ilog2(bdi->dirty_ratelimit); + long hi = ilog2(wb->avg_write_bandwidth); + long lo = ilog2(wb->dirty_ratelimit); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ @@ -1281,34 +1468,27 @@ static long bdi_min_pause(struct backing_dev_info *bdi, return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } -static inline void bdi_dirty_limits(struct backing_dev_info *bdi, - unsigned long dirty_thresh, - unsigned long background_thresh, - unsigned long *bdi_dirty, - unsigned long *bdi_thresh, - unsigned long *bdi_bg_thresh) +static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) { - unsigned long bdi_reclaimable; + struct bdi_writeback *wb = dtc->wb; + unsigned long wb_reclaimable; /* - * bdi_thresh is not treated as some limiting factor as + * wb_thresh is not treated as some limiting factor as * dirty_thresh, due to reasons - * - in JBOD setup, bdi_thresh can fluctuate a lot + * - in JBOD setup, wb_thresh can fluctuate a lot * - in a system with HDD and USB key, the USB key may somehow - * go into state (bdi_dirty >> bdi_thresh) either because - * bdi_dirty starts high, or because bdi_thresh drops low. + * go into state (wb_dirty >> wb_thresh) either because + * wb_dirty starts high, or because wb_thresh drops low. * In this case we don't want to hard throttle the USB key - * dirtiers for 100 seconds until bdi_dirty drops under - * bdi_thresh. Instead the auxiliary bdi control line in - * bdi_position_ratio() will let the dirtier task progress - * at some rate <= (write_bw / 2) for bringing down bdi_dirty. + * dirtiers for 100 seconds until wb_dirty drops under + * wb_thresh. Instead the auxiliary wb control line in + * wb_position_ratio() will let the dirtier task progress + * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ - *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); - - if (bdi_bg_thresh) - *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh * - background_thresh, - dirty_thresh) : 0; + dtc->wb_thresh = __wb_calc_thresh(dtc); + dtc->wb_bg_thresh = dtc->thresh ? + div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need @@ -1320,14 +1500,12 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, * actually dirty; with m+n sitting in the percpu * deltas. */ - if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { - bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - *bdi_dirty = bdi_reclaimable + - bdi_stat_sum(bdi, BDI_WRITEBACK); + if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { + wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); + dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { - bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - *bdi_dirty = bdi_reclaimable + - bdi_stat(bdi, BDI_WRITEBACK); + wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); + dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); } } @@ -1339,12 +1517,16 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, * perform some writeout. */ static void balance_dirty_pages(struct address_space *mapping, + struct bdi_writeback *wb, unsigned long pages_dirtied) { + struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; + struct dirty_throttle_control * const gdtc = &gdtc_stor; + struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? + &mdtc_stor : NULL; + struct dirty_throttle_control *sdtc; unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ - unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ - unsigned long background_thresh; - unsigned long dirty_thresh; long period; long pause; long max_pause; @@ -1353,18 +1535,16 @@ static void balance_dirty_pages(struct address_space *mapping, bool dirty_exceeded = false; unsigned long task_ratelimit; unsigned long dirty_ratelimit; - unsigned long pos_ratio; - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { unsigned long now = jiffies; - unsigned long uninitialized_var(bdi_thresh); - unsigned long thresh; - unsigned long uninitialized_var(bdi_dirty); - unsigned long dirty; - unsigned long bg_thresh; + unsigned long dirty, thresh, bg_thresh; + unsigned long m_dirty = 0; /* stop bogus uninit warnings */ + unsigned long m_thresh = 0; + unsigned long m_bg_thresh = 0; /* * Unstable writes are a feature of certain networked @@ -1374,65 +1554,127 @@ static void balance_dirty_pages(struct address_space *mapping, */ nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); - nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); + gdtc->avail = global_dirtyable_memory(); + gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); - global_dirty_limits(&background_thresh, &dirty_thresh); + domain_dirty_limits(gdtc); if (unlikely(strictlimit)) { - bdi_dirty_limits(bdi, dirty_thresh, background_thresh, - &bdi_dirty, &bdi_thresh, &bg_thresh); + wb_dirty_limits(gdtc); - dirty = bdi_dirty; - thresh = bdi_thresh; + dirty = gdtc->wb_dirty; + thresh = gdtc->wb_thresh; + bg_thresh = gdtc->wb_bg_thresh; } else { - dirty = nr_dirty; - thresh = dirty_thresh; - bg_thresh = background_thresh; + dirty = gdtc->dirty; + thresh = gdtc->thresh; + bg_thresh = gdtc->bg_thresh; + } + + if (mdtc) { + unsigned long filepages, headroom, writeback; + + /* + * If @wb belongs to !root memcg, repeat the same + * basic calculations for the memcg domain. + */ + mem_cgroup_wb_stats(wb, &filepages, &headroom, + &mdtc->dirty, &writeback); + mdtc->dirty += writeback; + mdtc_calc_avail(mdtc, filepages, headroom); + + domain_dirty_limits(mdtc); + + if (unlikely(strictlimit)) { + wb_dirty_limits(mdtc); + m_dirty = mdtc->wb_dirty; + m_thresh = mdtc->wb_thresh; + m_bg_thresh = mdtc->wb_bg_thresh; + } else { + m_dirty = mdtc->dirty; + m_thresh = mdtc->thresh; + m_bg_thresh = mdtc->bg_thresh; + } } /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts - * when the bdi limits are ramping up in case of !strictlimit. + * when the wb limits are ramping up in case of !strictlimit. * - * In strictlimit case make decision based on the bdi counters - * and limits. Small writeouts when the bdi limits are ramping + * In strictlimit case make decision based on the wb counters + * and limits. Small writeouts when the wb limits are ramping * up are the price we consciously pay for strictlimit-ing. + * + * If memcg domain is in effect, @dirty should be under + * both global and memcg freerun ceilings. */ - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) { + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && + (!mdtc || + m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { + unsigned long intv = dirty_poll_interval(dirty, thresh); + unsigned long m_intv = ULONG_MAX; + current->dirty_paused_when = now; current->nr_dirtied = 0; - current->nr_dirtied_pause = - dirty_poll_interval(dirty, thresh); + if (mdtc) + m_intv = dirty_poll_interval(m_dirty, m_thresh); + current->nr_dirtied_pause = min(intv, m_intv); break; } - if (unlikely(!writeback_in_progress(bdi))) - bdi_start_background_writeback(bdi); + if (unlikely(!writeback_in_progress(wb))) + wb_start_background_writeback(wb); + /* + * Calculate global domain's pos_ratio and select the + * global dtc by default. + */ if (!strictlimit) - bdi_dirty_limits(bdi, dirty_thresh, background_thresh, - &bdi_dirty, &bdi_thresh, NULL); - - dirty_exceeded = (bdi_dirty > bdi_thresh) && - ((nr_dirty > dirty_thresh) || strictlimit); - if (dirty_exceeded && !bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; - - bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, - nr_dirty, bdi_thresh, bdi_dirty, - start_time); - - dirty_ratelimit = bdi->dirty_ratelimit; - pos_ratio = bdi_position_ratio(bdi, dirty_thresh, - background_thresh, nr_dirty, - bdi_thresh, bdi_dirty); - task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> + wb_dirty_limits(gdtc); + + dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && + ((gdtc->dirty > gdtc->thresh) || strictlimit); + + wb_position_ratio(gdtc); + sdtc = gdtc; + + if (mdtc) { + /* + * If memcg domain is in effect, calculate its + * pos_ratio. @wb should satisfy constraints from + * both global and memcg domains. Choose the one + * w/ lower pos_ratio. + */ + if (!strictlimit) + wb_dirty_limits(mdtc); + + dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && + ((mdtc->dirty > mdtc->thresh) || strictlimit); + + wb_position_ratio(mdtc); + if (mdtc->pos_ratio < gdtc->pos_ratio) + sdtc = mdtc; + } + + if (dirty_exceeded && !wb->dirty_exceeded) + wb->dirty_exceeded = 1; + + if (time_is_before_jiffies(wb->bw_time_stamp + + BANDWIDTH_INTERVAL)) { + spin_lock(&wb->list_lock); + __wb_update_bandwidth(gdtc, mdtc, start_time, true); + spin_unlock(&wb->list_lock); + } + + /* throttle according to the chosen dtc */ + dirty_ratelimit = wb->dirty_ratelimit; + task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >> RATELIMIT_CALC_SHIFT; - max_pause = bdi_max_pause(bdi, bdi_dirty); - min_pause = bdi_min_pause(bdi, max_pause, - task_ratelimit, dirty_ratelimit, - &nr_dirtied_pause); + max_pause = wb_max_pause(wb, sdtc->wb_dirty); + min_pause = wb_min_pause(wb, max_pause, + task_ratelimit, dirty_ratelimit, + &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; @@ -1451,12 +1693,12 @@ static void balance_dirty_pages(struct address_space *mapping, * do a reset, as it may be a light dirtier. */ if (pause < min_pause) { - trace_balance_dirty_pages(bdi, - dirty_thresh, - background_thresh, - nr_dirty, - bdi_thresh, - bdi_dirty, + trace_balance_dirty_pages(wb, + sdtc->thresh, + sdtc->bg_thresh, + sdtc->dirty, + sdtc->wb_thresh, + sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -1480,12 +1722,12 @@ static void balance_dirty_pages(struct address_space *mapping, } pause: - trace_balance_dirty_pages(bdi, - dirty_thresh, - background_thresh, - nr_dirty, - bdi_thresh, - bdi_dirty, + trace_balance_dirty_pages(wb, + sdtc->thresh, + sdtc->bg_thresh, + sdtc->dirty, + sdtc->wb_thresh, + sdtc->wb_dirty, dirty_ratelimit, task_ratelimit, pages_dirtied, @@ -1500,33 +1742,33 @@ pause: current->nr_dirtied_pause = nr_dirtied_pause; /* - * This is typically equal to (nr_dirty < dirty_thresh) and can - * also keep "1000+ dd on a slow USB stick" under control. + * This is typically equal to (dirty < thresh) and can also + * keep "1000+ dd on a slow USB stick" under control. */ if (task_ratelimit) break; /* * In the case of an unresponding NFS server and the NFS dirty - * pages exceeds dirty_thresh, give the other good bdi's a pipe + * pages exceeds dirty_thresh, give the other good wb's a pipe * to go through, so that tasks on them still remain responsive. * * In theory 1 page is enough to keep the comsumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 - * more page. However bdi_dirty has accounting errors. So use - * the larger and more IO friendly bdi_stat_error. + * more page. However wb_dirty has accounting errors. So use + * the larger and more IO friendly wb_stat_error. */ - if (bdi_dirty <= bdi_stat_error(bdi)) + if (sdtc->wb_dirty <= wb_stat_error(wb)) break; if (fatal_signal_pending(current)) break; } - if (!dirty_exceeded && bdi->dirty_exceeded) - bdi->dirty_exceeded = 0; + if (!dirty_exceeded && wb->dirty_exceeded) + wb->dirty_exceeded = 0; - if (writeback_in_progress(bdi)) + if (writeback_in_progress(wb)) return; /* @@ -1540,8 +1782,8 @@ pause: if (laptop_mode) return; - if (nr_reclaimable > background_thresh) - bdi_start_background_writeback(bdi); + if (nr_reclaimable > gdtc->bg_thresh) + wb_start_background_writeback(wb); } static DEFINE_PER_CPU(int, bdp_ratelimits); @@ -1577,15 +1819,22 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; */ void balance_dirty_pages_ratelimited(struct address_space *mapping) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; int ratelimit; int *p; if (!bdi_cap_account_dirty(bdi)) return; + if (inode_cgwb_enabled(inode)) + wb = wb_get_create_current(bdi, GFP_KERNEL); + if (!wb) + wb = &bdi->wb; + ratelimit = current->nr_dirtied_pause; - if (bdi->dirty_exceeded) + if (wb->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); preempt_disable(); @@ -1617,10 +1866,60 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) - balance_dirty_pages(mapping, current->nr_dirtied); + balance_dirty_pages(mapping, wb, current->nr_dirtied); + + wb_put(wb); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +/** + * wb_over_bg_thresh - does @wb need to be written back? + * @wb: bdi_writeback of interest + * + * Determines whether background writeback should keep writing @wb or it's + * clean enough. Returns %true if writeback should continue. + */ +bool wb_over_bg_thresh(struct bdi_writeback *wb) +{ + struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; + struct dirty_throttle_control * const gdtc = &gdtc_stor; + struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? + &mdtc_stor : NULL; + + /* + * Similar to balance_dirty_pages() but ignores pages being written + * as we're trying to decide whether to put more under writeback. + */ + gdtc->avail = global_dirtyable_memory(); + gdtc->dirty = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + domain_dirty_limits(gdtc); + + if (gdtc->dirty > gdtc->bg_thresh) + return true; + + if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc)) + return true; + + if (mdtc) { + unsigned long filepages, headroom, writeback; + + mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, + &writeback); + mdtc_calc_avail(mdtc, filepages, headroom); + domain_dirty_limits(mdtc); /* ditto, ignore writeback */ + + if (mdtc->dirty > mdtc->bg_thresh) + return true; + + if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc)) + return true; + } + + return false; +} + void throttle_vm_writeout(gfp_t gfp_mask) { unsigned long background_thresh; @@ -1628,7 +1927,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) for ( ; ; ) { global_dirty_limits(&background_thresh, &dirty_thresh); - dirty_thresh = hard_dirty_limit(dirty_thresh); + dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh); /* * Boost the allowable dirty threshold a bit for page @@ -1667,14 +1966,21 @@ void laptop_mode_timer_fn(unsigned long data) struct request_queue *q = (struct request_queue *)data; int nr_pages = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); + struct bdi_writeback *wb; /* * We want to write everything out, not just down to the dirty * threshold */ - if (bdi_has_dirty_io(&q->backing_dev_info)) - bdi_start_writeback(&q->backing_dev_info, nr_pages, - WB_REASON_LAPTOP_TIMER); + if (!bdi_has_dirty_io(&q->backing_dev_info)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node) + if (wb_has_dirty_io(wb)) + wb_start_writeback(wb, nr_pages, true, + WB_REASON_LAPTOP_TIMER); + rcu_read_unlock(); } /* @@ -1718,10 +2024,12 @@ void laptop_sync_completion(void) void writeback_set_ratelimit(void) { + struct wb_domain *dom = &global_wb_domain; unsigned long background_thresh; unsigned long dirty_thresh; + global_dirty_limits(&background_thresh, &dirty_thresh); - global_dirty_limit = dirty_thresh; + dom->dirty_limit = dirty_thresh; ratelimit_pages = dirty_thresh / (num_online_cpus() * 32); if (ratelimit_pages < 16) ratelimit_pages = 16; @@ -1767,10 +2075,10 @@ static struct notifier_block ratelimit_nb = { */ void __init page_writeback_init(void) { + BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); + writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - - fprop_global_init(&writeout_completions, GFP_KERNEL); } /** @@ -2090,19 +2398,29 @@ int __set_page_dirty_no_writeback(struct page *page) /* * Helper function for set_page_dirty family. + * + * Caller must hold mem_cgroup_begin_page_stat(). + * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping) +void account_page_dirtied(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg) { + struct inode *inode = mapping->host; + trace_writeback_dirty_page(page, mapping); if (mapping_cap_account_dirty(mapping)) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct bdi_writeback *wb; + + inode_attach_wb(inode, page); + wb = inode_to_wb(inode); + mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); - __inc_bdi_stat(bdi, BDI_RECLAIMABLE); - __inc_bdi_stat(bdi, BDI_DIRTIED); + __inc_wb_stat(wb, WB_RECLAIMABLE); + __inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); @@ -2113,21 +2431,18 @@ EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for deaccounting dirty page without writeback. * - * Doing this should *normally* only ever be done when a page - * is truncated, and is not actually mapped anywhere at all. However, - * fs/buffer.c does this when it notices that somebody has cleaned - * out all the buffers on a page without actually doing it through - * the VM. Can you say "ext3 is horribly ugly"? Thought you could. + * Caller must hold mem_cgroup_begin_page_stat(). */ -void account_page_cleaned(struct page *page, struct address_space *mapping) +void account_page_cleaned(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg, struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); + dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_CACHE_SIZE); } } -EXPORT_SYMBOL(account_page_cleaned); /* * For address_spaces which do not use buffers. Just tag the page as dirty in @@ -2143,26 +2458,34 @@ EXPORT_SYMBOL(account_page_cleaned); */ int __set_page_dirty_nobuffers(struct page *page) { + struct mem_cgroup *memcg; + + memcg = mem_cgroup_begin_page_stat(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; - if (!mapping) + if (!mapping) { + mem_cgroup_end_page_stat(memcg); return 1; + } spin_lock_irqsave(&mapping->tree_lock, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); + if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } return 1; } + mem_cgroup_end_page_stat(memcg); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -2177,10 +2500,17 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers); void account_page_redirty(struct page *page) { struct address_space *mapping = page->mapping; + if (mapping && mapping_cap_account_dirty(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + bool locked; + + wb = unlocked_inode_to_wb_begin(inode, &locked); current->nr_dirtied--; dec_zone_page_state(page, NR_DIRTIED); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED); + dec_wb_stat(wb, WB_DIRTIED); + unlocked_inode_to_wb_end(inode, locked); } } EXPORT_SYMBOL(account_page_redirty); @@ -2266,6 +2596,43 @@ int set_page_dirty_lock(struct page *page) EXPORT_SYMBOL(set_page_dirty_lock); /* + * This cancels just the dirty bit on the kernel page itself, it does NOT + * actually remove dirty bits on any mmap's that may be around. It also + * leaves the page tagged dirty, so any sync activity will still find it on + * the dirty lists, and in particular, clear_page_dirty_for_io() will still + * look at the dirty bits in the VM. + * + * Doing this should *normally* only ever be done when a page is truncated, + * and is not actually mapped anywhere at all. However, fs/buffer.c does + * this when it notices that somebody has cleaned out all the buffers on a + * page without actually doing it through the VM. Can you say "ext3 is + * horribly ugly"? Thought you could. + */ +void cancel_dirty_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + + if (mapping_cap_account_dirty(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct mem_cgroup *memcg; + bool locked; + + memcg = mem_cgroup_begin_page_stat(page); + wb = unlocked_inode_to_wb_begin(inode, &locked); + + if (TestClearPageDirty(page)) + account_page_cleaned(page, mapping, memcg, wb); + + unlocked_inode_to_wb_end(inode, locked); + mem_cgroup_end_page_stat(memcg); + } else { + ClearPageDirty(page); + } +} +EXPORT_SYMBOL(cancel_dirty_page); + +/* * Clear a page's dirty flag, while caring for dirty memory accounting. * Returns true if the page was previously dirty. * @@ -2282,10 +2649,16 @@ EXPORT_SYMBOL(set_page_dirty_lock); int clear_page_dirty_for_io(struct page *page) { struct address_space *mapping = page_mapping(page); + int ret = 0; BUG_ON(!PageLocked(page)); if (mapping && mapping_cap_account_dirty(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct mem_cgroup *memcg; + bool locked; + /* * Yes, Virginia, this is indeed insane. * @@ -2321,13 +2694,17 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ + memcg = mem_cgroup_begin_page_stat(page); + wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { + mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), - BDI_RECLAIMABLE); - return 1; + dec_wb_stat(wb, WB_RECLAIMABLE); + ret = 1; } - return 0; + unlocked_inode_to_wb_end(inode, locked); + mem_cgroup_end_page_stat(memcg); + return ret; } return TestClearPageDirty(page); } @@ -2341,7 +2718,8 @@ int test_clear_page_writeback(struct page *page) memcg = mem_cgroup_begin_page_stat(page); if (mapping) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2351,8 +2729,10 @@ int test_clear_page_writeback(struct page *page) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { - __dec_bdi_stat(bdi, BDI_WRITEBACK); - __bdi_writeout_inc(bdi); + struct bdi_writeback *wb = inode_to_wb(inode); + + __dec_wb_stat(wb, WB_WRITEBACK); + __wb_writeout_inc(wb); } } spin_unlock_irqrestore(&mapping->tree_lock, flags); @@ -2376,7 +2756,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write) memcg = mem_cgroup_begin_page_stat(page); if (mapping) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct inode *inode = mapping->host; + struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); @@ -2386,7 +2767,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) - __inc_bdi_stat(bdi, BDI_WRITEBACK); + __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); } if (!PageDirty(page)) radix_tree_tag_clear(&mapping->page_tree, diff --git a/kernel/mm/page_alloc.c b/kernel/mm/page_alloc.c index 41bd90d60..d002418fc 100644 --- a/kernel/mm/page_alloc.c +++ b/kernel/mm/page_alloc.c @@ -62,6 +62,7 @@ #include <linux/sched/rt.h> #include <linux/locallock.h> #include <linux/page_owner.h> +#include <linux/kthread.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -125,6 +126,24 @@ unsigned long dirty_balance_reserve __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +/* + * A cached value of the page's pageblock's migratetype, used when the page is + * put on a pcplist. Used to avoid the pageblock migratetype lookup when + * freeing from pcplists in most cases, at the cost of possibly becoming stale. + * Also the migratetype set in the page does not necessarily match the pcplist + * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any + * other index - this ensures that it will be put on the correct CMA freelist. + */ +static inline int get_pcppage_migratetype(struct page *page) +{ + return page->index; +} + +static inline void set_pcppage_migratetype(struct page *page, int migratetype) +{ + page->index = migratetype; +} + #ifdef CONFIG_PM_SLEEP /* * The following functions are used by the suspend/hibernate code to temporarily @@ -151,19 +170,19 @@ void pm_restrict_gfp_mask(void) WARN_ON(!mutex_is_locked(&pm_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; - gfp_allowed_mask &= ~GFP_IOFS; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); } bool pm_suspended_storage(void) { - if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) return false; return true; } #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -int pageblock_order __read_mostly; +unsigned int pageblock_order __read_mostly; #endif static void __free_pages_ok(struct page *page, unsigned int order); @@ -206,6 +225,18 @@ static char * const zone_names[MAX_NR_ZONES] = { "HighMem", #endif "Movable", +#ifdef CONFIG_ZONE_DEVICE + "Device", +#endif +}; + +static void free_compound_page(struct page *page); +compound_page_dtor * const compound_page_dtors[] = { + NULL, + free_compound_page, +#ifdef CONFIG_HUGETLB_PAGE + free_huge_page, +#endif }; int min_free_kbytes = 1024; @@ -248,6 +279,75 @@ static DEFINE_LOCAL_IRQ_LOCK(pa_lock); int page_group_by_mobility_disabled __read_mostly; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ + pgdat->first_deferred_pfn = ULONG_MAX; +} + +/* Returns true if the struct page for the pfn is uninitialised */ +static inline bool __meminit early_page_uninitialised(unsigned long pfn) +{ + if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) + return true; + + return false; +} + +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + if (pfn >= NODE_DATA(nid)->first_deferred_pfn) + return true; + + return false; +} + +/* + * Returns false when the remaining initialisation should be deferred until + * later in the boot cycle when it can be parallelised. + */ +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + /* Always populate low zones for address-contrained allocations */ + if (zone_end < pgdat_end_pfn(pgdat)) + return true; + + /* Initialise at least 2G of the highest zone */ + (*nr_initialised)++; + if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + (pfn & (PAGES_PER_SECTION - 1)) == 0) { + pgdat->first_deferred_pfn = pfn; + return false; + } + + return true; +} +#else +static inline void reset_deferred_meminit(pg_data_t *pgdat) +{ +} + +static inline bool early_page_uninitialised(unsigned long pfn) +{ + return false; +} + +static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid) +{ + return false; +} + +static inline bool update_defer_init(pg_data_t *pgdat, + unsigned long pfn, unsigned long zone_end, + unsigned long *nr_initialised) +{ + return true; +} +#endif + + void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled && @@ -358,15 +458,15 @@ out: /* * Higher-order pages are called "compound pages". They are structured thusly: * - * The first PAGE_SIZE page is called the "head page". + * The first PAGE_SIZE page is called the "head page" and have PG_head set. * - * The remaining PAGE_SIZE pages are called "tail pages". + * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded + * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * - * All pages have PG_compound set. All tail pages have their ->first_page - * pointing at the head page. + * The first tail page's ->compound_dtor holds the offset in array of compound + * page destructors. See compound_page_dtors. * - * The first tail page's ->lru.next holds the address of the compound page's - * put_page() function. Its ->lru.prev holds the order of allocation. + * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ @@ -375,38 +475,21 @@ static void free_compound_page(struct page *page) __free_pages_ok(page, compound_order(page)); } -void prep_compound_page(struct page *page, unsigned long order) +void prep_compound_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; - set_compound_page_dtor(page, free_compound_page); + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } } -static inline void prep_zero_page(struct page *page, unsigned int order, - gfp_t gfp_flags) -{ - int i; - - /* - * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO - * and __GFP_HIGHMEM from hard or soft interrupt context. - */ - VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); - for (i = 0; i < (1 << order); i++) - clear_highpage(page + i); -} - #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; bool _debug_pagealloc_enabled __read_mostly; @@ -592,7 +675,7 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; - int max_order = MAX_ORDER; + unsigned int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -605,7 +688,7 @@ static inline void __free_one_page(struct page *page, * pageblock. Without this, pageblock isolation * could cause incorrect freepage accounting. */ - max_order = min(MAX_ORDER, pageblock_order + 1); + max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); } else { __mod_zone_freepage_state(zone, 1 << order, migratetype); } @@ -724,11 +807,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - mt = get_freepage_migratetype(page); + mt = get_pcppage_migratetype(page); + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ if (unlikely(has_isolate_pageblock(zone))) mt = get_pageblock_migratetype(page); - /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); to_free--; @@ -775,6 +860,7 @@ static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src, do { page = list_last_entry(list, struct page, lru); list_del(&page->lru); + list_add(&page->lru, dst); } while (--to_free && --batch_free && !list_empty(list)); } @@ -803,17 +889,103 @@ static void free_one_page(struct zone *zone, static int free_tail_pages_check(struct page *head_page, struct page *page) { - if (!IS_ENABLED(CONFIG_DEBUG_VM)) - return 0; + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; + } if (unlikely(!PageTail(page))) { bad_page(page, "PageTail not set", 0); - return 1; + goto out; } - if (unlikely(page->first_page != head_page)) { - bad_page(page, "first_page not consistent", 0); - return 1; + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; + } + ret = 0; +out: + clear_compound_head(page); + return ret; +} + +static void __meminit __init_single_page(struct page *page, unsigned long pfn, + unsigned long zone, int nid) +{ + set_page_links(page, zone, nid, pfn); + init_page_count(page); + page_mapcount_reset(page); + page_cpupid_reset_last(page); + + INIT_LIST_HEAD(&page->lru); +#ifdef WANT_PAGE_VIRTUAL + /* The shift won't overflow because ZONE_NORMAL is below 4G. */ + if (!is_highmem_idx(zone)) + set_page_address(page, __va(pfn << PAGE_SHIFT)); +#endif +} + +static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, + int nid) +{ + return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void init_reserved_page(unsigned long pfn) +{ + pg_data_t *pgdat; + int nid, zid; + + if (!early_page_uninitialised(pfn)) + return; + + nid = early_pfn_to_nid(pfn); + pgdat = NODE_DATA(nid); + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) + break; + } + __init_single_pfn(pfn, zid, nid); +} +#else +static inline void init_reserved_page(unsigned long pfn) +{ +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + +/* + * Initialised pages do not have PageReserved set. This function is + * called for each range allocated by the bootmem allocator and + * marks the pages PageReserved. The remaining valid pages are later + * sent to the buddy page allocator. + */ +void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) +{ + unsigned long start_pfn = PFN_DOWN(start); + unsigned long end_pfn = PFN_UP(end); + + for (; start_pfn < end_pfn; start_pfn++) { + if (pfn_valid(start_pfn)) { + struct page *page = pfn_to_page(start_pfn); + + init_reserved_page(start_pfn); + + /* Avoid false-positive PageTail() */ + INIT_LIST_HEAD(&page->lru); + + SetPageReserved(page); + } } - return 0; } static bool free_pages_prepare(struct page *page, unsigned int order) @@ -865,12 +1037,12 @@ static void __free_pages_ok(struct page *page, unsigned int order) migratetype = get_pfnblock_migratetype(page, pfn); local_lock_irqsave(pa_lock, flags); __count_vm_events(PGFREE, 1 << order); - set_freepage_migratetype(page, migratetype); free_one_page(page_zone(page), page, pfn, order, migratetype); local_unlock_irqrestore(pa_lock, flags); } -void __init __free_pages_bootmem(struct page *page, unsigned int order) +static void __init __free_pages_boot_core(struct page *page, + unsigned long pfn, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -890,6 +1062,235 @@ void __init __free_pages_bootmem(struct page *page, unsigned int order) __free_pages(page, order); } +#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ + defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) + +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; + +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + static DEFINE_SPINLOCK(early_pfn_lock); + int nid; + + spin_lock(&early_pfn_lock); + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); + if (nid < 0) + nid = 0; + spin_unlock(&early_pfn_lock); + + return nid; +} +#endif + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + int nid; + + nid = __early_pfn_to_nid(pfn, state); + if (nid >= 0 && nid != node) + return false; + return true; +} + +/* Only safe to use early in boot when initialisation is single-threaded */ +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); +} + +#else + +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + return true; +} +static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) +{ + return true; +} +#endif + + +void __init __free_pages_bootmem(struct page *page, unsigned long pfn, + unsigned int order) +{ + if (early_page_uninitialised(pfn)) + return; + return __free_pages_boot_core(page, pfn, order); +} + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static void __init deferred_free_range(struct page *page, + unsigned long pfn, int nr_pages) +{ + int i; + + if (!page) + return; + + /* Free a large naturally-aligned chunk if possible */ + if (nr_pages == MAX_ORDER_NR_PAGES && + (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + __free_pages_boot_core(page, pfn, MAX_ORDER-1); + return; + } + + for (i = 0; i < nr_pages; i++, page++, pfn++) + __free_pages_boot_core(page, pfn, 0); +} + +/* Completion tracking for deferred_init_memmap() threads */ +static atomic_t pgdat_init_n_undone __initdata; +static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); + +static inline void __init pgdat_init_report_one_done(void) +{ + if (atomic_dec_and_test(&pgdat_init_n_undone)) + complete(&pgdat_init_all_done_comp); +} + +/* Initialise remaining memory on a node */ +static int __init deferred_init_memmap(void *data) +{ + pg_data_t *pgdat = data; + int nid = pgdat->node_id; + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long start = jiffies; + unsigned long nr_pages = 0; + unsigned long walk_start, walk_end; + int i, zid; + struct zone *zone; + unsigned long first_init_pfn = pgdat->first_deferred_pfn; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (first_init_pfn == ULONG_MAX) { + pgdat_init_report_one_done(); + return 0; + } + + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); + + /* Sanity check boundaries */ + BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); + BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); + pgdat->first_deferred_pfn = ULONG_MAX; + + /* Only the highest zone is deferred so find it */ + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + zone = pgdat->node_zones + zid; + if (first_init_pfn < zone_end_pfn(zone)) + break; + } + + for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { + unsigned long pfn, end_pfn; + struct page *page = NULL; + struct page *free_base_page = NULL; + unsigned long free_base_pfn = 0; + int nr_to_free = 0; + + end_pfn = min(walk_end, zone_end_pfn(zone)); + pfn = first_init_pfn; + if (pfn < walk_start) + pfn = walk_start; + if (pfn < zone->zone_start_pfn) + pfn = zone->zone_start_pfn; + + for (; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + goto free_range; + + /* + * Ensure pfn_valid is checked every + * MAX_ORDER_NR_PAGES for memory holes + */ + if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { + if (!pfn_valid(pfn)) { + page = NULL; + goto free_range; + } + } + + if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + page = NULL; + goto free_range; + } + + /* Minimise pfn page lookups and scheduler checks */ + if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) { + page++; + } else { + nr_pages += nr_to_free; + deferred_free_range(free_base_page, + free_base_pfn, nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; + + page = pfn_to_page(pfn); + cond_resched(); + } + + if (page->flags) { + VM_BUG_ON(page_zone(page) != zone); + goto free_range; + } + + __init_single_page(page, pfn, zid, nid); + if (!free_base_page) { + free_base_page = page; + free_base_pfn = pfn; + nr_to_free = 0; + } + nr_to_free++; + + /* Where possible, batch up pages for a single free */ + continue; +free_range: + /* Free the current block of pages to allocator */ + nr_pages += nr_to_free; + deferred_free_range(free_base_page, free_base_pfn, + nr_to_free); + free_base_page = NULL; + free_base_pfn = nr_to_free = 0; + } + + first_init_pfn = max(end_pfn, first_init_pfn); + } + + /* Sanity check that the next zone really is unpopulated */ + WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); + + pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, + jiffies_to_msecs(jiffies - start)); + + pgdat_init_report_one_done(); + return 0; +} + +void __init page_alloc_init_late(void) +{ + int nid; + + /* There will be num_node_state(N_MEMORY) threads */ + atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); + for_each_node_state(nid, N_MEMORY) { + kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); + } + + /* Block until all are initialised */ + wait_for_completion(&pgdat_init_all_done_comp); + + /* Reinit limits that are based on free pages after the kernel is up */ + files_maxfiles_init(); +} +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) @@ -979,6 +1380,10 @@ static inline int check_new_page(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(atomic_read(&page->_count) != 0)) bad_reason = "nonzero _count"; + if (unlikely(page->flags & __PG_HWPOISON)) { + bad_reason = "HWPoisoned (hardware-corrupted)"; + bad_flags = __PG_HWPOISON; + } if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; bad_flags = PAGE_FLAGS_CHECK_AT_PREP; @@ -1013,7 +1418,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, kasan_alloc_pages(page, order); if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -1058,7 +1464,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); - set_freepage_migratetype(page, migratetype); + set_pcppage_migratetype(page, migratetype); return page; } @@ -1071,15 +1477,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * the free lists for the desirable migrate type are depleted */ static int fallbacks[MIGRATE_TYPES][4] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, #ifdef CONFIG_CMA - [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ #endif - [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ #ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ #endif }; @@ -1104,7 +1509,7 @@ int move_freepages(struct zone *zone, int migratetype) { struct page *page; - unsigned long order; + unsigned int order; int pages_moved = 0; #ifndef CONFIG_HOLES_IN_ZONE @@ -1135,7 +1540,6 @@ int move_freepages(struct zone *zone, order = page_order(page); list_move(&page->lru, &zone->free_area[order].free_list[migratetype]); - set_freepage_migratetype(page, migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -1218,7 +1622,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt) static void steal_suitable_fallback(struct zone *zone, struct page *page, int start_type) { - int current_order = page_order(page); + unsigned int current_order = page_order(page); int pages; /* Take ownership for orders >= pageblock_order */ @@ -1253,7 +1657,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, *can_steal = false; for (i = 0;; i++) { fallback_mt = fallbacks[migratetype][i]; - if (fallback_mt == MIGRATE_RESERVE) + if (fallback_mt == MIGRATE_TYPES) break; if (list_empty(&area->free_list[fallback_mt])) @@ -1272,6 +1676,101 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, return -1; } +/* + * Reserve a pageblock for exclusive use of high-order atomic allocations if + * there are no empty page blocks that contain a page with a suitable order + */ +static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, + unsigned int alloc_order) +{ + int mt; + unsigned long max_managed, flags; + + /* + * Limit the number reserved to 1 pageblock or roughly 1% of a zone. + * Check is race-prone but harmless. + */ + max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; + if (zone->nr_reserved_highatomic >= max_managed) + return; + + spin_lock_irqsave(&zone->lock, flags); + + /* Recheck the nr_reserved_highatomic limit under the lock */ + if (zone->nr_reserved_highatomic >= max_managed) + goto out_unlock; + + /* Yoink! */ + mt = get_pageblock_migratetype(page); + if (mt != MIGRATE_HIGHATOMIC && + !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + zone->nr_reserved_highatomic += pageblock_nr_pages; + set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + } + +out_unlock: + spin_unlock_irqrestore(&zone->lock, flags); +} + +/* + * Used when an allocation is about to fail under memory pressure. This + * potentially hurts the reliability of high-order allocations when under + * intense memory pressure but failed atomic allocations should be easier + * to recover from than an OOM. + */ +static void unreserve_highatomic_pageblock(const struct alloc_context *ac) +{ + struct zonelist *zonelist = ac->zonelist; + unsigned long flags; + struct zoneref *z; + struct zone *zone; + struct page *page; + int order; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + ac->nodemask) { + /* Preserve at least one pageblock */ + if (zone->nr_reserved_highatomic <= pageblock_nr_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct free_area *area = &(zone->free_area[order]); + + if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + continue; + + page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next, + struct page, lru); + + /* + * It should never happen but changes to locking could + * inadvertently allow a per-cpu drain to add pages + * to MIGRATE_HIGHATOMIC while unreserving so be safe + * and watch for underflows. + */ + zone->nr_reserved_highatomic -= min(pageblock_nr_pages, + zone->nr_reserved_highatomic); + + /* + * Convert to ac->migratetype and avoid the normal + * pageblock stealing heuristics. Minimally, the caller + * is doing the work and needs the pages. More + * importantly, if the block was always converted to + * MIGRATE_UNMOVABLE or another type then the number + * of pageblocks that cannot be completely freed + * may increase. + */ + set_pageblock_migratetype(page, ac->migratetype); + move_freepages_block(zone, page, ac->migratetype); + spin_unlock_irqrestore(&zone->lock, flags); + return; + } + spin_unlock_irqrestore(&zone->lock, flags); + } +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) @@ -1305,14 +1804,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) expand(zone, page, order, current_order, area, start_migratetype); /* - * The freepage_migratetype may differ from pageblock's + * The pcppage_migratetype may differ from pageblock's * migratetype depending on the decisions in - * try_to_steal_freepages(). This is OK as long as it - * does not differ for MIGRATE_CMA pageblocks. For CMA - * we need to make sure unallocated pages flushed from - * pcp lists are returned to the correct freelist. + * find_suitable_fallback(). This is OK as long as it does not + * differ for MIGRATE_CMA pageblocks. Those can be used as + * fallback only via special __rmqueue_cma_fallback() function */ - set_freepage_migratetype(page, start_migratetype); + set_pcppage_migratetype(page, start_migratetype); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); @@ -1328,29 +1826,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) * Call me with the zone->lock already held. */ static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype) + int migratetype, gfp_t gfp_flags) { struct page *page; -retry_reserve: page = __rmqueue_smallest(zone, order, migratetype); - - if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { + if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); if (!page) page = __rmqueue_fallback(zone, order, migratetype); - - /* - * Use MIGRATE_RESERVE rather than fail an allocation. goto - * is used because __rmqueue_smallest is an inline function - * and we want just one call site - */ - if (!page) { - migratetype = MIGRATE_RESERVE; - goto retry_reserve; - } } trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -1370,7 +1856,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype); + struct page *page = __rmqueue(zone, order, migratetype, 0); if (unlikely(page == NULL)) break; @@ -1388,7 +1874,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, else list_add_tail(&page->lru, list); list = &page->lru; - if (is_migrate_cma(get_freepage_migratetype(page))) + if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1601,7 +2087,7 @@ void free_hot_cold_page(struct page *page, bool cold) return; migratetype = get_pfnblock_migratetype(page, pfn); - set_freepage_migratetype(page, migratetype); + set_pcppage_migratetype(page, migratetype); local_lock_irqsave(pa_lock, flags); __count_vm_event(PGFREE); @@ -1665,6 +2151,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) void split_page(struct page *page, unsigned int order) { int i; + gfp_t gfp_mask; VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); @@ -1678,10 +2165,11 @@ void split_page(struct page *page, unsigned int order) split_page(virt_to_page(page[0].shadow), order); #endif - set_page_owner(page, 0, 0); + gfp_mask = get_page_owner_gfp(page); + set_page_owner(page, 0, gfp_mask); for (i = 1; i < (1 << order); i++) { set_page_refcounted(page + i); - set_page_owner(page + i, 0, 0); + set_page_owner(page + i, 0, gfp_mask); } } EXPORT_SYMBOL_GPL(split_page); @@ -1711,6 +2199,8 @@ int __isolate_free_page(struct page *page, unsigned int order) zone->free_area[order].nr_free--; rmv_page_order(page); + set_page_owner(page, order, __GFP_MOVABLE); + /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { struct page *endpage = page + (1 << order) - 1; @@ -1722,7 +2212,7 @@ int __isolate_free_page(struct page *page, unsigned int order) } } - set_page_owner(page, order, 0); + return 1UL << order; } @@ -1759,7 +2249,7 @@ int split_free_page(struct page *page) static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype) + gfp_t gfp_flags, int alloc_flags, int migratetype) { unsigned long flags; struct page *page; @@ -1802,13 +2292,21 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, WARN_ON_ONCE(order > 1); } local_spin_lock_irqsave(pa_lock, &zone->lock, flags); - page = __rmqueue(zone, order, migratetype); + + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } + if (!page) + page = __rmqueue(zone, order, migratetype, gfp_flags); if (!page) { spin_unlock(&zone->lock); goto failed; } __mod_zone_freepage_state(zone, -(1 << order), - get_freepage_migratetype(page)); + get_pcppage_migratetype(page)); spin_unlock(&zone->lock); } @@ -1834,13 +2332,13 @@ failed: static struct { struct fault_attr attr; - u32 ignore_gfp_highmem; - u32 ignore_gfp_wait; + bool ignore_gfp_highmem; + bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = 1, - .ignore_gfp_highmem = 1, + .ignore_gfp_reclaim = true, + .ignore_gfp_highmem = true, .min_order = 1, }; @@ -1858,7 +2356,8 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; - if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) + if (fail_page_alloc.ignore_gfp_reclaim && + (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; return should_fail(&fail_page_alloc.attr, 1 << order); @@ -1877,7 +2376,7 @@ static int __init fail_page_alloc_debugfs(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_wait)) + &fail_page_alloc.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem)) @@ -1907,42 +2406,77 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return true if free pages are above 'mark'. This takes into account the order - * of the allocation. + * Return true if free base pages are above 'mark'. For high-order checks it + * will return true of the order-0 watermark is reached and there is at least + * one free page of a suitable size. Checking now avoids taking the zone lock + * to check in the allocation paths if no pages are free. */ static bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags, long free_pages) { - /* free_pages may go negative - that's OK */ long min = mark; int o; - long free_cma = 0; + const int alloc_harder = (alloc_flags & ALLOC_HARDER); + /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; + if (alloc_flags & ALLOC_HIGH) min -= min / 2; - if (alloc_flags & ALLOC_HARDER) + + /* + * If the caller does not have rights to ALLOC_HARDER then subtract + * the high-atomic reserves. This will over-estimate the size of the + * atomic reserve but it avoids a search. + */ + if (likely(!alloc_harder)) + free_pages -= z->nr_reserved_highatomic; + else min -= min / 4; + #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) - free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); + free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) + /* + * Check watermarks for an order-0 allocation request. If these + * are not met, then a high-order request also cannot go ahead + * even if a suitable page happened to be free. + */ + if (free_pages <= min + z->lowmem_reserve[classzone_idx]) return false; - for (o = 0; o < order; o++) { - /* At the next order, this order's pages become unavailable */ - free_pages -= z->free_area[o].nr_free << o; - /* Require fewer higher order pages to be free */ - min >>= 1; + /* If this is an order-0 request then the watermark is fine */ + if (!order) + return true; + + /* For a high-order request, check at least one suitable page is free */ + for (o = order; o < MAX_ORDER; o++) { + struct free_area *area = &z->free_area[o]; + int mt; + + if (!area->nr_free) + continue; + + if (alloc_harder) + return true; + + for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { + if (!list_empty(&area->free_list[mt])) + return true; + } - if (free_pages <= min) - return false; +#ifdef CONFIG_CMA + if ((alloc_flags & ALLOC_CMA) && + !list_empty(&area->free_list[MIGRATE_CMA])) { + return true; + } +#endif } - return true; + return false; } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, @@ -1953,134 +2487,18 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags) + unsigned long mark, int classzone_idx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, classzone_idx, 0, free_pages); } #ifdef CONFIG_NUMA -/* - * zlc_setup - Setup for "zonelist cache". Uses cached zone data to - * skip over zones that are not allowed by the cpuset, or that have - * been recently (in last second) found to be nearly full. See further - * comments in mmzone.h. Reduces cache footprint of zonelist scans - * that have to skip over a lot of full or unallowed zones. - * - * If the zonelist cache is present in the passed zonelist, then - * returns a pointer to the allowed node mask (either the current - * tasks mems_allowed, or node_states[N_MEMORY].) - * - * If the zonelist cache is not available for this zonelist, does - * nothing and returns NULL. - * - * If the fullzones BITMAP in the zonelist cache is stale (more than - * a second since last zap'd) then we zap it out (clear its bits.) - * - * We hold off even calling zlc_setup, until after we've checked the - * first zone in the zonelist, on the theory that most allocations will - * be satisfied from that first zone, so best to examine that zone as - * quickly as we can. - */ -static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - nodemask_t *allowednodes; /* zonelist_cache approximation */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return NULL; - - if (time_after(jiffies, zlc->last_full_zap + HZ)) { - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); - zlc->last_full_zap = jiffies; - } - - allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? - &cpuset_current_mems_allowed : - &node_states[N_MEMORY]; - return allowednodes; -} - -/* - * Given 'z' scanning a zonelist, run a couple of quick checks to see - * if it is worth looking at further for free memory: - * 1) Check that the zone isn't thought to be full (doesn't have its - * bit set in the zonelist_cache fullzones BITMAP). - * 2) Check that the zones node (obtained from the zonelist_cache - * z_to_n[] mapping) is allowed in the passed in allowednodes mask. - * Return true (non-zero) if zone is worth looking at further, or - * else return false (zero) if it is not. - * - * This check -ignores- the distinction between various watermarks, - * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is - * found to be full for any variation of these watermarks, it will - * be considered full for up to one second by all requests, unless - * we are so low on memory on all allowed nodes that we are forced - * into the second scan of the zonelist. - * - * In the second scan we ignore this zonelist cache and exactly - * apply the watermarks to all zones, even it is slower to do so. - * We are low on memory in the second scan, and should leave no stone - * unturned looking for a free page. - */ -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, - nodemask_t *allowednodes) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - int i; /* index of *z in zonelist zones */ - int n; /* node that zone *z is on */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return 1; - - i = z - zonelist->_zonerefs; - n = zlc->z_to_n[i]; - - /* This zone is worth trying if it is allowed but not full */ - return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); -} - -/* - * Given 'z' scanning a zonelist, set the corresponding bit in - * zlc->fullzones, so that subsequent attempts to allocate a page - * from that zone don't waste time re-examining it. - */ -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - int i; /* index of *z in zonelist zones */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return; - - i = z - zonelist->_zonerefs; - - set_bit(i, zlc->fullzones); -} - -/* - * clear all zones full, called after direct reclaim makes progress so that - * a zone that was recently full is not skipped over for up to a second - */ -static void zlc_clear_zones_full(struct zonelist *zonelist) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return; - - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); -} - static bool zone_local(struct zone *local_zone, struct zone *zone) { return local_zone->node == zone->node; @@ -2091,28 +2509,7 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < RECLAIM_DISTANCE; } - #else /* CONFIG_NUMA */ - -static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) -{ - return NULL; -} - -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, - nodemask_t *allowednodes) -{ - return 1; -} - -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) -{ -} - -static void zlc_clear_zones_full(struct zonelist *zonelist) -{ -} - static bool zone_local(struct zone *local_zone, struct zone *zone) { return true; @@ -2122,7 +2519,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } - #endif /* CONFIG_NUMA */ static void reset_alloc_batches(struct zone *preferred_zone) @@ -2149,11 +2545,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct zoneref *z; struct page *page = NULL; struct zone *zone; - nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ - int zlc_active = 0; /* set if using zonelist_cache */ - int did_zlc_setup = 0; /* just call zlc_setup() one time */ - bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE); int nr_fair_skipped = 0; bool zonelist_rescan; @@ -2168,9 +2559,6 @@ zonelist_scan: ac->nodemask) { unsigned long mark; - if (IS_ENABLED(CONFIG_NUMA) && zlc_active && - !zlc_zone_worth_trying(zonelist, z, allowednodes)) - continue; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed(zone, gfp_mask)) @@ -2208,14 +2596,14 @@ zonelist_scan: * * XXX: For now, allow allocations to potentially * exceed the per-zone dirty limit in the slowpath - * (ALLOC_WMARK_LOW unset) before going into reclaim, + * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed * zones are together not big enough to reach the * global limit. The proper fix for these situations * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if (consider_zone_dirty && !zone_dirty_ok(zone)) + if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; @@ -2228,28 +2616,8 @@ zonelist_scan: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (IS_ENABLED(CONFIG_NUMA) && - !did_zlc_setup && nr_online_nodes > 1) { - /* - * we do zlc_setup if there are multiple nodes - * and before considering the first zone allowed - * by the cpuset. - */ - allowednodes = zlc_setup(zonelist, alloc_flags); - zlc_active = 1; - did_zlc_setup = 1; - } - if (zone_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zone, zone)) - goto this_zone_full; - - /* - * As we may have just activated ZLC, check if the first - * eligible zone has failed zone_reclaim recently. - */ - if (IS_ENABLED(CONFIG_NUMA) && zlc_active && - !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; ret = zone_reclaim(zone, gfp_mask, order); @@ -2266,34 +2634,26 @@ zonelist_scan: ac->classzone_idx, alloc_flags)) goto try_this_zone; - /* - * Failed to reclaim enough to meet watermark. - * Only mark the zone full if checking the min - * watermark or if we failed to reclaim just - * 1<<order pages or else the page allocator - * fastpath will prematurely mark zones full - * when the watermark is between the low and - * min watermarks. - */ - if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || - ret == ZONE_RECLAIM_SOME) - goto this_zone_full; - continue; } } try_this_zone: page = buffered_rmqueue(ac->preferred_zone, zone, order, - gfp_mask, ac->migratetype); + gfp_mask, alloc_flags, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) goto try_this_zone; + + /* + * If this is a high-order atomic allocation then check + * if the pageblock should be reserved for the future + */ + if (unlikely(order && (alloc_flags & ALLOC_HARDER))) + reserve_highatomic_pageblock(page, zone, order); + return page; } -this_zone_full: - if (IS_ENABLED(CONFIG_NUMA) && zlc_active) - zlc_mark_zone_full(zonelist, z); } /* @@ -2314,12 +2674,6 @@ this_zone_full: zonelist_rescan = true; } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - zonelist_rescan = true; - } - if (zonelist_rescan) goto zonelist_scan; @@ -2344,7 +2698,7 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); -void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) +void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; @@ -2361,7 +2715,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) if (test_thread_flag(TIF_MEMDIE) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) + if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; if (fmt) { @@ -2378,7 +2732,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) va_end(args); } - pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", + pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", current->comm, order, gfp_mask); dump_stack(); @@ -2386,61 +2740,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) show_mem(filter); } -static inline int -should_alloc_retry(gfp_t gfp_mask, unsigned int order, - unsigned long did_some_progress, - unsigned long pages_reclaimed) -{ - /* Do not loop if specifically requested */ - if (gfp_mask & __GFP_NORETRY) - return 0; - - /* Always retry if specifically requested */ - if (gfp_mask & __GFP_NOFAIL) - return 1; - - /* - * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim - * making forward progress without invoking OOM. Suspend also disables - * storage devices so kswapd will not help. Bail if we are suspending. - */ - if (!did_some_progress && pm_suspended_storage()) - return 0; - - /* - * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER - * means __GFP_NOFAIL, but that may not be true in other - * implementations. - */ - if (order <= PAGE_ALLOC_COSTLY_ORDER) - return 1; - - /* - * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is - * specified, then we retry until we no longer reclaim any pages - * (above), or we've reclaimed an order of pages at least as - * large as the allocation's order. In both cases, if the - * allocation still fails, we stop retrying. - */ - if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) - return 1; - - return 0; -} - static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) { + struct oom_control oc = { + .zonelist = ac->zonelist, + .nodemask = ac->nodemask, + .gfp_mask = gfp_mask, + .order = order, + }; struct page *page; *did_some_progress = 0; /* - * Acquire the per-zone oom lock for each zone. If that - * fails, somebody else is making progress for us. + * Acquire the oom lock. If that fails, somebody else is + * making progress for us. */ - if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { + if (!mutex_trylock(&oom_lock)) { *did_some_progress = 1; schedule_timeout_uninterruptible(1); return NULL; @@ -2466,26 +2784,27 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; - /* The OOM killer does not compensate for light reclaim */ + /* The OOM killer does not compensate for IO-less reclaim */ if (!(gfp_mask & __GFP_FS)) { /* * XXX: Page reclaim didn't yield anything, * and the OOM killer can't be invoked, but - * keep looping as per should_alloc_retry(). + * keep looping as per tradition. */ *did_some_progress = 1; goto out; } + if (pm_suspended_storage()) + goto out; /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; } /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) - || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) + if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) *did_some_progress = 1; out: - oom_zonelist_unlock(ac->zonelist, gfp_mask); + mutex_unlock(&oom_lock); return page; } @@ -2599,19 +2918,17 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, if (unlikely(!(*did_some_progress))) return NULL; - /* After successful reclaim, reconsider all zones for allocation */ - if (IS_ENABLED(CONFIG_NUMA)) - zlc_clear_zones_full(ac->zonelist); - retry: page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); /* * If an allocation failed after direct reclaim, it could be because - * pages are pinned on the per-cpu lists. Drain them and try again + * pages are pinned on the per-cpu lists or in high alloc reserves. + * Shrink them them and try again */ if (!page && !drained) { + unreserve_highatomic_pageblock(ac); drain_all_pages(NULL); drained = true; goto retry; @@ -2656,7 +2973,6 @@ static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; - const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -2665,11 +2981,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); - if (atomic) { + if (gfp_mask & __GFP_ATOMIC) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -2706,11 +3022,16 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } +static inline bool is_thp_gfp_mask(gfp_t gfp_mask) +{ + return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { - const gfp_t wait = gfp_mask & __GFP_WAIT; + bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; @@ -2731,15 +3052,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, } /* + * We also sanity check to catch abuse of atomic reserves being used by + * callers that are not in atomic context. + */ + if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == + (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) + gfp_mask &= ~__GFP_ATOMIC; + + /* * If this allocation cannot block and it is for a specific node, then * fail early. There's no need to wakeup kswapd or retry for a * speculative node-specific allocation. */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) + if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) goto nopage; retry: - if (!(gfp_mask & __GFP_NO_KSWAPD)) + if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); /* @@ -2782,8 +3111,8 @@ retry: } } - /* Atomic allocations - we can't balance anything */ - if (!wait) { + /* Caller is not willing to reclaim, we can't balance anything */ + if (!can_direct_reclaim) { /* * All existing users of the deprecated __GFP_NOFAIL are * blockable, so warn of any new users that actually allow this @@ -2813,7 +3142,7 @@ retry: goto got_pg; /* Checks for THP-specific high-order allocations */ - if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { + if (is_thp_gfp_mask(gfp_mask)) { /* * If compaction is deferred for high-order allocations, it is * because sync compaction recently failed. If this is the case @@ -2848,8 +3177,7 @@ retry: * fault, so use asynchronous memory compaction for THP unless it is * khugepaged trying to collapse. */ - if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || - (current->flags & PF_KTHREAD)) + if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD)) migration_mode = MIGRATE_SYNC_LIGHT; /* Try direct reclaim and then allocating */ @@ -2858,40 +3186,40 @@ retry: if (page) goto got_pg; - /* Check if we should retry the allocation */ + /* Do not loop if specifically requested */ + if (gfp_mask & __GFP_NORETRY) + goto noretry; + + /* Keep reclaiming pages as long as there is reasonable progress */ pages_reclaimed += did_some_progress; - if (should_alloc_retry(gfp_mask, order, did_some_progress, - pages_reclaimed)) { - /* - * If we fail to make progress by freeing individual - * pages, but the allocation wants us to keep going, - * start OOM killing tasks. - */ - if (!did_some_progress) { - page = __alloc_pages_may_oom(gfp_mask, order, ac, - &did_some_progress); - if (page) - goto got_pg; - if (!did_some_progress) - goto nopage; - } + if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || + ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { /* Wait for some write requests to complete then retry */ wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); goto retry; - } else { - /* - * High-order allocations do not necessarily loop after - * direct reclaim and reclaim/compaction depends on compaction - * being called after reclaim so call directly if necessary - */ - page = __alloc_pages_direct_compact(gfp_mask, order, - alloc_flags, ac, migration_mode, - &contended_compaction, - &deferred_compaction); - if (page) - goto got_pg; } + /* Reclaim has failed us, start killing things */ + page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); + if (page) + goto got_pg; + + /* Retry as long as the OOM killer is making progress */ + if (did_some_progress) + goto retry; + +noretry: + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, + ac, migration_mode, + &contended_compaction, + &deferred_compaction); + if (page) + goto got_pg; nopage: warn_alloc_failed(gfp_mask, order, NULL); got_pg: @@ -2920,7 +3248,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, lockdep_trace_alloc(gfp_mask); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); if (should_fail_alloc_page(gfp_mask, order)) return NULL; @@ -2941,6 +3269,10 @@ retry_cpuset: /* We set it here, as __alloc_pages_slowpath might have changed it */ ac.zonelist = zonelist; + + /* Dirty zone balancing only done in the fast path */ + ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); + /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask ? : &cpuset_current_mems_allowed, @@ -2959,6 +3291,7 @@ retry_cpuset: * complete. */ alloc_mask = memalloc_noio_flags(gfp_mask); + ac.spread_dirty_pages = false; page = __alloc_pages_slowpath(alloc_mask, order, &ac); } @@ -3031,6 +3364,104 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* + * Page Fragment: + * An arbitrary-length arbitrary-offset area of memory which resides + * within a 0 or higher order page. Multiple fragments within that page + * are individually refcounted, in the page's reference counter. + * + * The page_frag functions below provide a simple allocation framework for + * page fragments. This is used by the network stack and network device + * drivers to provide a backing region of memory for use as either an + * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. + */ +static struct page *__page_frag_refill(struct page_frag_cache *nc, + gfp_t gfp_mask) +{ + struct page *page = NULL; + gfp_t gfp = gfp_mask; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC; + page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, + PAGE_FRAG_CACHE_MAX_ORDER); + nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; +#endif + if (unlikely(!page)) + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + + nc->va = page ? page_address(page) : NULL; + + return page; +} + +void *__alloc_page_frag(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask) +{ + unsigned int size = PAGE_SIZE; + struct page *page; + int offset; + + if (unlikely(!nc->va)) { +refill: + page = __page_frag_refill(nc, gfp_mask); + if (!page) + return NULL; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size = nc->size; +#endif + /* Even if we own the page, we do not use atomic_set(). + * This would break get_page_unless_zero() users. + */ + atomic_add(size - 1, &page->_count); + + /* reset page count bias and offset to start of new frag */ + nc->pfmemalloc = page_is_pfmemalloc(page); + nc->pagecnt_bias = size; + nc->offset = size; + } + + offset = nc->offset - fragsz; + if (unlikely(offset < 0)) { + page = virt_to_page(nc->va); + + if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + goto refill; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size = nc->size; +#endif + /* OK, page count is 0, we can safely set it */ + atomic_set(&page->_count, size); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = size; + offset = size - fragsz; + } + + nc->pagecnt_bias--; + nc->offset = offset; + + return nc->va + offset; +} +EXPORT_SYMBOL(__alloc_page_frag); + +/* + * Frees a page fragment allocated out of either a compound or order 0 page. + */ +void __free_page_frag(void *addr) +{ + struct page *page = virt_to_head_page(addr); + + if (unlikely(put_page_testzero(page))) + __free_pages_ok(page, compound_order(page)); +} +EXPORT_SYMBOL(__free_page_frag); + +/* * alloc_kmem_pages charges newly allocated pages to the kmem resource counter * of the current memory cgroup. * @@ -3040,24 +3471,24 @@ EXPORT_SYMBOL(free_pages); struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; - struct mem_cgroup *memcg = NULL; - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; page = alloc_pages(gfp_mask, order); - memcg_kmem_commit_charge(page, memcg, order); + if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { + __free_pages(page, order); + page = NULL; + } return page; } struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { struct page *page; - struct mem_cgroup *memcg = NULL; - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; page = alloc_pages_node(nid, gfp_mask, order); - memcg_kmem_commit_charge(page, memcg, order); + if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { + __free_pages(page, order); + page = NULL; + } return page; } @@ -3067,7 +3498,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) */ void __free_kmem_pages(struct page *page, unsigned int order) { - memcg_kmem_uncharge_pages(page, order); + memcg_kmem_uncharge(page, order); __free_pages(page, order); } @@ -3079,7 +3510,8 @@ void free_kmem_pages(unsigned long addr, unsigned int order) } } -static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) +static void *make_alloc_exact(unsigned long addr, unsigned int order, + size_t size) { if (addr) { unsigned long alloc_end = addr + (PAGE_SIZE << order); @@ -3126,12 +3558,10 @@ EXPORT_SYMBOL(alloc_pages_exact); * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. - * Note this is not alloc_pages_exact_node() which allocates on a specific node, - * but is not exact. */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { - unsigned order = get_order(size); + unsigned int order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); if (!p) return NULL; @@ -3278,9 +3708,9 @@ static void show_migration_types(unsigned char type) { static const char types[MIGRATE_TYPES] = { [MIGRATE_UNMOVABLE] = 'U', - [MIGRATE_RECLAIMABLE] = 'E', [MIGRATE_MOVABLE] = 'M', - [MIGRATE_RESERVE] = 'R', + [MIGRATE_RECLAIMABLE] = 'E', + [MIGRATE_HIGHATOMIC] = 'H', #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif @@ -3433,7 +3863,8 @@ void show_free_areas(unsigned int filter) } for_each_populated_zone(zone) { - unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned int order; + unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -3782,7 +4213,7 @@ static void build_zonelists(pg_data_t *pgdat) nodemask_t used_mask; int local_node, prev_node; struct zonelist *zonelist; - int order = current_zonelist_order; + unsigned int order = current_zonelist_order; /* initialize zonelists */ for (i = 0; i < MAX_ZONELISTS; i++) { @@ -3826,20 +4257,6 @@ static void build_zonelists(pg_data_t *pgdat) build_thisnode_zonelists(pgdat); } -/* Construct the zonelist performance cache - see further mmzone.h */ -static void build_zonelist_cache(pg_data_t *pgdat) -{ - struct zonelist *zonelist; - struct zonelist_cache *zlc; - struct zoneref *z; - - zonelist = &pgdat->node_zonelists[0]; - zonelist->zlcache_ptr = zlc = &zonelist->zlcache; - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); - for (z = zonelist->_zonerefs; z->zone; z++) - zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); -} - #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * Return node id of node used for "local" allocations. @@ -3900,12 +4317,6 @@ static void build_zonelists(pg_data_t *pgdat) zonelist->_zonerefs[j].zone_idx = 0; } -/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ -static void build_zonelist_cache(pg_data_t *pgdat) -{ - pgdat->node_zonelists[0].zlcache_ptr = NULL; -} - #endif /* CONFIG_NUMA */ /* @@ -3946,14 +4357,12 @@ static int __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); - build_zonelist_cache(self); } for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); - build_zonelist_cache(pgdat); } /* @@ -4113,117 +4522,6 @@ static inline unsigned long wait_table_bits(unsigned long size) } /* - * Check if a pageblock contains reserved pages - */ -static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) - return 1; - } - return 0; -} - -/* - * Mark a number of pageblocks as MIGRATE_RESERVE. The number - * of blocks reserved is based on min_wmark_pages(zone). The memory within - * the reserve will tend to store contiguous free pages. Setting min_free_kbytes - * higher will lead to a bigger reserve which will get freed as contiguous - * blocks as reclaim kicks in - */ -static void setup_zone_migrate_reserve(struct zone *zone) -{ - unsigned long start_pfn, pfn, end_pfn, block_end_pfn; - struct page *page; - unsigned long block_migratetype; - int reserve; - int old_reserve; - - /* - * Get the start pfn, end pfn and the number of blocks to reserve - * We have to be careful to be aligned to pageblock_nr_pages to - * make sure that we always check pfn_valid for the first page in - * the block. - */ - start_pfn = zone->zone_start_pfn; - end_pfn = zone_end_pfn(zone); - start_pfn = roundup(start_pfn, pageblock_nr_pages); - reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> - pageblock_order; - - /* - * Reserve blocks are generally in place to help high-order atomic - * allocations that are short-lived. A min_free_kbytes value that - * would result in more than 2 reserve blocks for atomic allocations - * is assumed to be in place to help anti-fragmentation for the - * future allocation of hugepages at runtime. - */ - reserve = min(2, reserve); - old_reserve = zone->nr_migrate_reserve_block; - - /* When memory hot-add, we almost always need to do nothing */ - if (reserve == old_reserve) - return; - zone->nr_migrate_reserve_block = reserve; - - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - - /* Watch out for overlapping nodes */ - if (page_to_nid(page) != zone_to_nid(zone)) - continue; - - block_migratetype = get_pageblock_migratetype(page); - - /* Only test what is necessary when the reserves are not met */ - if (reserve > 0) { - /* - * Blocks with reserved pages will never free, skip - * them. - */ - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); - if (pageblock_is_reserved(pfn, block_end_pfn)) - continue; - - /* If this block is reserved, account for it */ - if (block_migratetype == MIGRATE_RESERVE) { - reserve--; - continue; - } - - /* Suitable for reserving if this block is movable */ - if (block_migratetype == MIGRATE_MOVABLE) { - set_pageblock_migratetype(page, - MIGRATE_RESERVE); - move_freepages_block(zone, page, - MIGRATE_RESERVE); - reserve--; - continue; - } - } else if (!old_reserve) { - /* - * At boot time we don't need to scan the whole zone - * for turning off MIGRATE_RESERVE. - */ - break; - } - - /* - * If the reserve is met and this is a previous reserved block, - * take it back - */ - if (block_migratetype == MIGRATE_RESERVE) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(zone, page, MIGRATE_MOVABLE); - } - } -} - -/* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. @@ -4231,15 +4529,16 @@ static void setup_zone_migrate_reserve(struct zone *zone) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - struct page *page; + pg_data_t *pgdat = NODE_DATA(nid); unsigned long end_pfn = start_pfn + size; unsigned long pfn; struct zone *z; + unsigned long nr_initialised = 0; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; - z = &NODE_DATA(nid)->node_zones[zone]; + z = &pgdat->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s @@ -4251,39 +4550,31 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; if (!early_pfn_in_nid(pfn, nid)) continue; + if (!update_defer_init(pgdat, pfn, end_pfn, + &nr_initialised)) + break; } - page = pfn_to_page(pfn); - set_page_links(page, zone, nid, pfn); - mminit_verify_page_links(page, zone, nid, pfn); - init_page_count(page); - page_mapcount_reset(page); - page_cpupid_reset_last(page); - SetPageReserved(page); + /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations * to reserve their blocks rather than leaking throughout * the address space during boot when many long-lived - * kernel allocations are made. Later some blocks near - * the start are marked MIGRATE_RESERVE by - * setup_zone_migrate_reserve() + * kernel allocations are made. * * bitmap is created for zone's valid pfn range. but memmap * can be created for invalid pages (for alignment) * check here not to call set_pageblock_migratetype() against * pfn out of zone. */ - if ((z->zone_start_pfn <= pfn) - && (pfn < zone_end_pfn(z)) - && !(pfn & (pageblock_nr_pages - 1))) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); + if (!(pfn & (pageblock_nr_pages - 1))) { + struct page *page = pfn_to_page(pfn); - INIT_LIST_HEAD(&page->lru); -#ifdef WANT_PAGE_VIRTUAL - /* The shift won't overflow because ZONE_NORMAL is below 4G. */ - if (!is_highmem_idx(zone)) - set_page_address(page, __va(pfn << PAGE_SHIFT)); -#endif + __init_single_page(page, pfn, zone, nid); + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + } else { + __init_single_pfn(pfn, zone, nid); + } } } @@ -4516,8 +4807,7 @@ static __meminit void zone_pcp_init(struct zone *zone) int __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, - unsigned long size, - enum memmap_context context) + unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; int ret; @@ -4541,57 +4831,30 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID + /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. */ -int __meminit __early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn, + struct mminit_pfnnid_cache *state) { unsigned long start_pfn, end_pfn; int nid; - /* - * NOTE: The following SMP-unsafe globals are only used early in boot - * when the kernel is running single-threaded. - */ - static unsigned long __meminitdata last_start_pfn, last_end_pfn; - static int __meminitdata last_nid; - if (last_start_pfn <= pfn && pfn < last_end_pfn) - return last_nid; + if (state->last_start <= pfn && pfn < state->last_end) + return state->last_nid; nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); if (nid != -1) { - last_start_pfn = start_pfn; - last_end_pfn = end_pfn; - last_nid = nid; + state->last_start = start_pfn; + state->last_end = end_pfn; + state->last_nid = nid; } return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ -int __meminit early_pfn_to_nid(unsigned long pfn) -{ - int nid; - - nid = __early_pfn_to_nid(pfn); - if (nid >= 0) - return nid; - /* just returns 0 */ - return 0; -} - -#ifdef CONFIG_NODES_SPAN_OTHER_NODES -bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - int nid; - - nid = __early_pfn_to_nid(pfn); - if (nid >= 0 && nid != node) - return false; - return true; -} -#endif - /** * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -4731,6 +4994,10 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, { unsigned long zone_start_pfn, zone_end_pfn; + /* When hotadd a new node from cpu_up(), the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; @@ -4794,6 +5061,10 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; + /* When hotadd a new node from cpu_up(), the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); @@ -4833,22 +5104,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long realtotalpages, totalpages = 0; + unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - zones_size); - pgdat->node_spanned_pages = totalpages; - - realtotalpages = totalpages; - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= - zone_absent_pages_in_node(pgdat->node_id, i, + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + unsigned long size, real_size; + + size = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + zones_size); + real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zholes_size); + zone->spanned_pages = size; + zone->present_pages = real_size; + + totalpages += size; + realtotalpages += real_size; + } + + pgdat->node_spanned_pages = totalpages; pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); @@ -4957,9 +5234,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * * NOTE: pgdat should get zeroed by caller. */ -static void __paginginit free_area_init_core(struct pglist_data *pgdat, - unsigned long node_start_pfn, unsigned long node_end_pfn, - unsigned long *zones_size, unsigned long *zholes_size) +static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; @@ -4980,12 +5255,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; - size = zone_spanned_pages_in_node(nid, j, node_start_pfn, - node_end_pfn, zones_size); - realsize = freesize = size - zone_absent_pages_in_node(nid, j, - node_start_pfn, - node_end_pfn, - zholes_size); + size = zone->spanned_pages; + realsize = freesize = zone->present_pages; /* * Adjust freesize so that it accounts for how much memory @@ -5020,8 +5291,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, nr_kernel_pages -= memmap_pages; nr_all_pages += freesize; - zone->spanned_pages = size; - zone->present_pages = realsize; /* * Set an approximate value for lowmem here, it will be adjusted * when the bootmem allocator frees pages into the buddy system. @@ -5050,8 +5319,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); - ret = init_currently_empty_zone(zone, zone_start_pfn, - size, MEMMAP_EARLY); + ret = init_currently_empty_zone(zone, zone_start_pfn, size); BUG_ON(ret); memmap_init(size, nid, j, zone_start_pfn); zone_start_pfn += size; @@ -5060,14 +5328,19 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) { + unsigned long __maybe_unused start = 0; + unsigned long __maybe_unused offset = 0; + /* Skip empty nodes */ if (!pgdat->node_spanned_pages) return; #ifdef CONFIG_FLAT_NODE_MEM_MAP + start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + offset = pgdat->node_start_pfn - start; /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) { - unsigned long size, start, end; + unsigned long size, end; struct page *map; /* @@ -5075,7 +5348,6 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) * aligned but the node_mem_map endpoints must be in order * for the buddy allocator to function correctly. */ - start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); @@ -5083,7 +5355,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) if (!map) map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); - pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); + pgdat->node_mem_map = map + offset; } #ifndef CONFIG_NEED_MULTIPLE_NODES /* @@ -5091,9 +5363,9 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) if (page_to_pfn(mem_map) != pgdat->node_start_pfn) - mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); + mem_map -= offset; #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif @@ -5110,12 +5382,14 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + reset_deferred_meminit(pgdat); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, - (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5127,8 +5401,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, start_pfn, end_pfn, - zones_size, zholes_size); + free_area_init_core(pgdat); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -5139,11 +5412,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, */ void __init setup_nr_node_ids(void) { - unsigned int node; - unsigned int highest = 0; + unsigned int highest; - for_each_node_mask(node, node_possible_map) - highest = node; + highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); nr_node_ids = highest + 1; } #endif @@ -5306,13 +5577,17 @@ static void __init find_zone_movable_pfns_for_nodes(void) */ required_movablecore = roundup(required_movablecore, MAX_ORDER_NR_PAGES); + required_movablecore = min(totalpages, required_movablecore); corepages = totalpages - required_movablecore; required_kernelcore = max(required_kernelcore, corepages); } - /* If kernelcore was not specified, there is no ZONE_MOVABLE */ - if (!required_kernelcore) + /* + * If kernelcore was not specified or kernelcore size is larger + * than totalpages, there is no ZONE_MOVABLE. + */ + if (!required_kernelcore || required_kernelcore >= totalpages) goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ @@ -5664,7 +5939,7 @@ void __init mem_init_print_info(const char *str) * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved * - * The per-cpu batchsize and zone watermarks are determined by present_pages. + * The per-cpu batchsize and zone watermarks are determined by managed_pages. * In the DMA zone, a significant percentage may be consumed by kernel image * and other unfreeable allocations which can skew the watermarks badly. This * function may optionally be used to account for unfreeable pages in the @@ -5718,7 +5993,7 @@ void __init page_alloc_init(void) } /* - * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio + * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio * or min_free_kbytes changes. */ static void calculate_totalreserve_pages(void) @@ -5762,7 +6037,7 @@ static void calculate_totalreserve_pages(void) /* * setup_per_zone_lowmem_reserve - called whenever - * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone + * sysctl_lowmem_reserve_ratio changes. Ensures that each zone * has a correct pages reserved value, so an adequate number of * pages are left in the zone after a successful __alloc_pages(). */ @@ -5848,7 +6123,6 @@ static void __setup_per_zone_wmarks(void) high_wmark_pages(zone) - low_wmark_pages(zone) - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -6078,9 +6352,9 @@ out: return ret; } +#ifdef CONFIG_NUMA int hashdist = HASHDIST_DEFAULT; -#ifdef CONFIG_NUMA static int __init set_hashdist(char *str) { if (!str) @@ -6470,7 +6744,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype) { unsigned long outer_start, outer_end; - int ret = 0, order; + unsigned int order; + int ret = 0; struct compact_control cc = { .nr_migratepages = 0, diff --git a/kernel/mm/page_counter.c b/kernel/mm/page_counter.c index 11b4beda1..7c6a63d2c 100644 --- a/kernel/mm/page_counter.c +++ b/kernel/mm/page_counter.c @@ -56,12 +56,12 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) * @nr_pages: number of pages to charge * @fail: points first counter to hit its limit, if any * - * Returns 0 on success, or -ENOMEM and @fail if the counter or one of - * its ancestors has hit its configured limit. + * Returns %true on success, or %false and @fail if the counter or one + * of its ancestors has hit its configured limit. */ -int page_counter_try_charge(struct page_counter *counter, - unsigned long nr_pages, - struct page_counter **fail) +bool page_counter_try_charge(struct page_counter *counter, + unsigned long nr_pages, + struct page_counter **fail) { struct page_counter *c; @@ -99,13 +99,13 @@ int page_counter_try_charge(struct page_counter *counter, if (new > c->watermark) c->watermark = new; } - return 0; + return true; failed: for (c = counter; c != *fail; c = c->parent) page_counter_cancel(c, nr_pages); - return -ENOMEM; + return false; } /** diff --git a/kernel/mm/page_ext.c b/kernel/mm/page_ext.c index d86fd2f53..292ca7b8d 100644 --- a/kernel/mm/page_ext.c +++ b/kernel/mm/page_ext.c @@ -6,6 +6,7 @@ #include <linux/vmalloc.h> #include <linux/kmemleak.h> #include <linux/page_owner.h> +#include <linux/page_idle.h> /* * struct page extension @@ -59,6 +60,9 @@ static struct page_ext_operations *page_ext_ops[] = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif +#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) + &page_idle_ops, +#endif }; static unsigned long total_usage; diff --git a/kernel/mm/page_idle.c b/kernel/mm/page_idle.c new file mode 100644 index 000000000..d5dd79041 --- /dev/null +++ b/kernel/mm/page_idle.c @@ -0,0 +1,232 @@ +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/fs.h> +#include <linux/sysfs.h> +#include <linux/kobject.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/mmu_notifier.h> +#include <linux/page_ext.h> +#include <linux/page_idle.h> + +#define BITMAP_CHUNK_SIZE sizeof(u64) +#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE) + +/* + * Idle page tracking only considers user memory pages, for other types of + * pages the idle flag is always unset and an attempt to set it is silently + * ignored. + * + * We treat a page as a user memory page if it is on an LRU list, because it is + * always safe to pass such a page to rmap_walk(), which is essential for idle + * page tracking. With such an indicator of user pages we can skip isolated + * pages, but since there are not usually many of them, it will hardly affect + * the overall result. + * + * This function tries to get a user memory page by pfn as described above. + */ +static struct page *page_idle_get_page(unsigned long pfn) +{ + struct page *page; + struct zone *zone; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_page(pfn); + if (!page || !PageLRU(page) || + !get_page_unless_zero(page)) + return NULL; + + zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + spin_unlock_irq(&zone->lru_lock); + return page; +} + +static int page_idle_clear_pte_refs_one(struct page *page, + struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + pmd_t *pmd; + pte_t *pte; + bool referenced = false; + + if (unlikely(PageTransHuge(page))) { + pmd = page_check_address_pmd(page, mm, addr, + PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); + if (pmd) { + referenced = pmdp_clear_young_notify(vma, addr, pmd); + spin_unlock(ptl); + } + } else { + pte = page_check_address(page, mm, addr, &ptl, 0); + if (pte) { + referenced = ptep_clear_young_notify(vma, addr, pte); + pte_unmap_unlock(pte, ptl); + } + } + if (referenced) { + clear_page_idle(page); + /* + * We cleared the referenced bit in a mapping to this page. To + * avoid interference with page reclaim, mark it young so that + * page_referenced() will return > 0. + */ + set_page_young(page); + } + return SWAP_AGAIN; +} + +static void page_idle_clear_pte_refs(struct page *page) +{ + /* + * Since rwc.arg is unused, rwc is effectively immutable, so we + * can make it static const to save some cycles and stack. + */ + static const struct rmap_walk_control rwc = { + .rmap_one = page_idle_clear_pte_refs_one, + .anon_lock = page_lock_anon_vma_read, + }; + bool need_lock; + + if (!page_mapped(page) || + !page_rmapping(page)) + return; + + need_lock = !PageAnon(page) || PageKsm(page); + if (need_lock && !trylock_page(page)) + return; + + rmap_walk(page, (struct rmap_walk_control *)&rwc); + + if (need_lock) + unlock_page(page); +} + +static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t pos, size_t count) +{ + u64 *out = (u64 *)buf; + struct page *page; + unsigned long pfn, end_pfn; + int bit; + + if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) + return -EINVAL; + + pfn = pos * BITS_PER_BYTE; + if (pfn >= max_pfn) + return 0; + + end_pfn = pfn + count * BITS_PER_BYTE; + if (end_pfn > max_pfn) + end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); + + for (; pfn < end_pfn; pfn++) { + bit = pfn % BITMAP_CHUNK_BITS; + if (!bit) + *out = 0ULL; + page = page_idle_get_page(pfn); + if (page) { + if (page_is_idle(page)) { + /* + * The page might have been referenced via a + * pte, in which case it is not idle. Clear + * refs and recheck. + */ + page_idle_clear_pte_refs(page); + if (page_is_idle(page)) + *out |= 1ULL << bit; + } + put_page(page); + } + if (bit == BITMAP_CHUNK_BITS - 1) + out++; + cond_resched(); + } + return (char *)out - buf; +} + +static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t pos, size_t count) +{ + const u64 *in = (u64 *)buf; + struct page *page; + unsigned long pfn, end_pfn; + int bit; + + if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) + return -EINVAL; + + pfn = pos * BITS_PER_BYTE; + if (pfn >= max_pfn) + return -ENXIO; + + end_pfn = pfn + count * BITS_PER_BYTE; + if (end_pfn > max_pfn) + end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); + + for (; pfn < end_pfn; pfn++) { + bit = pfn % BITMAP_CHUNK_BITS; + if ((*in >> bit) & 1) { + page = page_idle_get_page(pfn); + if (page) { + page_idle_clear_pte_refs(page); + set_page_idle(page); + put_page(page); + } + } + if (bit == BITMAP_CHUNK_BITS - 1) + in++; + cond_resched(); + } + return (char *)in - buf; +} + +static struct bin_attribute page_idle_bitmap_attr = + __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR, + page_idle_bitmap_read, page_idle_bitmap_write, 0); + +static struct bin_attribute *page_idle_bin_attrs[] = { + &page_idle_bitmap_attr, + NULL, +}; + +static struct attribute_group page_idle_attr_group = { + .bin_attrs = page_idle_bin_attrs, + .name = "page_idle", +}; + +#ifndef CONFIG_64BIT +static bool need_page_idle(void) +{ + return true; +} +struct page_ext_operations page_idle_ops = { + .need = need_page_idle, +}; +#endif + +static int __init page_idle_init(void) +{ + int err; + + err = sysfs_create_group(mm_kobj, &page_idle_attr_group); + if (err) { + pr_err("page_idle: register sysfs failed\n"); + return err; + } + return 0; +} +subsys_initcall(page_idle_init); diff --git a/kernel/mm/page_io.c b/kernel/mm/page_io.c index 6424869e2..b995a5ba5 100644 --- a/kernel/mm/page_io.c +++ b/kernel/mm/page_io.c @@ -33,22 +33,19 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, if (bio) { bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; - bio->bi_io_vec[0].bv_page = page; - bio->bi_io_vec[0].bv_len = PAGE_SIZE; - bio->bi_io_vec[0].bv_offset = 0; - bio->bi_vcnt = 1; - bio->bi_iter.bi_size = PAGE_SIZE; bio->bi_end_io = end_io; + + bio_add_page(bio, page, PAGE_SIZE, 0); + BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE); } return bio; } -void end_swap_bio_write(struct bio *bio, int err) +void end_swap_bio_write(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (!uptodate) { + if (bio->bi_error) { SetPageError(page); /* * We failed to write the page out to swap-space. @@ -69,12 +66,11 @@ void end_swap_bio_write(struct bio *bio, int err) bio_put(bio); } -void end_swap_bio_read(struct bio *bio, int err) +static void end_swap_bio_read(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (!uptodate) { + if (bio->bi_error) { SetPageError(page); ClearPageUptodate(page); printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", @@ -254,7 +250,7 @@ static sector_t swap_page_sector(struct page *page) } int __swap_writepage(struct page *page, struct writeback_control *wbc, - void (*end_write_func)(struct bio *, int)) + bio_end_io_t end_write_func) { struct bio *bio; int ret, rw = WRITE; diff --git a/kernel/mm/page_isolation.c b/kernel/mm/page_isolation.c index 303c90879..4568fd58f 100644 --- a/kernel/mm/page_isolation.c +++ b/kernel/mm/page_isolation.c @@ -9,7 +9,8 @@ #include <linux/hugetlb.h> #include "internal.h" -int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) +static int set_migratetype_isolate(struct page *page, + bool skip_hwpoisoned_pages) { struct zone *zone; unsigned long flags, pfn; @@ -72,7 +73,7 @@ out: return ret; } -void unset_migratetype_isolate(struct page *page, unsigned migratetype) +static void unset_migratetype_isolate(struct page *page, unsigned migratetype) { struct zone *zone; unsigned long flags, nr_pages; @@ -223,34 +224,16 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, continue; } page = pfn_to_page(pfn); - if (PageBuddy(page)) { + if (PageBuddy(page)) /* - * If race between isolatation and allocation happens, - * some free pages could be in MIGRATE_MOVABLE list - * although pageblock's migratation type of the page - * is MIGRATE_ISOLATE. Catch it and move the page into - * MIGRATE_ISOLATE list. + * If the page is on a free list, it has to be on + * the correct MIGRATE_ISOLATE freelist. There is no + * simple way to verify that as VM_BUG_ON(), though. */ - if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) { - struct page *end_page; - - end_page = page + (1 << page_order(page)) - 1; - move_freepages(page_zone(page), page, end_page, - MIGRATE_ISOLATE); - } pfn += 1 << page_order(page); - } - else if (page_count(page) == 0 && - get_freepage_migratetype(page) == MIGRATE_ISOLATE) - pfn += 1; - else if (skip_hwpoisoned_pages && PageHWPoison(page)) { - /* - * The HWPoisoned page may be not in buddy - * system, and page_count() is not 0. - */ + else if (skip_hwpoisoned_pages && PageHWPoison(page)) + /* A HWPoisoned page cannot be also PageBuddy */ pfn++; - continue; - } else break; } diff --git a/kernel/mm/page_owner.c b/kernel/mm/page_owner.c index 0993f5f36..983c3a10f 100644 --- a/kernel/mm/page_owner.c +++ b/kernel/mm/page_owner.c @@ -76,6 +76,13 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } +gfp_t __get_page_owner_gfp(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + + return page_ext->gfp_mask; +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_ext *page_ext) @@ -310,4 +317,4 @@ static int __init pageowner_init(void) return 0; } -module_init(pageowner_init) +late_initcall(pageowner_init) diff --git a/kernel/mm/percpu.c b/kernel/mm/percpu.c index 2dd74487a..8a943b97a 100644 --- a/kernel/mm/percpu.c +++ b/kernel/mm/percpu.c @@ -1554,12 +1554,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); #ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); - PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start)); #endif PCPU_SETUP_BUG_ON(!base_addr); - PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(offset_in_page(base_addr)); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); - PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); @@ -1668,9 +1668,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, schunk->map[1] = ai->static_size; schunk->map_used = 1; if (schunk->free_size) - schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size); - else - schunk->map[1] |= 1; + schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size; + schunk->map[schunk->map_used] |= 1; /* init dynamic chunk if necessary */ if (dyn_size) { @@ -1807,7 +1806,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; - while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + while (alloc_size % upa || (offset_in_page(alloc_size / upa))) upa--; max_upa = upa; @@ -1839,7 +1838,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( for (upa = max_upa; upa; upa--) { int allocs = 0, wasted = 0; - if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + if (alloc_size % upa || (offset_in_page(alloc_size / upa))) continue; for (group = 0; group < nr_groups; group++) { diff --git a/kernel/mm/pgtable-generic.c b/kernel/mm/pgtable-generic.c index c25f94b33..1ba58213a 100644 --- a/kernel/mm/pgtable-generic.c +++ b/kernel/mm/pgtable-generic.c @@ -57,26 +57,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma, } #endif -#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS -int pmdp_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp, - pmd_t entry, int dirty) -{ -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - int changed = !pmd_same(*pmdp, entry); - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - if (changed) { - set_pmd_at(vma->vm_mm, address, pmdp, entry); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); - } - return changed; -#else /* CONFIG_TRANSPARENT_HUGEPAGE */ - BUG(); - return 0; -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -} -#endif - #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) @@ -89,23 +69,6 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, } #endif -#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - int young; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON(address & ~HPAGE_PMD_MASK); -#else - BUG(); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - young = pmdp_test_and_clear_young(vma, address, pmdp); - if (young) - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); - return young; -} -#endif - #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) @@ -119,22 +82,64 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, } #endif -#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE -pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) + +#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE + +/* + * ARCHes with special requirements for evicting THP backing TLB entries can + * implement this. Otherwise also, it can help optimize normal TLB flush in + * THP regime. stock flush_tlb_range() typically has optimization to nuke the + * entire TLB TLB if flush span is greater than a threshhold, which will + * likely be true for a single huge page. Thus a single thp flush will + * invalidate the entire TLB which is not desitable. + * e.g. see arch/arc: flush_pmd_tlb_range + */ +#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int changed = !pmd_same(*pmdp, entry); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (changed) { + set_pmd_at(vma->vm_mm, address, pmdp, entry); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + return changed; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH +pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + VM_BUG_ON(!pmd_trans_huge(*pmdp)); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH -#ifdef CONFIG_TRANSPARENT_HUGEPAGE void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { @@ -142,13 +147,11 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, VM_BUG_ON(address & ~HPAGE_PMD_MASK); set_pmd_at(vma->vm_mm, address, pmdp, pmd); /* tlb flush only to serialize against gup-fast */ - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT -#ifdef CONFIG_TRANSPARENT_HUGEPAGE void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { @@ -161,11 +164,9 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); pmd_huge_pte(mm, pmdp) = pgtable; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW -#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* no "address" argument so destroys page coloring of some arch */ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { @@ -184,17 +185,35 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) } return pgtable; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE -#ifdef CONFIG_TRANSPARENT_HUGEPAGE void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t entry = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); + flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +} +#endif + +#ifndef pmdp_collapse_flush +pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + /* + * pmd and hugepage pte format are same. So we could + * use the same function. + */ + pmd_t pmd; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(pmd_trans_huge(*pmdp)); + pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); + + /* collapse entails shooting down ptes not pmd */ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/kernel/mm/process_vm_access.c b/kernel/mm/process_vm_access.c index e88d07164..5d453e58d 100644 --- a/kernel/mm/process_vm_access.c +++ b/kernel/mm/process_vm_access.c @@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, goto free_proc_pages; } - mm = mm_access(task, PTRACE_MODE_ATTACH); + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (!mm || IS_ERR(mm)) { rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; /* diff --git a/kernel/mm/readahead.c b/kernel/mm/readahead.c index 935675844..ba22d7fe0 100644 --- a/kernel/mm/readahead.c +++ b/kernel/mm/readahead.c @@ -89,8 +89,8 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, while (!list_empty(pages)) { page = list_to_page(pages); list_del(&page->lru); - if (add_to_page_cache_lru(page, mapping, - page->index, GFP_KERNEL)) { + if (add_to_page_cache_lru(page, mapping, page->index, + mapping_gfp_constraint(mapping, GFP_KERNEL))) { read_cache_pages_invalidate_page(mapping, page); continue; } @@ -127,8 +127,8 @@ static int read_pages(struct address_space *mapping, struct file *filp, for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_to_page(pages); list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, - page->index, GFP_KERNEL)) { + if (!add_to_page_cache_lru(page, mapping, page->index, + mapping_gfp_constraint(mapping, GFP_KERNEL))) { mapping->a_ops->readpage(filp, page); } page_cache_release(page); @@ -213,7 +213,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; - nr_to_read = max_sane_readahead(nr_to_read); + nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages); while (nr_to_read) { int err; @@ -232,16 +232,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return 0; } -#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) -/* - * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a - * sensible upper limit. - */ -unsigned long max_sane_readahead(unsigned long nr) -{ - return min(nr, MAX_READAHEAD); -} - /* * Set the initial window size, round to next power of 2 and square * for small size, x 4 for medium, and x 2 for large @@ -380,7 +370,7 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - unsigned long max = max_sane_readahead(ra->ra_pages); + unsigned long max = ra->ra_pages; pgoff_t prev_offset; /* @@ -541,7 +531,7 @@ page_cache_async_readahead(struct address_space *mapping, /* * Defer asynchronous read-ahead on IO congestion. */ - if (bdi_read_congested(inode_to_bdi(mapping->host))) + if (inode_read_congested(mapping->host)) return; /* do read-ahead */ diff --git a/kernel/mm/rmap.c b/kernel/mm/rmap.c index 24dd3f9fe..b577fbb98 100644 --- a/kernel/mm/rmap.c +++ b/kernel/mm/rmap.c @@ -30,6 +30,8 @@ * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) + * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) + * mapping->tree_lock (widely used) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) @@ -57,9 +59,12 @@ #include <linux/migrate.h> #include <linux/hugetlb.h> #include <linux/backing-dev.h> +#include <linux/page_idle.h> #include <asm/tlbflush.h> +#include <trace/events/tlb.h> + #include "internal.h" static struct kmem_cache *anon_vma_cachep; @@ -581,6 +586,107 @@ vma_address(struct page *page, struct vm_area_struct *vma) return address; } +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +static void percpu_flush_tlb_batch_pages(void *data) +{ + /* + * All TLB entries are flushed on the assumption that it is + * cheaper to flush all TLBs and let them be refilled than + * flushing individual PFNs. Note that we do not track mm's + * to flush as that might simply be multiple full TLB flushes + * for no gain. + */ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + flush_tlb_local(); +} + +/* + * Flush TLB entries for recently unmapped pages from remote CPUs. It is + * important if a PTE was dirty when it was unmapped that it's flushed + * before any IO is initiated on the page to prevent lost writes. Similarly, + * it must be flushed before freeing to prevent data leakage. + */ +void try_to_unmap_flush(void) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + int cpu; + + if (!tlb_ubc->flush_required) + return; + + cpu = get_cpu(); + + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL); + + if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) + percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask); + + if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) { + smp_call_function_many(&tlb_ubc->cpumask, + percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true); + } + cpumask_clear(&tlb_ubc->cpumask); + tlb_ubc->flush_required = false; + tlb_ubc->writable = false; + put_cpu(); +} + +/* Flush iff there are potentially writable TLB entries that can race with IO */ +void try_to_unmap_flush_dirty(void) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + + if (tlb_ubc->writable) + try_to_unmap_flush(); +} + +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, + struct page *page, bool writable) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + + cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); + tlb_ubc->flush_required = true; + + /* + * If the PTE was dirty then it's best to assume it's writable. The + * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() + * before the page is queued for IO. + */ + if (writable) + tlb_ubc->writable = true; +} + +/* + * Returns true if the TLB flush should be deferred to the end of a batch of + * unmap operations to reduce IPIs. + */ +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + bool should_defer = false; + + if (!(flags & TTU_BATCH_FLUSH)) + return false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} +#else +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, + struct page *page, bool writable) +{ +} + +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + return false; +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + /* * At what user virtual address is page expected in vma? * Caller should check the page is actually part of the vma. @@ -625,7 +731,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) pmd = pmd_offset(pud, address); /* - * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at() + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() * without holding anon_vma lock for write. So when looking for a * genuine pmde (in which to find pte), test present and !THP together. */ @@ -781,6 +887,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } + if (referenced) + clear_page_idle(page); + if (test_and_clear_page_young(page)) + referenced++; + if (referenced) { pra->referenced++; pra->vm_flags |= vma->vm_flags; @@ -950,7 +1061,12 @@ void page_move_anon_rmap(struct page *page, VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - page->mapping = (struct address_space *) anon_vma; + /* + * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written + * simultaneously, so a concurrent reader (eg page_referenced()'s + * PageAnon()) will not see one without the other. + */ + WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); } /** @@ -1188,6 +1304,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, int ret = SWAP_AGAIN; enum ttu_flags flags = (enum ttu_flags)arg; + /* munlock has nothing to gain from examining un-locked vmas */ + if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) + goto out; + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -1198,9 +1318,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * skipped over this mm) then we should reactivate it. */ if (!(flags & TTU_IGNORE_MLOCK)) { - if (vma->vm_flags & VM_LOCKED) - goto out_mlock; - + if (vma->vm_flags & VM_LOCKED) { + /* Holding pte lock, we do *not* need mmap_sem here */ + mlock_vma_page(page); + ret = SWAP_MLOCK; + goto out_unmap; + } if (flags & TTU_MUNLOCK) goto out_unmap; } @@ -1213,7 +1336,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush(vma, address, pte); + if (should_defer_flush(mm, flags)) { + /* + * We clear the PTE but do not flush so potentially a remote + * CPU could still be writing to the page. If the entry was + * previously clean then the architecture must guarantee that + * a clear->dirty transition on a cached TLB entry is written + * through and traps if the PTE is unmapped. + */ + pteval = ptep_get_and_clear(mm, address, pte); + + set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval)); + } else { + pteval = ptep_clear_flush(vma, address, pte); + } /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -1223,7 +1359,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { - if (!PageHuge(page)) { + if (PageHuge(page)) { + hugetlb_count_sub(1 << compound_order(page), mm); + } else { if (PageAnon(page)) dec_mm_counter(mm, MM_ANONPAGES); else @@ -1241,47 +1379,44 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, dec_mm_counter(mm, MM_ANONPAGES); else dec_mm_counter(mm, MM_FILEPAGES); + } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { + swp_entry_t entry; + pte_t swp_pte; + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + entry = make_migration_entry(page, pte_write(pteval)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pte, swp_pte); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; pte_t swp_pte; - - if (PageSwapCache(page)) { - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - if (swap_duplicate(entry) < 0) { - set_pte_at(mm, address, pte, pteval); - ret = SWAP_FAIL; - goto out_unmap; - } - if (list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - if (list_empty(&mm->mmlist)) - list_add(&mm->mmlist, &init_mm.mmlist); - spin_unlock(&mmlist_lock); - } - dec_mm_counter(mm, MM_ANONPAGES); - inc_mm_counter(mm, MM_SWAPENTS); - } else if (IS_ENABLED(CONFIG_MIGRATION)) { - /* - * Store the pfn of the page in a special migration - * pte. do_swap_page() will wait until the migration - * pte is removed and then restart fault handling. - */ - BUG_ON(!(flags & TTU_MIGRATION)); - entry = make_migration_entry(page, pte_write(pteval)); + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + VM_BUG_ON_PAGE(!PageSwapCache(page), page); + if (swap_duplicate(entry) < 0) { + set_pte_at(mm, address, pte, pteval); + ret = SWAP_FAIL; + goto out_unmap; + } + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); } + dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pte, swp_pte); - } else if (IS_ENABLED(CONFIG_MIGRATION) && - (flags & TTU_MIGRATION)) { - /* Establish migration entry for a file page */ - swp_entry_t entry; - entry = make_migration_entry(page, pte_write(pteval)); - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else dec_mm_counter(mm, MM_FILEPAGES); @@ -1290,31 +1425,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); - if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) + if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK)) mmu_notifier_invalidate_page(mm, address); out: return ret; - -out_mlock: - pte_unmap_unlock(pte, ptl); - - - /* - * We need mmap_sem locking, Otherwise VM_LOCKED check makes - * unstable result and race. Plus, We can't wait here because - * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem. - * if trylock failed, the page remain in evictable lru and later - * vmscan could retry to move the page to unevictable lru if the - * page is actually mlocked. - */ - if (down_read_trylock(&vma->vm_mm->mmap_sem)) { - if (vma->vm_flags & VM_LOCKED) { - mlock_vma_page(page); - ret = SWAP_MLOCK; - } - up_read(&vma->vm_mm->mmap_sem); - } - return ret; } bool is_vma_temporary_stack(struct vm_area_struct *vma) @@ -1478,6 +1592,8 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); + cond_resched(); + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; @@ -1527,6 +1643,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); + cond_resched(); + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; diff --git a/kernel/mm/shmem.c b/kernel/mm/shmem.c index 47d536e59..ea5a70cfc 100644 --- a/kernel/mm/shmem.c +++ b/kernel/mm/shmem.c @@ -73,6 +73,8 @@ static struct vfsmount *shm_mnt; #include <asm/uaccess.h> #include <asm/pgtable.h> +#include "internal.h" + #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) @@ -542,6 +544,21 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } EXPORT_SYMBOL_GPL(shmem_truncate_range); +static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct shmem_inode_info *info = SHMEM_I(inode); + + if (info->alloced - info->swapped != inode->i_mapping->nrpages) { + spin_lock(&info->lock); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + } + generic_fillattr(inode, stat); + return 0; +} + static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); @@ -569,12 +586,18 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } - if (newsize < oldsize) { + if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); - unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); - shmem_truncate_range(inode, newsize, (loff_t)-1); + if (oldsize > holebegin) + unmap_mapping_range(inode->i_mapping, + holebegin, 0, 1); + if (info->alloced) + shmem_truncate_range(inode, + newsize, (loff_t)-1); /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); + if (oldsize > holebegin) + unmap_mapping_range(inode->i_mapping, + holebegin, 0, 1); } } @@ -597,8 +620,7 @@ static void shmem_evict_inode(struct inode *inode) list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } - } else - kfree(info->symlink); + } simple_xattrs_free(&info->xattrs); WARN_ON(inode->i_blocks); @@ -820,14 +842,14 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) list_add_tail(&info->swaplist, &shmem_swaplist); if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { - swap_shmem_alloc(swap); - shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); - spin_lock(&info->lock); - info->swapped++; shmem_recalc_inode(inode); + info->swapped++; spin_unlock(&info->lock); + swap_shmem_alloc(swap); + shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + mutex_unlock(&shmem_swaplist_mutex); BUG_ON(page_mapped(page)); swap_writepage(page, wbc); @@ -1008,7 +1030,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_migrate(oldpage, newpage, true); + mem_cgroup_replace_page(oldpage, newpage); lru_cache_add_anon(newpage); *pagep = newpage; } @@ -1055,7 +1077,7 @@ repeat: if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; - goto failed; + goto unlock; } if (page && sgp == SGP_WRITE) @@ -1223,11 +1245,15 @@ clear: /* Perhaps the file has been truncated since we checked */ if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + if (alloced) { + ClearPageDirty(page); + delete_from_page_cache(page); + spin_lock(&info->lock); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + } error = -EINVAL; - if (alloced) - goto trunc; - else - goto failed; + goto unlock; } *pagep = page; return 0; @@ -1235,23 +1261,13 @@ clear: /* * Error recovery. */ -trunc: - info = SHMEM_I(inode); - ClearPageDirty(page); - delete_from_page_cache(page); - spin_lock(&info->lock); - info->alloced--; - inode->i_blocks -= BLOCKS_PER_PAGE; - spin_unlock(&info->lock); decused: - sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) percpu_counter_add(&sbinfo->used_blocks, -1); unacct: shmem_unacct_blocks(info->flags, 1); failed: - if (swap.val && error != -EINVAL && - !shmem_confirm_swap(mapping, index, swap)) + if (swap.val && !shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; unlock: if (page) { @@ -2445,8 +2461,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s info = SHMEM_I(inode); inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - info->symlink = kmemdup(symname, len, GFP_KERNEL); - if (!info->symlink) { + inode->i_link = kmemdup(symname, len, GFP_KERNEL); + if (!inode->i_link) { iput(inode); return -ENOMEM; } @@ -2474,30 +2490,23 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return 0; } -static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, SHMEM_I(d_inode(dentry))->symlink); - return NULL; -} - -static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *shmem_follow_link(struct dentry *dentry, void **cookie) { struct page *page = NULL; int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL); - nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); - if (page) - unlock_page(page); - return page; + if (error) + return ERR_PTR(error); + unlock_page(page); + *cookie = page; + return kmap(page); } -static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +static void shmem_put_link(struct inode *unused, void *cookie) { - if (!IS_ERR(nd_get_link(nd))) { - struct page *page = cookie; - kunmap(page); - mark_page_accessed(page); - page_cache_release(page); - } + struct page *page = cookie; + kunmap(page); + mark_page_accessed(page); + page_cache_release(page); } #ifdef CONFIG_TMPFS_XATTR @@ -2642,7 +2651,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) static const struct inode_operations shmem_short_symlink_operations = { .readlink = generic_readlink, - .follow_link = shmem_follow_short_symlink, + .follow_link = simple_follow_link, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, .getxattr = shmem_getxattr, @@ -3072,6 +3081,7 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) static void shmem_destroy_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); + kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } @@ -3128,6 +3138,7 @@ static const struct file_operations shmem_file_operations = { }; static const struct inode_operations shmem_inode_operations = { + .getattr = shmem_getattr, .setattr = shmem_setattr, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, @@ -3369,8 +3380,8 @@ put_path: * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be * kernel internal. There will be NO LSM permission checks against the * underlying inode. So users of this interface must do LSM checks at a - * higher layer. The one user is the big_key implementation. LSM checks - * are provided at the key level rather than the inode level. + * higher layer. The users are the big_key and shm implementations. LSM + * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc/<pid>/maps * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size diff --git a/kernel/mm/slab.c b/kernel/mm/slab.c index 3dd2d1ff9..4765c97ce 100644 --- a/kernel/mm/slab.c +++ b/kernel/mm/slab.c @@ -282,6 +282,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define CFLGS_OFF_SLAB (0x80000000UL) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) +#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) #define BATCHREFILL_LIMIT 16 /* @@ -1030,12 +1031,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not invoke reclaim - * or warn about failures. + * Construct gfp mask to allocate from a specific node but do not direct reclaim + * or warn about failures. kswapd may still wake to reclaim in the background. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; } #endif @@ -1454,6 +1455,7 @@ void __init kmem_cache_init(void) kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); slab_state = PARTIAL_NODE; + setup_kmalloc_cache_index_table(); slab_early_init = 0; @@ -1591,16 +1593,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - if (memcg_charge_slab(cachep, flags, cachep->gfporder)) - return NULL; - - page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); + page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { - memcg_uncharge_slab(cachep, cachep->gfporder); slab_out_of_memory(cachep, flags, nodeid); return NULL; } + if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) { + __free_pages(page, cachep->gfporder); + return NULL; + } + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (page_is_pfmemalloc(page)) pfmemalloc_active = true; @@ -1652,8 +1655,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_pages(page, cachep->gfporder); - memcg_uncharge_slab(cachep, cachep->gfporder); + __free_kmem_pages(page, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) @@ -1887,21 +1889,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) freelist = page->freelist; slab_destroy_debugcheck(cachep, page); - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { - struct rcu_head *head; - - /* - * RCU free overloads the RCU head over the LRU. - * slab_page has been overloeaded over the LRU, - * however it is not used from now on so that - * we can use it safely. - */ - head = (void *)&page->rcu_head; - call_rcu(head, kmem_rcu_free); - - } else { + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + call_rcu(&page->rcu_head, kmem_rcu_free); + else kmem_freepages(cachep, page); - } /* * From now on, we don't use freelist @@ -2189,9 +2180,16 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) - if (size >= kmalloc_size(INDEX_NODE + 1) - && cachep->object_size > cache_line_size() - && ALIGN(size, cachep->align) < PAGE_SIZE) { + /* + * To activate debug pagealloc, off-slab management is necessary + * requirement. In early phase of initialization, small sized slab + * doesn't get initialized so it would not be possible. So, we need + * to check size >= 256. It guarantees that all necessary small + * sized slab is initialized in current slab initialization sequence. + */ + if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) && + size >= 256 && cachep->object_size > cache_line_size() && + ALIGN(size, cachep->align) < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); size = PAGE_SIZE; } @@ -2204,7 +2202,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * it too early on. Always use on-slab management when * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ - if ((size >= (PAGE_SIZE >> 5)) && !slab_early_init && + if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj @@ -2268,7 +2266,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) /* * This is a possibility for one of the kmalloc_{dma,}_caches. * But since we go off slab only for object size greater than - * PAGE_SIZE/8, and kmalloc_{dma,}_caches get created + * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created * in ascending order,this should not happen at all. * But leave a BUG_ON for some lucky dude. */ @@ -2624,7 +2622,7 @@ static int cache_grow(struct kmem_cache *cachep, offset *= cachep->colour_off; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); /* @@ -2654,7 +2652,7 @@ static int cache_grow(struct kmem_cache *cachep, cache_init_objs(cachep, page); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); check_irq_off(); spin_lock(&n->list_lock); @@ -2668,7 +2666,7 @@ static int cache_grow(struct kmem_cache *cachep, opps1: kmem_freepages(cachep, page); failed: - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); return 0; } @@ -2860,7 +2858,7 @@ force_grow: static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) { - might_sleep_if(flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(flags)); #if DEBUG kmem_flagcheck(cachep, flags); #endif @@ -3048,11 +3046,11 @@ retry: */ struct page *page; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); kmem_flagcheck(cache, flags); page = kmem_getpages(cache, local_flags, numa_mem_id()); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); if (page) { /* @@ -3415,6 +3413,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return __kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + #ifdef CONFIG_TRACING void * kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) diff --git a/kernel/mm/slab.h b/kernel/mm/slab.h index 0c9bda0eb..afdc57941 100644 --- a/kernel/mm/slab.h +++ b/kernel/mm/slab.h @@ -71,6 +71,7 @@ unsigned long calculate_alignment(unsigned long flags, #ifndef CONFIG_SLOB /* Kmalloc array related functions */ +void setup_kmalloc_cache_index_table(void); void create_kmalloc_caches(unsigned long); /* Find the kmalloc slab corresponding for a certain size */ @@ -162,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); +/* + * Generic implementation of bulk operations + * These are useful for situations in which the allocator cannot + * perform optimizations. In that case segments of the objecct listed + * may be allocated or freed using these operations. + */ +void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + #ifdef CONFIG_MEMCG_KMEM /* * Iterate over all memcg caches of the given root cache. The caller must hold @@ -171,10 +181,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, list_for_each_entry(iter, &(root)->memcg_params.list, \ memcg_params.list) -#define for_each_memcg_cache_safe(iter, tmp, root) \ - list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \ - memcg_params.list) - static inline bool is_root_cache(struct kmem_cache *s) { return s->memcg_params.is_root_cache; @@ -230,23 +236,16 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s->memcg_params.root_cache; } -static __always_inline int memcg_charge_slab(struct kmem_cache *s, - gfp_t gfp, int order) +static __always_inline int memcg_charge_slab(struct page *page, + gfp_t gfp, int order, + struct kmem_cache *s) { if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); -} - -static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) -{ - if (!memcg_kmem_enabled()) - return; - if (is_root_cache(s)) - return; - memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); + return __memcg_kmem_charge_memcg(page, gfp, order, + s->memcg_params.memcg); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -255,8 +254,6 @@ extern void slab_init_memcg_params(struct kmem_cache *); #define for_each_memcg_cache(iter, root) \ for ((void)(iter), (void)(root); 0; ) -#define for_each_memcg_cache_safe(iter, tmp, root) \ - for ((void)(iter), (void)(tmp), (void)(root); 0; ) static inline bool is_root_cache(struct kmem_cache *s) { @@ -285,15 +282,12 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s; } -static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) +static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, + struct kmem_cache *s) { return 0; } -static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) -{ -} - static inline void slab_init_memcg_params(struct kmem_cache *s) { } @@ -320,7 +314,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return cachep; pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __func__, cachep->name, s->name); + __func__, s->name, cachep->name); WARN_ON_ONCE(1); return s; } diff --git a/kernel/mm/slab_common.c b/kernel/mm/slab_common.c index 999bb3424..3c6a86b4e 100644 --- a/kernel/mm/slab_common.c +++ b/kernel/mm/slab_common.c @@ -37,8 +37,7 @@ struct kmem_cache *kmem_cache; SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ SLAB_FAILSLAB) -#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA | SLAB_NOTRACK) +#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK) /* * Merge control. If this is set then no merging of slab caches will occur. @@ -105,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) } #endif +void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) + kmem_cache_free(s, p[i]); +} + +int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, + void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) { + void *x = p[i] = kmem_cache_alloc(s, flags); + if (!x) { + __kmem_cache_free_bulk(s, i, p); + return 0; + } + } + return i; +} + #ifdef CONFIG_MEMCG_KMEM void slab_init_memcg_params(struct kmem_cache *s) { @@ -294,10 +316,10 @@ unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } -static struct kmem_cache * -do_kmem_cache_create(const char *name, size_t object_size, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *), - struct mem_cgroup *memcg, struct kmem_cache *root_cache) +static struct kmem_cache *create_cache(const char *name, + size_t object_size, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *), + struct mem_cgroup *memcg, struct kmem_cache *root_cache) { struct kmem_cache *s; int err; @@ -362,7 +384,7 @@ struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s; + struct kmem_cache *s = NULL; const char *cache_name; int err; @@ -374,7 +396,6 @@ kmem_cache_create(const char *name, size_t size, size_t align, err = kmem_cache_sanity_check(name, size); if (err) { - s = NULL; /* suppress uninit var warning */ goto out_unlock; } @@ -396,9 +417,9 @@ kmem_cache_create(const char *name, size_t size, size_t align, goto out_unlock; } - s = do_kmem_cache_create(cache_name, size, size, - calculate_alignment(flags, align, size), - flags, ctor, NULL, NULL); + s = create_cache(cache_name, size, size, + calculate_alignment(flags, align, size), + flags, ctor, NULL, NULL); if (IS_ERR(s)) { err = PTR_ERR(s); kfree_const(cache_name); @@ -426,29 +447,20 @@ out_unlock: } EXPORT_SYMBOL(kmem_cache_create); -static int do_kmem_cache_shutdown(struct kmem_cache *s, +static int shutdown_cache(struct kmem_cache *s, struct list_head *release, bool *need_rcu_barrier) { - if (__kmem_cache_shutdown(s) != 0) { - printk(KERN_ERR "kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); - dump_stack(); + if (__kmem_cache_shutdown(s) != 0) return -EBUSY; - } if (s->flags & SLAB_DESTROY_BY_RCU) *need_rcu_barrier = true; -#ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) - list_del(&s->memcg_params.list); -#endif list_move(&s->list, release); return 0; } -static void do_kmem_cache_release(struct list_head *release, - bool need_rcu_barrier) +static void release_caches(struct list_head *release, bool need_rcu_barrier) { struct kmem_cache *s, *s2; @@ -478,7 +490,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ - struct cgroup_subsys_state *css = mem_cgroup_css(memcg); + struct cgroup_subsys_state *css = &memcg->css; struct memcg_cache_array *arr; struct kmem_cache *s = NULL; char *cache_name; @@ -514,10 +526,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, if (!cache_name) goto out_unlock; - s = do_kmem_cache_create(cache_name, root_cache->object_size, - root_cache->size, root_cache->align, - root_cache->flags, root_cache->ctor, - memcg, root_cache); + s = create_cache(cache_name, root_cache->object_size, + root_cache->size, root_cache->align, + root_cache->flags, root_cache->ctor, + memcg, root_cache); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root @@ -576,6 +588,18 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) put_online_cpus(); } +static int __shutdown_memcg_cache(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + BUG_ON(is_root_cache(s)); + + if (shutdown_cache(s, release, need_rcu_barrier)) + return -EBUSY; + + list_del(&s->memcg_params.list); + return 0; +} + void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) { LIST_HEAD(release); @@ -593,14 +617,76 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) * The cgroup is about to be freed and therefore has no charges * left. Hence, all its caches must be empty by now. */ - BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); + BUG_ON(__shutdown_memcg_cache(s, &release, &need_rcu_barrier)); } mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - do_kmem_cache_release(&release, need_rcu_barrier); + release_caches(&release, need_rcu_barrier); +} + +static int shutdown_memcg_caches(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + struct memcg_cache_array *arr; + struct kmem_cache *c, *c2; + LIST_HEAD(busy); + int i; + + BUG_ON(!is_root_cache(s)); + + /* + * First, shutdown active caches, i.e. caches that belong to online + * memory cgroups. + */ + arr = rcu_dereference_protected(s->memcg_params.memcg_caches, + lockdep_is_held(&slab_mutex)); + for_each_memcg_cache_index(i) { + c = arr->entries[i]; + if (!c) + continue; + if (__shutdown_memcg_cache(c, release, need_rcu_barrier)) + /* + * The cache still has objects. Move it to a temporary + * list so as not to try to destroy it for a second + * time while iterating over inactive caches below. + */ + list_move(&c->memcg_params.list, &busy); + else + /* + * The cache is empty and will be destroyed soon. Clear + * the pointer to it in the memcg_caches array so that + * it will never be accessed even if the root cache + * stays alive. + */ + arr->entries[i] = NULL; + } + + /* + * Second, shutdown all caches left from memory cgroups that are now + * offline. + */ + list_for_each_entry_safe(c, c2, &s->memcg_params.list, + memcg_params.list) + __shutdown_memcg_cache(c, release, need_rcu_barrier); + + list_splice(&busy, &s->memcg_params.list); + + /* + * A cache being destroyed must be empty. In particular, this means + * that all per memcg caches attached to it must be empty too. + */ + if (!list_empty(&s->memcg_params.list)) + return -EBUSY; + return 0; +} +#else +static inline int shutdown_memcg_caches(struct kmem_cache *s, + struct list_head *release, bool *need_rcu_barrier) +{ + return 0; } #endif /* CONFIG_MEMCG_KMEM */ @@ -613,12 +699,12 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - struct kmem_cache *c, *c2; LIST_HEAD(release); bool need_rcu_barrier = false; - bool busy = false; + int err; - BUG_ON(!is_root_cache(s)); + if (unlikely(!s)) + return; get_online_cpus(); get_online_mems(); @@ -629,21 +715,22 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - for_each_memcg_cache_safe(c, c2, s) { - if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) - busy = true; - } - - if (!busy) - do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); + err = shutdown_memcg_caches(s, &release, &need_rcu_barrier); + if (!err) + err = shutdown_cache(s, &release, &need_rcu_barrier); + if (err) { + pr_err("kmem_cache_destroy %s: " + "Slab cache still has objects\n", s->name); + dump_stack(); + } out_unlock: mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); - do_kmem_cache_release(&release, need_rcu_barrier); + release_caches(&release, need_rcu_barrier); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -667,7 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) } EXPORT_SYMBOL(kmem_cache_shrink); -int slab_is_available(void) +bool slab_is_available(void) { return slab_state >= UP; } @@ -784,25 +871,45 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) } /* - * Create the kmalloc array. Some of the regular kmalloc arrays - * may already have been created because they were needed to - * enable allocations for slab creation. + * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. + * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is + * kmalloc-67108864. */ -void __init create_kmalloc_caches(unsigned long flags) +static struct { + const char *name; + unsigned long size; +} const kmalloc_info[] __initconst = { + {NULL, 0}, {"kmalloc-96", 96}, + {"kmalloc-192", 192}, {"kmalloc-8", 8}, + {"kmalloc-16", 16}, {"kmalloc-32", 32}, + {"kmalloc-64", 64}, {"kmalloc-128", 128}, + {"kmalloc-256", 256}, {"kmalloc-512", 512}, + {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048}, + {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192}, + {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768}, + {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072}, + {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288}, + {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152}, + {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608}, + {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432}, + {"kmalloc-67108864", 67108864} +}; + +/* + * Patch up the size_index table if we have strange large alignment + * requirements for the kmalloc array. This is only the case for + * MIPS it seems. The standard arches will not generate any code here. + * + * Largest permitted alignment is 256 bytes due to the way we + * handle the index determination for the smaller caches. + * + * Make sure that nothing crazy happens if someone starts tinkering + * around with ARCH_KMALLOC_MINALIGN + */ +void __init setup_kmalloc_cache_index_table(void) { int i; - /* - * Patch up the size_index table if we have strange large alignment - * requirements for the kmalloc array. This is only the case for - * MIPS it seems. The standard arches will not generate any code here. - * - * Largest permitted alignment is 256 bytes due to the way we - * handle the index determination for the smaller caches. - * - * Make sure that nothing crazy happens if someone starts tinkering - * around with ARCH_KMALLOC_MINALIGN - */ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); @@ -833,11 +940,26 @@ void __init create_kmalloc_caches(unsigned long flags) for (i = 128 + 8; i <= 192; i += 8) size_index[size_index_elem(i)] = 8; } +} + +static void __init new_kmalloc_cache(int idx, unsigned long flags) +{ + kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, + kmalloc_info[idx].size, flags); +} + +/* + * Create the kmalloc array. Some of the regular kmalloc arrays + * may already have been created because they were needed to + * enable allocations for slab creation. + */ +void __init create_kmalloc_caches(unsigned long flags) +{ + int i; + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { - if (!kmalloc_caches[i]) { - kmalloc_caches[i] = create_kmalloc_cache(NULL, - 1 << i, flags); - } + if (!kmalloc_caches[i]) + new_kmalloc_cache(i, flags); /* * Caches that are not of the two-to-the-power-of size. @@ -845,27 +967,14 @@ void __init create_kmalloc_caches(unsigned long flags) * earlier power of two caches */ if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) - kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); - + new_kmalloc_cache(1, flags); if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) - kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); + new_kmalloc_cache(2, flags); } /* Kmalloc array is now usable */ slab_state = UP; - for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[i]; - char *n; - - if (s) { - n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i)); - - BUG_ON(!n); - s->name = n; - } - } - #ifdef CONFIG_ZONE_DMA for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { struct kmem_cache *s = kmalloc_caches[i]; diff --git a/kernel/mm/slob.c b/kernel/mm/slob.c index 4765f6501..17e8f8cc7 100644 --- a/kernel/mm/slob.c +++ b/kernel/mm/slob.c @@ -45,7 +45,7 @@ * NUMA support in SLOB is fairly simplistic, pushing most of the real * logic down to the page allocator, and simply doing the node accounting * on the upper levels. In the event that a node id is explicitly - * provided, alloc_pages_exact_node() with the specified node id is used + * provided, __alloc_pages_node() with the specified node id is used * instead. The common case (or when the node id isn't explicitly provided) * will default to the current node, as per numa_node_id(). * @@ -193,7 +193,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) #ifdef CONFIG_NUMA if (node != NUMA_NO_NODE) - page = alloc_pages_exact_node(node, gfp, order); + page = __alloc_pages_node(node, gfp, order); else #endif page = alloc_pages(gfp, order); @@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return __kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + int __kmem_cache_shutdown(struct kmem_cache *c) { /* No way to check for remaining objects */ diff --git a/kernel/mm/slub.c b/kernel/mm/slub.c index 905e283d7..d304d8802 100644 --- a/kernel/mm/slub.c +++ b/kernel/mm/slub.c @@ -459,8 +459,10 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) /* * Debug settings: */ -#ifdef CONFIG_SLUB_DEBUG_ON +#if defined(CONFIG_SLUB_DEBUG_ON) static int slub_debug = DEBUG_DEFAULT_FLAGS; +#elif defined(CONFIG_KASAN) +static int slub_debug = SLAB_STORE_USER; #else static int slub_debug; #endif @@ -1063,11 +1065,15 @@ bad: return 0; } +/* Supports checking bulk free of a constructed freelist */ static noinline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, void *object, + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, unsigned long addr, unsigned long *flags) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + void *object = head; + int cnt = 0; raw_spin_lock_irqsave(&n->list_lock, *flags); slab_lock(page); @@ -1075,6 +1081,9 @@ static noinline struct kmem_cache_node *free_debug_processing( if (!check_slab(s, page)) goto fail; +next_object: + cnt++; + if (!check_valid_pointer(s, page, object)) { slab_err(s, page, "Invalid object pointer 0x%p", object); goto fail; @@ -1105,8 +1114,19 @@ static noinline struct kmem_cache_node *free_debug_processing( if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } out: + if (cnt != bulk_cnt) + slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", + bulk_cnt, cnt); + slab_unlock(page); /* * Keep node_lock to preserve integrity @@ -1202,7 +1222,7 @@ unsigned long kmem_cache_flags(unsigned long object_size, return flags; } -#else +#else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} @@ -1210,7 +1230,8 @@ static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } static inline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, void *object, + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, unsigned long addr, unsigned long *flags) { return NULL; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) @@ -1269,7 +1290,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, { flags &= gfp_allowed_mask; lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(flags)); if (should_failslab(s->object_size, flags, s->flags)) return NULL; @@ -1277,14 +1298,21 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, return memcg_kmem_get_cache(s, flags); } -static inline void slab_post_alloc_hook(struct kmem_cache *s, - gfp_t flags, void *object) +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) { + size_t i; + flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); + for (i = 0; i < size; i++) { + void *object = p[i]; + + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, + s->flags, flags); + kasan_slab_alloc(s, object); + } memcg_kmem_put_cache(s); - kasan_slab_alloc(s, object); } static inline void slab_free_hook(struct kmem_cache *s, void *x) @@ -1312,6 +1340,29 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) kasan_slab_free(s, x); } +static inline void slab_free_freelist_hook(struct kmem_cache *s, + void *head, void *tail) +{ +/* + * Compiler cannot detect this function can be removed if slab_free_hook() + * evaluates to nothing. Thus, catch all relevant config debug options here. + */ +#if defined(CONFIG_KMEMCHECK) || \ + defined(CONFIG_LOCKDEP) || \ + defined(CONFIG_DEBUG_KMEMLEAK) || \ + defined(CONFIG_DEBUG_OBJECTS_FREE) || \ + defined(CONFIG_KASAN) + + void *object = head; + void *tail_obj = tail ? : head; + + do { + slab_free_hook(s, object); + } while ((object != tail_obj) && + (object = get_freepointer(s, object))); +#endif +} + static void setup_object(struct kmem_cache *s, struct page *page, void *object) { @@ -1334,16 +1385,15 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, flags |= __GFP_NOTRACK; - if (memcg_charge_slab(s, flags, order)) - return NULL; - if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); else - page = alloc_pages_exact_node(node, flags, order); + page = __alloc_pages_node(node, flags, order); - if (!page) - memcg_uncharge_slab(s, order); + if (page && memcg_charge_slab(page, flags, order, s)) { + __free_pages(page, order); + page = NULL; + } return page; } @@ -1355,13 +1405,15 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) gfp_t alloc_gfp; void *start, *p; int idx, order; - bool enableirqs; + bool enableirqs = false; flags &= gfp_allowed_mask; - enableirqs = (flags & __GFP_WAIT) != 0; + if (gfpflags_allow_blocking(flags)) + enableirqs = true; #ifdef CONFIG_PREEMPT_RT_FULL - enableirqs |= system_state == SYSTEM_RUNNING; + if (system_state == SYSTEM_RUNNING) + enableirqs = true; #endif if (enableirqs) local_irq_enable(); @@ -1373,6 +1425,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * so we fall-back to the minimum order allocation. */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { @@ -1485,8 +1539,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_pages(page, order); - memcg_uncharge_slab(s, order); + __free_kmem_pages(page, order); } static void free_delayed(struct list_head *h) @@ -1526,10 +1579,7 @@ static void free_slab(struct kmem_cache *s, struct page *page) VM_BUG_ON(s->reserved != sizeof(*head)); head = page_address(page) + offset; } else { - /* - * RCU free overloads the RCU head over the LRU - */ - head = (void *)&page->lru; + head = &page->rcu_head; } call_rcu(head, rcu_free_slab); @@ -2345,25 +2395,17 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) * And if we were unable to get a new slab from the partial slab lists then * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. + * + * Version of __slab_alloc to use when we know that interrupts are + * already disabled (which is the case for bulk allocation). */ -static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) +static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c, + struct list_head *to_free) { struct slub_free_list *f; void *freelist; struct page *page; - unsigned long flags; - LIST_HEAD(tofree); - - local_irq_save(flags); -#ifdef CONFIG_PREEMPT - /* - * We may have been preempted and rescheduled on a different - * cpu before disabling interrupts. Need to reload cpu area - * pointer. - */ - c = this_cpu_ptr(s->cpu_slab); -#endif page = c->page; if (!page) @@ -2421,13 +2463,13 @@ load_freelist: VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); + out: f = this_cpu_ptr(&slub_free_list); raw_spin_lock(&f->lock); - list_splice_init(&f->list, &tofree); + list_splice_init(&f->list, to_free); raw_spin_unlock(&f->lock); - local_irq_restore(flags); - free_delayed(&tofree); + return freelist; new_slab: @@ -2444,7 +2486,7 @@ new_slab: if (unlikely(!freelist)) { slab_out_of_memory(s, gfpflags, node); - goto out; + return NULL; } page = c->page; @@ -2463,6 +2505,33 @@ new_slab: } /* + * Another one that disabled interrupt and compensates for possible + * cpu changes by refetching the per cpu area pointer. + */ +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) +{ + void *p; + unsigned long flags; + LIST_HEAD(tofree); + + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif + + p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree); + local_irq_restore(flags); + free_delayed(&tofree); + return p; +} + +/* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call * overhead for requests that can be satisfied on the fastpath. @@ -2475,7 +2544,7 @@ new_slab: static __always_inline void *slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr) { - void **object; + void *object; struct kmem_cache_cpu *c; struct page *page; unsigned long tid; @@ -2554,7 +2623,7 @@ redo: if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->object_size); - slab_post_alloc_hook(s, gfpflags, object); + slab_post_alloc_hook(s, gfpflags, 1, &object); return object; } @@ -2625,10 +2694,11 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, unsigned long addr) + void *head, void *tail, int cnt, + unsigned long addr) + { void *prior; - void **object = (void *)x; int was_frozen; struct page new; unsigned long counters; @@ -2638,7 +2708,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && - !(n = free_debug_processing(s, page, x, addr, &flags))) + !(n = free_debug_processing(s, page, head, tail, cnt, + addr, &flags))) return; do { @@ -2648,10 +2719,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } prior = page->freelist; counters = page->counters; - set_freepointer(s, object, prior); + set_freepointer(s, tail, prior); new.counters = counters; was_frozen = new.frozen; - new.inuse--; + new.inuse -= cnt; if ((!new.inuse || !prior) && !was_frozen) { if (kmem_cache_has_cpu_partial(s) && !prior) { @@ -2682,7 +2753,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } while (!cmpxchg_double_slab(s, page, prior, counters, - object, new.counters, + head, new.counters, "__slab_free")); if (likely(!n)) { @@ -2747,22 +2818,27 @@ slab_empty: * * If fastpath is not possible then fall back to __slab_free where we deal * with all sorts of special processing. + * + * Bulk free of a freelist with several objects (all pointing to the + * same page) possible by specifying head and tail ptr, plus objects + * count (cnt). Bulk free indicated by tail pointer being set. */ -static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, unsigned long addr) +static __always_inline void slab_free(struct kmem_cache *s, struct page *page, + void *head, void *tail, int cnt, + unsigned long addr) { - void **object = (void *)x; + void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; - slab_free_hook(s, x); + slab_free_freelist_hook(s, head, tail); redo: /* * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succedd. + * during the cmpxchg then the free will succeed. */ do { tid = this_cpu_read(s->cpu_slab->tid); @@ -2774,19 +2850,19 @@ redo: barrier(); if (likely(page == c->page)) { - set_freepointer(s, object, c->freelist); + set_freepointer(s, tail_obj, c->freelist); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, c->freelist, tid, - object, next_tid(tid)))) { + head, next_tid(tid)))) { note_cmpxchg_failure("slab_free", s, tid); goto redo; } stat(s, FREE_FASTPATH); } else - __slab_free(s, page, x, addr); + __slab_free(s, page, head, tail_obj, cnt, addr); } @@ -2795,11 +2871,168 @@ void kmem_cache_free(struct kmem_cache *s, void *x) s = cache_from_obj(s, x); if (!s) return; - slab_free(s, virt_to_head_page(x), x, _RET_IP_); + slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); +struct detached_freelist { + struct page *page; + void *tail; + void *freelist; + int cnt; +}; + +/* + * This function progressively scans the array with free objects (with + * a limited look ahead) and extract objects belonging to the same + * page. It builds a detached freelist directly within the given + * page/objects. This can happen without any need for + * synchronization, because the objects are owned by running process. + * The freelist is build up as a single linked list in the objects. + * The idea is, that this detached freelist can then be bulk + * transferred to the real freelist(s), but only requiring a single + * synchronization primitive. Look ahead in the array is limited due + * to performance reasons. + */ +static int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) +{ + size_t first_skipped_index = 0; + int lookahead = 3; + void *object; + + /* Always re-init detached_freelist */ + df->page = NULL; + + do { + object = p[--size]; + } while (!object && size); + + if (!object) + return 0; + + /* Start new detached freelist */ + set_freepointer(s, object, NULL); + df->page = virt_to_head_page(object); + df->tail = object; + df->freelist = object; + p[size] = NULL; /* mark object processed */ + df->cnt = 1; + + while (size) { + object = p[--size]; + if (!object) + continue; /* Skip processed objects */ + + /* df->page is always set at this point */ + if (df->page == virt_to_head_page(object)) { + /* Opportunity build freelist */ + set_freepointer(s, object, df->freelist); + df->freelist = object; + df->cnt++; + p[size] = NULL; /* mark object processed */ + + continue; + } + + /* Limit look ahead search */ + if (!--lookahead) + break; + + if (!first_skipped_index) + first_skipped_index = size + 1; + } + + return first_skipped_index; +} + + +/* Note that interrupts must be enabled when calling this function. */ +void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +{ + if (WARN_ON(!size)) + return; + + do { + struct detached_freelist df; + struct kmem_cache *s; + + /* Support for memcg */ + s = cache_from_obj(orig_s, p[size - 1]); + + size = build_detached_freelist(s, size, p, &df); + if (unlikely(!df.page)) + continue; + + slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); + } while (likely(size)); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +/* Note that interrupts must be enabled when calling this function. */ +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + struct kmem_cache_cpu *c; + LIST_HEAD(to_free); + int i; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); + if (unlikely(!s)) + return false; + /* + * Drain objects in the per cpu slab, while disabling local + * IRQs, which protects against PREEMPT and interrupts + * handlers invoking normal fastpath. + */ + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + + for (i = 0; i < size; i++) { + void *object = c->freelist; + + if (unlikely(!object)) { + /* + * Invoking slow path likely have side-effect + * of re-populating per CPU c->freelist + */ + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, + _RET_IP_, c, &to_free); + if (unlikely(!p[i])) + goto error; + + c = this_cpu_ptr(s->cpu_slab); + continue; /* goto for-loop */ + } + c->freelist = get_freepointer(s, object); + p[i] = object; + } + c->tid = next_tid(c->tid); + local_irq_enable(); + free_delayed(&to_free); + + /* Clear memory outside IRQ disabled fastpath loop */ + if (unlikely(flags & __GFP_ZERO)) { + int j; + + for (j = 0; j < i; j++) + memset(p[j], 0, s->object_size); + } + + /* memcg and kmem_cache debug support */ + slab_post_alloc_hook(s, flags, size, p); + return i; +error: + local_irq_enable(); + slab_post_alloc_hook(s, flags, i, p); + __kmem_cache_free_bulk(s, i, p); + return 0; +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + + /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -2858,20 +3091,15 @@ static inline int slab_order(int size, int min_objects, if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, - fls(min_objects * size - 1) - PAGE_SHIFT); + for (order = max(min_order, get_order(min_objects * size + reserved)); order <= max_order; order++) { unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size + reserved) - continue; - rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; - } return order; @@ -2889,7 +3117,7 @@ static inline int calculate_order(int size, int reserved) * works by first attempting to generate a layout with * the best configuration and backing off gradually. * - * First we reduce the acceptable waste in a slab. Then + * First we increase the acceptable waste in a slab. Then * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; @@ -3465,7 +3693,7 @@ void kfree(const void *x) __free_kmem_pages(page, compound_order(page)); return; } - slab_free(page->slab_cache, page, object, _RET_IP_); + slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -3756,6 +3984,7 @@ void __init kmem_cache_init(void) kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ + setup_kmalloc_cache_index_table(); create_kmalloc_caches(0); #ifdef CONFIG_SMP @@ -5236,7 +5465,7 @@ static int sysfs_slab_add(struct kmem_cache *s) s->kobj.kset = cache_kset(s); err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) - goto out_put_kobj; + goto out; err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) @@ -5263,8 +5492,6 @@ out: return err; out_del_kobj: kobject_del(&s->kobj); -out_put_kobj: - kobject_put(&s->kobj); goto out; } diff --git a/kernel/mm/swap.c b/kernel/mm/swap.c index 1785ac603..ca194aeb4 100644 --- a/kernel/mm/swap.c +++ b/kernel/mm/swap.c @@ -31,8 +31,9 @@ #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> -#include <linux/hugetlb.h> #include <linux/locallock.h> +#include <linux/hugetlb.h> +#include <linux/page_idle.h> #include "internal.h" @@ -135,7 +136,6 @@ void put_unrefcounted_compound_page(struct page *page_head, struct page *page) * here, see the comment above this function. */ VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); if (put_page_testzero(page_head)) { /* * If this is the tail of a slab THP page, @@ -205,7 +205,7 @@ out_put_single: __put_single_page(page); return; } - VM_BUG_ON_PAGE(page_head != page->first_page, page); + VM_BUG_ON_PAGE(page_head != compound_head(page), page); /* * We can release the refcount taken by * get_page_unless_zero() now that @@ -266,7 +266,7 @@ static void put_compound_page(struct page *page) * Case 3 is possible, as we may race with * __split_huge_page_refcount tearing down a THP page. */ - page_head = compound_head_by_tail(page); + page_head = compound_head(page); if (!__compound_tail_refcounted(page_head)) put_unrefcounted_compound_page(page_head, page); else @@ -628,6 +628,8 @@ void mark_page_accessed(struct page *page) } else if (!PageReferenced(page)) { SetPageReferenced(page); } + if (page_is_idle(page)) + clear_page_idle(page); } EXPORT_SYMBOL(mark_page_accessed); diff --git a/kernel/mm/swap_state.c b/kernel/mm/swap_state.c index 8bc8e6613..d504adb7f 100644 --- a/kernel/mm/swap_state.c +++ b/kernel/mm/swap_state.c @@ -288,17 +288,14 @@ struct page * lookup_swap_cache(swp_entry_t entry) return page; } -/* - * Locate a page of swap in physical memory, reserving swap cache space - * and reading the disk if it is not already cached. - * A failure return means that either the page allocation failed or that - * the swap entry is no longer in use. - */ -struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) +struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr, + bool *new_page_allocated) { struct page *found_page, *new_page = NULL; + struct address_space *swapper_space = swap_address_space(entry); int err; + *new_page_allocated = false; do { /* @@ -306,8 +303,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swap_address_space(entry), - entry.val); + found_page = find_get_page(swapper_space, entry.val); if (found_page) break; @@ -366,7 +362,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Initiate read into locked page and return. */ lru_cache_add_anon(new_page); - swap_readpage(new_page); + *new_page_allocated = true; return new_page; } radix_tree_preload_end(); @@ -384,6 +380,25 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return found_page; } +/* + * Locate a page of swap in physical memory, reserving swap cache space + * and reading the disk if it is not already cached. + * A failure return means that either the page allocation failed or that + * the swap entry is no longer in use. + */ +struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + struct vm_area_struct *vma, unsigned long addr) +{ + bool page_was_allocated; + struct page *retpage = __read_swap_cache_async(entry, gfp_mask, + vma, addr, &page_was_allocated); + + if (page_was_allocated) + swap_readpage(retpage); + + return retpage; +} + static unsigned long swapin_nr_pages(unsigned long offset) { static unsigned long prev_offset; diff --git a/kernel/mm/swapfile.c b/kernel/mm/swapfile.c index a7e72103f..58877312c 100644 --- a/kernel/mm/swapfile.c +++ b/kernel/mm/swapfile.c @@ -875,6 +875,48 @@ int page_swapcount(struct page *page) } /* + * How many references to @entry are currently swapped out? + * This considers COUNT_CONTINUED so it returns exact answer. + */ +int swp_swapcount(swp_entry_t entry) +{ + int count, tmp_count, n; + struct swap_info_struct *p; + struct page *page; + pgoff_t offset; + unsigned char *map; + + p = swap_info_get(entry); + if (!p) + return 0; + + count = swap_count(p->swap_map[swp_offset(entry)]); + if (!(count & COUNT_CONTINUED)) + goto out; + + count &= ~COUNT_CONTINUED; + n = SWAP_MAP_MAX + 1; + + offset = swp_offset(entry); + page = vmalloc_to_page(p->swap_map + offset); + offset &= ~PAGE_MASK; + VM_BUG_ON(page_private(page) != SWP_CONTINUED); + + do { + page = list_entry(page->lru.next, struct page, lru); + map = kmap_atomic(page); + tmp_count = map[offset]; + kunmap_atomic(map); + + count += (tmp_count & ~COUNT_CONTINUED) * n; + n *= (SWAP_CONT_MAX + 1); + } while (tmp_count & COUNT_CONTINUED); +out: + spin_unlock(&p->lock); + return count; +} + +/* * We can write to an anon page without COW if there are no other references * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content @@ -2032,7 +2074,7 @@ static int swap_show(struct seq_file *swap, void *v) } file = si->swap_file; - len = seq_path(swap, &file->f_path, " \t\n\\"); + len = seq_file_path(swap, file, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file_inode(file)->i_mode) ? @@ -2143,11 +2185,10 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) if (S_ISBLK(inode->i_mode)) { p->bdev = bdgrab(I_BDEV(inode)); error = blkdev_get(p->bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, - sys_swapon); + FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); if (error < 0) { p->bdev = NULL; - return -EINVAL; + return error; } p->old_block_size = block_size(p->bdev); error = set_blocksize(p->bdev, PAGE_SIZE); @@ -2348,7 +2389,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; - int i; int prio; int error; union swap_header *swap_header; @@ -2388,19 +2428,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->swap_file = swap_file; mapping = swap_file->f_mapping; - - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = swap_info[i]; - - if (q == p || !q->swap_file) - continue; - if (mapping == q->swap_file->f_mapping) { - error = -EBUSY; - goto bad_swap; - } - } - inode = mapping->host; + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ error = claim_swapfile(p, inode); if (unlikely(error)) @@ -2433,6 +2462,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + int cpu; + p->flags |= SWP_SOLIDSTATE; /* * select a random position to start with to help wear leveling @@ -2451,9 +2482,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } - for_each_possible_cpu(i) { + for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; - cluster = per_cpu_ptr(p->percpu_cluster, i); + cluster = per_cpu_ptr(p->percpu_cluster, cpu); cluster_set_null(&cluster->index); } } diff --git a/kernel/mm/truncate.c b/kernel/mm/truncate.c index 09598db42..5f1964200 100644 --- a/kernel/mm/truncate.c +++ b/kernel/mm/truncate.c @@ -119,9 +119,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) * the VM has canceled the dirty bit (eg ext3 journaling). * Hence dirty accounting check is placed after invalidation. */ - if (TestClearPageDirty(page)) - account_page_cleaned(page, mapping); - + cancel_dirty_page(page); ClearPageMappedToDisk(page); delete_from_page_cache(page); return 0; @@ -515,19 +513,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages); static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { + struct mem_cgroup *memcg; + unsigned long flags; + if (page->mapping != mapping) return 0; if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - spin_lock_irq(&mapping->tree_lock); + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL); - spin_unlock_irq(&mapping->tree_lock); + __delete_from_page_cache(page, NULL, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -535,7 +538,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) page_cache_release(page); /* pagecache ref */ return 1; failed: - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); return 0; } diff --git a/kernel/mm/userfaultfd.c b/kernel/mm/userfaultfd.c new file mode 100644 index 000000000..77fee9325 --- /dev/null +++ b/kernel/mm/userfaultfd.c @@ -0,0 +1,308 @@ +/* + * mm/userfaultfd.c + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/userfaultfd_k.h> +#include <linux/mmu_notifier.h> +#include <asm/tlbflush.h> +#include "internal.h" + +static int mcopy_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + struct mem_cgroup *memcg; + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + void *page_kaddr; + int ret; + struct page *page; + + if (!*pagep) { + ret = -ENOMEM; + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); + if (!page) + goto out; + + page_kaddr = kmap_atomic(page); + ret = copy_from_user(page_kaddr, + (const void __user *) src_addr, + PAGE_SIZE); + kunmap_atomic(page_kaddr); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + ret = -EFAULT; + *pagep = page; + /* don't free the page */ + goto out; + } + } else { + page = *pagep; + *pagep = NULL; + } + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceeding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + ret = -ENOMEM; + if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) + goto out_release; + + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + + ret = -EEXIST; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!pte_none(*dst_pte)) + goto out_release_uncharge_unlock; + + inc_mm_counter(dst_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, dst_vma, dst_addr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, dst_vma); + + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + pte_unmap_unlock(dst_pte, ptl); + ret = 0; +out: + return ret; +out_release_uncharge_unlock: + pte_unmap_unlock(dst_pte, ptl); + mem_cgroup_cancel_charge(page, memcg); +out_release: + page_cache_release(page); + goto out; +} + +static int mfill_zeropage_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + int ret; + + _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + dst_vma->vm_page_prot)); + ret = -EEXIST; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!pte_none(*dst_pte)) + goto out_unlock; + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); + return ret; +} + +static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, address); + pud = pud_alloc(mm, pgd, address); + if (pud) + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + pmd = pmd_alloc(mm, pud, address); + return pmd; +} + +static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + bool zeropage) +{ + struct vm_area_struct *dst_vma; + ssize_t err; + pmd_t *dst_pmd; + unsigned long src_addr, dst_addr; + long copied; + struct page *page; + + /* + * Sanitize the command parameters: + */ + BUG_ON(dst_start & ~PAGE_MASK); + BUG_ON(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + BUG_ON(src_start + len <= src_start); + BUG_ON(dst_start + len <= dst_start); + + src_addr = src_start; + dst_addr = dst_start; + copied = 0; + page = NULL; +retry: + down_read(&dst_mm->mmap_sem); + + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + err = -EINVAL; + dst_vma = find_vma(dst_mm, dst_start); + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + goto out_unlock; + + /* + * Be strict and only allow __mcopy_atomic on userfaultfd + * registered ranges to prevent userland errors going + * unnoticed. As far as the VM consistency is concerned, it + * would be perfectly safe to remove this check, but there's + * no useful usage for __mcopy_atomic ouside of userfaultfd + * registered ranges. This is after all why these are ioctls + * belonging to the userfaultfd and not syscalls. + */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + goto out_unlock; + + /* + * FIXME: only allow copying on anonymous vmas, tmpfs should + * be added. + */ + if (dst_vma->vm_ops) + goto out_unlock; + + /* + * Ensure the dst_vma has a anon_vma or this page + * would get a NULL anon_vma when moved in the + * dst_vma. + */ + err = -ENOMEM; + if (unlikely(anon_vma_prepare(dst_vma))) + goto out_unlock; + + while (src_addr < src_start + len) { + pmd_t dst_pmdval; + + BUG_ON(dst_addr >= dst_start + len); + + dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); + if (unlikely(!dst_pmd)) { + err = -ENOMEM; + break; + } + + dst_pmdval = pmd_read_atomic(dst_pmd); + /* + * If the dst_pmd is mapped as THP don't + * override it and just be strict. + */ + if (unlikely(pmd_trans_huge(dst_pmdval))) { + err = -EEXIST; + break; + } + if (unlikely(pmd_none(dst_pmdval)) && + unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd, + dst_addr))) { + err = -ENOMEM; + break; + } + /* If an huge pmd materialized from under us fail */ + if (unlikely(pmd_trans_huge(*dst_pmd))) { + err = -EFAULT; + break; + } + + BUG_ON(pmd_none(*dst_pmd)); + BUG_ON(pmd_trans_huge(*dst_pmd)); + + if (!zeropage) + err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, &page); + else + err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, + dst_addr); + + cond_resched(); + + if (unlikely(err == -EFAULT)) { + void *page_kaddr; + + up_read(&dst_mm->mmap_sem); + BUG_ON(!page); + + page_kaddr = kmap(page); + err = copy_from_user(page_kaddr, + (const void __user *) src_addr, + PAGE_SIZE); + kunmap(page); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + goto retry; + } else + BUG_ON(page); + + if (!err) { + dst_addr += PAGE_SIZE; + src_addr += PAGE_SIZE; + copied += PAGE_SIZE; + + if (fatal_signal_pending(current)) + err = -EINTR; + } + if (err) + break; + } + +out_unlock: + up_read(&dst_mm->mmap_sem); +out: + if (page) + page_cache_release(page); + BUG_ON(copied < 0); + BUG_ON(err > 0); + BUG_ON(!copied && !err); + return copied ? copied : err; +} + +ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long src_start, unsigned long len) +{ + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); +} + +ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, + unsigned long len) +{ + return __mcopy_atomic(dst_mm, start, 0, len, true); +} diff --git a/kernel/mm/util.c b/kernel/mm/util.c index 68ff8a536..9af1c12b3 100644 --- a/kernel/mm/util.c +++ b/kernel/mm/util.c @@ -309,7 +309,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, { if (unlikely(offset + PAGE_ALIGN(len) < offset)) return -EINVAL; - if (unlikely(offset & ~PAGE_MASK)) + if (unlikely(offset_in_page(offset))) return -EINVAL; return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); diff --git a/kernel/mm/vmacache.c b/kernel/mm/vmacache.c index b6e3662fe..fd09dc9c6 100644 --- a/kernel/mm/vmacache.c +++ b/kernel/mm/vmacache.c @@ -52,7 +52,7 @@ void vmacache_flush_all(struct mm_struct *mm) * Also handle the case where a kernel thread has adopted this mm via use_mm(). * That kernel thread's vmacache is not applicable to this mm. */ -static bool vmacache_valid_mm(struct mm_struct *mm) +static inline bool vmacache_valid_mm(struct mm_struct *mm) { return current->mm == mm && !(current->flags & PF_KTHREAD); } diff --git a/kernel/mm/vmalloc.c b/kernel/mm/vmalloc.c index f87a29f1e..68740314a 100644 --- a/kernel/mm/vmalloc.c +++ b/kernel/mm/vmalloc.c @@ -35,6 +35,8 @@ #include <asm/tlbflush.h> #include <asm/shmparam.h> +#include "internal.h" + struct vfree_deferred { struct llist_head list; struct work_struct wq; @@ -358,7 +360,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, struct vmap_area *first; BUG_ON(!size); - BUG_ON(size & ~PAGE_MASK); + BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); va = kmalloc_node(sizeof(struct vmap_area), @@ -938,7 +940,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) unsigned int order; int cpu; - BUG_ON(size & ~PAGE_MASK); + BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); if (WARN_ON(size == 0)) { /* @@ -992,7 +994,7 @@ static void vb_free(const void *addr, unsigned long size) unsigned int order; struct vmap_block *vb; - BUG_ON(size & ~PAGE_MASK); + BUG_ON(offset_in_page(size)); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); @@ -1444,7 +1446,6 @@ struct vm_struct *remove_vm_area(const void *addr) vmap_debug_free_range(va->va_start, va->va_end); kasan_free_shadow(vm); free_unmap_vmap_area(va); - vm->size -= PAGE_SIZE; return vm; } @@ -1469,8 +1470,8 @@ static void __vunmap(const void *addr, int deallocate_pages) return; } - debug_check_no_locks_freed(addr, area->size); - debug_check_no_obj_freed(addr, area->size); + debug_check_no_locks_freed(addr, get_vm_area_size(area)); + debug_check_no_obj_freed(addr, get_vm_area_size(area)); if (deallocate_pages) { int i; @@ -1620,7 +1621,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfp_mask & __GFP_WAIT) + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } @@ -1905,7 +1906,7 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) while (count) { unsigned long offset, length; - offset = (unsigned long)addr & ~PAGE_MASK; + offset = offset_in_page(addr); length = PAGE_SIZE - offset; if (length > count) length = count; @@ -1944,7 +1945,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) while (count) { unsigned long offset, length; - offset = (unsigned long)addr & ~PAGE_MASK; + offset = offset_in_page(addr); length = PAGE_SIZE - offset; if (length > count) length = count; @@ -2395,7 +2396,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, bool purged = false; /* verify parameters and allocate data structures */ - BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); + BUG_ON(offset_in_page(align) || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { start = offsets[area]; end = start + sizes[area]; @@ -2691,52 +2692,5 @@ static int __init proc_vmalloc_init(void) } module_init(proc_vmalloc_init); -void get_vmalloc_info(struct vmalloc_info *vmi) -{ - struct vmap_area *va; - unsigned long free_area_size; - unsigned long prev_end; - - vmi->used = 0; - vmi->largest_chunk = 0; - - prev_end = VMALLOC_START; - - rcu_read_lock(); - - if (list_empty(&vmap_area_list)) { - vmi->largest_chunk = VMALLOC_TOTAL; - goto out; - } - - list_for_each_entry_rcu(va, &vmap_area_list, list) { - unsigned long addr = va->va_start; - - /* - * Some archs keep another range for modules in vmalloc space - */ - if (addr < VMALLOC_START) - continue; - if (addr >= VMALLOC_END) - break; - - if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) - continue; - - vmi->used += (va->va_end - va->va_start); - - free_area_size = addr - prev_end; - if (vmi->largest_chunk < free_area_size) - vmi->largest_chunk = free_area_size; - - prev_end = va->va_end; - } - - if (VMALLOC_END - prev_end > vmi->largest_chunk) - vmi->largest_chunk = VMALLOC_END - prev_end; - -out: - rcu_read_unlock(); -} #endif diff --git a/kernel/mm/vmscan.c b/kernel/mm/vmscan.c index 1a17bd7c0..2aec4241b 100644 --- a/kernel/mm/vmscan.c +++ b/kernel/mm/vmscan.c @@ -154,16 +154,47 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } + +/** + * sane_reclaim - is the usual dirty throttling mechanism operational? + * @sc: scan_control in question + * + * The normal page dirty throttling mechanism in balance_dirty_pages() is + * completely broken with the legacy memcg and direct stalling in + * shrink_page_list() is used for throttling instead, which lacks all the + * niceties such as fairness, adaptive pausing, bandwidth proportional + * allocation and configurability. + * + * This function tests whether the vmscan currently in progress can assume + * that the normal dirty throttling mechanism is operational. + */ +static bool sane_reclaim(struct scan_control *sc) +{ + struct mem_cgroup *memcg = sc->target_mem_cgroup; + + if (!memcg) + return true; +#ifdef CONFIG_CGROUP_WRITEBACK + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return true; +#endif + return false; +} #else static bool global_reclaim(struct scan_control *sc) { return true; } + +static bool sane_reclaim(struct scan_control *sc) +{ + return true; +} #endif static unsigned long zone_reclaimable_pages(struct zone *zone) { - int nr; + unsigned long nr; nr = zone_page_state(zone, NR_ACTIVE_FILE) + zone_page_state(zone, NR_INACTIVE_FILE); @@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi, - struct scan_control *sc) +static int may_write_to_inode(struct inode *inode, struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; - if (!bdi_write_congested(bdi)) + if (!inode_write_congested(inode)) return 1; - if (bdi == current->backing_dev_info) + if (inode_to_bdi(inode) == current->backing_dev_info) return 1; return 0; } @@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) + if (!may_write_to_inode(mapping->host, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { + unsigned long flags; + struct mem_cgroup *memcg; + BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - spin_lock_irq(&mapping->tree_lock); + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. * @@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping)) shadow = workingset_eviction(mapping, page); - __delete_from_page_cache(page, shadow); - spin_unlock_irq(&mapping->tree_lock); + __delete_from_page_cache(page, shadow, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (freepage != NULL) freepage(page); @@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); return 0; } @@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ mapping = page_mapping(page); if (((dirty || writeback) && mapping && - bdi_write_congested(inode_to_bdi(mapping->host))) || + inode_write_congested(mapping->host)) || (writeback && PageReclaim(page))) nr_congested++; @@ -935,11 +972,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, * note that the LRU is being scanned too quickly and the * caller can stall after page list has been processed. * - * 2) Global reclaim encounters a page, memcg encounters a - * page that is not marked for immediate reclaim or - * the caller does not have __GFP_FS (or __GFP_IO if it's - * simply going to swap, not to fs). In this case mark - * the page for immediate reclaim and continue scanning. + * 2) Global or new memcg reclaim encounters a page that is + * not marked for immediate reclaim, or the caller does not + * have __GFP_FS (or __GFP_IO if it's simply going to swap, + * not to fs). In this case mark the page for immediate + * reclaim and continue scanning. * * Require may_enter_fs because we would wait on fs, which * may not have submitted IO yet. And the loop driver might @@ -948,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * - * 3) memcg encounters a page that is not already marked + * 3) Legacy memcg encounters a page that is already marked * PageReclaim. memcg does not have any dirty pages * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to @@ -963,7 +1000,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Case 2 above */ - } else if (global_reclaim(sc) || + } else if (sane_reclaim(sc) || !PageReclaim(page) || !may_enter_fs) { /* * This is slightly racy - end_page_writeback() @@ -978,12 +1015,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; - goto keep_locked; /* Case 3 above */ } else { + unlock_page(page); wait_on_page_writeback(page); + /* then go back and try same page again */ + list_add_tail(&page->lru, page_list); + continue; } } @@ -1020,7 +1060,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, ttu_flags)) { + switch (try_to_unmap(page, + ttu_flags|TTU_BATCH_FLUSH)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1060,7 +1101,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!sc->may_writepage) goto keep_locked; - /* Page is dirty, try to write it out here */ + /* + * Page is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after IO + * starts and then write it out here. + */ + try_to_unmap_flush_dirty(); switch (pageout(page, mapping, sc)) { case PAGE_KEEP: goto keep_locked; @@ -1171,6 +1217,7 @@ keep: } mem_cgroup_uncharge_list(&free_pages); + try_to_unmap_flush(); free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); @@ -1315,7 +1362,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long scan; - for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && + !list_empty(src); scan++) { struct page *page; int nr_pages; @@ -1412,7 +1460,7 @@ static int too_many_isolated(struct zone *zone, int file, if (current_is_kswapd()) return 0; - if (!global_reclaim(sc)) + if (!sane_reclaim(sc)) return 0; if (file) { @@ -1428,7 +1476,7 @@ static int too_many_isolated(struct zone *zone, int file, * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) inactive >>= 3; return isolated > inactive; @@ -1604,10 +1652,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, set_bit(ZONE_WRITEBACK, &zone->flags); /* - * memcg will stall in page writeback so only consider forcibly - * stalling for global reclaim + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling here. */ - if (global_reclaim(sc)) { + if (sane_reclaim(sc)) { /* * Tag a zone as congested if all the dirty pages scanned were * backed by a congested BDI and wait_iff_congested will stall. @@ -1811,17 +1859,14 @@ static void shrink_active_list(unsigned long nr_to_scan, } #ifdef CONFIG_SWAP -static int inactive_anon_is_low_global(struct zone *zone) +static bool inactive_anon_is_low_global(struct zone *zone) { unsigned long active, inactive; active = zone_page_state(zone, NR_ACTIVE_ANON); inactive = zone_page_state(zone, NR_INACTIVE_ANON); - if (inactive * zone->inactive_ratio < active) - return 1; - - return 0; + return inactive * zone->inactive_ratio < active; } /** @@ -1831,14 +1876,14 @@ static int inactive_anon_is_low_global(struct zone *zone) * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct lruvec *lruvec) +static bool inactive_anon_is_low(struct lruvec *lruvec) { /* * If we don't have swap space, anonymous page deactivation * is pointless. */ if (!total_swap_pages) - return 0; + return false; if (!mem_cgroup_disabled()) return mem_cgroup_inactive_anon_is_low(lruvec); @@ -1846,9 +1891,9 @@ static int inactive_anon_is_low(struct lruvec *lruvec) return inactive_anon_is_low_global(lruvec_zone(lruvec)); } #else -static inline int inactive_anon_is_low(struct lruvec *lruvec) +static inline bool inactive_anon_is_low(struct lruvec *lruvec) { - return 0; + return false; } #endif @@ -1866,7 +1911,7 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec) * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct lruvec *lruvec) +static bool inactive_file_is_low(struct lruvec *lruvec) { unsigned long inactive; unsigned long active; @@ -1877,7 +1922,7 @@ static int inactive_file_is_low(struct lruvec *lruvec) return active > inactive; } -static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) +static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) { if (is_file_lru(lru)) return inactive_file_is_low(lruvec); @@ -2114,6 +2159,23 @@ out: } } +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +static void init_tlb_ubc(void) +{ + /* + * This deliberately does not clear the cpumask as it's expensive + * and unnecessary. If there happens to be data in there then the + * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and + * then will be cleared. + */ + current->tlb_ubc.flush_required = false; +} +#else +static inline void init_tlb_ubc(void) +{ +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -2148,6 +2210,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); + init_tlb_ubc(); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2413,7 +2477,7 @@ static inline bool compaction_ready(struct zone *zone, int order) balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0); /* * If compaction is deferred, reclaim up to a point where @@ -2642,7 +2706,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!populated_zone(zone)) + if (!populated_zone(zone) || + zone_reclaimable_pages(zone) == 0) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -2895,7 +2960,7 @@ static bool zone_balanced(struct zone *zone, int order, unsigned long balance_gap, int classzone_idx) { if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx, 0)) + balance_gap, classzone_idx)) return false; if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, @@ -3592,7 +3657,7 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -3628,18 +3693,18 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone) } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ -static long zone_pagecache_reclaimable(struct zone *zone) +static unsigned long zone_pagecache_reclaimable(struct zone *zone) { - long nr_pagecache_reclaimable; - long delta = 0; + unsigned long nr_pagecache_reclaimable; + unsigned long delta = 0; /* - * If RECLAIM_SWAP is set, then all file pages are considered + * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about * pages like swapcache and zone_unmapped_file_pages() provides * a better estimate */ - if (zone_reclaim_mode & RECLAIM_SWAP) + if (zone_reclaim_mode & RECLAIM_UNMAP) nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); else nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); @@ -3670,15 +3735,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .order = order, .priority = ZONE_RECLAIM_PRIORITY, .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, }; cond_resched(); /* - * We need to be able to allocate from the reserves for RECLAIM_SWAP + * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE - * and RECLAIM_SWAP. + * and RECLAIM_UNMAP. */ p->flags |= PF_MEMALLOC | PF_SWAPWRITE; lockdep_set_current_reclaim_state(gfp_mask); @@ -3726,7 +3791,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * Do not scan if the allocation should not be delayed. */ - if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) + if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return ZONE_RECLAIM_NOSCAN; /* diff --git a/kernel/mm/vmstat.c b/kernel/mm/vmstat.c index 86f0e2e3f..64416fd7c 100644 --- a/kernel/mm/vmstat.c +++ b/kernel/mm/vmstat.c @@ -219,7 +219,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, * particular counter cannot be updated from interrupt context. */ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { struct per_cpu_pageset __percpu *pcp = zone->pageset; s8 __percpu *p = pcp->vm_stat_diff + item; @@ -324,8 +324,8 @@ EXPORT_SYMBOL(__dec_zone_page_state); * 1 Overstepping half of threshold * -1 Overstepping minus half of threshold */ -static inline void mod_state(struct zone *zone, - enum zone_stat_item item, int delta, int overstep_mode) +static inline void mod_state(struct zone *zone, enum zone_stat_item item, + long delta, int overstep_mode) { struct per_cpu_pageset __percpu *pcp = zone->pageset; s8 __percpu *p = pcp->vm_stat_diff + item; @@ -363,7 +363,7 @@ static inline void mod_state(struct zone *zone, } void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { mod_state(zone, item, delta, 0); } @@ -390,7 +390,7 @@ EXPORT_SYMBOL(dec_zone_page_state); * Use interrupt disable to serialize counter updates */ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { unsigned long flags; @@ -597,6 +597,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) else __inc_zone_state(z, NUMA_OTHER); } + +/* + * Determine the per node value of a stat item. + */ +unsigned long node_page_state(int node, enum zone_stat_item item) +{ + struct zone *zones = NODE_DATA(node)->node_zones; + + return +#ifdef CONFIG_ZONE_DMA + zone_page_state(&zones[ZONE_DMA], item) + +#endif +#ifdef CONFIG_ZONE_DMA32 + zone_page_state(&zones[ZONE_DMA32], item) + +#endif +#ifdef CONFIG_HIGHMEM + zone_page_state(&zones[ZONE_HIGHMEM], item) + +#endif + zone_page_state(&zones[ZONE_NORMAL], item) + + zone_page_state(&zones[ZONE_MOVABLE], item); +} + #endif #ifdef CONFIG_COMPACTION @@ -905,9 +927,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, #ifdef CONFIG_PROC_FS static char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", - "Reclaimable", "Movable", - "Reserve", + "Reclaimable", + "HighAtomic", #ifdef CONFIG_CMA "CMA", #endif @@ -1363,21 +1385,23 @@ static const struct file_operations proc_vmstat_file_operations = { #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SMP +static struct workqueue_struct *vmstat_wq; static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats()) + if (refresh_cpu_vm_stats()) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the * update worker thread. */ - schedule_delayed_work(this_cpu_ptr(&vmstat_work), + queue_delayed_work_on(smp_processor_id(), vmstat_wq, + this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); - else { + } else { /* * We did not update any counters so the app may be in * a mode where it does not cause counter updates. @@ -1443,7 +1467,7 @@ static void vmstat_shepherd(struct work_struct *w) if (need_update(cpu) && cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) - schedule_delayed_work_on(cpu, + queue_delayed_work_on(cpu, vmstat_wq, &per_cpu(vmstat_work, cpu), 0); put_online_cpus(); @@ -1465,6 +1489,7 @@ static void __init start_shepherd_timer(void) BUG(); cpumask_copy(cpu_stat_off, cpu_online_mask); + vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); } diff --git a/kernel/mm/zbud.c b/kernel/mm/zbud.c index 2ee4e4520..d8a181fd7 100644 --- a/kernel/mm/zbud.c +++ b/kernel/mm/zbud.c @@ -96,7 +96,11 @@ struct zbud_pool { struct list_head buddied; struct list_head lru; u64 pages_nr; - struct zbud_ops *ops; + const struct zbud_ops *ops; +#ifdef CONFIG_ZPOOL + struct zpool *zpool; + const struct zpool_ops *zpool_ops; +#endif }; /* @@ -123,17 +127,28 @@ struct zbud_header { static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) { - return zpool_evict(pool, handle); + if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) + return pool->zpool_ops->evict(pool->zpool, handle); + else + return -ENOENT; } -static struct zbud_ops zbud_zpool_ops = { +static const struct zbud_ops zbud_zpool_ops = { .evict = zbud_zpool_evict }; -static void *zbud_zpool_create(char *name, gfp_t gfp, - struct zpool_ops *zpool_ops) +static void *zbud_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) { - return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + struct zbud_pool *pool; + + pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + return pool; } static void zbud_zpool_destroy(void *pool) @@ -287,12 +302,12 @@ static int num_free_chunks(struct zbud_header *zhdr) * Return: pointer to the new zbud pool or NULL if the metadata allocation * failed. */ -struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) +struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) { struct zbud_pool *pool; int i; - pool = kmalloc(sizeof(struct zbud_pool), gfp); + pool = kzalloc(sizeof(struct zbud_pool), gfp); if (!pool) return NULL; spin_lock_init(&pool->lock); diff --git a/kernel/mm/zpool.c b/kernel/mm/zpool.c index bacdab6e4..fd3ff719c 100644 --- a/kernel/mm/zpool.c +++ b/kernel/mm/zpool.c @@ -18,11 +18,9 @@ #include <linux/zpool.h> struct zpool { - char *type; - struct zpool_driver *driver; void *pool; - struct zpool_ops *ops; + const struct zpool_ops *ops; struct list_head list; }; @@ -73,34 +71,8 @@ int zpool_unregister_driver(struct zpool_driver *driver) } EXPORT_SYMBOL(zpool_unregister_driver); -/** - * zpool_evict() - evict callback from a zpool implementation. - * @pool: pool to evict from. - * @handle: handle to evict. - * - * This can be used by zpool implementations to call the - * user's evict zpool_ops struct evict callback. - */ -int zpool_evict(void *pool, unsigned long handle) -{ - struct zpool *zpool; - - spin_lock(&pools_lock); - list_for_each_entry(zpool, &pools_head, list) { - if (zpool->pool == pool) { - spin_unlock(&pools_lock); - if (!zpool->ops || !zpool->ops->evict) - return -EINVAL; - return zpool->ops->evict(zpool, handle); - } - } - spin_unlock(&pools_lock); - - return -ENOENT; -} -EXPORT_SYMBOL(zpool_evict); - -static struct zpool_driver *zpool_get_driver(char *type) +/* this assumes @type is null-terminated. */ +static struct zpool_driver *zpool_get_driver(const char *type) { struct zpool_driver *driver; @@ -127,6 +99,41 @@ static void zpool_put_driver(struct zpool_driver *driver) } /** + * zpool_has_pool() - Check if the pool driver is available + * @type The type of the zpool to check (e.g. zbud, zsmalloc) + * + * This checks if the @type pool driver is available. This will try to load + * the requested module, if needed, but there is no guarantee the module will + * still be loaded and available immediately after calling. If this returns + * true, the caller should assume the pool is available, but must be prepared + * to handle the @zpool_create_pool() returning failure. However if this + * returns false, the caller should assume the requested pool type is not + * available; either the requested pool type module does not exist, or could + * not be loaded, and calling @zpool_create_pool() with the pool type will + * fail. + * + * The @type string must be null-terminated. + * + * Returns: true if @type pool is available, false if not + */ +bool zpool_has_pool(char *type) +{ + struct zpool_driver *driver = zpool_get_driver(type); + + if (!driver) { + request_module("zpool-%s", type); + driver = zpool_get_driver(type); + } + + if (!driver) + return false; + + zpool_put_driver(driver); + return true; +} +EXPORT_SYMBOL(zpool_has_pool); + +/** * zpool_create_pool() - Create a new zpool * @type The type of the zpool to create (e.g. zbud, zsmalloc) * @name The name of the zpool (e.g. zram0, zswap) @@ -139,15 +146,17 @@ static void zpool_put_driver(struct zpool_driver *driver) * * Implementations must guarantee this to be thread-safe. * + * The @type and @name strings must be null-terminated. + * * Returns: New zpool on success, NULL on failure. */ -struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, - struct zpool_ops *ops) +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, + const struct zpool_ops *ops) { struct zpool_driver *driver; struct zpool *zpool; - pr_info("creating pool type %s\n", type); + pr_debug("creating pool type %s\n", type); driver = zpool_get_driver(type); @@ -168,9 +177,8 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, return NULL; } - zpool->type = driver->type; zpool->driver = driver; - zpool->pool = driver->create(name, gfp, ops); + zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; if (!zpool->pool) { @@ -180,7 +188,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, return NULL; } - pr_info("created %s pool\n", type); + pr_debug("created pool type %s\n", type); spin_lock(&pools_lock); list_add(&zpool->list, &pools_head); @@ -202,7 +210,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, */ void zpool_destroy_pool(struct zpool *zpool) { - pr_info("destroying pool type %s\n", zpool->type); + pr_debug("destroying pool type %s\n", zpool->driver->type); spin_lock(&pools_lock); list_del(&zpool->list); @@ -222,9 +230,9 @@ void zpool_destroy_pool(struct zpool *zpool) * * Returns: The type of zpool. */ -char *zpool_get_type(struct zpool *zpool) +const char *zpool_get_type(struct zpool *zpool) { - return zpool->type; + return zpool->driver->type; } /** @@ -347,20 +355,6 @@ u64 zpool_get_total_size(struct zpool *zpool) return zpool->driver->total_size(zpool->pool); } -static int __init init_zpool(void) -{ - pr_info("loaded\n"); - return 0; -} - -static void __exit exit_zpool(void) -{ - pr_info("unloaded\n"); -} - -module_init(init_zpool); -module_exit(exit_zpool); - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/kernel/mm/zsmalloc.c b/kernel/mm/zsmalloc.c index a8b5e749e..18cc59fb1 100644 --- a/kernel/mm/zsmalloc.c +++ b/kernel/mm/zsmalloc.c @@ -16,7 +16,7 @@ * struct page(s) to form a zspage. * * Usage of struct page fields: - * page->first_page: points to the first component (0-order) page + * page->private: points to the first component (0-order) page * page->index (union with page->freelist): offset of the first object * starting in this page. For the first page, this is * always 0, so we use this field (aka freelist) to point @@ -26,8 +26,7 @@ * * For _first_ page only: * - * page->private (union with page->first_page): refers to the - * component page after the first page + * page->private: refers to the component page after the first page * If the page is first_page for huge object, it stores handle. * Look at size_class->huge. * page->freelist: points to the first free object in zspage. @@ -38,6 +37,7 @@ * page->lru: links together first pages of various zspages. * Basically forming list of zspages in a fullness group. * page->mapping: class index and fullness group of the zspage + * page->inuse: the number of objects that are used in this zspage * * Usage of struct page flags: * PG_private: identifies the first component page @@ -45,10 +45,6 @@ * */ -#ifdef CONFIG_ZSMALLOC_DEBUG -#define DEBUG -#endif - #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> @@ -62,7 +58,7 @@ #include <linux/cpumask.h> #include <linux/cpu.h> #include <linux/vmalloc.h> -#include <linux/hardirq.h> +#include <linux/preempt.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/debugfs.h> @@ -170,17 +166,20 @@ enum zs_stat_type { OBJ_USED, CLASS_ALMOST_FULL, CLASS_ALMOST_EMPTY, - NR_ZS_STAT_TYPE, }; #ifdef CONFIG_ZSMALLOC_STAT - -static struct dentry *zs_stat_root; +#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) +#else +#define NR_ZS_STAT_TYPE (OBJ_USED + 1) +#endif struct zs_size_stat { unsigned long objs[NR_ZS_STAT_TYPE]; }; +#ifdef CONFIG_ZSMALLOC_STAT +static struct dentry *zs_stat_root; #endif /* @@ -205,6 +204,8 @@ static int zs_size_classes; static const int fullness_threshold_frac = 4; struct size_class { + spinlock_t lock; + struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; /* * Size of objects stored in this class. Must be multiple * of ZS_ALIGN. @@ -214,16 +215,10 @@ struct size_class { /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; - /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ - bool huge; - -#ifdef CONFIG_ZSMALLOC_STAT struct zs_size_stat stats; -#endif - spinlock_t lock; - - struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; + /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ + bool huge; }; /* @@ -247,7 +242,7 @@ struct link_free { }; struct zs_pool { - char *name; + const char *name; struct size_class **size_class; struct kmem_cache *handle_cachep; @@ -255,6 +250,15 @@ struct zs_pool { gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; + struct zs_pool_stats stats; + + /* Compact classes */ + struct shrinker shrinker; + /* + * To signify that register_shrinker() was successful + * and unregister_shrinker() will not Oops. + */ + bool shrinker_enabled; #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif @@ -289,8 +293,7 @@ static int create_handle_cache(struct zs_pool *pool) static void destroy_handle_cache(struct zs_pool *pool) { - if (pool->handle_cachep) - kmem_cache_destroy(pool->handle_cachep); + kmem_cache_destroy(pool->handle_cachep); } static unsigned long alloc_handle(struct zs_pool *pool) @@ -306,14 +309,21 @@ static void free_handle(struct zs_pool *pool, unsigned long handle) static void record_obj(unsigned long handle, unsigned long obj) { - *(unsigned long *)handle = obj; + /* + * lsb of @obj represents handle lock while other bits + * represent object value the handle is pointing so + * updating shouldn't do store tearing. + */ + WRITE_ONCE(*(unsigned long *)handle, obj); } /* zpool driver */ #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops) +static void *zs_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) { return zs_create_pool(name, gfp); } @@ -444,26 +454,30 @@ static int get_size_class_index(int size) return min(zs_size_classes - 1, idx); } -#ifdef CONFIG_ZSMALLOC_STAT - static inline void zs_stat_inc(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - class->stats.objs[type] += cnt; + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] += cnt; } static inline void zs_stat_dec(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - class->stats.objs[type] -= cnt; + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] -= cnt; } static inline unsigned long zs_stat_get(struct size_class *class, enum zs_stat_type type) { - return class->stats.objs[type]; + if (type < NR_ZS_STAT_TYPE) + return class->stats.objs[type]; + return 0; } +#ifdef CONFIG_ZSMALLOC_STAT + static int __init zs_stat_init(void) { if (!debugfs_initialized()) @@ -548,7 +562,7 @@ static const struct file_operations zs_stat_size_ops = { .release = single_release, }; -static int zs_pool_stat_create(char *name, struct zs_pool *pool) +static int zs_pool_stat_create(const char *name, struct zs_pool *pool) { struct dentry *entry; @@ -579,23 +593,6 @@ static void zs_pool_stat_destroy(struct zs_pool *pool) } #else /* CONFIG_ZSMALLOC_STAT */ - -static inline void zs_stat_inc(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ -} - -static inline void zs_stat_dec(struct size_class *class, - enum zs_stat_type type, unsigned long cnt) -{ -} - -static inline unsigned long zs_stat_get(struct size_class *class, - enum zs_stat_type type) -{ - return 0; -} - static int __init zs_stat_init(void) { return 0; @@ -605,7 +602,7 @@ static void __exit zs_stat_exit(void) { } -static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) +static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool) { return 0; } @@ -613,7 +610,6 @@ static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) static inline void zs_pool_stat_destroy(struct zs_pool *pool) { } - #endif @@ -661,13 +657,22 @@ static void insert_zspage(struct page *page, struct size_class *class, if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; - head = &class->fullness_list[fullness]; - if (*head) - list_add_tail(&page->lru, &(*head)->lru); - - *head = page; zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); + + head = &class->fullness_list[fullness]; + if (!*head) { + *head = page; + return; + } + + /* + * We want to see more ZS_FULL pages and less almost + * empty/full. Put pages with higher ->inuse first. + */ + list_add_tail(&page->lru, &(*head)->lru); + if (page->inuse >= (*head)->inuse) + *head = page; } /* @@ -773,7 +778,7 @@ static struct page *get_first_page(struct page *page) if (is_first_page(page)) return page; else - return page->first_page; + return (struct page *)page_private(page); } static struct page *get_next_page(struct page *page) @@ -833,7 +838,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page, { if (class->huge) { VM_BUG_ON(!is_first_page(page)); - return *(unsigned long *)page_private(page); + return page_private(page); } else return *(unsigned long *)obj; } @@ -958,7 +963,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) * Allocate individual pages and link them together as: * 1. first page->private = first sub-page * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->first_page + * 3. each sub-page is linked to the first page using page->private * * For each size class, First/Head pages are linked together using * page->lru. Also, we set PG_private to identify the first page @@ -983,7 +988,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) if (i == 1) set_page_private(first_page, (unsigned long)page); if (i >= 1) - page->first_page = first_page; + set_page_private(page, (unsigned long)first_page); if (i >= 2) list_add(&page->lru, &prev_page->lru); if (i == class->pages_per_zspage - 1) /* last page */ @@ -1284,7 +1289,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); - area = &get_cpu_var(zs_map_area); + area = per_cpu_ptr(&zs_map_area, get_cpu_light()); area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ @@ -1337,7 +1342,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } - put_cpu_var(zs_map_area); + put_cpu_light(); unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); @@ -1437,8 +1442,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, struct page *first_page, *f_page; unsigned long f_objidx, f_offset; void *vaddr; - int class_idx; - enum fullness_group fullness; BUG_ON(!obj); @@ -1446,7 +1449,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); - get_zspage_mapping(first_page, &class_idx, &fullness); f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); vaddr = kmap_atomic(f_page); @@ -1498,7 +1500,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_free); -static void zs_object_copy(unsigned long src, unsigned long dst, +static void zs_object_copy(unsigned long dst, unsigned long src, struct size_class *class) { struct page *s_page, *d_page; @@ -1605,8 +1607,6 @@ struct zs_compact_control { /* Starting object index within @s_page which used for live object * in the subpage. */ int index; - /* how many of objects are migrated */ - int nr_migrated; }; static int migrate_zspage(struct zs_pool *pool, struct size_class *class, @@ -1617,7 +1617,6 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, struct page *s_page = cc->s_page; struct page *d_page = cc->d_page; unsigned long index = cc->index; - int nr_migrated = 0; int ret = 0; while (1) { @@ -1639,23 +1638,28 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, used_obj = handle_to_obj(handle); free_obj = obj_malloc(d_page, class, handle); - zs_object_copy(used_obj, free_obj, class); + zs_object_copy(free_obj, used_obj, class); index++; + /* + * record_obj updates handle's value to free_obj and it will + * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which + * breaks synchronization using pin_tag(e,g, zs_free) so + * let's keep the lock bit. + */ + free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); unpin_tag(handle); obj_free(pool, class, used_obj); - nr_migrated++; } /* Remember last position in this iteration */ cc->s_page = s_page; cc->index = index; - cc->nr_migrated = nr_migrated; return ret; } -static struct page *alloc_target_page(struct size_class *class) +static struct page *isolate_target_page(struct size_class *class) { int i; struct page *page; @@ -1671,8 +1675,17 @@ static struct page *alloc_target_page(struct size_class *class) return page; } -static void putback_zspage(struct zs_pool *pool, struct size_class *class, - struct page *first_page) +/* + * putback_zspage - add @first_page into right class's fullness list + * @pool: target pool + * @class: destination class + * @first_page: target page + * + * Return @fist_page's fullness_group + */ +static enum fullness_group putback_zspage(struct zs_pool *pool, + struct size_class *class, + struct page *first_page) { enum fullness_group fullness; @@ -1690,50 +1703,72 @@ static void putback_zspage(struct zs_pool *pool, struct size_class *class, free_zspage(first_page); } + + return fullness; } static struct page *isolate_source_page(struct size_class *class) { - struct page *page; + int i; + struct page *page = NULL; + + for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { + page = class->fullness_list[i]; + if (!page) + continue; - page = class->fullness_list[ZS_ALMOST_EMPTY]; - if (page) - remove_zspage(page, class, ZS_ALMOST_EMPTY); + remove_zspage(page, class, i); + break; + } return page; } -static unsigned long __zs_compact(struct zs_pool *pool, - struct size_class *class) +/* + * + * Based on the number of unused allocated objects calculate + * and return the number of pages that we can free. + */ +static unsigned long zs_can_compact(struct size_class *class) +{ + unsigned long obj_wasted; + + obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - + zs_stat_get(class, OBJ_USED); + + obj_wasted /= get_maxobj_per_zspage(class->size, + class->pages_per_zspage); + + return obj_wasted * class->pages_per_zspage; +} + +static void __zs_compact(struct zs_pool *pool, struct size_class *class) { - int nr_to_migrate; struct zs_compact_control cc; struct page *src_page; struct page *dst_page = NULL; - unsigned long nr_total_migrated = 0; spin_lock(&class->lock); while ((src_page = isolate_source_page(class))) { BUG_ON(!is_first_page(src_page)); - /* The goal is to migrate all live objects in source page */ - nr_to_migrate = src_page->inuse; + if (!zs_can_compact(class)) + break; + cc.index = 0; cc.s_page = src_page; - while ((dst_page = alloc_target_page(class))) { + while ((dst_page = isolate_target_page(class))) { cc.d_page = dst_page; /* - * If there is no more space in dst_page, try to - * allocate another zspage. + * If there is no more space in dst_page, resched + * and see if anyone had allocated another zspage. */ if (!migrate_zspage(pool, class, &cc)) break; putback_zspage(pool, class, dst_page); - nr_total_migrated += cc.nr_migrated; - nr_to_migrate -= cc.nr_migrated; } /* Stop if we couldn't find slot */ @@ -1741,9 +1776,9 @@ static unsigned long __zs_compact(struct zs_pool *pool, break; putback_zspage(pool, class, dst_page); - putback_zspage(pool, class, src_page); + if (putback_zspage(pool, class, src_page) == ZS_EMPTY) + pool->stats.pages_compacted += class->pages_per_zspage; spin_unlock(&class->lock); - nr_total_migrated += cc.nr_migrated; cond_resched(); spin_lock(&class->lock); } @@ -1752,14 +1787,11 @@ static unsigned long __zs_compact(struct zs_pool *pool, putback_zspage(pool, class, src_page); spin_unlock(&class->lock); - - return nr_total_migrated; } unsigned long zs_compact(struct zs_pool *pool) { int i; - unsigned long nr_migrated = 0; struct size_class *class; for (i = zs_size_classes - 1; i >= 0; i--) { @@ -1768,13 +1800,77 @@ unsigned long zs_compact(struct zs_pool *pool) continue; if (class->index != i) continue; - nr_migrated += __zs_compact(pool, class); + __zs_compact(pool, class); } - return nr_migrated; + return pool->stats.pages_compacted; } EXPORT_SYMBOL_GPL(zs_compact); +void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) +{ + memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); +} +EXPORT_SYMBOL_GPL(zs_pool_stats); + +static unsigned long zs_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long pages_freed; + struct zs_pool *pool = container_of(shrinker, struct zs_pool, + shrinker); + + pages_freed = pool->stats.pages_compacted; + /* + * Compact classes and calculate compaction delta. + * Can run concurrently with a manually triggered + * (by user) compaction. + */ + pages_freed = zs_compact(pool) - pages_freed; + + return pages_freed ? pages_freed : SHRINK_STOP; +} + +static unsigned long zs_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int i; + struct size_class *class; + unsigned long pages_to_free = 0; + struct zs_pool *pool = container_of(shrinker, struct zs_pool, + shrinker); + + for (i = zs_size_classes - 1; i >= 0; i--) { + class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) + continue; + + pages_to_free += zs_can_compact(class); + } + + return pages_to_free; +} + +static void zs_unregister_shrinker(struct zs_pool *pool) +{ + if (pool->shrinker_enabled) { + unregister_shrinker(&pool->shrinker); + pool->shrinker_enabled = false; + } +} + +static int zs_register_shrinker(struct zs_pool *pool) +{ + pool->shrinker.scan_objects = zs_shrinker_scan; + pool->shrinker.count_objects = zs_shrinker_count; + pool->shrinker.batch = 0; + pool->shrinker.seeks = DEFAULT_SEEKS; + + return register_shrinker(&pool->shrinker); +} + /** * zs_create_pool - Creates an allocation pool to work from. * @flags: allocation flags used to allocate pool metadata @@ -1785,7 +1881,7 @@ EXPORT_SYMBOL_GPL(zs_compact); * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(char *name, gfp_t flags) +struct zs_pool *zs_create_pool(const char *name, gfp_t flags) { int i; struct zs_pool *pool; @@ -1860,6 +1956,12 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) if (zs_pool_stat_create(name, pool)) goto err; + /* + * Not critical, we still can use the pool + * and user can trigger compaction manually. + */ + if (zs_register_shrinker(pool) == 0) + pool->shrinker_enabled = true; return pool; err: @@ -1872,6 +1974,7 @@ void zs_destroy_pool(struct zs_pool *pool) { int i; + zs_unregister_shrinker(pool); zs_pool_stat_destroy(pool); for (i = 0; i < zs_size_classes; i++) { diff --git a/kernel/mm/zswap.c b/kernel/mm/zswap.c index 4249e82ff..bf14508af 100644 --- a/kernel/mm/zswap.c +++ b/kernel/mm/zswap.c @@ -75,89 +75,53 @@ static u64 zswap_duplicate_entry; /********************************* * tunables **********************************/ -/* Enable/disable zswap (disabled by default, fixed at boot for now) */ -static bool zswap_enabled __read_mostly; -module_param_named(enabled, zswap_enabled, bool, 0444); -/* Compressor to be used by zswap (fixed at boot for now) */ +/* Enable/disable zswap (disabled by default) */ +static bool zswap_enabled; +module_param_named(enabled, zswap_enabled, bool, 0644); + +/* Crypto compressor to use */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; -module_param_named(compressor, zswap_compressor, charp, 0444); - -/* The maximum percentage of memory that the compressed pool can occupy */ -static unsigned int zswap_max_pool_percent = 20; -module_param_named(max_pool_percent, - zswap_max_pool_percent, uint, 0644); +static int zswap_compressor_param_set(const char *, + const struct kernel_param *); +static struct kernel_param_ops zswap_compressor_param_ops = { + .set = zswap_compressor_param_set, + .get = param_get_charp, + .free = param_free_charp, +}; +module_param_cb(compressor, &zswap_compressor_param_ops, + &zswap_compressor, 0644); -/* Compressed storage to use */ +/* Compressed storage zpool to use */ #define ZSWAP_ZPOOL_DEFAULT "zbud" static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; -module_param_named(zpool, zswap_zpool_type, charp, 0444); +static int zswap_zpool_param_set(const char *, const struct kernel_param *); +static struct kernel_param_ops zswap_zpool_param_ops = { + .set = zswap_zpool_param_set, + .get = param_get_charp, + .free = param_free_charp, +}; +module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); -/* zpool is shared by all of zswap backend */ -static struct zpool *zswap_pool; +/* The maximum percentage of memory that the compressed pool can occupy */ +static unsigned int zswap_max_pool_percent = 20; +module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); /********************************* -* compression functions +* data structures **********************************/ -/* per-cpu compression transforms */ -static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms; -enum comp_op { - ZSWAP_COMPOP_COMPRESS, - ZSWAP_COMPOP_DECOMPRESS +struct zswap_pool { + struct zpool *zpool; + struct crypto_comp * __percpu *tfm; + struct kref kref; + struct list_head list; + struct rcu_head rcu_head; + struct notifier_block notifier; + char tfm_name[CRYPTO_MAX_ALG_NAME]; }; -static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen, - u8 *dst, unsigned int *dlen) -{ - struct crypto_comp *tfm; - int ret; - - tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu()); - switch (op) { - case ZSWAP_COMPOP_COMPRESS: - ret = crypto_comp_compress(tfm, src, slen, dst, dlen); - break; - case ZSWAP_COMPOP_DECOMPRESS: - ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); - break; - default: - ret = -EINVAL; - } - - put_cpu(); - return ret; -} - -static int __init zswap_comp_init(void) -{ - if (!crypto_has_comp(zswap_compressor, 0, 0)) { - pr_info("%s compressor not available\n", zswap_compressor); - /* fall back to default compressor */ - zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; - if (!crypto_has_comp(zswap_compressor, 0, 0)) - /* can't even load the default compressor */ - return -ENODEV; - } - pr_info("using %s compressor\n", zswap_compressor); - - /* alloc percpu transforms */ - zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); - if (!zswap_comp_pcpu_tfms) - return -ENOMEM; - return 0; -} - -static void __init zswap_comp_exit(void) -{ - /* free percpu transforms */ - free_percpu(zswap_comp_pcpu_tfms); -} - -/********************************* -* data structures -**********************************/ /* * struct zswap_entry * @@ -165,22 +129,24 @@ static void __init zswap_comp_exit(void) * page within zswap. * * rbnode - links the entry into red-black tree for the appropriate swap type + * offset - the swap offset for the entry. Index into the red-black tree. * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code * concurrent calls to load, invalidate, and writeback. The lock * for the zswap_tree structure that contains the entry must * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. - * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zpool allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during * decompression + * pool - the zswap_pool the entry's data is in + * handle - zpool allocation handle that stores the compressed page data */ struct zswap_entry { struct rb_node rbnode; pgoff_t offset; int refcount; unsigned int length; + struct zswap_pool *pool; unsigned long handle; }; @@ -200,6 +166,51 @@ struct zswap_tree { static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; +/* RCU-protected iteration */ +static LIST_HEAD(zswap_pools); +/* protects zswap_pools list modification */ +static DEFINE_SPINLOCK(zswap_pools_lock); + +/* used by param callback function */ +static bool zswap_init_started; + +/********************************* +* helpers and fwd declarations +**********************************/ + +#define zswap_pool_debug(msg, p) \ + pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ + zpool_get_type((p)->zpool)) + +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle); +static int zswap_pool_get(struct zswap_pool *pool); +static void zswap_pool_put(struct zswap_pool *pool); + +static const struct zpool_ops zswap_zpool_ops = { + .evict = zswap_writeback_entry +}; + +static bool zswap_is_full(void) +{ + return totalram_pages * zswap_max_pool_percent / 100 < + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); +} + +static void zswap_update_total_size(void) +{ + struct zswap_pool *pool; + u64 total = 0; + + rcu_read_lock(); + + list_for_each_entry_rcu(pool, &zswap_pools, list) + total += zpool_get_total_size(pool->zpool); + + rcu_read_unlock(); + + zswap_pool_total_size = total; +} + /********************************* * zswap entry functions **********************************/ @@ -293,10 +304,11 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) */ static void zswap_free_entry(struct zswap_entry *entry) { - zpool_free(zswap_pool, entry->handle); + zpool_free(entry->pool->zpool, entry->handle); + zswap_pool_put(entry->pool); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_pool_total_size = zpool_get_total_size(zswap_pool); + zswap_update_total_size(); } /* caller must hold the tree lock */ @@ -324,7 +336,7 @@ static void zswap_entry_put(struct zswap_tree *tree, static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, pgoff_t offset) { - struct zswap_entry *entry = NULL; + struct zswap_entry *entry; entry = zswap_rb_search(root, offset); if (entry) @@ -338,35 +350,21 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, **********************************/ static DEFINE_PER_CPU(u8 *, zswap_dstmem); -static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) +static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu) { - struct crypto_comp *tfm; u8 *dst; switch (action) { case CPU_UP_PREPARE: - tfm = crypto_alloc_comp(zswap_compressor, 0, 0); - if (IS_ERR(tfm)) { - pr_err("can't allocate compressor transform\n"); - return NOTIFY_BAD; - } - *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!dst) { pr_err("can't allocate compressor buffer\n"); - crypto_free_comp(tfm); - *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; return NOTIFY_BAD; } per_cpu(zswap_dstmem, cpu) = dst; break; case CPU_DEAD: case CPU_UP_CANCELED: - tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu); - if (tfm) { - crypto_free_comp(tfm); - *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; - } dst = per_cpu(zswap_dstmem, cpu); kfree(dst); per_cpu(zswap_dstmem, cpu) = NULL; @@ -377,43 +375,404 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) return NOTIFY_OK; } -static int zswap_cpu_notifier(struct notifier_block *nb, - unsigned long action, void *pcpu) +static int zswap_cpu_dstmem_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) { - unsigned long cpu = (unsigned long)pcpu; - return __zswap_cpu_notifier(action, cpu); + return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu); } -static struct notifier_block zswap_cpu_notifier_block = { - .notifier_call = zswap_cpu_notifier +static struct notifier_block zswap_dstmem_notifier = { + .notifier_call = zswap_cpu_dstmem_notifier, }; -static int __init zswap_cpu_init(void) +static int __init zswap_cpu_dstmem_init(void) { unsigned long cpu; cpu_notifier_register_begin(); for_each_online_cpu(cpu) - if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) + if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) == + NOTIFY_BAD) goto cleanup; - __register_cpu_notifier(&zswap_cpu_notifier_block); + __register_cpu_notifier(&zswap_dstmem_notifier); cpu_notifier_register_done(); return 0; cleanup: for_each_online_cpu(cpu) - __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); + __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu); cpu_notifier_register_done(); return -ENOMEM; } +static void zswap_cpu_dstmem_destroy(void) +{ + unsigned long cpu; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu); + __unregister_cpu_notifier(&zswap_dstmem_notifier); + cpu_notifier_register_done(); +} + +static int __zswap_cpu_comp_notifier(struct zswap_pool *pool, + unsigned long action, unsigned long cpu) +{ + struct crypto_comp *tfm; + + switch (action) { + case CPU_UP_PREPARE: + if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) + break; + tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); + if (IS_ERR_OR_NULL(tfm)) { + pr_err("could not alloc crypto comp %s : %ld\n", + pool->tfm_name, PTR_ERR(tfm)); + return NOTIFY_BAD; + } + *per_cpu_ptr(pool->tfm, cpu) = tfm; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + tfm = *per_cpu_ptr(pool->tfm, cpu); + if (!IS_ERR_OR_NULL(tfm)) + crypto_free_comp(tfm); + *per_cpu_ptr(pool->tfm, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zswap_cpu_comp_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier); + + return __zswap_cpu_comp_notifier(pool, action, cpu); +} + +static int zswap_cpu_comp_init(struct zswap_pool *pool) +{ + unsigned long cpu; + + memset(&pool->notifier, 0, sizeof(pool->notifier)); + pool->notifier.notifier_call = zswap_cpu_comp_notifier; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) == + NOTIFY_BAD) + goto cleanup; + __register_cpu_notifier(&pool->notifier); + cpu_notifier_register_done(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu); + cpu_notifier_register_done(); + return -ENOMEM; +} + +static void zswap_cpu_comp_destroy(struct zswap_pool *pool) +{ + unsigned long cpu; + + cpu_notifier_register_begin(); + for_each_online_cpu(cpu) + __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu); + __unregister_cpu_notifier(&pool->notifier); + cpu_notifier_register_done(); +} + /********************************* -* helpers +* pool functions **********************************/ -static bool zswap_is_full(void) + +static struct zswap_pool *__zswap_pool_current(void) { - return totalram_pages * zswap_max_pool_percent / 100 < - DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); + struct zswap_pool *pool; + + pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list); + WARN_ON(!pool); + + return pool; +} + +static struct zswap_pool *zswap_pool_current(void) +{ + assert_spin_locked(&zswap_pools_lock); + + return __zswap_pool_current(); +} + +static struct zswap_pool *zswap_pool_current_get(void) +{ + struct zswap_pool *pool; + + rcu_read_lock(); + + pool = __zswap_pool_current(); + if (!pool || !zswap_pool_get(pool)) + pool = NULL; + + rcu_read_unlock(); + + return pool; +} + +static struct zswap_pool *zswap_pool_last_get(void) +{ + struct zswap_pool *pool, *last = NULL; + + rcu_read_lock(); + + list_for_each_entry_rcu(pool, &zswap_pools, list) + last = pool; + if (!WARN_ON(!last) && !zswap_pool_get(last)) + last = NULL; + + rcu_read_unlock(); + + return last; +} + +/* type and compressor must be null-terminated */ +static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) +{ + struct zswap_pool *pool; + + assert_spin_locked(&zswap_pools_lock); + + list_for_each_entry_rcu(pool, &zswap_pools, list) { + if (strcmp(pool->tfm_name, compressor)) + continue; + if (strcmp(zpool_get_type(pool->zpool), type)) + continue; + /* if we can't get it, it's about to be destroyed */ + if (!zswap_pool_get(pool)) + continue; + return pool; + } + + return NULL; +} + +static struct zswap_pool *zswap_pool_create(char *type, char *compressor) +{ + struct zswap_pool *pool; + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; + + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool) { + pr_err("pool alloc failed\n"); + return NULL; + } + + pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops); + if (!pool->zpool) { + pr_err("%s zpool not available\n", type); + goto error; + } + pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); + + strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); + pool->tfm = alloc_percpu(struct crypto_comp *); + if (!pool->tfm) { + pr_err("percpu alloc failed\n"); + goto error; + } + + if (zswap_cpu_comp_init(pool)) + goto error; + pr_debug("using %s compressor\n", pool->tfm_name); + + /* being the current pool takes 1 ref; this func expects the + * caller to always add the new pool as the current pool + */ + kref_init(&pool->kref); + INIT_LIST_HEAD(&pool->list); + + zswap_pool_debug("created", pool); + + return pool; + +error: + free_percpu(pool->tfm); + if (pool->zpool) + zpool_destroy_pool(pool->zpool); + kfree(pool); + return NULL; +} + +static __init struct zswap_pool *__zswap_pool_create_fallback(void) +{ + if (!crypto_has_comp(zswap_compressor, 0, 0)) { + if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { + pr_err("default compressor %s not available\n", + zswap_compressor); + return NULL; + } + pr_err("compressor %s not available, using default %s\n", + zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); + param_free_charp(&zswap_compressor); + zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + } + if (!zpool_has_pool(zswap_zpool_type)) { + if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_err("default zpool %s not available\n", + zswap_zpool_type); + return NULL; + } + pr_err("zpool %s not available, using default %s\n", + zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + } + + return zswap_pool_create(zswap_zpool_type, zswap_compressor); +} + +static void zswap_pool_destroy(struct zswap_pool *pool) +{ + zswap_pool_debug("destroying", pool); + + zswap_cpu_comp_destroy(pool); + free_percpu(pool->tfm); + zpool_destroy_pool(pool->zpool); + kfree(pool); +} + +static int __must_check zswap_pool_get(struct zswap_pool *pool) +{ + return kref_get_unless_zero(&pool->kref); +} + +static void __zswap_pool_release(struct rcu_head *head) +{ + struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head); + + /* nobody should have been able to get a kref... */ + WARN_ON(kref_get_unless_zero(&pool->kref)); + + /* pool is now off zswap_pools list and has no references. */ + zswap_pool_destroy(pool); +} + +static void __zswap_pool_empty(struct kref *kref) +{ + struct zswap_pool *pool; + + pool = container_of(kref, typeof(*pool), kref); + + spin_lock(&zswap_pools_lock); + + WARN_ON(pool == zswap_pool_current()); + + list_del_rcu(&pool->list); + call_rcu(&pool->rcu_head, __zswap_pool_release); + + spin_unlock(&zswap_pools_lock); +} + +static void zswap_pool_put(struct zswap_pool *pool) +{ + kref_put(&pool->kref, __zswap_pool_empty); +} + +/********************************* +* param callbacks +**********************************/ + +/* val must be a null-terminated string */ +static int __zswap_param_set(const char *val, const struct kernel_param *kp, + char *type, char *compressor) +{ + struct zswap_pool *pool, *put_pool = NULL; + char *s = strstrip((char *)val); + int ret; + + /* no change required */ + if (!strcmp(s, *(char **)kp->arg)) + return 0; + + /* if this is load-time (pre-init) param setting, + * don't create a pool; that's done during init. + */ + if (!zswap_init_started) + return param_set_charp(s, kp); + + if (!type) { + if (!zpool_has_pool(s)) { + pr_err("zpool %s not available\n", s); + return -ENOENT; + } + type = s; + } else if (!compressor) { + if (!crypto_has_comp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); + return -ENOENT; + } + compressor = s; + } else { + WARN_ON(1); + return -EINVAL; + } + + spin_lock(&zswap_pools_lock); + + pool = zswap_pool_find_get(type, compressor); + if (pool) { + zswap_pool_debug("using existing", pool); + list_del_rcu(&pool->list); + } else { + spin_unlock(&zswap_pools_lock); + pool = zswap_pool_create(type, compressor); + spin_lock(&zswap_pools_lock); + } + + if (pool) + ret = param_set_charp(s, kp); + else + ret = -EINVAL; + + if (!ret) { + put_pool = zswap_pool_current(); + list_add_rcu(&pool->list, &zswap_pools); + } else if (pool) { + /* add the possibly pre-existing pool to the end of the pools + * list; if it's new (and empty) then it'll be removed and + * destroyed by the put after we drop the lock + */ + list_add_tail_rcu(&pool->list, &zswap_pools); + put_pool = pool; + } + + spin_unlock(&zswap_pools_lock); + + /* drop the ref from either the old current pool, + * or the new pool we failed to add + */ + if (put_pool) + zswap_pool_put(put_pool); + + return ret; +} + +static int zswap_compressor_param_set(const char *val, + const struct kernel_param *kp) +{ + return __zswap_param_set(val, kp, zswap_zpool_type, NULL); +} + +static int zswap_zpool_param_set(const char *val, + const struct kernel_param *kp) +{ + return __zswap_param_set(val, kp, NULL, zswap_compressor); } /********************************* @@ -445,75 +804,14 @@ enum zswap_get_swap_ret { static int zswap_get_swap_cache_page(swp_entry_t entry, struct page **retpage) { - struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = swap_address_space(entry); - int err; - - *retpage = NULL; - do { - /* - * First check the swap cache. Since this is normally - * called after lookup_swap_cache() failed, re-calling - * that would confuse statistics. - */ - found_page = find_get_page(swapper_space, entry.val); - if (found_page) - break; - - /* - * Get a new page to read into from swap. - */ - if (!new_page) { - new_page = alloc_page(GFP_KERNEL); - if (!new_page) - break; /* Out of memory */ - } - - /* - * call radix_tree_preload() while we can wait. - */ - err = radix_tree_preload(GFP_KERNEL); - if (err) - break; - - /* - * Swap entry may have been freed since our caller observed it. - */ - err = swapcache_prepare(entry); - if (err == -EEXIST) { /* seems racy */ - radix_tree_preload_end(); - continue; - } - if (err) { /* swp entry is obsolete ? */ - radix_tree_preload_end(); - break; - } - - /* May fail (-ENOMEM) if radix-tree node allocation failed. */ - __set_page_locked(new_page); - SetPageSwapBacked(new_page); - err = __add_to_swap_cache(new_page, entry); - if (likely(!err)) { - radix_tree_preload_end(); - lru_cache_add_anon(new_page); - *retpage = new_page; - return ZSWAP_SWAPCACHE_NEW; - } - radix_tree_preload_end(); - ClearPageSwapBacked(new_page); - __clear_page_locked(new_page); - /* - * add_to_swap_cache() doesn't return -EEXIST, so we can safely - * clear SWAP_HAS_CACHE flag. - */ - swapcache_free(entry); - } while (err != -ENOMEM); + bool page_was_allocated; - if (new_page) - page_cache_release(new_page); - if (!found_page) + *retpage = __read_swap_cache_async(entry, GFP_KERNEL, + NULL, 0, &page_was_allocated); + if (page_was_allocated) + return ZSWAP_SWAPCACHE_NEW; + if (!*retpage) return ZSWAP_SWAPCACHE_FAIL; - *retpage = found_page; return ZSWAP_SWAPCACHE_EXIST; } @@ -537,6 +835,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) pgoff_t offset; struct zswap_entry *entry; struct page *page; + struct crypto_comp *tfm; u8 *src, *dst; unsigned int dlen; int ret; @@ -577,13 +876,15 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); - ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, - entry->length, dst, &dlen); + tfm = *get_cpu_ptr(entry->pool->tfm); + ret = crypto_comp_decompress(tfm, src, entry->length, + dst, &dlen); + put_cpu_ptr(entry->pool->tfm); kunmap_atomic(dst); - zpool_unmap_handle(zswap_pool, entry->handle); + zpool_unmap_handle(entry->pool->zpool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -632,6 +933,22 @@ end: return ret; } +static int zswap_shrink(void) +{ + struct zswap_pool *pool; + int ret; + + pool = zswap_pool_last_get(); + if (!pool) + return -ENOENT; + + ret = zpool_shrink(pool->zpool, 1, NULL); + + zswap_pool_put(pool); + + return ret; +} + /********************************* * frontswap hooks **********************************/ @@ -641,6 +958,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; + struct crypto_comp *tfm; int ret; unsigned int dlen = PAGE_SIZE, len; unsigned long handle; @@ -648,7 +966,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, u8 *src, *dst; struct zswap_header *zhdr; - if (!tree) { + if (!zswap_enabled || !tree) { ret = -ENODEV; goto reject; } @@ -656,7 +974,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; - if (zpool_shrink(zswap_pool, 1, NULL)) { + if (zswap_shrink()) { zswap_reject_reclaim_fail++; ret = -ENOMEM; goto reject; @@ -671,33 +989,43 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, goto reject; } + /* if entry is successfully added, it keeps the reference */ + entry->pool = zswap_pool_current_get(); + if (!entry->pool) { + ret = -EINVAL; + goto freepage; + } + /* compress */ dst = get_cpu_var(zswap_dstmem); + tfm = *get_cpu_ptr(entry->pool->tfm); src = kmap_atomic(page); - ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); + ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); kunmap_atomic(src); + put_cpu_ptr(entry->pool->tfm); if (ret) { ret = -EINVAL; - goto freepage; + goto put_dstmem; } /* store */ len = dlen + sizeof(struct zswap_header); - ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, - &handle); + ret = zpool_malloc(entry->pool->zpool, len, + __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, + &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; - goto freepage; + goto put_dstmem; } if (ret) { zswap_reject_alloc_fail++; - goto freepage; + goto put_dstmem; } - zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); + zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); zhdr->swpentry = swp_entry(type, offset); buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zpool_unmap_handle(zswap_pool, handle); + zpool_unmap_handle(entry->pool->zpool, handle); put_cpu_var(zswap_dstmem); /* populate entry */ @@ -720,12 +1048,14 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_pool_total_size = zpool_get_total_size(zswap_pool); + zswap_update_total_size(); return 0; -freepage: +put_dstmem: put_cpu_var(zswap_dstmem); + zswap_pool_put(entry->pool); +freepage: zswap_entry_cache_free(entry); reject: return ret; @@ -740,6 +1070,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; + struct crypto_comp *tfm; u8 *src, *dst; unsigned int dlen; int ret; @@ -756,13 +1087,14 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); - ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, - dst, &dlen); + tfm = *get_cpu_ptr(entry->pool->tfm); + ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); + put_cpu_ptr(entry->pool->tfm); kunmap_atomic(dst); - zpool_unmap_handle(zswap_pool, entry->handle); + zpool_unmap_handle(entry->pool->zpool, entry->handle); BUG_ON(ret); spin_lock(&tree->lock); @@ -815,10 +1147,6 @@ static void zswap_frontswap_invalidate_area(unsigned type) zswap_trees[type] = NULL; } -static struct zpool_ops zswap_zpool_ops = { - .evict = zswap_writeback_entry -}; - static void zswap_frontswap_init(unsigned type) { struct zswap_tree *tree; @@ -899,52 +1227,40 @@ static void __exit zswap_debugfs_exit(void) { } **********************************/ static int __init init_zswap(void) { - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + struct zswap_pool *pool; - if (!zswap_enabled) - return 0; - - pr_info("loading zswap\n"); - - zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, - &zswap_zpool_ops); - if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { - pr_info("%s zpool not available\n", zswap_zpool_type); - zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; - zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, - &zswap_zpool_ops); - } - if (!zswap_pool) { - pr_err("%s zpool not available\n", zswap_zpool_type); - pr_err("zpool creation failed\n"); - goto error; - } - pr_info("using %s pool\n", zswap_zpool_type); + zswap_init_started = true; if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); - goto cachefail; + goto cache_fail; } - if (zswap_comp_init()) { - pr_err("compressor initialization failed\n"); - goto compfail; + + if (zswap_cpu_dstmem_init()) { + pr_err("dstmem alloc failed\n"); + goto dstmem_fail; } - if (zswap_cpu_init()) { - pr_err("per-cpu initialization failed\n"); - goto pcpufail; + + pool = __zswap_pool_create_fallback(); + if (!pool) { + pr_err("pool creation failed\n"); + goto pool_fail; } + pr_info("loaded using pool %s/%s\n", pool->tfm_name, + zpool_get_type(pool->zpool)); + + list_add(&pool->list, &zswap_pools); frontswap_register_ops(&zswap_frontswap_ops); if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); return 0; -pcpufail: - zswap_comp_exit(); -compfail: + +pool_fail: + zswap_cpu_dstmem_destroy(); +dstmem_fail: zswap_entry_cache_destroy(); -cachefail: - zpool_destroy_pool(zswap_pool); -error: +cache_fail: return -ENOMEM; } /* must be late so crypto has time to come up */ |