diff options
Diffstat (limited to 'kernel/mm')
31 files changed, 666 insertions, 325 deletions
diff --git a/kernel/mm/backing-dev.c b/kernel/mm/backing-dev.c index 6871838f0..826fed55c 100644 --- a/kernel/mm/backing-dev.c +++ b/kernel/mm/backing-dev.c @@ -825,6 +825,20 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) } EXPORT_SYMBOL(bdi_register_dev); +int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) +{ + int rc; + + rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt), + MINOR(owner->devt)); + if (rc) + return rc; + bdi->owner = owner; + get_device(owner); + return 0; +} +EXPORT_SYMBOL(bdi_register_owner); + /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ @@ -849,6 +863,11 @@ void bdi_unregister(struct backing_dev_info *bdi) device_unregister(bdi->dev); bdi->dev = NULL; } + + if (bdi->owner) { + put_device(bdi->owner); + bdi->owner = NULL; + } } void bdi_exit(struct backing_dev_info *bdi) diff --git a/kernel/mm/cma.c b/kernel/mm/cma.c index ea506eb18..bd0e14124 100644 --- a/kernel/mm/cma.c +++ b/kernel/mm/cma.c @@ -183,7 +183,8 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, return -EINVAL; /* ensure minimal alignment required by mm core */ - alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + alignment = PAGE_SIZE << + max_t(unsigned long, MAX_ORDER - 1, pageblock_order); /* alignment should be aligned with order_per_bit */ if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) @@ -266,8 +267,8 @@ int __init cma_declare_contiguous(phys_addr_t base, * migratetype page by page allocator's buddy algorithm. In the case, * you couldn't get a contiguous memory, which is not what we want. */ - alignment = max(alignment, - (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); + alignment = max(alignment, (phys_addr_t)PAGE_SIZE << + max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); base = ALIGN(base, alignment); size = ALIGN(size, alignment); limit &= ~(alignment - 1); diff --git a/kernel/mm/compaction.c b/kernel/mm/compaction.c index ba0f146d8..51963f58a 100644 --- a/kernel/mm/compaction.c +++ b/kernel/mm/compaction.c @@ -475,25 +475,23 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); + if (!isolated) + break; + total_isolated += isolated; + cc->nr_freepages += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); page++; } - - /* If a page was split, advance to the end of it */ - if (isolated) { - cc->nr_freepages += isolated; - if (!strict && - cc->nr_migratepages <= cc->nr_freepages) { - blockpfn += isolated; - break; - } - - blockpfn += isolated - 1; - cursor += isolated - 1; - continue; + if (!strict && cc->nr_migratepages <= cc->nr_freepages) { + blockpfn += isolated; + break; } + /* Advance to the end of split page */ + blockpfn += isolated - 1; + cursor += isolated - 1; + continue; isolate_fail: if (strict) @@ -503,6 +501,9 @@ isolate_fail: } + if (locked) + spin_unlock_irqrestore(&cc->zone->lock, flags); + /* * There is a tiny chance that we have read bogus compound_order(), * so be careful to not go outside of the pageblock. @@ -524,9 +525,6 @@ isolate_fail: if (strict && blockpfn < end_pfn) total_isolated = 0; - if (locked) - spin_unlock_irqrestore(&cc->zone->lock, flags); - /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); @@ -880,16 +878,8 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, ISOLATE_UNEVICTABLE); - /* - * In case of fatal failure, release everything that might - * have been isolated in the previous iteration, and signal - * the failure back to caller. - */ - if (!pfn) { - putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; + if (!pfn) break; - } if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) break; @@ -974,7 +964,6 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { - /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check if we need @@ -998,32 +987,30 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, false); + isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, + freelist, false); /* - * If we isolated enough freepages, or aborted due to async - * compaction being contended, terminate the loop. - * Remember where the free scanner should restart next time, - * which is where isolate_freepages_block() left off. - * But if it scanned the whole pageblock, isolate_start_pfn - * now points at block_end_pfn, which is the start of the next - * pageblock. - * In that case we will however want to restart at the start - * of the previous pageblock. + * If we isolated enough freepages, or aborted due to lock + * contention, terminate. */ if ((cc->nr_freepages >= cc->nr_migratepages) || cc->contended) { - if (isolate_start_pfn >= block_end_pfn) + if (isolate_start_pfn >= block_end_pfn) { + /* + * Restart at previous pageblock if more + * freepages can be isolated next time. + */ isolate_start_pfn = block_start_pfn - pageblock_nr_pages; + } break; - } else { + } else if (isolate_start_pfn < block_end_pfn) { /* - * isolate_freepages_block() should not terminate - * prematurely unless contended, or isolated enough + * If isolation failed early, do not continue + * needlessly. */ - VM_BUG_ON(isolate_start_pfn < block_end_pfn); + break; } } diff --git a/kernel/mm/filemap.c b/kernel/mm/filemap.c index 44301361c..91dc614cb 100644 --- a/kernel/mm/filemap.c +++ b/kernel/mm/filemap.c @@ -109,6 +109,51 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page, void **shadowp) +{ + struct radix_tree_node *node; + void **slot; + int error; + + error = __radix_tree_create(&mapping->page_tree, page->index, + &node, &slot); + if (error) + return error; + if (*slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + if (shadowp) + *shadowp = p; + mapping->nrshadows--; + if (node) + workingset_node_shadows_dec(node); + } + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) { + local_lock(workingset_shadow_lock); + list_lru_del(&__workingset_shadow_nodes, + &node->private_list); + local_unlock(workingset_shadow_lock); + } + } + return 0; +} + static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { @@ -122,6 +167,14 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + if (!node) { + /* + * We need a node to properly account shadow + * entries. Don't plant any without. XXX + */ + shadow = NULL; + } + if (shadow) { mapping->nrshadows++; /* @@ -540,9 +593,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) memcg = mem_cgroup_begin_page_stat(old); spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(old, NULL, memcg); - error = radix_tree_insert(&mapping->page_tree, offset, new); + error = page_cache_tree_insert(mapping, new, NULL); BUG_ON(error); - mapping->nrpages++; /* * hugetlb pages do not participate in page cache accounting. @@ -564,51 +616,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); -static int page_cache_tree_insert(struct address_space *mapping, - struct page *page, void **shadowp) -{ - struct radix_tree_node *node; - void **slot; - int error; - - error = __radix_tree_create(&mapping->page_tree, page->index, - &node, &slot); - if (error) - return error; - if (*slot) { - void *p; - - p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); - if (!radix_tree_exceptional_entry(p)) - return -EEXIST; - if (shadowp) - *shadowp = p; - mapping->nrshadows--; - if (node) - workingset_node_shadows_dec(node); - } - radix_tree_replace_slot(slot, page); - mapping->nrpages++; - if (node) { - workingset_node_pages_inc(node); - /* - * Don't track node that contains actual pages. - * - * Avoid acquiring the list_lru lock if already - * untracked. The list_empty() test is safe as - * node->private_list is protected by - * mapping->tree_lock. - */ - if (!list_empty(&node->private_list)) { - local_lock(workingset_shadow_lock); - list_lru_del(&__workingset_shadow_nodes, - &node->private_list); - local_unlock(workingset_shadow_lock); - } - } - return 0; -} - static int __add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask, @@ -1557,6 +1564,11 @@ static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, cond_resched(); find_page: + if (fatal_signal_pending(current)) { + error = -EINTR; + goto out; + } + page = find_get_page(mapping, index); if (!page) { page_cache_sync_readahead(mapping, diff --git a/kernel/mm/gup.c b/kernel/mm/gup.c index deafa2c91..4b0b7e7d1 100644 --- a/kernel/mm/gup.c +++ b/kernel/mm/gup.c @@ -58,6 +58,16 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, return -EEXIST; } +/* + * FOLL_FORCE can write to even unwritable pte's, but only + * after we've gone through a COW cycle and they are dirty. + */ +static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) +{ + return pte_write(pte) || + ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); +} + static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { @@ -92,7 +102,7 @@ retry: } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) { + if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { pte_unmap_unlock(ptep, ptl); return NULL; } @@ -352,7 +362,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, * reCOWed by userspace write). */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) - *flags &= ~FOLL_WRITE; + *flags |= FOLL_COW; return 0; } diff --git a/kernel/mm/huge_memory.c b/kernel/mm/huge_memory.c index 62fe06bb7..530e6427f 100644 --- a/kernel/mm/huge_memory.c +++ b/kernel/mm/huge_memory.c @@ -2134,10 +2134,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, * page fault if needed. */ return 0; - if (vma->vm_ops) + if (vma->vm_ops || (vm_flags & VM_NO_THP)) /* khugepaged not yet working on file or special mappings */ return 0; - VM_BUG_ON_VMA(vm_flags & VM_NO_THP, vma); hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart < hend) @@ -2498,8 +2497,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) return false; if (is_vma_temporary_stack(vma)) return false; - VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); - return true; + return !(vma->vm_flags & VM_NO_THP); } static void collapse_huge_page(struct mm_struct *mm, diff --git a/kernel/mm/hugetlb.c b/kernel/mm/hugetlb.c index ef6963b57..ea11123a9 100644 --- a/kernel/mm/hugetlb.c +++ b/kernel/mm/hugetlb.c @@ -1416,12 +1416,13 @@ static void dissolve_free_huge_page(struct page *page) { spin_lock(&hugetlb_lock); if (PageHuge(page) && !page_count(page)) { - struct hstate *h = page_hstate(page); - int nid = page_to_nid(page); - list_del(&page->lru); + struct page *head = compound_head(page); + struct hstate *h = page_hstate(head); + int nid = page_to_nid(head); + list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; - update_and_free_page(h, page); + update_and_free_page(h, head); } spin_unlock(&hugetlb_lock); } @@ -1429,7 +1430,8 @@ static void dissolve_free_huge_page(struct page *page) /* * Dissolve free hugepages in a given pfn range. Used by memory hotplug to * make specified memory blocks removable from the system. - * Note that start_pfn should aligned with (minimum) hugepage size. + * Note that this will dissolve a free gigantic hugepage completely, if any + * part of it lies within the given range. */ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) { @@ -1438,7 +1440,6 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) if (!hugepages_supported()) return; - VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) dissolve_free_huge_page(pfn_to_page(pfn)); } @@ -1722,23 +1723,32 @@ free: } /* - * When releasing a hugetlb pool reservation, any surplus pages that were - * allocated to satisfy the reservation must be explicitly freed if they were - * never used. - * Called with hugetlb_lock held. + * This routine has two main purposes: + * 1) Decrement the reservation count (resv_huge_pages) by the value passed + * in unused_resv_pages. This corresponds to the prior adjustments made + * to the associated reservation map. + * 2) Free any unused surplus pages that may have been allocated to satisfy + * the reservation. As many as unused_resv_pages may be freed. + * + * Called with hugetlb_lock held. However, the lock could be dropped (and + * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, + * we must make sure nobody else can claim pages we are in the process of + * freeing. Do this by ensuring resv_huge_page always is greater than the + * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; - /* Uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; - /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) - return; + goto out; + /* + * Part (or even all) of the reservation could have been backed + * by pre-allocated pages. Only free surplus pages. + */ nr_pages = min(unused_resv_pages, h->surplus_huge_pages); /* @@ -1748,12 +1758,22 @@ static void return_unused_surplus_pages(struct hstate *h, * when the nodes with surplus pages have no free pages. * free_pool_huge_page() will balance the the freed pages across the * on-line nodes with memory and will handle the hstate accounting. + * + * Note that we decrement resv_huge_pages as we free the pages. If + * we drop the lock, resv_huge_pages will still be sufficiently large + * to cover subsequent pages we may free. */ while (nr_pages--) { + h->resv_huge_pages--; + unused_resv_pages--; if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) - break; + goto out; cond_resched_lock(&hugetlb_lock); } + +out: + /* Fully uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; } @@ -2170,6 +2190,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * and reducing the surplus. */ spin_unlock(&hugetlb_lock); + + /* yield cpu to avoid soft lockup */ + cond_resched(); + if (hstate_is_gigantic(h)) ret = alloc_fresh_gigantic_page(h, nodes_allowed); else @@ -4209,7 +4233,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (saddr) { spte = huge_pte_offset(svma->vm_mm, saddr); if (spte) { - mm_inc_nr_pmds(mm); get_page(virt_to_page(spte)); break; } @@ -4224,9 +4247,9 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); + mm_inc_nr_pmds(mm); } else { put_page(virt_to_page(spte)); - mm_inc_nr_pmds(mm); } spin_unlock(ptl); out: diff --git a/kernel/mm/init-mm.c b/kernel/mm/init-mm.c index a56a85190..975e49f00 100644 --- a/kernel/mm/init-mm.c +++ b/kernel/mm/init-mm.c @@ -6,6 +6,7 @@ #include <linux/cpumask.h> #include <linux/atomic.h> +#include <linux/user_namespace.h> #include <asm/pgtable.h> #include <asm/mmu.h> @@ -21,5 +22,6 @@ struct mm_struct init_mm = { .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), + .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) }; diff --git a/kernel/mm/internal.h b/kernel/mm/internal.h index 38e24b89e..6979b2bd3 100644 --- a/kernel/mm/internal.h +++ b/kernel/mm/internal.h @@ -22,7 +22,8 @@ */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ - __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ + __GFP_ATOMIC) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/kernel/mm/kasan/kasan.h b/kernel/mm/kasan/kasan.h index 4f6c62e5c..37ff0ab6a 100644 --- a/kernel/mm/kasan/kasan.h +++ b/kernel/mm/kasan/kasan.h @@ -52,6 +52,9 @@ struct kasan_global { #if KASAN_ABI_VERSION >= 4 struct kasan_source_location *location; #endif +#if KASAN_ABI_VERSION >= 5 + char *odr_indicator; +#endif }; static inline const void *kasan_shadow_to_mem(const void *shadow_addr) diff --git a/kernel/mm/ksm.c b/kernel/mm/ksm.c index b5cd647da..2f028e6d0 100644 --- a/kernel/mm/ksm.c +++ b/kernel/mm/ksm.c @@ -283,7 +283,8 @@ static inline struct rmap_item *alloc_rmap_item(void) { struct rmap_item *rmap_item; - rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | + __GFP_NORETRY | __GFP_NOWARN); if (rmap_item) ksm_rmap_items++; return rmap_item; diff --git a/kernel/mm/list_lru.c b/kernel/mm/list_lru.c index afc71ea9a..5d8dffd5b 100644 --- a/kernel/mm/list_lru.c +++ b/kernel/mm/list_lru.c @@ -554,6 +554,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, err = memcg_init_list_lru(lru, memcg_aware); if (err) { kfree(lru->node); + /* Do this so a list_lru_destroy() doesn't crash: */ + lru->node = NULL; goto out; } diff --git a/kernel/mm/memcontrol.c b/kernel/mm/memcontrol.c index 095d20f60..b5f5984aa 100644 --- a/kernel/mm/memcontrol.c +++ b/kernel/mm/memcontrol.c @@ -199,6 +199,7 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { spinlock_t lock; /* for from, to */ + struct mm_struct *mm; struct mem_cgroup *from; struct mem_cgroup *to; unsigned long flags; @@ -274,21 +275,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { - return memcg->css.id; -} - -/* - * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling - * css_tryget_online() if the mem_cgroup is used for charging. (dropping - * refcnt from swap can be called against removed memcg.) - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); + return memcg->id.id; } /* Writing them here to avoid exposing memcg's inner layout */ @@ -1335,7 +1322,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) return limit; } -static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, +static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { struct oom_control oc = { @@ -1413,6 +1400,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } unlock: mutex_unlock(&oom_lock); + return chosen; } #if MAX_NUMNODES > 1 @@ -2073,6 +2061,15 @@ retry: current->flags & PF_EXITING)) goto force; + /* + * Prevent unbounded recursion when reclaim operations need to + * allocate memory. This might exceed the limits temporarily, + * but we prefer facilitating memory reclaim and getting back + * under the limit over triggering OOM kills in these cases. + */ + if (unlikely(current->flags & PF_MEMALLOC)) + goto force; + if (unlikely(task_in_memcg_oom(current))) goto nomem; @@ -3665,6 +3662,7 @@ static void memcg_deactivate_kmem(struct mem_cgroup *memcg) * ordering is imposed by list_lru_node->lock taken by * memcg_drain_all_list_lrus(). */ + rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ css_for_each_descendant_pre(css, &memcg->css) { child = mem_cgroup_from_css(css); BUG_ON(child->kmemcg_id != kmemcg_id); @@ -3672,6 +3670,8 @@ static void memcg_deactivate_kmem(struct mem_cgroup *memcg) if (!memcg->use_hierarchy) break; } + rcu_read_unlock(); + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); memcg_free_cache_id(kmemcg_id); @@ -4125,6 +4125,88 @@ static struct cftype mem_cgroup_legacy_files[] = { { }, /* terminate */ }; +/* + * Private memory cgroup IDR + * + * Swap-out records and page cache shadow entries need to store memcg + * references in constrained space, so we maintain an ID space that is + * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of + * memory-controlled cgroups to 64k. + * + * However, there usually are many references to the oflline CSS after + * the cgroup has been destroyed, such as page cache or reclaimable + * slab objects, that don't need to hang on to the ID. We want to keep + * those dead CSS from occupying IDs, or we might quickly exhaust the + * relatively small ID space and prevent the creation of new cgroups + * even when there are much fewer than 64k cgroups - possibly none. + * + * Maintain a private 16-bit ID space for memcg, and allow the ID to + * be freed and recycled when it's no longer needed, which is usually + * when the CSS is offlined. + * + * The only exception to that are records of swapped out tmpfs/shmem + * pages that need to be attributed to live ancestors on swapin. But + * those references are manageable from userspace. + */ + +static DEFINE_IDR(mem_cgroup_idr); + +static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) +{ + atomic_add(n, &memcg->id.ref); +} + +static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +{ + while (!atomic_inc_not_zero(&memcg->id.ref)) { + /* + * The root cgroup cannot be destroyed, so it's refcount must + * always be >= 1. + */ + if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + VM_BUG_ON(1); + break; + } + memcg = parent_mem_cgroup(memcg); + if (!memcg) + memcg = root_mem_cgroup; + } + return memcg; +} + +static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) +{ + if (atomic_sub_and_test(n, &memcg->id.ref)) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + + /* Memcg ID pins CSS */ + css_put(&memcg->css); + } +} + +static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) +{ + mem_cgroup_id_get_many(memcg, 1); +} + +static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) +{ + mem_cgroup_id_put_many(memcg, 1); +} + +/** + * mem_cgroup_from_id - look up a memcg from a memcg id + * @id: the memcg id to look up + * + * Caller must hold rcu_read_lock(). + */ +struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return idr_find(&mem_cgroup_idr, id); +} + static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; @@ -4179,6 +4261,12 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg_wb_domain_init(memcg, GFP_KERNEL)) goto out_free_stat; + memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, + 1, MEM_CGROUP_ID_MAX, + GFP_KERNEL); + if (memcg->id.id < 0) + goto out_free_stat; + return memcg; out_free_stat: @@ -4264,9 +4352,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif + idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return &memcg->css; free_out: + idr_remove(&mem_cgroup_idr, memcg->id.id); __mem_cgroup_free(memcg); return ERR_PTR(error); } @@ -4278,8 +4368,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); int ret; - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; + /* Online state pins memcg ID, memcg ID pins CSS */ + mem_cgroup_id_get(mem_cgroup_from_css(css)); + css_get(css); if (!parent) return 0; @@ -4353,6 +4444,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_deactivate_kmem(memcg); wb_memcg_offline(memcg); + + mem_cgroup_id_put(memcg); } static void mem_cgroup_css_released(struct cgroup_subsys_state *css) @@ -4409,9 +4502,9 @@ static int mem_cgroup_do_precharge(unsigned long count) return ret; } - /* Try charges one by one with reclaim */ + /* Try charges one by one with reclaim, but do not retry */ while (count--) { - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); if (ret) return ret; mc.precharge++; @@ -4786,6 +4879,8 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) page_counter_uncharge(&mc.from->memsw, mc.moved_swap); + mem_cgroup_id_put_many(mc.from, mc.moved_swap); + /* * we charged both to->memory and to->memsw, so we * should uncharge to->memory. @@ -4793,9 +4888,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.to)) page_counter_uncharge(&mc.to->memory, mc.moved_swap); - css_put_many(&mc.from->css, mc.moved_swap); + mem_cgroup_id_get_many(mc.to, mc.moved_swap); + css_put_many(&mc.to->css, mc.moved_swap); - /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); @@ -4805,6 +4900,8 @@ static void __mem_cgroup_clear_mc(void) static void mem_cgroup_clear_mc(void) { + struct mm_struct *mm = mc.mm; + /* * we must clear moving_task before waking up waiters at the end of * task migration. @@ -4814,7 +4911,10 @@ static void mem_cgroup_clear_mc(void) spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; + mc.mm = NULL; spin_unlock(&mc.lock); + + mmput(mm); } static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -4871,6 +4971,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) VM_BUG_ON(mc.moved_swap); spin_lock(&mc.lock); + mc.mm = mm; mc.from = from; mc.to = memcg; mc.flags = move_flags; @@ -4880,8 +4981,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); + } else { + mmput(mm); } - mmput(mm); return ret; } @@ -4990,11 +5092,11 @@ put: /* get_mctgt_type() gets the page */ return ret; } -static void mem_cgroup_move_charge(struct mm_struct *mm) +static void mem_cgroup_move_charge(void) { struct mm_walk mem_cgroup_move_charge_walk = { .pmd_entry = mem_cgroup_move_charge_pte_range, - .mm = mm, + .mm = mc.mm, }; lru_add_drain_all(); @@ -5006,7 +5108,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) atomic_inc(&mc.from->moving_account); synchronize_rcu(); retry: - if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) { /* * Someone who are holding the mmap_sem might be waiting in * waitq. So we cancel all extra charges, wake up all waiters, @@ -5023,23 +5125,16 @@ retry: * additional charge, the page walk just aborts. */ walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); - up_read(&mm->mmap_sem); + up_read(&mc.mm->mmap_sem); atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { - struct cgroup_subsys_state *css; - struct task_struct *p = cgroup_taskset_first(tset, &css); - struct mm_struct *mm = get_task_mm(p); - - if (mm) { - if (mc.to) - mem_cgroup_move_charge(mm); - mmput(mm); - } - if (mc.to) + if (mc.to) { + mem_cgroup_move_charge(); mem_cgroup_clear_mc(); + } } #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) @@ -5049,7 +5144,7 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_taskset *tset) +static void mem_cgroup_move_task(void) { } #endif @@ -5127,6 +5222,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long nr_pages; unsigned long high; int err; @@ -5137,6 +5233,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + nr_pages = page_counter_read(&memcg->memory); + if (nr_pages > high) + try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, true); + memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -5158,6 +5259,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; + bool drained = false; unsigned long max; int err; @@ -5166,9 +5269,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - err = mem_cgroup_resize_limit(memcg, max); - if (err) - return err; + xchg(&memcg->memory.limit, max); + + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + + if (nr_pages <= max) + break; + + if (signal_pending(current)) { + err = -EINTR; + break; + } + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + if (nr_reclaims) { + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, + GFP_KERNEL, true)) + nr_reclaims--; + continue; + } + + mem_cgroup_events(memcg, MEMCG_OOM, 1); + if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) + break; + } memcg_wb_domain_size_changed(memcg); return nbytes; @@ -5228,7 +5358,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, - .attach = mem_cgroup_move_task, + .post_attach = mem_cgroup_move_task, .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, @@ -5636,7 +5766,7 @@ subsys_initcall(mem_cgroup_init); */ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *swap_memcg; unsigned short oldid; unsigned long flags; @@ -5652,15 +5782,27 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!memcg) return; - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* + * In case the memcg owning these pages has been offlined and doesn't + * have an ID allocated to it anymore, charge the closest online + * ancestor for the swap instead and transfer the memory+swap charge. + */ + swap_memcg = mem_cgroup_id_get_online(memcg); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(swap_memcg, true); page->mem_cgroup = NULL; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); + if (memcg != swap_memcg) { + if (!mem_cgroup_is_root(swap_memcg)) + page_counter_charge(&swap_memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, 1); + } + /* * Interrupts should be disabled here because the caller holds the * mapping->tree_lock lock which is taken with interrupts-off. It is @@ -5673,6 +5815,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) #endif mem_cgroup_charge_statistics(memcg, page, -1); memcg_check_events(memcg, page); + + if (!mem_cgroup_is_root(memcg)) + css_put(&memcg->css); local_unlock_irqrestore(event_lock, flags); } @@ -5697,7 +5842,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memsw, 1); mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); + mem_cgroup_id_put(memcg); } rcu_read_unlock(); } diff --git a/kernel/mm/memory.c b/kernel/mm/memory.c index b80bf4746..76dcee317 100644 --- a/kernel/mm/memory.c +++ b/kernel/mm/memory.c @@ -797,6 +797,46 @@ out: return pfn_to_page(pfn); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + /* + * There is no pmd_special() but there may be special pmds, e.g. + * in a direct-access (dax) mapping, so let's just replicate the + * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + */ + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn)) + return NULL; + if (unlikely(pfn > highest_memmap_pfn)) + return NULL; + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} +#endif + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range diff --git a/kernel/mm/memory_hotplug.c b/kernel/mm/memory_hotplug.c index a042a9d53..a18923e43 100644 --- a/kernel/mm/memory_hotplug.c +++ b/kernel/mm/memory_hotplug.c @@ -1371,17 +1371,20 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) } /* - * Confirm all pages in a range [start, end) is belongs to the same zone. + * Confirm all pages in a range [start, end) belong to the same zone. + * When true, return its valid [start, end). */ -int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) +int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, + unsigned long *valid_start, unsigned long *valid_end) { unsigned long pfn, sec_end_pfn; + unsigned long start, end; struct zone *zone = NULL; struct page *page; int i; - for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn); + for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1); pfn < end_pfn; - pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) { + pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) { /* Make sure the memory section is present first */ if (!present_section_nr(pfn_to_section_nr(pfn))) continue; @@ -1397,10 +1400,20 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) page = pfn_to_page(pfn + i); if (zone && page_zone(page) != zone) return 0; + if (!zone) + start = pfn + i; zone = page_zone(page); + end = pfn + MAX_ORDER_NR_PAGES; } } - return 1; + + if (zone) { + *valid_start = start; + *valid_end = end; + return 1; + } else { + return 0; + } } /* @@ -1718,6 +1731,7 @@ static int __ref __offline_pages(unsigned long start_pfn, long offlined_pages; int ret, drain, retry_max, node; unsigned long flags; + unsigned long valid_start, valid_end; struct zone *zone; struct memory_notify arg; @@ -1728,10 +1742,10 @@ static int __ref __offline_pages(unsigned long start_pfn, return -EINVAL; /* This makes hotplug much easier...and readable. we assume this for now. .*/ - if (!test_pages_in_a_zone(start_pfn, end_pfn)) + if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) return -EINVAL; - zone = page_zone(pfn_to_page(start_pfn)); + zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; diff --git a/kernel/mm/mempolicy.c b/kernel/mm/mempolicy.c index 87a177917..a4217fe60 100644 --- a/kernel/mm/mempolicy.c +++ b/kernel/mm/mempolicy.c @@ -2006,8 +2006,8 @@ retry_cpuset: nmask = policy_nodemask(gfp, pol); zl = policy_zonelist(gfp, pol, node); - mpol_cond_put(pol); page = __alloc_pages_nodemask(gfp, order, zl, nmask); + mpol_cond_put(pol); out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; diff --git a/kernel/mm/migrate.c b/kernel/mm/migrate.c index 6d17e0ab4..72c09dea6 100644 --- a/kernel/mm/migrate.c +++ b/kernel/mm/migrate.c @@ -429,6 +429,7 @@ int migrate_page_move_mapping(struct address_space *mapping, return MIGRATEPAGE_SUCCESS; } +EXPORT_SYMBOL(migrate_page_move_mapping); /* * The expected number of remaining references is the same as that @@ -579,6 +580,7 @@ void migrate_page_copy(struct page *newpage, struct page *page) if (PageWriteback(newpage)) end_page_writeback(newpage); } +EXPORT_SYMBOL(migrate_page_copy); /************************************************************ * Migration functions @@ -963,7 +965,13 @@ out: dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); /* Soft-offlined page shouldn't go through lru cache list */ - if (reason == MR_MEMORY_FAILURE) { + if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) { + /* + * With this release, we free successfully migrated + * page and set PG_HWPoison on just freed page + * intentionally. Although it's rather weird, it's how + * HWPoison flag works at the moment. + */ put_page(page); if (!test_set_page_hwpoison(page)) num_poisoned_pages_inc(); diff --git a/kernel/mm/page-writeback.c b/kernel/mm/page-writeback.c index d15d88c8e..fd51ebfc4 100644 --- a/kernel/mm/page-writeback.c +++ b/kernel/mm/page-writeback.c @@ -359,8 +359,9 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; - unsigned long ratio = vm_dirty_ratio; - unsigned long bg_ratio = dirty_background_ratio; + /* convert ratios to per-PAGE_SIZE for higher precision */ + unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; + unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; unsigned long thresh; unsigned long bg_thresh; struct task_struct *tsk; @@ -372,26 +373,28 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against - * globally available memory. + * globally available memory. As the ratios are in + * per-PAGE_SIZE, they can be obtained by dividing bytes by + * number of pages. */ if (bytes) - ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 / - global_avail, 100UL); + ratio = min(DIV_ROUND_UP(bytes, global_avail), + PAGE_SIZE); if (bg_bytes) - bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 / - global_avail, 100UL); + bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), + PAGE_SIZE); bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else - thresh = (ratio * available_memory) / 100; + thresh = (ratio * available_memory) / PAGE_SIZE; if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else - bg_thresh = (bg_ratio * available_memory) / 100; + bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; if (bg_thresh >= thresh) bg_thresh = thresh / 2; @@ -1899,7 +1902,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (gdtc->dirty > gdtc->bg_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc)) + if (wb_stat(wb, WB_RECLAIMABLE) > + wb_calc_thresh(gdtc->wb, gdtc->bg_thresh)) return true; if (mdtc) { @@ -1913,7 +1917,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (mdtc->dirty > mdtc->bg_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc)) + if (wb_stat(wb, WB_RECLAIMABLE) > + wb_calc_thresh(mdtc->wb, mdtc->bg_thresh)) return true; } diff --git a/kernel/mm/page_alloc.c b/kernel/mm/page_alloc.c index d002418fc..abb0cdb5b 100644 --- a/kernel/mm/page_alloc.c +++ b/kernel/mm/page_alloc.c @@ -288,7 +288,9 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat) /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { - if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn) + int nid = early_pfn_to_nid(pfn); + + if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) return true; return false; @@ -675,34 +677,28 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; - unsigned int max_order = MAX_ORDER; + unsigned int max_order; + + max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); - if (is_migrate_isolate(migratetype)) { - /* - * We restrict max order of merging to prevent merge - * between freepages on isolate pageblock and normal - * pageblock. Without this, pageblock isolation - * could cause incorrect freepage accounting. - */ - max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); - } else { + if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); - } - page_idx = pfn & ((1 << max_order) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); +continue_merging: while (order < max_order - 1) { buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) - break; + goto done_merging; /* * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. @@ -719,6 +715,32 @@ static inline void __free_one_page(struct page *page, page_idx = combined_idx; order++; } + if (max_order < MAX_ORDER) { + /* If we are here, it means order is >= pageblock_order. + * We want to prevent merge between freepages on isolate + * pageblock and normal pageblock. Without this, pageblock + * isolation could cause incorrect freepage or CMA accounting. + * + * We don't want to hit this code for the more frequent + * low-order merging. + */ + if (unlikely(has_isolate_pageblock(zone))) { + int buddy_mt; + + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + buddy_mt = get_pageblock_migratetype(buddy); + + if (migratetype != buddy_mt + && (is_migrate_isolate(migratetype) || + is_migrate_isolate(buddy_mt))) + goto done_merging; + } + max_order++; + goto continue_merging; + } + +done_merging: set_page_order(page, order); /* @@ -969,7 +991,7 @@ static inline void init_reserved_page(unsigned long pfn) * marks the pages PageReserved. The remaining valid pages are later * sent to the buddy page allocator. */ -void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) +void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -1075,7 +1097,7 @@ int __meminit early_pfn_to_nid(unsigned long pfn) spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid < 0) - nid = 0; + nid = first_online_node; spin_unlock(&early_pfn_lock); return nid; @@ -5735,15 +5757,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) sizeof(arch_zone_lowest_possible_pfn)); memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); - arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; - for (i = 1; i < MAX_NR_ZONES; i++) { + + start_pfn = find_min_pfn_with_active_regions(); + + for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - arch_zone_lowest_possible_pfn[i] = - arch_zone_highest_possible_pfn[i-1]; - arch_zone_highest_possible_pfn[i] = - max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); + + end_pfn = max(max_zone_pfn[i], start_pfn); + arch_zone_lowest_possible_pfn[i] = start_pfn; + arch_zone_highest_possible_pfn[i] = end_pfn; + + start_pfn = end_pfn; } arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; @@ -6235,7 +6260,7 @@ int __meminit init_per_zone_wmark_min(void) setup_per_zone_inactive_ratio(); return 0; } -module_init(init_per_zone_wmark_min) +core_initcall(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so diff --git a/kernel/mm/page_isolation.c b/kernel/mm/page_isolation.c index 4568fd58f..00c96462c 100644 --- a/kernel/mm/page_isolation.c +++ b/kernel/mm/page_isolation.c @@ -283,11 +283,11 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, * now as a simple work-around, we use the next node for destination. */ if (PageHuge(page)) { - nodemask_t src = nodemask_of_node(page_to_nid(page)); - nodemask_t dst; - nodes_complement(dst, src); + int node = next_online_node(page_to_nid(page)); + if (node == MAX_NUMNODES) + node = first_online_node; return alloc_huge_page_node(page_hstate(compound_head(page)), - next_node(page_to_nid(page), dst)); + node); } if (PageHighMem(page)) diff --git a/kernel/mm/percpu.c b/kernel/mm/percpu.c index 8a943b97a..1f376bce4 100644 --- a/kernel/mm/percpu.c +++ b/kernel/mm/percpu.c @@ -110,7 +110,7 @@ struct pcpu_chunk { int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ - struct work_struct map_extend_work;/* async ->map[] extension */ + struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ void *data; /* chunk data */ int first_free; /* no free below this */ @@ -160,10 +160,13 @@ static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ -static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +/* chunks which need their map areas extended, protected by pcpu_lock */ +static LIST_HEAD(pcpu_map_extend_chunks); + /* * The number of empty populated pages, protected by pcpu_lock. The * reserved chunk doesn't contribute to the count. @@ -397,13 +400,19 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) { int margin, new_alloc; + lockdep_assert_held(&pcpu_lock); + if (is_atomic) { margin = 3; if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && - pcpu_async_enabled) - schedule_work(&chunk->map_extend_work); + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) { + if (list_empty(&chunk->map_extend_list)) { + list_add_tail(&chunk->map_extend_list, + &pcpu_map_extend_chunks); + pcpu_schedule_balance_work(); + } + } } else { margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; } @@ -437,6 +446,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); unsigned long flags; + lockdep_assert_held(&pcpu_alloc_mutex); + new = pcpu_mem_zalloc(new_size); if (!new) return -ENOMEM; @@ -469,20 +480,6 @@ out_unlock: return 0; } -static void pcpu_map_extend_workfn(struct work_struct *work) -{ - struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, - map_extend_work); - int new_alloc; - - spin_lock_irq(&pcpu_lock); - new_alloc = pcpu_need_to_extend(chunk, false); - spin_unlock_irq(&pcpu_lock); - - if (new_alloc) - pcpu_extend_area_map(chunk, new_alloc); -} - /** * pcpu_fit_in_area - try to fit the requested allocation in a candidate area * @chunk: chunk the candidate area belongs to @@ -742,7 +739,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); - INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&chunk->map_extend_list); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; @@ -897,6 +894,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, return NULL; } + if (!is_atomic) + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ @@ -969,12 +969,9 @@ restart: if (is_atomic) goto fail; - mutex_lock(&pcpu_alloc_mutex); - if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { chunk = pcpu_create_chunk(); if (!chunk) { - mutex_unlock(&pcpu_alloc_mutex); err = "failed to allocate new chunk"; goto fail; } @@ -985,7 +982,6 @@ restart: spin_lock_irqsave(&pcpu_lock, flags); } - mutex_unlock(&pcpu_alloc_mutex); goto restart; area_found: @@ -995,8 +991,6 @@ area_found: if (!is_atomic) { int page_start, page_end, rs, re; - mutex_lock(&pcpu_alloc_mutex); - page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); @@ -1007,7 +1001,6 @@ area_found: spin_lock_irqsave(&pcpu_lock, flags); if (ret) { - mutex_unlock(&pcpu_alloc_mutex); pcpu_free_area(chunk, off, &occ_pages); err = "failed to populate"; goto fail_unlock; @@ -1047,6 +1040,8 @@ fail: /* see the flag handling in pcpu_blance_workfn() */ pcpu_atomic_alloc_failed = true; pcpu_schedule_balance_work(); + } else { + mutex_unlock(&pcpu_alloc_mutex); } return NULL; } @@ -1131,6 +1126,7 @@ static void pcpu_balance_workfn(struct work_struct *work) if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; + list_del_init(&chunk->map_extend_list); list_move(&chunk->list, &to_free); } @@ -1148,6 +1144,25 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } + /* service chunks which requested async area map extension */ + do { + int new_alloc = 0; + + spin_lock_irq(&pcpu_lock); + + chunk = list_first_entry_or_null(&pcpu_map_extend_chunks, + struct pcpu_chunk, map_extend_list); + if (chunk) { + list_del_init(&chunk->map_extend_list); + new_alloc = pcpu_need_to_extend(chunk, false); + } + + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); + } while (chunk); + /* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic @@ -1646,7 +1661,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); - INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&schunk->map_extend_list); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); @@ -1675,7 +1690,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); - INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&dchunk->map_extend_list); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); diff --git a/kernel/mm/shmem.c b/kernel/mm/shmem.c index ea5a70cfc..1b11ccc0a 100644 --- a/kernel/mm/shmem.c +++ b/kernel/mm/shmem.c @@ -2153,9 +2153,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, NULL); if (error) { /* Remove the !PageUptodate pages we added */ - shmem_undo_range(inode, - (loff_t)start << PAGE_CACHE_SHIFT, - (loff_t)index << PAGE_CACHE_SHIFT, true); + if (index > start) { + shmem_undo_range(inode, + (loff_t)start << PAGE_CACHE_SHIFT, + ((loff_t)index << PAGE_CACHE_SHIFT) - 1, true); + } goto undone; } diff --git a/kernel/mm/slab_common.c b/kernel/mm/slab_common.c index 3c6a86b4e..bec2fce9f 100644 --- a/kernel/mm/slab_common.c +++ b/kernel/mm/slab_common.c @@ -521,8 +521,8 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); - cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - css->id, memcg_name_buf); + cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name, + css->serial_nr, memcg_name_buf); if (!cache_name) goto out_unlock; diff --git a/kernel/mm/slub.c b/kernel/mm/slub.c index d304d8802..feb4a445a 100644 --- a/kernel/mm/slub.c +++ b/kernel/mm/slub.c @@ -2881,6 +2881,7 @@ struct detached_freelist { void *tail; void *freelist; int cnt; + struct kmem_cache *s; }; /* @@ -2895,8 +2896,9 @@ struct detached_freelist { * synchronization primitive. Look ahead in the array is limited due * to performance reasons. */ -static int build_detached_freelist(struct kmem_cache *s, size_t size, - void **p, struct detached_freelist *df) +static inline +int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) { size_t first_skipped_index = 0; int lookahead = 3; @@ -2912,8 +2914,11 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, if (!object) return 0; + /* Support for memcg, compiler can optimize this out */ + df->s = cache_from_obj(s, object); + /* Start new detached freelist */ - set_freepointer(s, object, NULL); + set_freepointer(df->s, object, NULL); df->page = virt_to_head_page(object); df->tail = object; df->freelist = object; @@ -2928,7 +2933,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, /* df->page is always set at this point */ if (df->page == virt_to_head_page(object)) { /* Opportunity build freelist */ - set_freepointer(s, object, df->freelist); + set_freepointer(df->s, object, df->freelist); df->freelist = object; df->cnt++; p[size] = NULL; /* mark object processed */ @@ -2947,25 +2952,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, return first_skipped_index; } - /* Note that interrupts must be enabled when calling this function. */ -void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) { if (WARN_ON(!size)) return; do { struct detached_freelist df; - struct kmem_cache *s; - - /* Support for memcg */ - s = cache_from_obj(orig_s, p[size - 1]); size = build_detached_freelist(s, size, p, &df); if (unlikely(!df.page)) continue; - slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); + slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); diff --git a/kernel/mm/swap.c b/kernel/mm/swap.c index ca194aeb4..ad1664922 100644 --- a/kernel/mm/swap.c +++ b/kernel/mm/swap.c @@ -821,9 +821,15 @@ void lru_add_drain_cpu(int cpu) unsigned long flags; /* No harm done if a racing interrupt already did this */ +#ifdef CONFIG_PREEMPT_RT_BASE + local_lock_irqsave_on(rotate_lock, flags, cpu); + pagevec_move_tail(pvec); + local_unlock_irqrestore_on(rotate_lock, flags, cpu); +#else local_lock_irqsave(rotate_lock, flags); pagevec_move_tail(pvec); local_unlock_irqrestore(rotate_lock, flags); +#endif } pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); @@ -866,12 +872,32 @@ void lru_add_drain(void) local_unlock_cpu(swapvec_lock); } + +#ifdef CONFIG_PREEMPT_RT_BASE +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work) +{ + local_lock_on(swapvec_lock, cpu); + lru_add_drain_cpu(cpu); + local_unlock_on(swapvec_lock, cpu); +} + +#else + static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_drain(); } static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work) +{ + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); + + INIT_WORK(work, lru_add_drain_per_cpu); + schedule_work_on(cpu, work); + cpumask_set_cpu(cpu, has_work); +} +#endif void lru_add_drain_all(void) { @@ -884,20 +910,17 @@ void lru_add_drain_all(void) cpumask_clear(&has_work); for_each_online_cpu(cpu) { - struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); - if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || - need_activate_page_drain(cpu)) { - INIT_WORK(work, lru_add_drain_per_cpu); - schedule_work_on(cpu, work); - cpumask_set_cpu(cpu, &has_work); - } + need_activate_page_drain(cpu)) + remote_lru_add_drain(cpu, &has_work); } +#ifndef CONFIG_PREEMPT_RT_BASE for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); +#endif put_online_cpus(); mutex_unlock(&lock); diff --git a/kernel/mm/swapfile.c b/kernel/mm/swapfile.c index 58877312c..c1a0f3dea 100644 --- a/kernel/mm/swapfile.c +++ b/kernel/mm/swapfile.c @@ -2225,6 +2225,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p, swab32s(&swap_header->info.version); swab32s(&swap_header->info.last_page); swab32s(&swap_header->info.nr_badpages); + if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) + return 0; for (i = 0; i < swap_header->info.nr_badpages; i++) swab32s(&swap_header->info.badpages[i]); } diff --git a/kernel/mm/util.c b/kernel/mm/util.c index 9af1c12b3..d5259b62f 100644 --- a/kernel/mm/util.c +++ b/kernel/mm/util.c @@ -199,36 +199,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, } /* Check if the vma is being used as a stack by this task */ -static int vm_is_stack_for_task(struct task_struct *t, - struct vm_area_struct *vma) +int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t) { return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } -/* - * Check if the vma is being used as a stack. - * If is_group is non-zero, check in the entire thread group or else - * just check in the current task. Returns the task_struct of the task - * that the vma is stack for. Must be called under rcu_read_lock(). - */ -struct task_struct *task_of_stack(struct task_struct *task, - struct vm_area_struct *vma, bool in_group) -{ - if (vm_is_stack_for_task(task, vma)) - return task; - - if (in_group) { - struct task_struct *t; - - for_each_thread(task, t) { - if (vm_is_stack_for_task(t, vma)) - return t; - } - } - - return NULL; -} - #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { diff --git a/kernel/mm/vmscan.c b/kernel/mm/vmscan.c index 2aec4241b..bfc5050cb 100644 --- a/kernel/mm/vmscan.c +++ b/kernel/mm/vmscan.c @@ -277,6 +277,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; + long scanned = 0, next_deferred; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0) @@ -298,7 +299,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); total_scan = freeable; - } + next_deferred = nr; + } else + next_deferred = total_scan; /* * We need to avoid excessive windup on filesystem shrinkers @@ -355,17 +358,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, count_vm_events(SLABS_SCANNED, nr_to_scan); total_scan -= nr_to_scan; + scanned += nr_to_scan; cond_resched(); } + if (next_deferred >= scanned) + next_deferred -= scanned; + else + next_deferred = 0; /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, + if (next_deferred > 0) + new_nr = atomic_long_add_return(next_deferred, &shrinker->nr_deferred[nid]); else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); @@ -2159,23 +2167,6 @@ out: } } -#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH -static void init_tlb_ubc(void) -{ - /* - * This deliberately does not clear the cpumask as it's expensive - * and unnecessary. If there happens to be data in there then the - * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and - * then will be cleared. - */ - current->tlb_ubc.flush_required = false; -} -#else -static inline void init_tlb_ubc(void) -{ -} -#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ - /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -2210,8 +2201,6 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); - init_tlb_ubc(); - blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2534,7 +2523,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask |= __GFP_HIGHMEM; for_each_zone_zonelist_nodemask(zone, z, zonelist, - requested_highidx, sc->nodemask) { + gfp_zone(sc->gfp_mask), sc->nodemask) { enum zone_type classzone_idx; if (!populated_zone(zone)) @@ -2929,7 +2918,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.may_writepage, sc.gfp_mask); + current->flags |= PF_MEMALLOC; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + current->flags &= ~PF_MEMALLOC; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); diff --git a/kernel/mm/workingset.c b/kernel/mm/workingset.c index 263d01947..6db7b243f 100644 --- a/kernel/mm/workingset.c +++ b/kernel/mm/workingset.c @@ -342,21 +342,19 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - - BUG_ON(!node->count); - BUG_ON(node->count & RADIX_TREE_COUNT_MASK); + BUG_ON(!workingset_node_shadows(node)); + BUG_ON(workingset_node_pages(node)); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); node->slots[i] = NULL; - BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); - node->count -= 1U << RADIX_TREE_COUNT_SHIFT; + workingset_node_shadows_dec(node); BUG_ON(!mapping->nrshadows); mapping->nrshadows--; } } - BUG_ON(node->count); + BUG_ON(workingset_node_shadows(node)); inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); if (!__radix_tree_delete_node(&mapping->page_tree, node)) BUG(); diff --git a/kernel/mm/zsmalloc.c b/kernel/mm/zsmalloc.c index 18cc59fb1..529552c37 100644 --- a/kernel/mm/zsmalloc.c +++ b/kernel/mm/zsmalloc.c @@ -64,6 +64,7 @@ #include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> +#include <linux/locallock.h> /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -403,6 +404,7 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock); static int is_first_page(struct page *page) { @@ -1289,7 +1291,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, class = pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); - area = per_cpu_ptr(&zs_map_area, get_cpu_light()); + area = &get_locked_var(zs_map_area_lock, zs_map_area); area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ @@ -1342,7 +1344,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } - put_cpu_light(); + put_locked_var(zs_map_area_lock, zs_map_area); unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); @@ -1732,10 +1734,13 @@ static struct page *isolate_source_page(struct size_class *class) static unsigned long zs_can_compact(struct size_class *class) { unsigned long obj_wasted; + unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + unsigned long obj_used = zs_stat_get(class, OBJ_USED); - obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - - zs_stat_get(class, OBJ_USED); + if (obj_allocated <= obj_used) + return 0; + obj_wasted = obj_allocated - obj_used; obj_wasted /= get_maxobj_per_zspage(class->size, class->pages_per_zspage); diff --git a/kernel/mm/zswap.c b/kernel/mm/zswap.c index bf14508af..45476f429 100644 --- a/kernel/mm/zswap.c +++ b/kernel/mm/zswap.c @@ -78,7 +78,13 @@ static u64 zswap_duplicate_entry; /* Enable/disable zswap (disabled by default) */ static bool zswap_enabled; -module_param_named(enabled, zswap_enabled, bool, 0644); +static int zswap_enabled_param_set(const char *, + const struct kernel_param *); +static struct kernel_param_ops zswap_enabled_param_ops = { + .set = zswap_enabled_param_set, + .get = param_get_bool, +}; +module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); /* Crypto compressor to use */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" @@ -170,10 +176,15 @@ static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; static LIST_HEAD(zswap_pools); /* protects zswap_pools list modification */ static DEFINE_SPINLOCK(zswap_pools_lock); +/* pool counter to provide unique names to zpool */ +static atomic_t zswap_pools_count = ATOMIC_INIT(0); /* used by param callback function */ static bool zswap_init_started; +/* fatal error during init */ +static bool zswap_init_failed; + /********************************* * helpers and fwd declarations **********************************/ @@ -565,6 +576,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { struct zswap_pool *pool; + char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; pool = kzalloc(sizeof(*pool), GFP_KERNEL); @@ -573,7 +585,10 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) return NULL; } - pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops); + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); + + pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops); if (!pool->zpool) { pr_err("%s zpool not available\n", type); goto error; @@ -696,6 +711,11 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *s = strstrip((char *)val); int ret; + if (zswap_init_failed) { + pr_err("can't set param, initialization failed\n"); + return -ENODEV; + } + /* no change required */ if (!strcmp(s, *(char **)kp->arg)) return 0; @@ -775,6 +795,17 @@ static int zswap_zpool_param_set(const char *val, return __zswap_param_set(val, kp, NULL, zswap_compressor); } +static int zswap_enabled_param_set(const char *val, + const struct kernel_param *kp) +{ + if (zswap_init_failed) { + pr_err("can't enable, initialization failed\n"); + return -ENODEV; + } + + return param_set_bool(val, kp); +} + /********************************* * writeback code **********************************/ @@ -1261,6 +1292,9 @@ pool_fail: dstmem_fail: zswap_entry_cache_destroy(); cache_fail: + /* if built-in, we aren't unloaded on failure; don't allow use */ + zswap_init_failed = true; + zswap_enabled = false; return -ENOMEM; } /* must be late so crypto has time to come up */ |