diff options
Diffstat (limited to 'kernel/mm/memory.c')
-rw-r--r-- | kernel/mm/memory.c | 90 |
1 files changed, 68 insertions, 22 deletions
diff --git a/kernel/mm/memory.c b/kernel/mm/memory.c index 3fc6efd10..b80bf4746 100644 --- a/kernel/mm/memory.c +++ b/kernel/mm/memory.c @@ -61,6 +61,7 @@ #include <linux/string.h> #include <linux/dma-debug.h> #include <linux/debugfs.h> +#include <linux/userfaultfd_k.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task) #ifdef HAVE_GENERIC_MMU_GATHER -static int tlb_next_batch(struct mmu_gather *tlb) +static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; batch = tlb->active; if (batch->next) { tlb->active = batch->next; - return 1; + return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) - return 0; + return false; batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) - return 0; + return false; tlb->batch_count++; batch->next = NULL; @@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb) tlb->active->next = batch; tlb->active = batch; - return 1; + return true; } /* tlb_gather_mmu @@ -2081,11 +2082,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, goto oom; cow_user_page(new_page, old_page, address, vma); } - __SetPageUptodate(new_page); if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) goto oom_free_new; + __SetPageUptodate(new_page); + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* @@ -2684,6 +2686,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto unlock; + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(page_table, ptl); + return handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + } goto setpte; } @@ -2693,6 +2701,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_zeroed_user_highpage_movable(vma, address); if (!page) goto oom; + + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) + goto oom_free_page; + /* * The memory barrier inside __SetPageUptodate makes sure that * preceeding stores to the page contents become visible before @@ -2700,9 +2712,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) - goto oom_free_page; - entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -2711,6 +2720,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(page_table, ptl); + mem_cgroup_cancel_charge(page, memcg); + page_cache_release(page); + return handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + } + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); mem_cgroup_commit_charge(page, memcg, false); @@ -3214,6 +3232,27 @@ out: return 0; } +static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, unsigned int flags) +{ + if (vma_is_anonymous(vma)) + return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); + if (vma->vm_ops->pmd_fault) + return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + return VM_FAULT_FALLBACK; +} + +static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd, + unsigned int flags) +{ + if (vma_is_anonymous(vma)) + return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); + if (vma->vm_ops->pmd_fault) + return vma->vm_ops->pmd_fault(vma, address, pmd, flags); + return VM_FAULT_FALLBACK; +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3249,12 +3288,12 @@ static int handle_pte_fault(struct mm_struct *mm, barrier(); if (!pte_present(entry)) { if (pte_none(entry)) { - if (vma->vm_ops) + if (vma_is_anonymous(vma)) + return do_anonymous_page(mm, vma, address, + pte, pmd, flags); + else return do_fault(mm, vma, address, pte, pmd, flags, entry); - - return do_anonymous_page(mm, vma, address, pte, pmd, - flags); } return do_swap_page(mm, vma, address, pte, pmd, flags, entry); @@ -3316,10 +3355,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!pmd) return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { - int ret = VM_FAULT_FALLBACK; - if (!vma->vm_ops) - ret = do_huge_pmd_anonymous_page(mm, vma, address, - pmd, flags); + int ret = create_huge_pmd(mm, vma, address, pmd, flags); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { @@ -3343,8 +3379,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, orig_pmd, pmd); if (dirty && !pmd_write(orig_pmd)) { - ret = do_huge_pmd_wp_page(mm, vma, address, pmd, - orig_pmd); + ret = wp_huge_pmd(mm, vma, address, pmd, + orig_pmd, flags); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { @@ -3363,8 +3399,18 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(pmd_none(*pmd)) && unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; - /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd + * didn't become pmd_trans_huge under us and then back to pmd_none, as + * a result of MADV_DONTNEED running immediately after a huge pmd fault + * in a different thread of this mm, in turn leading to a misleading + * pmd_trans_huge() retval. All we have to ensure is that it is a + * regular pmd that we can walk with pte_offset_map() and we can do that + * through an atomic read in C, which is what pmd_trans_unstable() + * provides. + */ + if (unlikely(pmd_trans_unstable(pmd))) return 0; /* * A regular pmd is established and it can't morph into a huge pmd @@ -3730,7 +3776,7 @@ void print_vma_addr(char *prefix, unsigned long ip) if (buf) { char *p; - p = d_path(&f->f_path, buf, PAGE_SIZE); + p = file_path(f, buf, PAGE_SIZE); if (IS_ERR(p)) p = "?"; printk("%s%s[%lx+%lx]", prefix, kbasename(p), |